# Notebook logs from Chap 5 Exploring and Denoising Your Data Set

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from rfpimp import *

bookcolors = {
    'crimson': '#a50026', 'red': '#d73027', 'redorange': '#f46d43',
    'orange': '#fdae61', 'yellow': '#fee090', 'sky': '#e0f3f8',
    'babyblue': '#abd9e9', 'lightblue': '#74add1', 'blue': '#4575b4',
    'purple': '#313695'
}

df = pd.read_csv("data/rent.csv", parse_dates=['created'])
df_num = df[['bathrooms', 'bedrooms', 'longitude', 'latitude', 'price']]
X, y = df_num.drop('price', axis=1), df_num['price']

In [None]:
import numpy as np
m = np.mean(y)
plt.hist(np.clip(y, 0, 20_000), bins=70, color=bookcolors['blue'])
plt.ylim(0, 2000) # zoom in to see tail
plt.xlabel("price")
plt.show()

In [None]:
y_log = np.log(y)
m = np.mean(y_log)
plt.hist(y_log, bins=200, color=bookcolors['blue'])
plt.xlim(6.8, 10) # zoom in
plt.xlabel("log(price)")
plt.show()

In [None]:
df_local = df[(df.latitude>40.764) & (df.latitude<40.7678) &
              (df.longitude>=-73.9844) & (df.longitude<=-73.9842) &
              (df.bathrooms==1) & (df.bedrooms==1)].copy()
df_local['bathrooms'] = df_local['bathrooms'].astype(int)
df_local['log(price)'] = np.log(df_local.price)

In [None]:
X, y = df_num.drop('price', axis=1), df_num['price']
!y_log = np.log(y) # apply log to each price

rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True)
rf.fit(X, y_log)
log_oob_r2 = rf.oob_score_
print(f"OOB R^2 score for log(price) {log_oob_r2:.4f}")