# Notebook sniff from Chap 5 Exploring and Denoising Your Data Set

In [None]:
import pandas as pd
df = pd.read_csv("data/rent.csv")
print(df.shape) # print rows, columns

In [None]:
df.info()

In [None]:
df_num = df[['bathrooms', 'bedrooms', 'longitude', 'latitude', 'price']]

In [None]:
X_train = df_num.drop('price', axis=1)
y_train = df_num['price']

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100,
                           n_jobs=-1)        # train w/all CPU core

In [None]:
rf.fit(X_train, y_train)

In [None]:
r2 = rf.score(X_train, y_train)
print( f"{r2:.4f}" )

In [None]:
rf = RandomForestRegressor(n_estimators=100,
                           n_jobs=-1,
                           oob_score=True)   # get error estimate
rf.fit(X_train, y_train)
noisy_oob_r2 = rf.oob_score_
print(f"OOB score {noisy_oob_r2:.4f}")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import numpy as np

!X, y = df_num.drop('price', axis=1), df_num['price']

errors = []
print(f"Validation MAE trials:", end='')
for i in range(7):
!    X_train, X_test, y_train, y_test = !        train_test_split(X, y, test_size=0.20)
!    rf = RandomForestRegressor(n_estimators=100, n_jobs=-1)
!    rf.fit(X_train, y_train)
!    y_predicted = rf.predict(X_test)
!    e = mean_absolute_error(y_test, y_predicted)
    print(f" ${e:.0f}", end='')
    errors.append(e)
print()
!noisy_avg_mae = np.mean(errors)
print(f"Average validation MAE ${noisy_avg_mae:.0f}")

In [None]:
bookcolors = { # our usual book color palette
         'crimson': '#a50026', 'red': '#d73027',
         'redorange': '#f46d43', 'orange': '#fdae61',
         'yellow': '#fee090', 'sky': '#e0f3f8',
         'babyblue': '#abd9e9', 'lightblue': '#74add1',
         'blue': '#4575b4', 'purple': '#313695'}
fig,ax = plt.subplots()
ax.set_xlabel('Num Bedrooms')
ax.set_ylabel('Num Apts')
ax.hist(df_num.bedrooms, color=bookcolors['blue'])
plt.show()

In [None]:
bookcolors = {
         'crimson': '#a50026', 'red': '#d73027',
         'redorange': '#f46d43', 'orange': '#fdae61',
         'yellow': '#fee090', 'sky': '#e0f3f8',
         'babyblue': '#abd9e9', 'lightblue': '#74add1',
         'blue': '#4575b4', 'purple': '#313695'
     }

In [None]:
df_local = df[(df.latitude>40.764) & (df.latitude<40.7678) &
              (df.longitude>=-73.9844) & (df.longitude<=-73.9842) &
           (df.bathrooms==1) & (df.bedrooms==1)]

In [None]:
# filter all records (training/testing)
df_clean = df_num[(df_num.price>1_000) & (df_num.price<10_000)]

In [None]:
fig,ax = plt.subplots()
ax.set_xlabel('Clipped Price')
ax.set_ylabel('Num Apts at that price')
ax.hist(df_clean.price, bins=45, color=bookcolors['blue'])
plt.show()

In [None]:
upper, lower = np.percentile(df.price, [1,99]) # find middle 98% of prices
clipped = np.clip(df.price, upper, lower)
fig,ax = plt.subplots()
ax.set_xlabel('Prices in middle 98% range')
ax.set_ylabel('Num Apts at that price')
ax.hist(clipped, bins=45, color=bookcolors['blue'])
plt.show()

In [None]:
df_missing = df_clean[(df_clean.longitude==0) | (df_clean.latitude==0)]

In [None]:
df_clean = df_clean[(df_clean.longitude!=0) | (df_clean.latitude!=0)]

In [None]:
df_clean = df_clean[(df_clean['latitude']>40.55) &
                    (df_clean['latitude']<40.94) &
                    (df_clean['longitude']>-74.1) &
                    (df_clean['longitude']<-73.67)]

In [None]:
X, y = df_clean.drop('price', axis=1), df_clean['price']
rf = RandomForestRegressor(n_estimators=100,
                           n_jobs=-1,        # parallelize
                           oob_score=True)   # get error estimate
rf.fit(X, y)
clean_oob_r2 = rf.oob_score_
print(f"Validation OOB score {clean_oob_r2:.4f}")

In [None]:
def test_MAE(X, y, n_trials=7, outliers=True, verbose=True):
    errors = []
    if verbose: print(f"Validation MAE trials:", end='')
    for i in range(n_trials):
        X_train, X_test, y_train, y_test =             train_test_split(X, y, test_size=0.20)
        rf = RandomForestRegressor(n_estimators=100, n_jobs=-1)
        rf.fit(X_train, y_train)
        if not outliers: # test only reasonable apts?
            X_test = X_test[y_test<10_000]
            y_test = y_test[y_test<10_000]
        y_predicted = rf.predict(X_test)
        e = mean_absolute_error(y_test, y_predicted)
        if verbose: print(f" ${e:.0f}", end='')
        errors.append(e)
    if verbose: print()
    return np.mean(errors)
    
clean_avg_mae = test_MAE(X, y)
print(f"Average clean validation MAE ${clean_avg_mae:.0f}")

In [None]:
from sklearn.linear_model import Lasso

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
lm = Lasso(alpha=0.5) # create linear model
lm.fit(X_train, y_train)
print(f"LM Training score {lm.score(X_train, y_train):.4f}")
print(f"LM Validation score {lm.score(X_test, y_test):.4f}")

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(n_estimators = 2000)
gbr.fit(X_train, y_train)
print(f"GB Training score {gbr.score(X_train, y_train):.4f}")
print(f"GB Validation score {gbr.score(X_test, y_test):.4f}")