# Notebook go from Chap 2 A First Taste of Machine Learning

In [None]:
import pandas as pd # Import the library and give a short alias: pd
rent = pd.read_csv("data/rent-ideal.csv")

In [None]:
print(rent.head(5))

In [None]:
prices = rent['price']
avg_rent = prices.mean()
print(f"Average rent is ${avg_rent:.0f}")

In [None]:
bybaths = rent.groupby(['bathrooms']).mean()
bybaths = bybaths.reset_index() # overcome quirk in Pandas
print(bybaths[['bathrooms','price']]) # print just num baths, avg price

In [None]:
import matplotlib.pyplot as plt

bybaths.plot.line('bathrooms','price', style='-o')
plt.show()

In [None]:
X, y = rent[['bathrooms']], rent['price']

In [None]:
print(type(X), type(y))

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf.fit(X, y)

In [None]:
print( rf.predict([[0], [1]]) )

In [None]:
from sklearn.metrics import mean_absolute_error

predictions = rf.predict(X)
e = mean_absolute_error(y, predictions)
print(f"${e:.0f} average error; {e*100.0/y.mean():.2f}% error")

In [None]:
X, y = rent[['bedrooms','bathrooms']], rent['price']

rf = RandomForestRegressor()
rf.fit(X, y)

In [None]:
predictions = rf.predict(X)
e = mean_absolute_error(y, predictions)
print(f"${e:.0f} average error; {e*100.0/y.mean():.2f}% error")

In [None]:
X, y = rent[['latitude','longitude']], rent['price']
rf = RandomForestRegressor()
rf.fit(X, y)
e = mean_absolute_error(y, rf.predict(X))
print(f"${e:.0f} average error; {e*100.0/y.mean():.2f}% error")

In [None]:
X, y = rent[['bedrooms','bathrooms','latitude','longitude']], rent['price']
rf = RandomForestRegressor()
rf.fit(X, y)
e = mean_absolute_error(y, rf.predict(X))
print(f"${e:.0f} average error; {e*100.0/y.mean():.2f}% error")

In [None]:
from sklearn.model_selection import train_test_split

X, y = rent[['bedrooms','bathrooms','latitude','longitude']], rent['price']
# 20% of data goes into test set, 80% into training set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) 

rf = RandomForestRegressor()
rf.fit(X_train, y_train)

e = mean_absolute_error(y_test, rf.predict(X_test))
print(f"${e:.0f} average error; {e*100.0/y.mean():.2f}% error")

In [None]:
from sklearn.model_selection import cross_val_score

k = 5
cv_err = cross_val_score(RandomForestRegressor(),
                         X, y, cv=k,
                         scoring='neg_mean_absolute_error')
m_err = -cv_err.mean()
std_err = cv_err.std()
print(f"${m_err:.0f} average error +/-${2*std_err:.2f}; {m_err*100.0/y.mean():.2f}% error")

In [None]:
def validate(model):
    cv_err = cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error')
    m_err = -cv_err.mean()
    std_err = cv_err.std()
    print(f"${m_err:.0f} average error +/-${2*std_err:.2f}; {m_err*100.0/y.mean():.2f}% error")

In [None]:
rf = RandomForestRegressor(n_estimators=100)
validate(rf)

In [None]:
from rfpimp import *
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X, y)
I = importances(rf, X, y)

In [None]:
I = importances(rf,X_train,y_train)
I.plot(kind='barh', legend=False)

In [None]:
from sklearn import linear_model

lm = linear_model.LinearRegression() # create linear model
lm.fit(X, y)
validate(lm)

In [None]:
from sklearn import ensemble

gbr = ensemble.GradientBoostingRegressor() # create boosting model
gbr.fit(X_train, y_train)
validate(gbr)