# Notebook cats from Chap 6 Categorically Speaking

In [None]:
from rfpimp import *
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestRegressor

In [None]:
df = pd.read_csv("data/rent.csv", parse_dates=['created'])
df_clean = df[(df.price>1_000) & (df.price<10_000)]
df_clean = df_clean[(df_clean.longitude!=0) | (df_clean.latitude!=0)]
df_clean = df_clean[(df_clean['latitude']>40.55) &
                    (df_clean['latitude']<40.94) &
                    (df_clean['longitude']>-74.1) &
                    (df_clean['longitude']<-73.67)]
df = df_clean

In [None]:
numfeatures = ['bathrooms', 'bedrooms', 'longitude', 'latitude']
X, y = df[numfeatures], df['price']
rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True)
rf.fit(X, y)
oob_baseline = rf.oob_score_

In [None]:
print(f"{rfnnodes(rf):,d} tree nodes and {np.median(rfmaxdepths(rf))} median tree height")

In [None]:
def showimp(rf, X, y):
    features = list(X.columns)
    features.remove('latitude')
    features.remove('longitude')
    features += [['latitude','longitude']]

    I = importances(rf, X, y, features=features)
    plot_importances(I, color='#4575b4')
    
showimp(rf, X, y)

In [None]:
df['interest_level'] = df['interest_level'].map({'low':1,'medium':2,'high':3})

In [None]:
def test(X, y):
    rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True)
    rf.fit(X, y)
    oob = rf.oob_score_
    n = rfnnodes(rf)
    h = np.median(rfmaxdepths(rf))
    print(f"OOB R^2 {oob:.5f} using {n:,d} tree nodes with {h} median tree height")
    return rf, oob

X, y = df[['interest_level']+numfeatures], df['price']
rf, oob = test(X, y)

In [None]:
df['display_address_cat'] = df['display_address'].astype('category').cat.as_ordered()
df['display_address_cat'] = df['display_address_cat'].cat.codes + 1

In [None]:
X, y = df[['display_address_cat']+numfeatures], df['price']
rf, oob = test(X, y)

In [None]:
managers_count = df['manager_id'].value_counts()
df['mgr_apt_count'] = df['manager_id'].map(managers_count)

In [None]:
X, y = df[['display_address_cat','mgr_apt_count']+numfeatures], df['price']
rf, oob = test(X, y)

In [None]:
showimp(rf, X, y)

In [None]:
df['description'] = df['description'].fillna('')
df['description'] = df['description'].str.lower() # normalize to lower case
df['features'] = df['features'].fillna('') # fill missing w/blanks
df['features'] = df['features'].str.lower() # normalize to lower case

In [None]:
# has apartment been renovated?
df['renov'] = df['description'].str.contains("renov")

for w in ['doorman', 'parking', 'garage', 'laundry', 
          'Elevator', 'fitness center', 'dishwasher']:
    df[w] = df['features'].str.contains(w)

In [None]:
df["num_desc_words"] = df["description"].apply(lambda x: len(x.split()))
df["num_features"] = df["features"].apply(lambda x: len(x.split(",")))

In [None]:
df["num_photos"] = df["photos"].apply(lambda x: len(x.split(",")))

In [None]:
textfeatures = [
    'num_photos', 'num_desc_words', 'num_features',
    'doorman', 'parking', 'garage', 'laundry', 
    'Elevator', 'fitness center', 'dishwasher',
    'renov'
]
X, y = df[textfeatures+numfeatures], df['price']
rf, oob = test(X, y)

In [None]:
showimp(rf, X, y)

In [None]:
df["beds_to_baths"] = df["bedrooms"]/(df["bathrooms"]+1) # avoid div by 0
X, y = df[['beds_to_baths']+numfeatures], df['price']
rf, oob = test(X, y)

In [None]:
df["beds_per_price"] = df["bedrooms"] / df["price"]
X, y = df[['beds_per_price']+numfeatures], df['price']
rf, oob = test(X, y)

In [None]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.20)
df_train = df_train.copy()
df_train['beds_per_price'] = df_train['bedrooms'] / df_train["price"]

In [None]:
bpmap = dict(zip(df_train["bedrooms"],df_train["beds_per_price"]))
df_test = df_test.copy()
df_test["beds_per_price"] = df_test["bedrooms"].map(bpmap)
avg = np.mean(df_test['beds_per_price'])
df_test['beds_per_price'].fillna(avg, inplace=True)

In [None]:
X_train, y_train = df_train[['beds_per_price']+numfeatures], df_train['price']
X_test, y_test = df_test[['beds_per_price']+numfeatures], df_test['price']

rf = RandomForestRegressor(n_estimators=100, n_jobs=-1)
rf.fit(X_train, y_train)
oob_overfit = rf.score(X_test, y_test) # don't test training set
print(f"OOB R^2 {oob_overfit:.5f}")
print(f"{rfnnodes(rf):,d} nodes, {np.median(rfmaxdepths(rf))} median height")

In [None]:
hoods = {
    "hells" : [40.7622, -73.9924],
    "astoria" : [40.7796684, -73.9215888],
    "Evillage" : [40.723163774, -73.984829394],
    "Wvillage" : [40.73578, -74.00357],
    "LowerEast" : [40.715033, -73.9842724],
    "UpperEast" : [40.768163594, -73.959329496],
    "ParkSlope" : [40.672404, -73.977063],
    "Prospect Park" : [40.93704, -74.17431],
    "Crown Heights" : [40.657830702, -73.940162906],
    "financial" : [40.703830518, -74.005666644],
    "brooklynheights" : [40.7022621909, -73.9871760513],
    "gowanus" : [40.673, -73.997]
}

In [None]:
for hood,loc in hoods.items():
    # compute manhattan distance
    df[hood] = np.abs(df.latitude - loc[0]) + np.abs(df.longitude - loc[1])

In [None]:
hoodfeatures = list(hoods.keys())
X, y = df[numfeatures+hoodfeatures], df['price']
rf, oob_hood = test(X, y)

In [None]:
X = X.drop(['longitude','latitude'],axis=1)
rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True)
rf.fit(X, y)
print(f"{rf.oob_score_:.4f} score {rfnnodes(rf):,d} tree nodes and {np.median(rfmaxdepths(rf))} median tree height")

In [None]:
X = df[['interest_level']+textfeatures+hoodfeatures+numfeatures]
rf, oob_combined = test(X, y)

In [None]:
showimp(rf, X, y)