# Notebook prod-trend from Chap 9 Train, Validate, Test

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from typing import Sequence
from pandas.api.types import is_string_dtype, is_object_dtype, is_categorical_dtype

bookcolors = {
    'crimson': '#a50026', 'red': '#d73027',
    'redorange': '#f46d43', 'orange': '#fdae61',
    'yellow': '#fee090', 'sky': '#e0f3f8',
    'babyblue': '#abd9e9', 'lightblue': '#74add1',
    'blue': '#4575b4', 'purple': '#313695'}

def clean_dates(df):
    df.loc[df['YearMade']<1950, 'YearMade'] = np.nan
    df.loc[df.eval("saledate.dt.year < YearMade"), 'YearMade'] =         df['saledate'].dt.year    

def df_split_dates(df,colname):
    df["saleyear"] = df[colname].dt.year
    df["salemonth"] = df[colname].dt.month
    df["saleday"] = df[colname].dt.day
    df["saledayofweek"] = df[colname].dt.dayofweek
    df["saledayofyear"] = df[colname].dt.dayofyear
    df[colname] = df[colname].astype(np.int64) # convert to seconds since 1970
    # age can be nan since YearMade can be nan
    df['age'] = df['saleyear'] - df['YearMade'] # synthesize age

def df_normalize_strings(df):
    for col in df.columns:
        if is_string_dtype(df[col]) or is_object_dtype(df[col]):
            df[col] = df[col].str.lower()
            df[col] = df[col].fillna(np.nan) # make None -> np.nan
            df[col] = df[col].replace('none or unspecified', np.nan)
            df[col] = df[col].replace('none', np.nan)
            df[col] = df[col].replace('#name?', np.nan)
            df[col] = df[col].replace('', np.nan)

def df_order_product_size(df):
    sizes = {np.nan:0, 'mini':1, 'compact':1, 'small':2, 'medium':3,
             'large / medium':4, 'large':5}
    df['ProductSize'] = df['ProductSize'].map(sizes).values

df = pd.read_feather("data/bulldozer-train-all.feather")
clean_dates(df)
df_split_dates(df, 'saledate')
df_normalize_strings(df)
df_order_product_size(df)

P = df.groupby(['ProductSize','saleyear']).mean()
P = P.reset_index().sort_values('saleyear')[['ProductSize','saleyear','SalePrice']]

Large = P.query("ProductSize==5").sort_values('saleyear')
Med = P.query("ProductSize==4").sort_values('saleyear')
Small = P.query("ProductSize==3").sort_values('saleyear')
Mini = P.query("ProductSize==2").sort_values('saleyear')

fig,ax = plt.subplots(figsize=(7,3))
ax.plot(Large['saleyear'], Large['SalePrice'], "o-", label="Large",
        markersize=3, linewidth=1,
        c=bookcolors['blue'])
ax.plot(Med['saleyear'], Med['SalePrice'], "o-", label="Medium",
        markersize=3, linewidth=1,
        c=bookcolors['orange'])
ax.plot(Small['saleyear'], Small['SalePrice'], "o-", label="Small",
        markersize=3, linewidth=1,
        c=bookcolors['crimson'])
ax.plot(Mini['saleyear'], Mini['SalePrice'], "o-", label="Mini",
        markersize=3, linewidth=1,
        c=bookcolors['purple'])
ax.set_ylim(25_000, 55_000)
ax.set_xlabel("Sale year")
ax.set_ylabel("Sale price (dollars)")
plt.legend(loc="center right", fontsize="small", labelspacing=0)