Assignment 8

Assignment Overview

In this assignment we were tasked with using machine learning to optimize regression that predicts housing prices given a multitude of factors. We had a training and testing dataset to build our model, then a holdout dataset that we predicted the prices for.

I winsorized my data to reduce the effects of outliers then used the ridge function in a pipeline with GridSearchCV to optimize my model for alpha and k values.

My overall results were an R^2 of .883 when predicting the prices on my test dataset

import pandas as pd
# from pandas_profiling import ProfileReport # now use ydata-profiling
from pandas_profiling import ProfileReport
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.model_selection import KFold, cross_validate, GridSearchCV
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from tqdm import tqdm
from sklearn.model_selection import (
    GridSearchCV,
    KFold,
    cross_validate,
    train_test_split,
)

from sklearn.linear_model import Ridge
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score
import seaborn as sns
import matplotlib.pyplot as plt
from numpy import argsort
from sklearn.pipeline import make_pipeline 
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score

import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LassoCV
from sklearn.model_selection import KFold, cross_validate, GridSearchCV

C:\Users\AlexanderRomanowski\AppData\Local\Temp\ipykernel_21548\1978820628.py:3: DeprecationWarning: `import pandas_profiling` is going to be deprecated by April 1st. Please use `import ydata_profiling` instead.
  from pandas_profiling import ProfileReport

Load data and cleaning

After looking at the key stats for the housing data, it’s clear that there are significant outliers given the high max/min/std dev values. Winsorizing the data to the 1% and 99% tails greatly reduced the outliers and the std dev. After doing this, I noticed that my model became a lot more stable in its R^2 values.

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.model_selection import KFold, cross_validate, GridSearchCV
from sklearn.linear_model import RidgeCV
from winsorizer_with_missing import winsorizer_with_missing
import warnings

# put code here
housing = pd.read_csv('input_data2/housing_train.csv')
y = np.log(housing.v_SalePrice)

housing.describe().T

	count	mean	std	min	25%	50%	75%	max
v_MS_SubClass	1941.0	58.088614	42.946015	20.0	20.00	50.0	70.00	190.0
v_Lot_Frontage	1620.0	69.301235	23.978101	21.0	58.00	68.0	80.00	313.0
v_Lot_Area	1941.0	10284.770222	7832.295527	1470.0	7420.00	9450.0	11631.00	164660.0
v_Overall_Qual	1941.0	6.113344	1.401594	1.0	5.00	6.0	7.00	10.0
v_Overall_Cond	1941.0	5.568264	1.087465	1.0	5.00	5.0	6.00	9.0
v_Year_Built	1941.0	1971.321999	30.209933	1872.0	1953.00	1973.0	2001.00	2008.0
v_Year_Remod/Add	1941.0	1984.073158	20.837338	1950.0	1965.00	1993.0	2004.00	2009.0
v_Mas_Vnr_Area	1923.0	104.846074	184.982611	0.0	0.00	0.0	168.00	1600.0
v_BsmtFin_SF_1	1940.0	436.986598	457.815715	0.0	0.00	361.5	735.25	5644.0
v_BsmtFin_SF_2	1940.0	49.247938	169.555232	0.0	0.00	0.0	0.00	1474.0
v_Bsmt_Unf_SF	1940.0	567.437629	439.600535	0.0	225.75	474.0	815.00	2153.0
v_Total_Bsmt_SF	1940.0	1053.672165	438.662147	0.0	796.75	989.5	1295.25	6110.0
v_1st_Flr_SF	1941.0	1161.071613	396.945408	334.0	886.00	1085.0	1383.00	5095.0
v_2nd_Flr_SF	1941.0	340.955178	434.242152	0.0	0.00	0.0	717.00	2065.0
v_Low_Qual_Fin_SF	1941.0	4.282329	42.943917	0.0	0.00	0.0	0.00	697.0
v_Gr_Liv_Area	1941.0	1506.309119	524.765289	334.0	1118.00	1436.0	1755.00	5642.0
v_Bsmt_Full_Bath	1939.0	0.415162	0.515395	0.0	0.00	0.0	1.00	2.0
v_Bsmt_Half_Bath	1939.0	0.064982	0.254791	0.0	0.00	0.0	0.00	2.0
v_Full_Bath	1941.0	1.566718	0.552693	0.0	1.00	2.0	2.00	3.0
v_Half_Bath	1941.0	0.378156	0.498675	0.0	0.00	0.0	1.00	2.0
v_Bedroom_AbvGr	1941.0	2.866048	0.827732	0.0	2.00	3.0	3.00	8.0
v_Kitchen_AbvGr	1941.0	1.039155	0.201827	0.0	1.00	1.0	1.00	2.0
v_TotRms_AbvGrd	1941.0	6.465224	1.577696	2.0	5.00	6.0	7.00	15.0
v_Fireplaces	1941.0	0.595569	0.641969	0.0	0.00	1.0	1.00	4.0
v_Garage_Yr_Blt	1834.0	1978.188113	25.729319	1895.0	1960.00	1980.0	2002.00	2207.0
v_Garage_Cars	1940.0	1.769588	0.763399	0.0	1.00	2.0	2.00	4.0
v_Garage_Area	1940.0	472.766495	217.089624	0.0	318.75	478.0	576.00	1488.0
v_Wood_Deck_SF	1941.0	92.458011	127.020523	0.0	0.00	0.0	168.00	1424.0
v_Open_Porch_SF	1941.0	49.157135	70.296277	0.0	0.00	28.0	72.00	742.0
v_Enclosed_Porch	1941.0	22.947965	65.249307	0.0	0.00	0.0	0.00	1012.0
v_3Ssn_Porch	1941.0	2.249871	22.416832	0.0	0.00	0.0	0.00	407.0
v_Screen_Porch	1941.0	16.249871	56.748086	0.0	0.00	0.0	0.00	576.0
v_Pool_Area	1941.0	3.386399	43.695267	0.0	0.00	0.0	0.00	800.0
v_Misc_Val	1941.0	52.553838	616.064459	0.0	0.00	0.0	0.00	17000.0
v_Mo_Sold	1941.0	6.431221	2.745199	1.0	5.00	6.0	8.00	12.0
v_Yr_Sold	1941.0	2006.998454	0.801736	2006.0	2006.00	2007.0	2008.00	2008.0
v_SalePrice	1941.0	182033.238022	80407.100395	13100.0	130000.00	161900.0	215000.00	755000.0

#pulling the column names to winsorize
housingwin = housing.select_dtypes(include=['int64', 'float64']).columns

housingwin = housingwin.tolist()

#winsorizing
new_house = winsorizer_with_missing(housing, cols= housingwin)
new_house = new_house.drop('v_SalePrice',axis=1)

C:\Users\AlexanderRomanowski\Documents\HW\fin 377\asgn-08-ajr423\winsorizer_with_missing.py:45: FutureWarning: Downcasting integer-dtype results in .where is deprecated and will change in a future version. To retain the old behavior, explicitly cast the results to the desired dtype.
  df[cols] = df[cols].clip(lower=df[cols].quantile(low_),

new_house.describe().T

	count	mean	std	min	25%	50%	75%	max
v_MS_SubClass	1941.0	58.088614	42.946015	20.0	20.00	50.0	70.00	190.000
v_Lot_Frontage	1620.0	68.772222	21.368657	21.0	58.00	68.0	80.00	130.000
v_Lot_Area	1941.0	9936.527048	4663.839454	1896.6	7420.00	9450.0	11631.00	30867.700
v_Overall_Qual	1941.0	6.107161	1.356487	3.0	5.00	6.0	7.00	9.000
v_Overall_Cond	1941.0	5.561051	1.033465	3.0	5.00	5.0	6.00	8.000
v_Year_Built	1941.0	1971.415250	29.936818	1900.0	1953.00	1973.0	2001.00	2007.000
v_Year_Remod/Add	1941.0	1984.072643	20.836733	1950.0	1965.00	1993.0	2004.00	2008.000
v_Mas_Vnr_Area	1923.0	100.885450	166.196584	0.0	0.00	0.0	168.00	710.680
v_BsmtFin_SF_1	1940.0	431.173093	428.688416	0.0	0.00	361.5	735.25	1561.660
v_BsmtFin_SF_2	1940.0	46.804820	155.508430	0.0	0.00	0.0	0.00	828.745
v_Bsmt_Unf_SF	1940.0	564.993428	432.532077	0.0	225.75	474.0	815.00	1693.575
v_Total_Bsmt_SF	1940.0	1045.872912	401.483213	0.0	796.75	989.5	1295.25	2035.915
v_1st_Flr_SF	1941.0	1153.883565	360.236856	525.1	886.00	1085.0	1383.00	2135.300
v_2nd_Flr_SF	1941.0	338.057702	426.044845	0.0	0.00	0.0	717.00	1359.800
v_Low_Qual_Fin_SF	1941.0	0.000000	0.000000	0.0	0.00	0.0	0.00	0.000
v_Gr_Liv_Area	1941.0	1498.284905	482.248402	747.7	1118.00	1436.0	1755.00	2827.800
v_Bsmt_Full_Bath	1939.0	0.403816	0.490788	0.0	0.00	0.0	1.00	1.000
v_Bsmt_Half_Bath	1939.0	0.062919	0.242880	0.0	0.00	0.0	0.00	1.000
v_Full_Bath	1941.0	1.570325	0.545672	1.0	1.00	2.0	2.00	3.000
v_Half_Bath	1941.0	0.371458	0.483319	0.0	0.00	0.0	1.00	1.000
v_Bedroom_AbvGr	1941.0	2.862442	0.795047	1.0	2.00	3.0	3.00	5.000
v_Kitchen_AbvGr	1941.0	1.040701	0.197647	1.0	1.00	1.0	1.00	2.000
v_TotRms_AbvGrd	1941.0	6.468315	1.531340	4.0	5.00	6.0	7.00	11.000
v_Fireplaces	1941.0	0.590417	0.625647	0.0	0.00	1.0	1.00	2.000
v_Garage_Yr_Blt	1834.0	1978.200654	24.813501	1920.0	1960.00	1980.0	2002.00	2007.000
v_Garage_Cars	1940.0	1.764433	0.751600	0.0	1.00	2.0	2.00	3.000
v_Garage_Area	1940.0	470.119149	209.319506	0.0	318.75	478.0	576.00	953.405
v_Wood_Deck_SF	1941.0	90.223596	116.616823	0.0	0.00	0.0	168.00	465.400
v_Open_Porch_SF	1941.0	47.604328	62.672975	0.0	0.00	28.0	72.00	265.900
v_Enclosed_Porch	1941.0	21.710974	57.088499	0.0	0.00	0.0	0.00	243.600
v_3Ssn_Porch	1941.0	0.000000	0.000000	0.0	0.00	0.0	0.00	0.000
v_Screen_Porch	1941.0	15.062339	50.325873	0.0	0.00	0.0	0.00	227.000
v_Pool_Area	1941.0	0.000000	0.000000	0.0	0.00	0.0	0.00	0.000
v_Misc_Val	1941.0	16.524987	92.916618	0.0	0.00	0.0	0.00	600.000
v_Mo_Sold	1941.0	6.431221	2.745199	1.0	5.00	6.0	8.00	12.000
v_Yr_Sold	1941.0	2006.998454	0.801736	2006.0	2006.00	2007.0	2008.00	2008.000

new_house.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1941 entries, 0 to 1940
Data columns (total 80 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 parcel             1941 non-null   object 
 v_MS_SubClass      1941 non-null   int64  
 v_MS_Zoning        1941 non-null   object 
 v_Lot_Frontage     1620 non-null   float64
 v_Lot_Area         1941 non-null   float64
 v_Street           1941 non-null   object 
 v_Alley            136 non-null    object 
 v_Lot_Shape        1941 non-null   object 
 v_Land_Contour     1941 non-null   object 
 v_Utilities        1941 non-null   object 
v_Lot_Config       1941 non-null   object 
v_Land_Slope       1941 non-null   object 
v_Neighborhood     1941 non-null   object 
v_Condition_1      1941 non-null   object 
v_Condition_2      1941 non-null   object 
v_Bldg_Type        1941 non-null   object 
v_House_Style      1941 non-null   object 
v_Overall_Qual     1941 non-null   int64  
v_Overall_Cond     1941 non-null   int64  
v_Year_Built       1941 non-null   int64  
v_Year_Remod/Add   1941 non-null   int64  
v_Roof_Style       1941 non-null   object 
v_Roof_Matl        1941 non-null   object 
v_Exterior_1st     1941 non-null   object 
v_Exterior_2nd     1941 non-null   object 
v_Mas_Vnr_Type     1923 non-null   object 
v_Mas_Vnr_Area     1923 non-null   float64
v_Exter_Qual       1941 non-null   object 
v_Exter_Cond       1941 non-null   object 
v_Foundation       1941 non-null   object 
v_Bsmt_Qual        1891 non-null   object 
v_Bsmt_Cond        1891 non-null   object 
v_Bsmt_Exposure    1889 non-null   object 
v_BsmtFin_Type_1   1891 non-null   object 
v_BsmtFin_SF_1     1940 non-null   float64
v_BsmtFin_Type_2   1891 non-null   object 
v_BsmtFin_SF_2     1940 non-null   float64
v_Bsmt_Unf_SF      1940 non-null   float64
v_Total_Bsmt_SF    1940 non-null   float64
v_Heating          1941 non-null   object 
v_Heating_QC       1941 non-null   object 
v_Central_Air      1941 non-null   object 
v_Electrical       1940 non-null   object 
v_1st_Flr_SF       1941 non-null   float64
v_2nd_Flr_SF       1941 non-null   float64
v_Low_Qual_Fin_SF  1941 non-null   int64  
v_Gr_Liv_Area      1941 non-null   float64
v_Bsmt_Full_Bath   1939 non-null   float64
v_Bsmt_Half_Bath   1939 non-null   float64
v_Full_Bath        1941 non-null   int64  
v_Half_Bath        1941 non-null   int64  
v_Bedroom_AbvGr    1941 non-null   int64  
v_Kitchen_AbvGr    1941 non-null   int64  
v_Kitchen_Qual     1941 non-null   object 
v_TotRms_AbvGrd    1941 non-null   int64  
v_Functional       1941 non-null   object 
v_Fireplaces       1941 non-null   int64  
v_Fireplace_Qu     1001 non-null   object 
v_Garage_Type      1836 non-null   object 
v_Garage_Yr_Blt    1834 non-null   float64
v_Garage_Finish    1834 non-null   object 
v_Garage_Cars      1940 non-null   float64
v_Garage_Area      1940 non-null   float64
v_Garage_Qual      1834 non-null   object 
v_Garage_Cond      1834 non-null   object 
v_Paved_Drive      1941 non-null   object 
v_Wood_Deck_SF     1941 non-null   float64
v_Open_Porch_SF    1941 non-null   float64
v_Enclosed_Porch   1941 non-null   float64
v_3Ssn_Porch       1941 non-null   int64  
v_Screen_Porch     1941 non-null   int64  
v_Pool_Area        1941 non-null   int64  
v_Pool_QC          13 non-null     object 
v_Fence            365 non-null    object 
v_Misc_Feature     63 non-null     object 
v_Misc_Val         1941 non-null   int64  
v_Mo_Sold          1941 non-null   int64  
v_Yr_Sold          1941 non-null   int64  
v_Sale_Type        1941 non-null   object 
v_Sale_Condition   1941 non-null   object 
dtypes: float64(18), int64(18), object(44)
memory usage: 1.2+ MB

# splitting the data up
rng = np.random.RandomState(0)
X_train, X_test, y_train, y_test = train_test_split(new_house, y, random_state=rng)

#this pipe replaces empty variables and drops almost all of the non-numeric variables

numer_pipe = make_pipeline(
    SimpleImputer(),
    StandardScaler()
)

cat_pipe   = make_pipeline(OneHotEncoder()) #handle_unknown='ignore'

preproc_pipe = ColumnTransformer(
    [ 
    # numerical vars
    ("num_impute", numer_pipe, make_column_selector(dtype_include=np.number)),
    # categorical vars  
    ("cat_trans", cat_pipe, ['v_Lot_Config'])
    ]
    , remainder = 'drop'
)

#using grid search cv to select the optimal alpha and k value through ridge estimation
pipe = Pipeline([('columntransformer',preproc_pipe),
                 ('feature_create',SelectKBest(f_classif, k=5)), 
                 ('feature_select','passthrough'), 
                 ('clf', Ridge())
                ])

param_grid = [    {'feature_create__k': [25,26,27,28,29,30,31,32,33,34,35],
                   'clf__alpha': [17,18,19,20,21,22,23,24,25,26]
                  },
]

grid_search = GridSearchCV(estimator=pipe,
                           param_grid=param_grid,
                           cv=5,
                           scoring='r2'
                          )
warnings.filterwarnings("ignore")
grid_search.fit(X_train, y_train)
results = grid_search.fit(X_train, y_train)
results_df = pd.DataFrame(results.cv_results_)

#pulling in variables to the results df
results_df = results_df[['params','mean_test_score','std_test_score']]

#adding alpha and k variables into the results df for graphing later
results_df['alpha'] = results_df['params'].apply(lambda x: x.get('clf__alpha'))
results_df['k'] = results_df['params'].apply(lambda x: x.get('feature_select__k'))

#grabbing optimal values
print(results_df['mean_test_score'].max())
print(results_df['std_test_score'].max())
results_df.sort_values('mean_test_score').tail(15)

0.8873861426042128
0.021320530443543578

	params	mean_test_score	std_test_score	alpha	k
63	{'clf__alpha': 22, 'feature_create__k': 33}	0.887215	0.018216	22	None
74	{'clf__alpha': 23, 'feature_create__k': 33}	0.887226	0.018223	23	None
85	{'clf__alpha': 24, 'feature_create__k': 33}	0.887234	0.018231	24	None
96	{'clf__alpha': 25, 'feature_create__k': 33}	0.887242	0.018238	25	None
10	{'clf__alpha': 17, 'feature_create__k': 35}	0.887242	0.017957	17	None
107	{'clf__alpha': 26, 'feature_create__k': 33}	0.887248	0.018246	26	None
21	{'clf__alpha': 18, 'feature_create__k': 35}	0.887266	0.017966	18	None
32	{'clf__alpha': 19, 'feature_create__k': 35}	0.887288	0.017974	19	None
43	{'clf__alpha': 20, 'feature_create__k': 35}	0.887307	0.017983	20	None
54	{'clf__alpha': 21, 'feature_create__k': 35}	0.887324	0.017993	21	None
65	{'clf__alpha': 22, 'feature_create__k': 35}	0.887340	0.018003	22	None
76	{'clf__alpha': 23, 'feature_create__k': 35}	0.887354	0.018013	23	None
87	{'clf__alpha': 24, 'feature_create__k': 35}	0.887366	0.018023	24	None
98	{'clf__alpha': 25, 'feature_create__k': 35}	0.887377	0.018034	25	None
109	{'clf__alpha': 26, 'feature_create__k': 35}	0.887386	0.018044	26	None

#graphic for visualization
sns.scatterplot(data=results_df, x='std_test_score', y='mean_test_score')
plt.xlabel('Std Dev Test Scores')
plt.ylabel('Mean R2 Test Scores')
plt.title('Optimizing R2 & Std Dev')

Text(0.5, 1.0, 'Optimizing R2 & Std Dev')

png

#finding the best estimator and training it over the data
best_pipe = grid_search.best_estimator_ 
best_pipe.fit(X_train,y_train)


#using the trained model to predict and checking the R2
y_test_predict = best_pipe.predict(X_test)
test_score = r2_score(y_test,y_test_predict)
print(test_score)

0.8830394937046963

Ok, I’m happy with that R^2, let’s test this puppy out!

#predict values on our holdout set, store them in a df, convert to csv for grading
best_pipe.fit(new_house,y)

holdout = pd.read_csv('input_data2/housing_holdout.csv')
holdout_x_vals = holdout.drop('parcel',axis=1)

y_pred = best_pipe.predict(holdout_x_vals)

df_out = pd.DataFrame({'parcel':holdout['parcel'],
                       'prediction':y_pred})

df_out.to_csv('submission/MY_PREDICTIONS.csv',index=False)