Logo

Alex Romanowski

LinkedIn
Resume
GitHub

Assignment 8

Assignment Overview

In this assignment we were tasked with using machine learning to optimize regression that predicts housing prices given a multitude of factors. We had a training and testing dataset to build our model, then a holdout dataset that we predicted the prices for.

I winsorized my data to reduce the effects of outliers then used the ridge function in a pipeline with GridSearchCV to optimize my model for alpha and k values.

My overall results were an R^2 of .883 when predicting the prices on my test dataset

import pandas as pd
# from pandas_profiling import ProfileReport # now use ydata-profiling
from pandas_profiling import ProfileReport
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.model_selection import KFold, cross_validate, GridSearchCV
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from tqdm import tqdm
from sklearn.model_selection import (
    GridSearchCV,
    KFold,
    cross_validate,
    train_test_split,
)

from sklearn.linear_model import Ridge
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score
import seaborn as sns
import matplotlib.pyplot as plt
from numpy import argsort
from sklearn.pipeline import make_pipeline 
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score

import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LassoCV
from sklearn.model_selection import KFold, cross_validate, GridSearchCV

C:\Users\AlexanderRomanowski\AppData\Local\Temp\ipykernel_21548\1978820628.py:3: DeprecationWarning: `import pandas_profiling` is going to be deprecated by April 1st. Please use `import ydata_profiling` instead.
  from pandas_profiling import ProfileReport

Load data and cleaning

After looking at the key stats for the housing data, it’s clear that there are significant outliers given the high max/min/std dev values. Winsorizing the data to the 1% and 99% tails greatly reduced the outliers and the std dev. After doing this, I noticed that my model became a lot more stable in its R^2 values.

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.model_selection import KFold, cross_validate, GridSearchCV
from sklearn.linear_model import RidgeCV
from winsorizer_with_missing import winsorizer_with_missing
import warnings
# put code here
housing = pd.read_csv('input_data2/housing_train.csv')
y = np.log(housing.v_SalePrice)

housing.describe().T
count mean std min 25% 50% 75% max
v_MS_SubClass 1941.0 58.088614 42.946015 20.0 20.00 50.0 70.00 190.0
v_Lot_Frontage 1620.0 69.301235 23.978101 21.0 58.00 68.0 80.00 313.0
v_Lot_Area 1941.0 10284.770222 7832.295527 1470.0 7420.00 9450.0 11631.00 164660.0
v_Overall_Qual 1941.0 6.113344 1.401594 1.0 5.00 6.0 7.00 10.0
v_Overall_Cond 1941.0 5.568264 1.087465 1.0 5.00 5.0 6.00 9.0
v_Year_Built 1941.0 1971.321999 30.209933 1872.0 1953.00 1973.0 2001.00 2008.0
v_Year_Remod/Add 1941.0 1984.073158 20.837338 1950.0 1965.00 1993.0 2004.00 2009.0
v_Mas_Vnr_Area 1923.0 104.846074 184.982611 0.0 0.00 0.0 168.00 1600.0
v_BsmtFin_SF_1 1940.0 436.986598 457.815715 0.0 0.00 361.5 735.25 5644.0
v_BsmtFin_SF_2 1940.0 49.247938 169.555232 0.0 0.00 0.0 0.00 1474.0
v_Bsmt_Unf_SF 1940.0 567.437629 439.600535 0.0 225.75 474.0 815.00 2153.0
v_Total_Bsmt_SF 1940.0 1053.672165 438.662147 0.0 796.75 989.5 1295.25 6110.0
v_1st_Flr_SF 1941.0 1161.071613 396.945408 334.0 886.00 1085.0 1383.00 5095.0
v_2nd_Flr_SF 1941.0 340.955178 434.242152 0.0 0.00 0.0 717.00 2065.0
v_Low_Qual_Fin_SF 1941.0 4.282329 42.943917 0.0 0.00 0.0 0.00 697.0
v_Gr_Liv_Area 1941.0 1506.309119 524.765289 334.0 1118.00 1436.0 1755.00 5642.0
v_Bsmt_Full_Bath 1939.0 0.415162 0.515395 0.0 0.00 0.0 1.00 2.0
v_Bsmt_Half_Bath 1939.0 0.064982 0.254791 0.0 0.00 0.0 0.00 2.0
v_Full_Bath 1941.0 1.566718 0.552693 0.0 1.00 2.0 2.00 3.0
v_Half_Bath 1941.0 0.378156 0.498675 0.0 0.00 0.0 1.00 2.0
v_Bedroom_AbvGr 1941.0 2.866048 0.827732 0.0 2.00 3.0 3.00 8.0
v_Kitchen_AbvGr 1941.0 1.039155 0.201827 0.0 1.00 1.0 1.00 2.0
v_TotRms_AbvGrd 1941.0 6.465224 1.577696 2.0 5.00 6.0 7.00 15.0
v_Fireplaces 1941.0 0.595569 0.641969 0.0 0.00 1.0 1.00 4.0
v_Garage_Yr_Blt 1834.0 1978.188113 25.729319 1895.0 1960.00 1980.0 2002.00 2207.0
v_Garage_Cars 1940.0 1.769588 0.763399 0.0 1.00 2.0 2.00 4.0
v_Garage_Area 1940.0 472.766495 217.089624 0.0 318.75 478.0 576.00 1488.0
v_Wood_Deck_SF 1941.0 92.458011 127.020523 0.0 0.00 0.0 168.00 1424.0
v_Open_Porch_SF 1941.0 49.157135 70.296277 0.0 0.00 28.0 72.00 742.0
v_Enclosed_Porch 1941.0 22.947965 65.249307 0.0 0.00 0.0 0.00 1012.0
v_3Ssn_Porch 1941.0 2.249871 22.416832 0.0 0.00 0.0 0.00 407.0
v_Screen_Porch 1941.0 16.249871 56.748086 0.0 0.00 0.0 0.00 576.0
v_Pool_Area 1941.0 3.386399 43.695267 0.0 0.00 0.0 0.00 800.0
v_Misc_Val 1941.0 52.553838 616.064459 0.0 0.00 0.0 0.00 17000.0
v_Mo_Sold 1941.0 6.431221 2.745199 1.0 5.00 6.0 8.00 12.0
v_Yr_Sold 1941.0 2006.998454 0.801736 2006.0 2006.00 2007.0 2008.00 2008.0
v_SalePrice 1941.0 182033.238022 80407.100395 13100.0 130000.00 161900.0 215000.00 755000.0
#pulling the column names to winsorize
housingwin = housing.select_dtypes(include=['int64', 'float64']).columns
housingwin = housingwin.tolist()
#winsorizing
new_house = winsorizer_with_missing(housing, cols= housingwin)
new_house = new_house.drop('v_SalePrice',axis=1)
C:\Users\AlexanderRomanowski\Documents\HW\fin 377\asgn-08-ajr423\winsorizer_with_missing.py:45: FutureWarning: Downcasting integer-dtype results in .where is deprecated and will change in a future version. To retain the old behavior, explicitly cast the results to the desired dtype.
  df[cols] = df[cols].clip(lower=df[cols].quantile(low_),
new_house.describe().T
count mean std min 25% 50% 75% max
v_MS_SubClass 1941.0 58.088614 42.946015 20.0 20.00 50.0 70.00 190.000
v_Lot_Frontage 1620.0 68.772222 21.368657 21.0 58.00 68.0 80.00 130.000
v_Lot_Area 1941.0 9936.527048 4663.839454 1896.6 7420.00 9450.0 11631.00 30867.700
v_Overall_Qual 1941.0 6.107161 1.356487 3.0 5.00 6.0 7.00 9.000
v_Overall_Cond 1941.0 5.561051 1.033465 3.0 5.00 5.0 6.00 8.000
v_Year_Built 1941.0 1971.415250 29.936818 1900.0 1953.00 1973.0 2001.00 2007.000
v_Year_Remod/Add 1941.0 1984.072643 20.836733 1950.0 1965.00 1993.0 2004.00 2008.000
v_Mas_Vnr_Area 1923.0 100.885450 166.196584 0.0 0.00 0.0 168.00 710.680
v_BsmtFin_SF_1 1940.0 431.173093 428.688416 0.0 0.00 361.5 735.25 1561.660
v_BsmtFin_SF_2 1940.0 46.804820 155.508430 0.0 0.00 0.0 0.00 828.745
v_Bsmt_Unf_SF 1940.0 564.993428 432.532077 0.0 225.75 474.0 815.00 1693.575
v_Total_Bsmt_SF 1940.0 1045.872912 401.483213 0.0 796.75 989.5 1295.25 2035.915
v_1st_Flr_SF 1941.0 1153.883565 360.236856 525.1 886.00 1085.0 1383.00 2135.300
v_2nd_Flr_SF 1941.0 338.057702 426.044845 0.0 0.00 0.0 717.00 1359.800
v_Low_Qual_Fin_SF 1941.0 0.000000 0.000000 0.0 0.00 0.0 0.00 0.000
v_Gr_Liv_Area 1941.0 1498.284905 482.248402 747.7 1118.00 1436.0 1755.00 2827.800
v_Bsmt_Full_Bath 1939.0 0.403816 0.490788 0.0 0.00 0.0 1.00 1.000
v_Bsmt_Half_Bath 1939.0 0.062919 0.242880 0.0 0.00 0.0 0.00 1.000
v_Full_Bath 1941.0 1.570325 0.545672 1.0 1.00 2.0 2.00 3.000
v_Half_Bath 1941.0 0.371458 0.483319 0.0 0.00 0.0 1.00 1.000
v_Bedroom_AbvGr 1941.0 2.862442 0.795047 1.0 2.00 3.0 3.00 5.000
v_Kitchen_AbvGr 1941.0 1.040701 0.197647 1.0 1.00 1.0 1.00 2.000
v_TotRms_AbvGrd 1941.0 6.468315 1.531340 4.0 5.00 6.0 7.00 11.000
v_Fireplaces 1941.0 0.590417 0.625647 0.0 0.00 1.0 1.00 2.000
v_Garage_Yr_Blt 1834.0 1978.200654 24.813501 1920.0 1960.00 1980.0 2002.00 2007.000
v_Garage_Cars 1940.0 1.764433 0.751600 0.0 1.00 2.0 2.00 3.000
v_Garage_Area 1940.0 470.119149 209.319506 0.0 318.75 478.0 576.00 953.405
v_Wood_Deck_SF 1941.0 90.223596 116.616823 0.0 0.00 0.0 168.00 465.400
v_Open_Porch_SF 1941.0 47.604328 62.672975 0.0 0.00 28.0 72.00 265.900
v_Enclosed_Porch 1941.0 21.710974 57.088499 0.0 0.00 0.0 0.00 243.600
v_3Ssn_Porch 1941.0 0.000000 0.000000 0.0 0.00 0.0 0.00 0.000
v_Screen_Porch 1941.0 15.062339 50.325873 0.0 0.00 0.0 0.00 227.000
v_Pool_Area 1941.0 0.000000 0.000000 0.0 0.00 0.0 0.00 0.000
v_Misc_Val 1941.0 16.524987 92.916618 0.0 0.00 0.0 0.00 600.000
v_Mo_Sold 1941.0 6.431221 2.745199 1.0 5.00 6.0 8.00 12.000
v_Yr_Sold 1941.0 2006.998454 0.801736 2006.0 2006.00 2007.0 2008.00 2008.000
new_house.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1941 entries, 0 to 1940
Data columns (total 80 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   parcel             1941 non-null   object 
 1   v_MS_SubClass      1941 non-null   int64  
 2   v_MS_Zoning        1941 non-null   object 
 3   v_Lot_Frontage     1620 non-null   float64
 4   v_Lot_Area         1941 non-null   float64
 5   v_Street           1941 non-null   object 
 6   v_Alley            136 non-null    object 
 7   v_Lot_Shape        1941 non-null   object 
 8   v_Land_Contour     1941 non-null   object 
 9   v_Utilities        1941 non-null   object 
 10  v_Lot_Config       1941 non-null   object 
 11  v_Land_Slope       1941 non-null   object 
 12  v_Neighborhood     1941 non-null   object 
 13  v_Condition_1      1941 non-null   object 
 14  v_Condition_2      1941 non-null   object 
 15  v_Bldg_Type        1941 non-null   object 
 16  v_House_Style      1941 non-null   object 
 17  v_Overall_Qual     1941 non-null   int64  
 18  v_Overall_Cond     1941 non-null   int64  
 19  v_Year_Built       1941 non-null   int64  
 20  v_Year_Remod/Add   1941 non-null   int64  
 21  v_Roof_Style       1941 non-null   object 
 22  v_Roof_Matl        1941 non-null   object 
 23  v_Exterior_1st     1941 non-null   object 
 24  v_Exterior_2nd     1941 non-null   object 
 25  v_Mas_Vnr_Type     1923 non-null   object 
 26  v_Mas_Vnr_Area     1923 non-null   float64
 27  v_Exter_Qual       1941 non-null   object 
 28  v_Exter_Cond       1941 non-null   object 
 29  v_Foundation       1941 non-null   object 
 30  v_Bsmt_Qual        1891 non-null   object 
 31  v_Bsmt_Cond        1891 non-null   object 
 32  v_Bsmt_Exposure    1889 non-null   object 
 33  v_BsmtFin_Type_1   1891 non-null   object 
 34  v_BsmtFin_SF_1     1940 non-null   float64
 35  v_BsmtFin_Type_2   1891 non-null   object 
 36  v_BsmtFin_SF_2     1940 non-null   float64
 37  v_Bsmt_Unf_SF      1940 non-null   float64
 38  v_Total_Bsmt_SF    1940 non-null   float64
 39  v_Heating          1941 non-null   object 
 40  v_Heating_QC       1941 non-null   object 
 41  v_Central_Air      1941 non-null   object 
 42  v_Electrical       1940 non-null   object 
 43  v_1st_Flr_SF       1941 non-null   float64
 44  v_2nd_Flr_SF       1941 non-null   float64
 45  v_Low_Qual_Fin_SF  1941 non-null   int64  
 46  v_Gr_Liv_Area      1941 non-null   float64
 47  v_Bsmt_Full_Bath   1939 non-null   float64
 48  v_Bsmt_Half_Bath   1939 non-null   float64
 49  v_Full_Bath        1941 non-null   int64  
 50  v_Half_Bath        1941 non-null   int64  
 51  v_Bedroom_AbvGr    1941 non-null   int64  
 52  v_Kitchen_AbvGr    1941 non-null   int64  
 53  v_Kitchen_Qual     1941 non-null   object 
 54  v_TotRms_AbvGrd    1941 non-null   int64  
 55  v_Functional       1941 non-null   object 
 56  v_Fireplaces       1941 non-null   int64  
 57  v_Fireplace_Qu     1001 non-null   object 
 58  v_Garage_Type      1836 non-null   object 
 59  v_Garage_Yr_Blt    1834 non-null   float64
 60  v_Garage_Finish    1834 non-null   object 
 61  v_Garage_Cars      1940 non-null   float64
 62  v_Garage_Area      1940 non-null   float64
 63  v_Garage_Qual      1834 non-null   object 
 64  v_Garage_Cond      1834 non-null   object 
 65  v_Paved_Drive      1941 non-null   object 
 66  v_Wood_Deck_SF     1941 non-null   float64
 67  v_Open_Porch_SF    1941 non-null   float64
 68  v_Enclosed_Porch   1941 non-null   float64
 69  v_3Ssn_Porch       1941 non-null   int64  
 70  v_Screen_Porch     1941 non-null   int64  
 71  v_Pool_Area        1941 non-null   int64  
 72  v_Pool_QC          13 non-null     object 
 73  v_Fence            365 non-null    object 
 74  v_Misc_Feature     63 non-null     object 
 75  v_Misc_Val         1941 non-null   int64  
 76  v_Mo_Sold          1941 non-null   int64  
 77  v_Yr_Sold          1941 non-null   int64  
 78  v_Sale_Type        1941 non-null   object 
 79  v_Sale_Condition   1941 non-null   object 
dtypes: float64(18), int64(18), object(44)
memory usage: 1.2+ MB
# splitting the data up
rng = np.random.RandomState(0)
X_train, X_test, y_train, y_test = train_test_split(new_house, y, random_state=rng)
#this pipe replaces empty variables and drops almost all of the non-numeric variables

numer_pipe = make_pipeline(
    SimpleImputer(),
    StandardScaler()
)

cat_pipe   = make_pipeline(OneHotEncoder()) #handle_unknown='ignore'

preproc_pipe = ColumnTransformer(
    [ 
    # numerical vars
    ("num_impute", numer_pipe, make_column_selector(dtype_include=np.number)),
    # categorical vars  
    ("cat_trans", cat_pipe, ['v_Lot_Config'])
    ]
    , remainder = 'drop'
)
#using grid search cv to select the optimal alpha and k value through ridge estimation
pipe = Pipeline([('columntransformer',preproc_pipe),
                 ('feature_create',SelectKBest(f_classif, k=5)), 
                 ('feature_select','passthrough'), 
                 ('clf', Ridge())
                ])

param_grid = [    {'feature_create__k': [25,26,27,28,29,30,31,32,33,34,35],
                   'clf__alpha': [17,18,19,20,21,22,23,24,25,26]
                  },
]

grid_search = GridSearchCV(estimator=pipe,
                           param_grid=param_grid,
                           cv=5,
                           scoring='r2'
                          )
warnings.filterwarnings("ignore")
grid_search.fit(X_train, y_train)
results = grid_search.fit(X_train, y_train)
results_df = pd.DataFrame(results.cv_results_)

#pulling in variables to the results df
results_df = results_df[['params','mean_test_score','std_test_score']]

#adding alpha and k variables into the results df for graphing later
results_df['alpha'] = results_df['params'].apply(lambda x: x.get('clf__alpha'))
results_df['k'] = results_df['params'].apply(lambda x: x.get('feature_select__k'))

#grabbing optimal values
print(results_df['mean_test_score'].max())
print(results_df['std_test_score'].max())
results_df.sort_values('mean_test_score').tail(15)
0.8873861426042128
0.021320530443543578
params mean_test_score std_test_score alpha k
63 {'clf__alpha': 22, 'feature_create__k': 33} 0.887215 0.018216 22 None
74 {'clf__alpha': 23, 'feature_create__k': 33} 0.887226 0.018223 23 None
85 {'clf__alpha': 24, 'feature_create__k': 33} 0.887234 0.018231 24 None
96 {'clf__alpha': 25, 'feature_create__k': 33} 0.887242 0.018238 25 None
10 {'clf__alpha': 17, 'feature_create__k': 35} 0.887242 0.017957 17 None
107 {'clf__alpha': 26, 'feature_create__k': 33} 0.887248 0.018246 26 None
21 {'clf__alpha': 18, 'feature_create__k': 35} 0.887266 0.017966 18 None
32 {'clf__alpha': 19, 'feature_create__k': 35} 0.887288 0.017974 19 None
43 {'clf__alpha': 20, 'feature_create__k': 35} 0.887307 0.017983 20 None
54 {'clf__alpha': 21, 'feature_create__k': 35} 0.887324 0.017993 21 None
65 {'clf__alpha': 22, 'feature_create__k': 35} 0.887340 0.018003 22 None
76 {'clf__alpha': 23, 'feature_create__k': 35} 0.887354 0.018013 23 None
87 {'clf__alpha': 24, 'feature_create__k': 35} 0.887366 0.018023 24 None
98 {'clf__alpha': 25, 'feature_create__k': 35} 0.887377 0.018034 25 None
109 {'clf__alpha': 26, 'feature_create__k': 35} 0.887386 0.018044 26 None
#graphic for visualization
sns.scatterplot(data=results_df, x='std_test_score', y='mean_test_score')
plt.xlabel('Std Dev Test Scores')
plt.ylabel('Mean R2 Test Scores')
plt.title('Optimizing R2 & Std Dev')

Text(0.5, 1.0, 'Optimizing R2 & Std Dev')

png

#finding the best estimator and training it over the data
best_pipe = grid_search.best_estimator_ 
best_pipe.fit(X_train,y_train)


#using the trained model to predict and checking the R2
y_test_predict = best_pipe.predict(X_test)
test_score = r2_score(y_test,y_test_predict)
print(test_score)
0.8830394937046963

Ok, I’m happy with that R^2, let’s test this puppy out!

#predict values on our holdout set, store them in a df, convert to csv for grading
best_pipe.fit(new_house,y)

holdout = pd.read_csv('input_data2/housing_holdout.csv')
holdout_x_vals = holdout.drop('parcel',axis=1)

y_pred = best_pipe.predict(holdout_x_vals)

df_out = pd.DataFrame({'parcel':holdout['parcel'],
                       'prediction':y_pred})

df_out.to_csv('submission/MY_PREDICTIONS.csv',index=False)