Assignment Overview
In this assignment we were tasked with using machine learning to optimize regression that predicts housing prices given a multitude of factors. We had a training and testing dataset to build our model, then a holdout dataset that we predicted the prices for.
I winsorized my data to reduce the effects of outliers then used the ridge function in a pipeline with GridSearchCV to optimize my model for alpha and k values.
My overall results were an R^2 of .883 when predicting the prices on my test dataset
import pandas as pd
# from pandas_profiling import ProfileReport # now use ydata-profiling
from pandas_profiling import ProfileReport
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.model_selection import KFold, cross_validate, GridSearchCV
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from tqdm import tqdm
from sklearn.model_selection import (
GridSearchCV,
KFold,
cross_validate,
train_test_split,
)
from sklearn.linear_model import Ridge
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score
import seaborn as sns
import matplotlib.pyplot as plt
from numpy import argsort
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LassoCV
from sklearn.model_selection import KFold, cross_validate, GridSearchCV
C:\Users\AlexanderRomanowski\AppData\Local\Temp\ipykernel_21548\1978820628.py:3: DeprecationWarning: `import pandas_profiling` is going to be deprecated by April 1st. Please use `import ydata_profiling` instead.
from pandas_profiling import ProfileReport
After looking at the key stats for the housing data, it’s clear that there are significant outliers given the high max/min/std dev values. Winsorizing the data to the 1% and 99% tails greatly reduced the outliers and the std dev. After doing this, I noticed that my model became a lot more stable in its R^2 values.
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.model_selection import KFold, cross_validate, GridSearchCV
from sklearn.linear_model import RidgeCV
from winsorizer_with_missing import winsorizer_with_missing
import warnings
# put code here
housing = pd.read_csv('input_data2/housing_train.csv')
y = np.log(housing.v_SalePrice)
housing.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| v_MS_SubClass | 1941.0 | 58.088614 | 42.946015 | 20.0 | 20.00 | 50.0 | 70.00 | 190.0 |
| v_Lot_Frontage | 1620.0 | 69.301235 | 23.978101 | 21.0 | 58.00 | 68.0 | 80.00 | 313.0 |
| v_Lot_Area | 1941.0 | 10284.770222 | 7832.295527 | 1470.0 | 7420.00 | 9450.0 | 11631.00 | 164660.0 |
| v_Overall_Qual | 1941.0 | 6.113344 | 1.401594 | 1.0 | 5.00 | 6.0 | 7.00 | 10.0 |
| v_Overall_Cond | 1941.0 | 5.568264 | 1.087465 | 1.0 | 5.00 | 5.0 | 6.00 | 9.0 |
| v_Year_Built | 1941.0 | 1971.321999 | 30.209933 | 1872.0 | 1953.00 | 1973.0 | 2001.00 | 2008.0 |
| v_Year_Remod/Add | 1941.0 | 1984.073158 | 20.837338 | 1950.0 | 1965.00 | 1993.0 | 2004.00 | 2009.0 |
| v_Mas_Vnr_Area | 1923.0 | 104.846074 | 184.982611 | 0.0 | 0.00 | 0.0 | 168.00 | 1600.0 |
| v_BsmtFin_SF_1 | 1940.0 | 436.986598 | 457.815715 | 0.0 | 0.00 | 361.5 | 735.25 | 5644.0 |
| v_BsmtFin_SF_2 | 1940.0 | 49.247938 | 169.555232 | 0.0 | 0.00 | 0.0 | 0.00 | 1474.0 |
| v_Bsmt_Unf_SF | 1940.0 | 567.437629 | 439.600535 | 0.0 | 225.75 | 474.0 | 815.00 | 2153.0 |
| v_Total_Bsmt_SF | 1940.0 | 1053.672165 | 438.662147 | 0.0 | 796.75 | 989.5 | 1295.25 | 6110.0 |
| v_1st_Flr_SF | 1941.0 | 1161.071613 | 396.945408 | 334.0 | 886.00 | 1085.0 | 1383.00 | 5095.0 |
| v_2nd_Flr_SF | 1941.0 | 340.955178 | 434.242152 | 0.0 | 0.00 | 0.0 | 717.00 | 2065.0 |
| v_Low_Qual_Fin_SF | 1941.0 | 4.282329 | 42.943917 | 0.0 | 0.00 | 0.0 | 0.00 | 697.0 |
| v_Gr_Liv_Area | 1941.0 | 1506.309119 | 524.765289 | 334.0 | 1118.00 | 1436.0 | 1755.00 | 5642.0 |
| v_Bsmt_Full_Bath | 1939.0 | 0.415162 | 0.515395 | 0.0 | 0.00 | 0.0 | 1.00 | 2.0 |
| v_Bsmt_Half_Bath | 1939.0 | 0.064982 | 0.254791 | 0.0 | 0.00 | 0.0 | 0.00 | 2.0 |
| v_Full_Bath | 1941.0 | 1.566718 | 0.552693 | 0.0 | 1.00 | 2.0 | 2.00 | 3.0 |
| v_Half_Bath | 1941.0 | 0.378156 | 0.498675 | 0.0 | 0.00 | 0.0 | 1.00 | 2.0 |
| v_Bedroom_AbvGr | 1941.0 | 2.866048 | 0.827732 | 0.0 | 2.00 | 3.0 | 3.00 | 8.0 |
| v_Kitchen_AbvGr | 1941.0 | 1.039155 | 0.201827 | 0.0 | 1.00 | 1.0 | 1.00 | 2.0 |
| v_TotRms_AbvGrd | 1941.0 | 6.465224 | 1.577696 | 2.0 | 5.00 | 6.0 | 7.00 | 15.0 |
| v_Fireplaces | 1941.0 | 0.595569 | 0.641969 | 0.0 | 0.00 | 1.0 | 1.00 | 4.0 |
| v_Garage_Yr_Blt | 1834.0 | 1978.188113 | 25.729319 | 1895.0 | 1960.00 | 1980.0 | 2002.00 | 2207.0 |
| v_Garage_Cars | 1940.0 | 1.769588 | 0.763399 | 0.0 | 1.00 | 2.0 | 2.00 | 4.0 |
| v_Garage_Area | 1940.0 | 472.766495 | 217.089624 | 0.0 | 318.75 | 478.0 | 576.00 | 1488.0 |
| v_Wood_Deck_SF | 1941.0 | 92.458011 | 127.020523 | 0.0 | 0.00 | 0.0 | 168.00 | 1424.0 |
| v_Open_Porch_SF | 1941.0 | 49.157135 | 70.296277 | 0.0 | 0.00 | 28.0 | 72.00 | 742.0 |
| v_Enclosed_Porch | 1941.0 | 22.947965 | 65.249307 | 0.0 | 0.00 | 0.0 | 0.00 | 1012.0 |
| v_3Ssn_Porch | 1941.0 | 2.249871 | 22.416832 | 0.0 | 0.00 | 0.0 | 0.00 | 407.0 |
| v_Screen_Porch | 1941.0 | 16.249871 | 56.748086 | 0.0 | 0.00 | 0.0 | 0.00 | 576.0 |
| v_Pool_Area | 1941.0 | 3.386399 | 43.695267 | 0.0 | 0.00 | 0.0 | 0.00 | 800.0 |
| v_Misc_Val | 1941.0 | 52.553838 | 616.064459 | 0.0 | 0.00 | 0.0 | 0.00 | 17000.0 |
| v_Mo_Sold | 1941.0 | 6.431221 | 2.745199 | 1.0 | 5.00 | 6.0 | 8.00 | 12.0 |
| v_Yr_Sold | 1941.0 | 2006.998454 | 0.801736 | 2006.0 | 2006.00 | 2007.0 | 2008.00 | 2008.0 |
| v_SalePrice | 1941.0 | 182033.238022 | 80407.100395 | 13100.0 | 130000.00 | 161900.0 | 215000.00 | 755000.0 |
#pulling the column names to winsorize
housingwin = housing.select_dtypes(include=['int64', 'float64']).columns
housingwin = housingwin.tolist()
#winsorizing
new_house = winsorizer_with_missing(housing, cols= housingwin)
new_house = new_house.drop('v_SalePrice',axis=1)
C:\Users\AlexanderRomanowski\Documents\HW\fin 377\asgn-08-ajr423\winsorizer_with_missing.py:45: FutureWarning: Downcasting integer-dtype results in .where is deprecated and will change in a future version. To retain the old behavior, explicitly cast the results to the desired dtype.
df[cols] = df[cols].clip(lower=df[cols].quantile(low_),
new_house.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| v_MS_SubClass | 1941.0 | 58.088614 | 42.946015 | 20.0 | 20.00 | 50.0 | 70.00 | 190.000 |
| v_Lot_Frontage | 1620.0 | 68.772222 | 21.368657 | 21.0 | 58.00 | 68.0 | 80.00 | 130.000 |
| v_Lot_Area | 1941.0 | 9936.527048 | 4663.839454 | 1896.6 | 7420.00 | 9450.0 | 11631.00 | 30867.700 |
| v_Overall_Qual | 1941.0 | 6.107161 | 1.356487 | 3.0 | 5.00 | 6.0 | 7.00 | 9.000 |
| v_Overall_Cond | 1941.0 | 5.561051 | 1.033465 | 3.0 | 5.00 | 5.0 | 6.00 | 8.000 |
| v_Year_Built | 1941.0 | 1971.415250 | 29.936818 | 1900.0 | 1953.00 | 1973.0 | 2001.00 | 2007.000 |
| v_Year_Remod/Add | 1941.0 | 1984.072643 | 20.836733 | 1950.0 | 1965.00 | 1993.0 | 2004.00 | 2008.000 |
| v_Mas_Vnr_Area | 1923.0 | 100.885450 | 166.196584 | 0.0 | 0.00 | 0.0 | 168.00 | 710.680 |
| v_BsmtFin_SF_1 | 1940.0 | 431.173093 | 428.688416 | 0.0 | 0.00 | 361.5 | 735.25 | 1561.660 |
| v_BsmtFin_SF_2 | 1940.0 | 46.804820 | 155.508430 | 0.0 | 0.00 | 0.0 | 0.00 | 828.745 |
| v_Bsmt_Unf_SF | 1940.0 | 564.993428 | 432.532077 | 0.0 | 225.75 | 474.0 | 815.00 | 1693.575 |
| v_Total_Bsmt_SF | 1940.0 | 1045.872912 | 401.483213 | 0.0 | 796.75 | 989.5 | 1295.25 | 2035.915 |
| v_1st_Flr_SF | 1941.0 | 1153.883565 | 360.236856 | 525.1 | 886.00 | 1085.0 | 1383.00 | 2135.300 |
| v_2nd_Flr_SF | 1941.0 | 338.057702 | 426.044845 | 0.0 | 0.00 | 0.0 | 717.00 | 1359.800 |
| v_Low_Qual_Fin_SF | 1941.0 | 0.000000 | 0.000000 | 0.0 | 0.00 | 0.0 | 0.00 | 0.000 |
| v_Gr_Liv_Area | 1941.0 | 1498.284905 | 482.248402 | 747.7 | 1118.00 | 1436.0 | 1755.00 | 2827.800 |
| v_Bsmt_Full_Bath | 1939.0 | 0.403816 | 0.490788 | 0.0 | 0.00 | 0.0 | 1.00 | 1.000 |
| v_Bsmt_Half_Bath | 1939.0 | 0.062919 | 0.242880 | 0.0 | 0.00 | 0.0 | 0.00 | 1.000 |
| v_Full_Bath | 1941.0 | 1.570325 | 0.545672 | 1.0 | 1.00 | 2.0 | 2.00 | 3.000 |
| v_Half_Bath | 1941.0 | 0.371458 | 0.483319 | 0.0 | 0.00 | 0.0 | 1.00 | 1.000 |
| v_Bedroom_AbvGr | 1941.0 | 2.862442 | 0.795047 | 1.0 | 2.00 | 3.0 | 3.00 | 5.000 |
| v_Kitchen_AbvGr | 1941.0 | 1.040701 | 0.197647 | 1.0 | 1.00 | 1.0 | 1.00 | 2.000 |
| v_TotRms_AbvGrd | 1941.0 | 6.468315 | 1.531340 | 4.0 | 5.00 | 6.0 | 7.00 | 11.000 |
| v_Fireplaces | 1941.0 | 0.590417 | 0.625647 | 0.0 | 0.00 | 1.0 | 1.00 | 2.000 |
| v_Garage_Yr_Blt | 1834.0 | 1978.200654 | 24.813501 | 1920.0 | 1960.00 | 1980.0 | 2002.00 | 2007.000 |
| v_Garage_Cars | 1940.0 | 1.764433 | 0.751600 | 0.0 | 1.00 | 2.0 | 2.00 | 3.000 |
| v_Garage_Area | 1940.0 | 470.119149 | 209.319506 | 0.0 | 318.75 | 478.0 | 576.00 | 953.405 |
| v_Wood_Deck_SF | 1941.0 | 90.223596 | 116.616823 | 0.0 | 0.00 | 0.0 | 168.00 | 465.400 |
| v_Open_Porch_SF | 1941.0 | 47.604328 | 62.672975 | 0.0 | 0.00 | 28.0 | 72.00 | 265.900 |
| v_Enclosed_Porch | 1941.0 | 21.710974 | 57.088499 | 0.0 | 0.00 | 0.0 | 0.00 | 243.600 |
| v_3Ssn_Porch | 1941.0 | 0.000000 | 0.000000 | 0.0 | 0.00 | 0.0 | 0.00 | 0.000 |
| v_Screen_Porch | 1941.0 | 15.062339 | 50.325873 | 0.0 | 0.00 | 0.0 | 0.00 | 227.000 |
| v_Pool_Area | 1941.0 | 0.000000 | 0.000000 | 0.0 | 0.00 | 0.0 | 0.00 | 0.000 |
| v_Misc_Val | 1941.0 | 16.524987 | 92.916618 | 0.0 | 0.00 | 0.0 | 0.00 | 600.000 |
| v_Mo_Sold | 1941.0 | 6.431221 | 2.745199 | 1.0 | 5.00 | 6.0 | 8.00 | 12.000 |
| v_Yr_Sold | 1941.0 | 2006.998454 | 0.801736 | 2006.0 | 2006.00 | 2007.0 | 2008.00 | 2008.000 |
new_house.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1941 entries, 0 to 1940
Data columns (total 80 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 parcel 1941 non-null object
1 v_MS_SubClass 1941 non-null int64
2 v_MS_Zoning 1941 non-null object
3 v_Lot_Frontage 1620 non-null float64
4 v_Lot_Area 1941 non-null float64
5 v_Street 1941 non-null object
6 v_Alley 136 non-null object
7 v_Lot_Shape 1941 non-null object
8 v_Land_Contour 1941 non-null object
9 v_Utilities 1941 non-null object
10 v_Lot_Config 1941 non-null object
11 v_Land_Slope 1941 non-null object
12 v_Neighborhood 1941 non-null object
13 v_Condition_1 1941 non-null object
14 v_Condition_2 1941 non-null object
15 v_Bldg_Type 1941 non-null object
16 v_House_Style 1941 non-null object
17 v_Overall_Qual 1941 non-null int64
18 v_Overall_Cond 1941 non-null int64
19 v_Year_Built 1941 non-null int64
20 v_Year_Remod/Add 1941 non-null int64
21 v_Roof_Style 1941 non-null object
22 v_Roof_Matl 1941 non-null object
23 v_Exterior_1st 1941 non-null object
24 v_Exterior_2nd 1941 non-null object
25 v_Mas_Vnr_Type 1923 non-null object
26 v_Mas_Vnr_Area 1923 non-null float64
27 v_Exter_Qual 1941 non-null object
28 v_Exter_Cond 1941 non-null object
29 v_Foundation 1941 non-null object
30 v_Bsmt_Qual 1891 non-null object
31 v_Bsmt_Cond 1891 non-null object
32 v_Bsmt_Exposure 1889 non-null object
33 v_BsmtFin_Type_1 1891 non-null object
34 v_BsmtFin_SF_1 1940 non-null float64
35 v_BsmtFin_Type_2 1891 non-null object
36 v_BsmtFin_SF_2 1940 non-null float64
37 v_Bsmt_Unf_SF 1940 non-null float64
38 v_Total_Bsmt_SF 1940 non-null float64
39 v_Heating 1941 non-null object
40 v_Heating_QC 1941 non-null object
41 v_Central_Air 1941 non-null object
42 v_Electrical 1940 non-null object
43 v_1st_Flr_SF 1941 non-null float64
44 v_2nd_Flr_SF 1941 non-null float64
45 v_Low_Qual_Fin_SF 1941 non-null int64
46 v_Gr_Liv_Area 1941 non-null float64
47 v_Bsmt_Full_Bath 1939 non-null float64
48 v_Bsmt_Half_Bath 1939 non-null float64
49 v_Full_Bath 1941 non-null int64
50 v_Half_Bath 1941 non-null int64
51 v_Bedroom_AbvGr 1941 non-null int64
52 v_Kitchen_AbvGr 1941 non-null int64
53 v_Kitchen_Qual 1941 non-null object
54 v_TotRms_AbvGrd 1941 non-null int64
55 v_Functional 1941 non-null object
56 v_Fireplaces 1941 non-null int64
57 v_Fireplace_Qu 1001 non-null object
58 v_Garage_Type 1836 non-null object
59 v_Garage_Yr_Blt 1834 non-null float64
60 v_Garage_Finish 1834 non-null object
61 v_Garage_Cars 1940 non-null float64
62 v_Garage_Area 1940 non-null float64
63 v_Garage_Qual 1834 non-null object
64 v_Garage_Cond 1834 non-null object
65 v_Paved_Drive 1941 non-null object
66 v_Wood_Deck_SF 1941 non-null float64
67 v_Open_Porch_SF 1941 non-null float64
68 v_Enclosed_Porch 1941 non-null float64
69 v_3Ssn_Porch 1941 non-null int64
70 v_Screen_Porch 1941 non-null int64
71 v_Pool_Area 1941 non-null int64
72 v_Pool_QC 13 non-null object
73 v_Fence 365 non-null object
74 v_Misc_Feature 63 non-null object
75 v_Misc_Val 1941 non-null int64
76 v_Mo_Sold 1941 non-null int64
77 v_Yr_Sold 1941 non-null int64
78 v_Sale_Type 1941 non-null object
79 v_Sale_Condition 1941 non-null object
dtypes: float64(18), int64(18), object(44)
memory usage: 1.2+ MB
# splitting the data up
rng = np.random.RandomState(0)
X_train, X_test, y_train, y_test = train_test_split(new_house, y, random_state=rng)
#this pipe replaces empty variables and drops almost all of the non-numeric variables
numer_pipe = make_pipeline(
SimpleImputer(),
StandardScaler()
)
cat_pipe = make_pipeline(OneHotEncoder()) #handle_unknown='ignore'
preproc_pipe = ColumnTransformer(
[
# numerical vars
("num_impute", numer_pipe, make_column_selector(dtype_include=np.number)),
# categorical vars
("cat_trans", cat_pipe, ['v_Lot_Config'])
]
, remainder = 'drop'
)
#using grid search cv to select the optimal alpha and k value through ridge estimation
pipe = Pipeline([('columntransformer',preproc_pipe),
('feature_create',SelectKBest(f_classif, k=5)),
('feature_select','passthrough'),
('clf', Ridge())
])
param_grid = [ {'feature_create__k': [25,26,27,28,29,30,31,32,33,34,35],
'clf__alpha': [17,18,19,20,21,22,23,24,25,26]
},
]
grid_search = GridSearchCV(estimator=pipe,
param_grid=param_grid,
cv=5,
scoring='r2'
)
warnings.filterwarnings("ignore")
grid_search.fit(X_train, y_train)
results = grid_search.fit(X_train, y_train)
results_df = pd.DataFrame(results.cv_results_)
#pulling in variables to the results df
results_df = results_df[['params','mean_test_score','std_test_score']]
#adding alpha and k variables into the results df for graphing later
results_df['alpha'] = results_df['params'].apply(lambda x: x.get('clf__alpha'))
results_df['k'] = results_df['params'].apply(lambda x: x.get('feature_select__k'))
#grabbing optimal values
print(results_df['mean_test_score'].max())
print(results_df['std_test_score'].max())
results_df.sort_values('mean_test_score').tail(15)
0.8873861426042128
0.021320530443543578
| params | mean_test_score | std_test_score | alpha | k | |
|---|---|---|---|---|---|
| 63 | {'clf__alpha': 22, 'feature_create__k': 33} | 0.887215 | 0.018216 | 22 | None |
| 74 | {'clf__alpha': 23, 'feature_create__k': 33} | 0.887226 | 0.018223 | 23 | None |
| 85 | {'clf__alpha': 24, 'feature_create__k': 33} | 0.887234 | 0.018231 | 24 | None |
| 96 | {'clf__alpha': 25, 'feature_create__k': 33} | 0.887242 | 0.018238 | 25 | None |
| 10 | {'clf__alpha': 17, 'feature_create__k': 35} | 0.887242 | 0.017957 | 17 | None |
| 107 | {'clf__alpha': 26, 'feature_create__k': 33} | 0.887248 | 0.018246 | 26 | None |
| 21 | {'clf__alpha': 18, 'feature_create__k': 35} | 0.887266 | 0.017966 | 18 | None |
| 32 | {'clf__alpha': 19, 'feature_create__k': 35} | 0.887288 | 0.017974 | 19 | None |
| 43 | {'clf__alpha': 20, 'feature_create__k': 35} | 0.887307 | 0.017983 | 20 | None |
| 54 | {'clf__alpha': 21, 'feature_create__k': 35} | 0.887324 | 0.017993 | 21 | None |
| 65 | {'clf__alpha': 22, 'feature_create__k': 35} | 0.887340 | 0.018003 | 22 | None |
| 76 | {'clf__alpha': 23, 'feature_create__k': 35} | 0.887354 | 0.018013 | 23 | None |
| 87 | {'clf__alpha': 24, 'feature_create__k': 35} | 0.887366 | 0.018023 | 24 | None |
| 98 | {'clf__alpha': 25, 'feature_create__k': 35} | 0.887377 | 0.018034 | 25 | None |
| 109 | {'clf__alpha': 26, 'feature_create__k': 35} | 0.887386 | 0.018044 | 26 | None |
#graphic for visualization
sns.scatterplot(data=results_df, x='std_test_score', y='mean_test_score')
plt.xlabel('Std Dev Test Scores')
plt.ylabel('Mean R2 Test Scores')
plt.title('Optimizing R2 & Std Dev')
Text(0.5, 1.0, 'Optimizing R2 & Std Dev')

#finding the best estimator and training it over the data
best_pipe = grid_search.best_estimator_
best_pipe.fit(X_train,y_train)
#using the trained model to predict and checking the R2
y_test_predict = best_pipe.predict(X_test)
test_score = r2_score(y_test,y_test_predict)
print(test_score)
0.8830394937046963
Ok, I’m happy with that R^2, let’s test this puppy out!
#predict values on our holdout set, store them in a df, convert to csv for grading
best_pipe.fit(new_house,y)
holdout = pd.read_csv('input_data2/housing_holdout.csv')
holdout_x_vals = holdout.drop('parcel',axis=1)
y_pred = best_pipe.predict(holdout_x_vals)
df_out = pd.DataFrame({'parcel':holdout['parcel'],
'prediction':y_pred})
df_out.to_csv('submission/MY_PREDICTIONS.csv',index=False)