In [1]:
#Import all the required packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
from plotly.offline import init_notebook_mode,iplot
import plotly.graph_objs as go
import plotly.plotly as py
from plotly import tools
import plotly.figure_factory as ff
from datetime import date
import warnings

warnings.filterwarnings('ignore')
init_notebook_mode(connected=True)
In [2]:
#Part1 Data preprocessing
#Read in Data and Look at Summary Information
train = pd.read_csv(r"C:\SPS\HousePrices\train.csv")
In [3]:
#shape
print("shape of the train data set:",train.shape)

#summary
train.head()
train.describe()

#get a general idea about the missing value and date type
train.info()
shape of the train data set: (1460, 81)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-null object
Exterior2nd      1460 non-null object
MasVnrType       1452 non-null object
MasVnrArea       1452 non-null float64
ExterQual        1460 non-null object
ExterCond        1460 non-null object
Foundation       1460 non-null object
BsmtQual         1423 non-null object
BsmtCond         1423 non-null object
BsmtExposure     1422 non-null object
BsmtFinType1     1423 non-null object
BsmtFinSF1       1460 non-null int64
BsmtFinType2     1422 non-null object
BsmtFinSF2       1460 non-null int64
BsmtUnfSF        1460 non-null int64
TotalBsmtSF      1460 non-null int64
Heating          1460 non-null object
HeatingQC        1460 non-null object
CentralAir       1460 non-null object
Electrical       1459 non-null object
1stFlrSF         1460 non-null int64
2ndFlrSF         1460 non-null int64
LowQualFinSF     1460 non-null int64
GrLivArea        1460 non-null int64
BsmtFullBath     1460 non-null int64
BsmtHalfBath     1460 non-null int64
FullBath         1460 non-null int64
HalfBath         1460 non-null int64
BedroomAbvGr     1460 non-null int64
KitchenAbvGr     1460 non-null int64
KitchenQual      1460 non-null object
TotRmsAbvGrd     1460 non-null int64
Functional       1460 non-null object
Fireplaces       1460 non-null int64
FireplaceQu      770 non-null object
GarageType       1379 non-null object
GarageYrBlt      1379 non-null float64
GarageFinish     1379 non-null object
GarageCars       1460 non-null int64
GarageArea       1460 non-null int64
GarageQual       1379 non-null object
GarageCond       1379 non-null object
PavedDrive       1460 non-null object
WoodDeckSF       1460 non-null int64
OpenPorchSF      1460 non-null int64
EnclosedPorch    1460 non-null int64
3SsnPorch        1460 non-null int64
ScreenPorch      1460 non-null int64
PoolArea         1460 non-null int64
PoolQC           7 non-null object
Fence            281 non-null object
MiscFeature      54 non-null object
MiscVal          1460 non-null int64
MoSold           1460 non-null int64
YrSold           1460 non-null int64
SaleType         1460 non-null object
SaleCondition    1460 non-null object
SalePrice        1460 non-null int64
dtypes: float64(3), int64(35), object(43)
memory usage: 924.0+ KB
In [4]:
#missing data map
total = train.isnull().sum().sort_values(ascending=False)
percent = (train.isnull().sum()/train.shape[0]).sort_values(ascending=False)
missing_table = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_table.head(20)
Out[4]:
Total Percent
PoolQC 1453 0.995205
MiscFeature 1406 0.963014
Alley 1369 0.937671
Fence 1179 0.807534
FireplaceQu 690 0.472603
LotFrontage 259 0.177397
GarageCond 81 0.055479
GarageType 81 0.055479
GarageYrBlt 81 0.055479
GarageFinish 81 0.055479
GarageQual 81 0.055479
BsmtExposure 38 0.026027
BsmtFinType2 38 0.026027
BsmtFinType1 37 0.025342
BsmtCond 37 0.025342
BsmtQual 37 0.025342
MasVnrArea 8 0.005479
MasVnrType 8 0.005479
Electrical 1 0.000685
Utilities 0 0.000000
In [5]:
#delete those variables contain more than 20% of the missing data, since we know they are not really important when buy a house
train = train.drop(missing_table[missing_table['Percent'] > 0.2].index,1)
In [6]:
#impute median for numerical variables and most frequent for categorical variables
from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        pass

    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].median() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

train = DataFrameImputer().fit_transform(train)
train.isnull().sum().max()
Out[6]:
0
In [7]:
#Part 2 Exploratory Data Analysis
train["SalePrice"].describe()
Out[7]:
count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64
In [8]:
#histogram of the price
price_his = go.Histogram(x=train["SalePrice"],name='price')
log_price_his = go.Histogram(x=np.log(train["SalePrice"]),name='log price')
fig = tools.make_subplots(rows=1, cols=2)
fig.append_trace(price_his, 1, 1)
fig.append_trace(log_price_his, 1, 2)
fig = go.Figure(data=fig)
py.iplot(fig)
This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]

Out[8]:
In [9]:
#skewness and kurtosis
price = [train['SalePrice'].skew(),train['SalePrice'].kurt()]
log_price = [np.log(train['SalePrice']).skew(),np.log(train['SalePrice']).kurt()]
price_dic = {'SalePrice': price, 'Log_SalePrice': log_price}
dev_table = pd.DataFrame(price_dic,index=["Skewness","Kurtosis"])
dev_table
Out[9]:
SalePrice Log_SalePrice
Skewness 1.882876 0.121335
Kurtosis 6.536282 0.809532
In [10]:
#Extract numerical variables 
num_var = train.select_dtypes(include = ['float64', 'int64']).iloc[:,1:]
corr = num_var.corr()
corr_list = corr["SalePrice"].sort_values(axis=0,ascending=False).iloc[1:]
In [11]:
#Correlation matrix (heatmap)
data = [go.Heatmap(z=corr.values.tolist(),
                   x=corr.index,
                   y=corr.index,
                   colorscale='Blackbody')]
py.iplot(data)
Out[11]:
In [12]:
#Scatter plots
sns.set()
sns.pairplot(train[corr_list.index[:9]], size = 2.5)
plt.show();
In [13]:
#Extract categorical variables 
cat_var = train.select_dtypes(exclude = ['float64', 'int64']).iloc[:,1:]
cat_var.shape
Out[13]:
(1460, 37)
In [14]:
#Relationship with categorical features

plt.figure(figsize = (24, 12))
sns.boxplot(x = 'Neighborhood', y = 'SalePrice',  data = train)
plt.xticks(rotation=45)
Out[14]:
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24]),
 <a list of 25 Text xticklabel objects>)
In [15]:
plt.figure(figsize = (20, 12))
sns.boxplot(x = 'OverallQual', y = 'SalePrice',  data = train)
Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x259eddf4e80>
In [16]:
plt.figure(figsize = (24, 12))
sns.boxplot(x = 'YearBuilt', y = 'SalePrice',  data = train)
xt = plt.xticks(rotation=90)
In [ ]: