#Import all the required packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
from plotly.offline import init_notebook_mode,iplot
import plotly.graph_objs as go
import plotly.plotly as py
from plotly import tools
import plotly.figure_factory as ff
from datetime import date
import warnings
warnings.filterwarnings('ignore')
init_notebook_mode(connected=True)
#Part1 Data preprocessing
#Read in Data and Look at Summary Information
train = pd.read_csv(r"C:\SPS\HousePrices\train.csv")
#shape
print("shape of the train data set:",train.shape)
#summary
train.head()
train.describe()
#get a general idea about the missing value and date type
train.info()
#missing data map
total = train.isnull().sum().sort_values(ascending=False)
percent = (train.isnull().sum()/train.shape[0]).sort_values(ascending=False)
missing_table = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_table.head(20)
#delete those variables contain more than 20% of the missing data, since we know they are not really important when buy a house
train = train.drop(missing_table[missing_table['Percent'] > 0.2].index,1)
#impute median for numerical variables and most frequent for categorical variables
from sklearn.base import TransformerMixin
class DataFrameImputer(TransformerMixin):
def __init__(self):
pass
def fit(self, X, y=None):
self.fill = pd.Series([X[c].value_counts().index[0]
if X[c].dtype == np.dtype('O') else X[c].median() for c in X],
index=X.columns)
return self
def transform(self, X, y=None):
return X.fillna(self.fill)
train = DataFrameImputer().fit_transform(train)
train.isnull().sum().max()
#Part 2 Exploratory Data Analysis
train["SalePrice"].describe()
#histogram of the price
price_his = go.Histogram(x=train["SalePrice"],name='price')
log_price_his = go.Histogram(x=np.log(train["SalePrice"]),name='log price')
fig = tools.make_subplots(rows=1, cols=2)
fig.append_trace(price_his, 1, 1)
fig.append_trace(log_price_his, 1, 2)
fig = go.Figure(data=fig)
py.iplot(fig)
#skewness and kurtosis
price = [train['SalePrice'].skew(),train['SalePrice'].kurt()]
log_price = [np.log(train['SalePrice']).skew(),np.log(train['SalePrice']).kurt()]
price_dic = {'SalePrice': price, 'Log_SalePrice': log_price}
dev_table = pd.DataFrame(price_dic,index=["Skewness","Kurtosis"])
dev_table
#Extract numerical variables
num_var = train.select_dtypes(include = ['float64', 'int64']).iloc[:,1:]
corr = num_var.corr()
corr_list = corr["SalePrice"].sort_values(axis=0,ascending=False).iloc[1:]
#Correlation matrix (heatmap)
data = [go.Heatmap(z=corr.values.tolist(),
x=corr.index,
y=corr.index,
colorscale='Blackbody')]
py.iplot(data)
#Scatter plots
sns.set()
sns.pairplot(train[corr_list.index[:9]], size = 2.5)
plt.show();
#Extract categorical variables
cat_var = train.select_dtypes(exclude = ['float64', 'int64']).iloc[:,1:]
cat_var.shape
#Relationship with categorical features
plt.figure(figsize = (24, 12))
sns.boxplot(x = 'Neighborhood', y = 'SalePrice', data = train)
plt.xticks(rotation=45)
plt.figure(figsize = (20, 12))
sns.boxplot(x = 'OverallQual', y = 'SalePrice', data = train)
plt.figure(figsize = (24, 12))
sns.boxplot(x = 'YearBuilt', y = 'SalePrice', data = train)
xt = plt.xticks(rotation=90)