|
|
|
@ -26,9 +26,21 @@ sns.distplot(data_train['SalePrice'])
|
|
|
|
|
#skewness and kurtosis 峰度和偏度
|
|
|
|
|
print("Skewness: %f" % data_train['SalePrice'].skew())
|
|
|
|
|
print("Kurtosis: %f" % data_train['SalePrice'].kurt())
|
|
|
|
|
# CentralAir
|
|
|
|
|
var = 'CentralAir'
|
|
|
|
|
data = pd.concat([data_train['SalePrice'], data_train[var]], axis=1)
|
|
|
|
|
fig = sns.boxplot(x=var, y="SalePrice", data=data)
|
|
|
|
|
fig.axis(ymin=0, ymax=800000);
|
|
|
|
|
# OverallQual
|
|
|
|
|
var = 'OverallQual'
|
|
|
|
|
data = pd.concat([data_train['SalePrice'], data_train[var]], axis=1)
|
|
|
|
|
fig = sns.boxplot(x=var, y="SalePrice", data=data)
|
|
|
|
|
fig.axis(ymin=0, ymax=800000);
|
|
|
|
|
plt.show()
|
|
|
|
|
corrmat = data_train.corr()
|
|
|
|
|
f, ax = plt.subplots(figsize=(20, 9))
|
|
|
|
|
sns.heatmap(corrmat, vmax=0.8, square=True)
|
|
|
|
|
#plt.show()
|
|
|
|
|
k = 10 # 关系矩阵中将显示10个特征,由此我们可以知道相关性高的数据类别,便于后续的分析
|
|
|
|
|
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
|
|
|
|
|
cm = np.corrcoef(data_train[cols].values.T)
|
|
|
|
@ -38,7 +50,7 @@ hm = sns.heatmap(cm, cbar=True, annot=True, \
|
|
|
|
|
sns.set()
|
|
|
|
|
cols = ['SalePrice','OverallQual','GrLivArea', 'GarageCars','TotalBsmtSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt']
|
|
|
|
|
sns.pairplot(data_train[cols], size = 2.5)
|
|
|
|
|
|
|
|
|
|
#plt.show()
|
|
|
|
|
# 获取数据
|
|
|
|
|
cols = ['OverallQual','GrLivArea', 'GarageCars','TotalBsmtSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt']
|
|
|
|
|
x = data_train[cols].values
|
|
|
|
|