|
|
# 导入需要的模块
|
|
|
import pandas as pd
|
|
|
import matplotlib.pyplot as plt
|
|
|
import numpy as np
|
|
|
from sklearn import preprocessing
|
|
|
from sklearn import linear_model, svm, gaussian_process
|
|
|
from sklearn.ensemble import RandomForestRegressor
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
import numpy as np
|
|
|
from sklearn.ensemble import RandomForestRegressor
|
|
|
# 用来绘图的,封装了matplot
|
|
|
# 要注意的是一旦导入了seaborn,
|
|
|
# matplotlib的默认作图风格就会被覆盖成seaborn的格式
|
|
|
import seaborn as sns
|
|
|
|
|
|
from scipy import stats
|
|
|
from scipy.stats import norm
|
|
|
from sklearn.preprocessing import StandardScaler
|
|
|
import warnings
|
|
|
warnings.filterwarnings('ignore')
|
|
|
|
|
|
data_train = pd.read_csv("train.csv")
|
|
|
print(data_train['SalePrice'].describe())
|
|
|
sns.distplot(data_train['SalePrice'])
|
|
|
|
|
|
#skewness and kurtosis 峰度和偏度
|
|
|
print("Skewness: %f" % data_train['SalePrice'].skew())
|
|
|
print("Kurtosis: %f" % data_train['SalePrice'].kurt())
|
|
|
# CentralAir
|
|
|
var = 'CentralAir'
|
|
|
data = pd.concat([data_train['SalePrice'], data_train[var]], axis=1)
|
|
|
fig = sns.boxplot(x=var, y="SalePrice", data=data)
|
|
|
fig.axis(ymin=0, ymax=800000);
|
|
|
# OverallQual
|
|
|
var = 'OverallQual'
|
|
|
data = pd.concat([data_train['SalePrice'], data_train[var]], axis=1)
|
|
|
fig = sns.boxplot(x=var, y="SalePrice", data=data)
|
|
|
fig.axis(ymin=0, ymax=800000);
|
|
|
plt.show()
|
|
|
corrmat = data_train.corr()
|
|
|
f, ax = plt.subplots(figsize=(20, 9))
|
|
|
sns.heatmap(corrmat, vmax=0.8, square=True)
|
|
|
#plt.show()
|
|
|
k = 10 # 关系矩阵中将显示10个特征,由此我们可以知道相关性高的数据类别,便于后续的分析
|
|
|
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
|
|
|
cm = np.corrcoef(data_train[cols].values.T)
|
|
|
sns.set(font_scale=1.25)
|
|
|
hm = sns.heatmap(cm, cbar=True, annot=True, \
|
|
|
square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
|
|
|
sns.set()
|
|
|
cols = ['SalePrice','OverallQual','GrLivArea', 'GarageCars','TotalBsmtSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt']
|
|
|
sns.pairplot(data_train[cols], size = 2.5)
|
|
|
#plt.show()
|
|
|
# 获取数据
|
|
|
cols = ['OverallQual','GrLivArea', 'GarageCars','TotalBsmtSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt']
|
|
|
x = data_train[cols].values
|
|
|
y = data_train['SalePrice'].values
|
|
|
x_scaled = preprocessing.StandardScaler().fit_transform(x)
|
|
|
y_scaled = preprocessing.StandardScaler().fit_transform(y.reshape(-1,1))
|
|
|
X_train,X_test, y_train, y_test = train_test_split(x_scaled, y_scaled, test_size=0.33, random_state=42)
|
|
|
clfs = {
|
|
|
'svm':svm.SVR(),
|
|
|
'RandomForestRegressor':RandomForestRegressor(n_estimators=400),
|
|
|
'BayesianRidge':linear_model.BayesianRidge()
|
|
|
}
|
|
|
for clf in clfs:
|
|
|
try:
|
|
|
clfs[clf].fit(X_train, y_train)
|
|
|
y_pred = clfs[clf].predict(X_test)
|
|
|
print(clf + " cost:" + str(np.sum(y_pred-y_test)/len(y_pred)) )
|
|
|
except Exception as e:
|
|
|
print(clf + " Error:")
|
|
|
print(str(e))
|
|
|
cols = ['OverallQual','GrLivArea', 'GarageCars','TotalBsmtSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt']
|
|
|
x = data_train[cols].values
|
|
|
y = data_train['SalePrice'].values
|
|
|
X_train,X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
|
|
|
|
|
|
clf = RandomForestRegressor(n_estimators=400)
|
|
|
clf.fit(X_train, y_train)
|
|
|
y_pred = clf.predict(X_test)
|
|
|
|
|
|
rfr = clf
|
|
|
data_test = pd.read_csv("test.csv")
|
|
|
data_test[cols].isnull().sum()
|
|
|
cols2 = ['OverallQual','GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt']
|
|
|
cars = data_test['GarageCars'].fillna(1.766118)
|
|
|
bsmt = data_test['TotalBsmtSF'].fillna(1046.117970)
|
|
|
data_test_x = pd.concat( [data_test[cols2], cars, bsmt] ,axis=1)
|
|
|
data_test_x.isnull().sum()
|
|
|
x = data_test_x.values
|
|
|
y_te_pred = rfr.predict(x)
|
|
|
prediction = pd.DataFrame(y_te_pred, columns=['SalePrice'])
|
|
|
result = pd.concat([ data_test['Id'], prediction], axis=1)
|
|
|
# result = result.drop(resultlt.columns[0], 1)
|
|
|
result.columns
|
|
|
result.to_csv('./Predictions.csv', index=False)
|
|
|
|
|
|
plt.show() |