# 导入需要的模块 import pandas as pd import matplotlib.pyplot as plt import numpy as np from sklearn import preprocessing from sklearn import linear_model, svm, gaussian_process from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split import numpy as np from sklearn.ensemble import RandomForestRegressor # 用来绘图的,封装了matplot # 要注意的是一旦导入了seaborn, # matplotlib的默认作图风格就会被覆盖成seaborn的格式 import seaborn as sns from scipy import stats from scipy.stats import norm from sklearn.preprocessing import StandardScaler import warnings warnings.filterwarnings('ignore') data_train = pd.read_csv("train.csv") print(data_train['SalePrice'].describe()) sns.distplot(data_train['SalePrice']) #skewness and kurtosis 峰度和偏度 print("Skewness: %f" % data_train['SalePrice'].skew()) print("Kurtosis: %f" % data_train['SalePrice'].kurt()) # CentralAir var = 'CentralAir' data = pd.concat([data_train['SalePrice'], data_train[var]], axis=1) fig = sns.boxplot(x=var, y="SalePrice", data=data) fig.axis(ymin=0, ymax=800000); # OverallQual var = 'OverallQual' data = pd.concat([data_train['SalePrice'], data_train[var]], axis=1) fig = sns.boxplot(x=var, y="SalePrice", data=data) fig.axis(ymin=0, ymax=800000); plt.show() corrmat = data_train.corr() f, ax = plt.subplots(figsize=(20, 9)) sns.heatmap(corrmat, vmax=0.8, square=True) #plt.show() k = 10 # 关系矩阵中将显示10个特征,由此我们可以知道相关性高的数据类别,便于后续的分析 cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index cm = np.corrcoef(data_train[cols].values.T) sns.set(font_scale=1.25) hm = sns.heatmap(cm, cbar=True, annot=True, \ square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values) sns.set() cols = ['SalePrice','OverallQual','GrLivArea', 'GarageCars','TotalBsmtSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt'] sns.pairplot(data_train[cols], size = 2.5) #plt.show() # 获取数据 cols = ['OverallQual','GrLivArea', 'GarageCars','TotalBsmtSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt'] x = data_train[cols].values y = data_train['SalePrice'].values x_scaled = preprocessing.StandardScaler().fit_transform(x) y_scaled = preprocessing.StandardScaler().fit_transform(y.reshape(-1,1)) X_train,X_test, y_train, y_test = train_test_split(x_scaled, y_scaled, test_size=0.33, random_state=42) clfs = { 'svm':svm.SVR(), 'RandomForestRegressor':RandomForestRegressor(n_estimators=400), 'BayesianRidge':linear_model.BayesianRidge() } for clf in clfs: try: clfs[clf].fit(X_train, y_train) y_pred = clfs[clf].predict(X_test) print(clf + " cost:" + str(np.sum(y_pred-y_test)/len(y_pred)) ) except Exception as e: print(clf + " Error:") print(str(e)) cols = ['OverallQual','GrLivArea', 'GarageCars','TotalBsmtSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt'] x = data_train[cols].values y = data_train['SalePrice'].values X_train,X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42) clf = RandomForestRegressor(n_estimators=400) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) rfr = clf data_test = pd.read_csv("test.csv") data_test[cols].isnull().sum() cols2 = ['OverallQual','GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt'] cars = data_test['GarageCars'].fillna(1.766118) bsmt = data_test['TotalBsmtSF'].fillna(1046.117970) data_test_x = pd.concat( [data_test[cols2], cars, bsmt] ,axis=1) data_test_x.isnull().sum() x = data_test_x.values y_te_pred = rfr.predict(x) prediction = pd.DataFrame(y_te_pred, columns=['SalePrice']) result = pd.concat([ data_test['Id'], prediction], axis=1) # result = result.drop(resultlt.columns[0], 1) result.columns result.to_csv('./Predictions.csv', index=False) plt.show()