diff --git a/PricePredict.py b/PricePredict.py new file mode 100644 index 0000000..87068ce --- /dev/null +++ b/PricePredict.py @@ -0,0 +1,87 @@ +# 导入需要的模块 +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np +from sklearn import preprocessing +from sklearn import linear_model, svm, gaussian_process +from sklearn.ensemble import RandomForestRegressor +from sklearn.model_selection import train_test_split +import numpy as np +from sklearn.ensemble import RandomForestRegressor +# 用来绘图的,封装了matplot +# 要注意的是一旦导入了seaborn, +# matplotlib的默认作图风格就会被覆盖成seaborn的格式 +import seaborn as sns + +from scipy import stats +from scipy.stats import norm +from sklearn.preprocessing import StandardScaler +import warnings +warnings.filterwarnings('ignore') + +data_train = pd.read_csv("train.csv") +print(data_train['SalePrice'].describe()) +sns.distplot(data_train['SalePrice']) + +#skewness and kurtosis 峰度和偏度 +print("Skewness: %f" % data_train['SalePrice'].skew()) +print("Kurtosis: %f" % data_train['SalePrice'].kurt()) +corrmat = data_train.corr() +f, ax = plt.subplots(figsize=(20, 9)) +sns.heatmap(corrmat, vmax=0.8, square=True) +k = 10 # 关系矩阵中将显示10个特征,由此我们可以知道相关性高的数据类别,便于后续的分析 +cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index +cm = np.corrcoef(data_train[cols].values.T) +sns.set(font_scale=1.25) +hm = sns.heatmap(cm, cbar=True, annot=True, \ + square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values) +sns.set() +cols = ['SalePrice','OverallQual','GrLivArea', 'GarageCars','TotalBsmtSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt'] +sns.pairplot(data_train[cols], size = 2.5) + +# 获取数据 +cols = ['OverallQual','GrLivArea', 'GarageCars','TotalBsmtSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt'] +x = data_train[cols].values +y = data_train['SalePrice'].values +x_scaled = preprocessing.StandardScaler().fit_transform(x) +y_scaled = preprocessing.StandardScaler().fit_transform(y.reshape(-1,1)) +X_train,X_test, y_train, y_test = train_test_split(x_scaled, y_scaled, test_size=0.33, random_state=42) +clfs = { + 'svm':svm.SVR(), + 'RandomForestRegressor':RandomForestRegressor(n_estimators=400), + 'BayesianRidge':linear_model.BayesianRidge() + } +for clf in clfs: + try: + clfs[clf].fit(X_train, y_train) + y_pred = clfs[clf].predict(X_test) + print(clf + " cost:" + str(np.sum(y_pred-y_test)/len(y_pred)) ) + except Exception as e: + print(clf + " Error:") + print(str(e)) +cols = ['OverallQual','GrLivArea', 'GarageCars','TotalBsmtSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt'] +x = data_train[cols].values +y = data_train['SalePrice'].values +X_train,X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42) + +clf = RandomForestRegressor(n_estimators=400) +clf.fit(X_train, y_train) +y_pred = clf.predict(X_test) + +rfr = clf +data_test = pd.read_csv("test.csv") +data_test[cols].isnull().sum() +cols2 = ['OverallQual','GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt'] +cars = data_test['GarageCars'].fillna(1.766118) +bsmt = data_test['TotalBsmtSF'].fillna(1046.117970) +data_test_x = pd.concat( [data_test[cols2], cars, bsmt] ,axis=1) +data_test_x.isnull().sum() +x = data_test_x.values +y_te_pred = rfr.predict(x) +prediction = pd.DataFrame(y_te_pred, columns=['SalePrice']) +result = pd.concat([ data_test['Id'], prediction], axis=1) +# result = result.drop(resultlt.columns[0], 1) +result.columns +result.to_csv('./Predictions.csv', index=False) + +plt.show() \ No newline at end of file