ADD file via upload

2 years ago · 8961bcea23
parent b38166952d
commit 8961bcea23
1 changed files with 184 additions and 0 deletions
--- a/numpy.py
+++ b/numpy.py
@ -0,0 +1,184 @@
 import numpy as np
 import pandas as pd
 from sklearn import datasets
 from sklearn.neighbors import KNeighborsClassifier#估计器
 from sklearn.ensemble import RandomForestClassifier#估计器
 #管道
 from sklearn.model_selection import train_test_split#数据训练集、测试集划分
 from sklearn.preprocessing import StandardScaler#预处理器、转化器
 from sklearn.pipeline import make_pipeline
 from sklearn.pipeline import Pipeline#管道
 from sklearn.metrics import accuracy_score#准确性计算
 #交叉验证
 from sklearn.datasets import make_regression
 from sklearn.model_selection import cross_validate
 from sklearn.linear_model import LinearRegression
 #自动参数搜索
 from sklearn.datasets import fetch_california_housing
 from sklearn.model_selection import RandomizedSearchCV
 from scipy.stats import randint
 from sklearn.ensemble import RandomForestRegressor
 import matplotlib.pyplot as plt #python可视化库
 import seaborn as sns
 from sklearn.model_selection  import cross_val_score
 from matplotlib.colors import ListedColormap
 from sklearn import neighbors
 import operator
 def run_KNN(X,X_train,y_train,K):  #需要预测的数据集,训练集,训练集,K个最近
    dataSize = X_train.shape[0]
    y_predict = []
    for x in X:
        diff = np.tile(x,(dataSize,1)) - X_train # 把X扩大然后矩阵相减
        squaredDist = np.sum(diff**2,axis=1) # axis = 1计算每一行的和
        distance = squaredDist ** 0.5
        # 对距离递增排序获取最前面K个样本的种类并统计各种类出现次数
        nearIds = distance.argsort() # 按值排序，得到对应下标数组
        classesCount = {}
        for i in range(K):
            y = y_train[nearIds[i]] # 得到对应的种类
            classesCount[y] = classesCount.get(y,0)+1 # 0为设置默认值
        # print(classesCount)
        # 对字典按值进行递减排序
        sortClassesCount = sorted(classesCount.items(),key=operator.itemgetter(1),reverse=True)
                                                        # 获取对象第二个元素           逆序
        y_predict.append(sortClassesCount[0][0])  # 预测种类为出现次数最多的那一类
    return y_predict
 sns.set_style("whitegrid")
 filename = 'iris\iris.data'
 data= pd.read_csv(filename,usecols=[0,1,2,3],header=None,names=["sepal length","sepal width","petal length","petal width"])
 target = pd.read_csv(filename,usecols=[4],header=None,names=["type"])
 test=pd.read_csv(filename,header=None,names=["sepal length","sepal width","petal length","petal width","type"])
 pipe=make_pipeline(
    StandardScaler(),#预处理器/转化器（特征缩放）
   KNeighborsClassifier() #估计器
 )
 x = data
 y = target
 #划分鸢尾花数据集
 x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3)
 #求取k值
 k_range = range(1, 31)
 k_error = []
 index=0
 min=1
 #循环，取k=1到k=31，查看误差效果
 for k in k_range:
    pipe.fit(x_train,np.ravel(y_train))
    Pipeline(steps=[('standardscaler', StandardScaler()),('kneighborsclassfier', KNeighborsClassifier(n_neighbors=k))])
    #10折交叉验证
    scores = cross_val_score(KNeighborsClassifier(n_neighbors=k), x_train, y_train, cv=10, scoring='accuracy')
    k_error.append(1 - scores.mean())
    if k==1:
        index=k;
    elif k>1 and k_error[k-1]<min:
        index=k;
        min=k_error[k-1]
 #画图，x轴为k值，y值为误差值
 plt.plot(k_range, k_error)
 plt.xlabel('Value of K for KNN')
 plt.ylabel('Error')
 plt.show()
 print("最小误差为：",min)
 print("最小k值为：",index)
 # print(type(iris_x[0]))
 # print(iris_x[:2])
 # print(iris_y[:2])
 # print(x_test)
 # print(y_test)
 pipe.fit(x_test,np.ravel(y_test))#格式转化
 knn=Pipeline(steps=[('standardscaler', StandardScaler()),('kneighborsclassfier', KNeighborsClassifier(n_neighbors=index))])
 ans=pipe.predict(x_test)
 #ans1=run_KNN(test,x_train,y_train,index)
 print(type(y_test))
 y_test=y_test.values
 for i in range(0,len(ans)):
    print(" ",y_test[i][0],ans[i])
 print("scikit-learn knn模型预估准确度为：",accuracy_score(pipe.predict(x_test),y_test))#准确度
 #print("knn模型预估准确度为：",accuracy_score(ans1,y_test))#准确度
 # kn=KNeighborsClassifier()
 # ra=RandomForestClassifier(random_state=0)
 # # print(ra.predict(x_test))
 # # print(y_test)
 # StandardScaler().fit(iris_x).transform(iris_x) 
 # ra.fit(x_train,y_train)
 # print(ra.predict(x_test))
 # print(y_test)
 #线性回归交叉验证
 X, y = make_regression(n_samples=1000, random_state=0)
 lr = LinearRegression()
 result = cross_validate(lr, X, y)  #
 print(result['test_score'])  # r_squared score is high because dataset is easy
 print("1.查看数据集直方图")
 print("2.查看数据集波形图")
 print("3.查看数据集特征关系图")
 print("4.查看数据集箱形图")
 print("0.退出")
 while(1):
    a=input()
    if a=='1':
        test.hist(bins=15)#绘制测试集各类花瓣直方图
        plt.show()
    elif a=='2':
        test.plot.area(stacked=False)#波形图
        plt.show()
    elif a=='3':
        sns.pairplot(test,hue="type",height=3)
        plt.show()
    elif a=='4':
        fig,axes=plt.subplots(2,2,figsize=(10,8))
        sns.boxplot(y=test["sepal length"],x=test["type"],ax=axes[0,0])
        sns.boxplot(y=test["sepal width"],x=test["type"],ax=axes[0,1])
        sns.boxplot(y=test["petal length"],x=test["type"],ax=axes[1,0])
        sns.boxplot(y=test["petal width"],x=test["type"],ax=axes[1,1])
        plt.show()
    elif a=='0':
        break
 #特征两两关系图
 # data.plot(kind="kde")#KDE图
 # sns.heatmap(data.corr(),annot=True,cmap="YlGnBu")
 # 
 # print(data.describe())
 # #自动参数搜索
 # x,y=fetch_california_housing(return_X_y=True);
 # x_train,x_test,y_train,y_test=train_test_split(iris_x,iris_y,test_size=0.3)
 # param_distributions = {'n_estimators': randint(1, 5),
 #                     'max_depth': randint(5, 10)}
 # search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0),
 #                             n_iter=5,
 #                             param_distributions=param_distributions,
 #                             random_state=0)
 # search.fit(x_train, y_train)
 # RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0), n_iter=5,
 #                    param_distributions={'max_depth': ...,
 #                                         'n_estimators': ...},
 #                    random_state=0)
 # search.best_params_
 # {'max_depth': 9, 'n_estimators': 4}
 # print(search.score(x_test, y_test))
 # 使用KNN预测数据类别