Scikit-learn/import numpy.py

import numpy as np
import pandas as pd

from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier#估计器
from sklearn.ensemble import RandomForestClassifier#估计器
#管道
from sklearn.model_selection import train_test_split#数据训练集、测试集划分
from sklearn.preprocessing import StandardScaler#预处理器、转化器
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline#管道
from sklearn.metrics import accuracy_score#准确性计算
#交叉验证
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression
#自动参数搜索
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.ensemble import RandomForestRegressor

import matplotlib.pyplot as plt #python可视化库
import seaborn as sns
from sklearn.model_selection  import cross_val_score
from matplotlib.colors import ListedColormap
from sklearn import neighbors
import operator

def run_KNN(X,X_train,y_train,K):  #需要预测的数据集,训练集,训练集,K个最近
    dataSize = X_train.shape[0]
    y_predict = []
    for x in X:
        diff = np.tile(x,(dataSize,1)) - X_train # 把X扩大然后矩阵相减
        squaredDist = np.sum(diff**2,axis=1) # axis = 1计算每一行的和
        distance = squaredDist ** 0.5

        # 对距离递增排序获取最前面K个样本的种类并统计各种类出现次数
        nearIds = distance.argsort() # 按值排序，得到对应下标数组
        classesCount = {}
        for i in range(K):
            y = y_train[nearIds[i]] # 得到对应的种类
            classesCount[y] = classesCount.get(y,0)+1 # 0为设置默认值
        # print(classesCount)
        # 对字典按值进行递减排序
        sortClassesCount = sorted(classesCount.items(),key=operator.itemgetter(1),reverse=True)
                                                        # 获取对象第二个元素           逆序
        y_predict.append(sortClassesCount[0][0])  # 预测种类为出现次数最多的那一类
    return y_predict

sns.set_style("whitegrid")


filename = 'iris\iris.data'
data= pd.read_csv(filename,usecols=[0,1,2,3],header=None,names=["sepal length","sepal width","petal length","petal width"])
target = pd.read_csv(filename,usecols=[4],header=None,names=["type"])
test=pd.read_csv(filename,header=None,names=["sepal length","sepal width","petal length","petal width","type"])

pipe=make_pipeline(
    StandardScaler(),#预处理器/转化器（特征缩放）
   KNeighborsClassifier() #估计器
)

x = data
y = target
#划分鸢尾花数据集
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3)
#求取k值
k_range = range(1, 31)
k_error = []
index=0
min=1
#循环，取k=1到k=31，查看误差效果
for k in k_range:
    pipe.fit(x_train,np.ravel(y_train))
    Pipeline(steps=[('standardscaler', StandardScaler()),('kneighborsclassfier', KNeighborsClassifier(n_neighbors=k))])
    #10折交叉验证
    scores = cross_val_score(KNeighborsClassifier(n_neighbors=k), x_train, y_train, cv=10, scoring='accuracy')
    k_error.append(1 - scores.mean())
    if k==1:
        index=k;
    elif k>1 and k_error[k-1]<min:
        index=k;
        min=k_error[k-1]

#画图，x轴为k值，y值为误差值
plt.plot(k_range, k_error)
plt.xlabel('Value of K for KNN')
plt.ylabel('Error')
plt.show()
print("最小误差为：",min)
print("最小k值为：",index)


# print(type(iris_x[0]))
# print(iris_x[:2])
# print(iris_y[:2])

# print(x_test)
# print(y_test)
pipe.fit(x_test,np.ravel(y_test))#格式转化
knn=Pipeline(steps=[('standardscaler', StandardScaler()),('kneighborsclassfier', KNeighborsClassifier(n_neighbors=index))])
ans=pipe.predict(x_test)
#ans1=run_KNN(test,x_train,y_train,index)
print(type(y_test))
y_test=y_test.values
for i in range(0,len(ans)):
    print(" ",y_test[i][0],ans[i])


print("scikit-learn knn模型预估准确度为：",accuracy_score(pipe.predict(x_test),y_test))#准确度
#print("knn模型预估准确度为：",accuracy_score(ans1,y_test))#准确度
# kn=KNeighborsClassifier()
# ra=RandomForestClassifier(random_state=0)
# # print(ra.predict(x_test))
# # print(y_test)
# StandardScaler().fit(iris_x).transform(iris_x)
# ra.fit(x_train,y_train)
# print(ra.predict(x_test))
# print(y_test)

#线性回归交叉验证
X, y = make_regression(n_samples=1000, random_state=0)
lr = LinearRegression()
result = cross_validate(lr, X, y)  #
print(result['test_score'])  # r_squared score is high because dataset is easy
print("1.查看数据集直方图")
print("2.查看数据集波形图")
print("3.查看数据集特征关系图")
print("4.查看数据集箱形图")
print("0.退出")
while(1):
    a=input()
    if a=='1':
        test.hist(bins=15)#绘制测试集各类花瓣直方图
        plt.show()
    elif a=='2':
        test.plot.area(stacked=False)#波形图
        plt.show()
    elif a=='3':
        sns.pairplot(test,hue="type",height=3)
        plt.show()
    elif a=='4':
        fig,axes=plt.subplots(2,2,figsize=(10,8))
        sns.boxplot(y=test["sepal length"],x=test["type"],ax=axes[0,0])
        sns.boxplot(y=test["sepal width"],x=test["type"],ax=axes[0,1])
        sns.boxplot(y=test["petal length"],x=test["type"],ax=axes[1,0])
        sns.boxplot(y=test["petal width"],x=test["type"],ax=axes[1,1])
        plt.show()
    elif a=='0':
        break


#特征两两关系图
# data.plot(kind="kde")#KDE图
# sns.heatmap(data.corr(),annot=True,cmap="YlGnBu")
#
# print(data.describe())


# #自动参数搜索
# x,y=fetch_california_housing(return_X_y=True);
# x_train,x_test,y_train,y_test=train_test_split(iris_x,iris_y,test_size=0.3)
# param_distributions = {'n_estimators': randint(1, 5),
#                     'max_depth': randint(5, 10)}
# search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0),
#                             n_iter=5,
#                             param_distributions=param_distributions,
#                             random_state=0)
# search.fit(x_train, y_train)
# RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0), n_iter=5,
#                    param_distributions={'max_depth': ...,
#                                         'n_estimators': ...},
#                    random_state=0)
# search.best_params_
# {'max_depth': 9, 'n_estimators': 4}
# print(search.score(x_test, y_test))
# 使用KNN预测数据类别