import numpy as np import pandas as pd from sklearn import datasets from sklearn.neighbors import KNeighborsClassifier#估计器 from sklearn.ensemble import RandomForestClassifier#估计器 #管道 from sklearn.model_selection import train_test_split#数据训练集、测试集划分 from sklearn.preprocessing import StandardScaler#预处理器、转化器 from sklearn.pipeline import make_pipeline from sklearn.pipeline import Pipeline#管道 from sklearn.metrics import accuracy_score#准确性计算 #交叉验证 from sklearn.datasets import make_regression from sklearn.model_selection import cross_validate from sklearn.linear_model import LinearRegression #自动参数搜索 from sklearn.datasets import fetch_california_housing from sklearn.model_selection import RandomizedSearchCV from scipy.stats import randint from sklearn.ensemble import RandomForestRegressor import matplotlib.pyplot as plt #python可视化库 import seaborn as sns from sklearn.model_selection import cross_val_score from matplotlib.colors import ListedColormap from sklearn import neighbors import operator def run_KNN(X,X_train,y_train,K): #需要预测的数据集,训练集,训练集,K个最近 dataSize = X_train.shape[0] y_predict = [] for x in X: diff = np.tile(x,(dataSize,1)) - X_train # 把X扩大然后矩阵相减 squaredDist = np.sum(diff**2,axis=1) # axis = 1计算每一行的和 distance = squaredDist ** 0.5 # 对距离递增排序获取最前面K个样本的种类并统计各种类出现次数 nearIds = distance.argsort() # 按值排序,得到对应下标数组 classesCount = {} for i in range(K): y = y_train[nearIds[i]] # 得到对应的种类 classesCount[y] = classesCount.get(y,0)+1 # 0为设置默认值 # print(classesCount) # 对字典按值进行递减排序 sortClassesCount = sorted(classesCount.items(),key=operator.itemgetter(1),reverse=True) # 获取对象第二个元素 逆序 y_predict.append(sortClassesCount[0][0]) # 预测种类为出现次数最多的那一类 return y_predict sns.set_style("whitegrid") filename = 'iris\iris.data' data= pd.read_csv(filename,usecols=[0,1,2,3],header=None,names=["sepal length","sepal width","petal length","petal width"]) target = pd.read_csv(filename,usecols=[4],header=None,names=["type"]) test=pd.read_csv(filename,header=None,names=["sepal length","sepal width","petal length","petal width","type"]) pipe=make_pipeline( StandardScaler(),#预处理器/转化器(特征缩放) KNeighborsClassifier() #估计器 ) x = data y = target #划分鸢尾花数据集 x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3) #求取k值 k_range = range(1, 31) k_error = [] index=0 min=1 #循环,取k=1到k=31,查看误差效果 for k in k_range: pipe.fit(x_train,np.ravel(y_train)) Pipeline(steps=[('standardscaler', StandardScaler()),('kneighborsclassfier', KNeighborsClassifier(n_neighbors=k))]) #10折交叉验证 scores = cross_val_score(KNeighborsClassifier(n_neighbors=k), x_train, y_train, cv=10, scoring='accuracy') k_error.append(1 - scores.mean()) if k==1: index=k; elif k>1 and k_error[k-1]