diff --git a/import numpy.py b/import numpy.py new file mode 100644 index 0000000..a90b915 --- /dev/null +++ b/import numpy.py @@ -0,0 +1,184 @@ +import numpy as np +import pandas as pd + +from sklearn import datasets +from sklearn.neighbors import KNeighborsClassifier#估计器 +from sklearn.ensemble import RandomForestClassifier#估计器 +#管道 +from sklearn.model_selection import train_test_split#数据训练集、测试集划分 +from sklearn.preprocessing import StandardScaler#预处理器、转化器 +from sklearn.pipeline import make_pipeline +from sklearn.pipeline import Pipeline#管道 +from sklearn.metrics import accuracy_score#准确性计算 +#交叉验证 +from sklearn.datasets import make_regression +from sklearn.model_selection import cross_validate +from sklearn.linear_model import LinearRegression +#自动参数搜索 +from sklearn.datasets import fetch_california_housing +from sklearn.model_selection import RandomizedSearchCV +from scipy.stats import randint +from sklearn.ensemble import RandomForestRegressor + +import matplotlib.pyplot as plt #python可视化库 +import seaborn as sns +from sklearn.model_selection import cross_val_score +from matplotlib.colors import ListedColormap +from sklearn import neighbors +import operator + +def run_KNN(X,X_train,y_train,K): #需要预测的数据集,训练集,训练集,K个最近 + dataSize = X_train.shape[0] + y_predict = [] + for x in X: + diff = np.tile(x,(dataSize,1)) - X_train # 把X扩大然后矩阵相减 + squaredDist = np.sum(diff**2,axis=1) # axis = 1计算每一行的和 + distance = squaredDist ** 0.5 + + # 对距离递增排序获取最前面K个样本的种类并统计各种类出现次数 + nearIds = distance.argsort() # 按值排序,得到对应下标数组 + classesCount = {} + for i in range(K): + y = y_train[nearIds[i]] # 得到对应的种类 + classesCount[y] = classesCount.get(y,0)+1 # 0为设置默认值 + # print(classesCount) + # 对字典按值进行递减排序 + sortClassesCount = sorted(classesCount.items(),key=operator.itemgetter(1),reverse=True) + # 获取对象第二个元素 逆序 + y_predict.append(sortClassesCount[0][0]) # 预测种类为出现次数最多的那一类 + return y_predict + +sns.set_style("whitegrid") + + +filename = 'iris\iris.data' +data= pd.read_csv(filename,usecols=[0,1,2,3],header=None,names=["sepal length","sepal width","petal length","petal width"]) +target = pd.read_csv(filename,usecols=[4],header=None,names=["type"]) +test=pd.read_csv(filename,header=None,names=["sepal length","sepal width","petal length","petal width","type"]) + +pipe=make_pipeline( + StandardScaler(),#预处理器/转化器(特征缩放) + KNeighborsClassifier() #估计器 +) + +x = data +y = target +#划分鸢尾花数据集 +x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3) +#求取k值 +k_range = range(1, 31) +k_error = [] +index=0 +min=1 +#循环,取k=1到k=31,查看误差效果 +for k in k_range: + pipe.fit(x_train,np.ravel(y_train)) + Pipeline(steps=[('standardscaler', StandardScaler()),('kneighborsclassfier', KNeighborsClassifier(n_neighbors=k))]) + #10折交叉验证 + scores = cross_val_score(KNeighborsClassifier(n_neighbors=k), x_train, y_train, cv=10, scoring='accuracy') + k_error.append(1 - scores.mean()) + if k==1: + index=k; + elif k>1 and k_error[k-1]