You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

185 lines
6.6 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier#估计器
from sklearn.ensemble import RandomForestClassifier#估计器
#管道
from sklearn.model_selection import train_test_split#数据训练集、测试集划分
from sklearn.preprocessing import StandardScaler#预处理器、转化器
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline#管道
from sklearn.metrics import accuracy_score#准确性计算
#交叉验证
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression
#自动参数搜索
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt #python可视化库
import seaborn as sns
from sklearn.model_selection import cross_val_score
from matplotlib.colors import ListedColormap
from sklearn import neighbors
import operator
def run_KNN(X,X_train,y_train,K): #需要预测的数据集,训练集,训练集,K个最近
dataSize = X_train.shape[0]
y_predict = []
for x in X:
diff = np.tile(x,(dataSize,1)) - X_train # 把X扩大然后矩阵相减
squaredDist = np.sum(diff**2,axis=1) # axis = 1计算每一行的和
distance = squaredDist ** 0.5
# 对距离递增排序获取最前面K个样本的种类并统计各种类出现次数
nearIds = distance.argsort() # 按值排序,得到对应下标数组
classesCount = {}
for i in range(K):
y = y_train[nearIds[i]] # 得到对应的种类
classesCount[y] = classesCount.get(y,0)+1 # 0为设置默认值
# print(classesCount)
# 对字典按值进行递减排序
sortClassesCount = sorted(classesCount.items(),key=operator.itemgetter(1),reverse=True)
# 获取对象第二个元素 逆序
y_predict.append(sortClassesCount[0][0]) # 预测种类为出现次数最多的那一类
return y_predict
sns.set_style("whitegrid")
filename = 'iris\iris.data'
data= pd.read_csv(filename,usecols=[0,1,2,3],header=None,names=["sepal length","sepal width","petal length","petal width"])
target = pd.read_csv(filename,usecols=[4],header=None,names=["type"])
test=pd.read_csv(filename,header=None,names=["sepal length","sepal width","petal length","petal width","type"])
pipe=make_pipeline(
StandardScaler(),#预处理器/转化器(特征缩放)
KNeighborsClassifier() #估计器
)
x = data
y = target
#划分鸢尾花数据集
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3)
#求取k值
k_range = range(1, 31)
k_error = []
index=0
min=1
#循环取k=1到k=31查看误差效果
for k in k_range:
pipe.fit(x_train,np.ravel(y_train))
Pipeline(steps=[('standardscaler', StandardScaler()),('kneighborsclassfier', KNeighborsClassifier(n_neighbors=k))])
#10折交叉验证
scores = cross_val_score(KNeighborsClassifier(n_neighbors=k), x_train, y_train, cv=10, scoring='accuracy')
k_error.append(1 - scores.mean())
if k==1:
index=k;
elif k>1 and k_error[k-1]<min:
index=k;
min=k_error[k-1]
#画图x轴为k值y值为误差值
plt.plot(k_range, k_error)
plt.xlabel('Value of K for KNN')
plt.ylabel('Error')
plt.show()
print("最小误差为:",min)
print("最小k值为",index)
# print(type(iris_x[0]))
# print(iris_x[:2])
# print(iris_y[:2])
# print(x_test)
# print(y_test)
pipe.fit(x_test,np.ravel(y_test))#格式转化
knn=Pipeline(steps=[('standardscaler', StandardScaler()),('kneighborsclassfier', KNeighborsClassifier(n_neighbors=index))])
ans=pipe.predict(x_test)
#ans1=run_KNN(test,x_train,y_train,index)
print(type(y_test))
y_test=y_test.values
for i in range(0,len(ans)):
print(" ",y_test[i][0],ans[i])
print("scikit-learn knn模型预估准确度为",accuracy_score(pipe.predict(x_test),y_test))#准确度
#print("knn模型预估准确度为",accuracy_score(ans1,y_test))#准确度
# kn=KNeighborsClassifier()
# ra=RandomForestClassifier(random_state=0)
# # print(ra.predict(x_test))
# # print(y_test)
# StandardScaler().fit(iris_x).transform(iris_x)
# ra.fit(x_train,y_train)
# print(ra.predict(x_test))
# print(y_test)
#线性回归交叉验证
X, y = make_regression(n_samples=1000, random_state=0)
lr = LinearRegression()
result = cross_validate(lr, X, y) #
print(result['test_score']) # r_squared score is high because dataset is easy
print("1.查看数据集直方图")
print("2.查看数据集波形图")
print("3.查看数据集特征关系图")
print("4.查看数据集箱形图")
print("0.退出")
while(1):
a=input()
if a=='1':
test.hist(bins=15)#绘制测试集各类花瓣直方图
plt.show()
elif a=='2':
test.plot.area(stacked=False)#波形图
plt.show()
elif a=='3':
sns.pairplot(test,hue="type",height=3)
plt.show()
elif a=='4':
fig,axes=plt.subplots(2,2,figsize=(10,8))
sns.boxplot(y=test["sepal length"],x=test["type"],ax=axes[0,0])
sns.boxplot(y=test["sepal width"],x=test["type"],ax=axes[0,1])
sns.boxplot(y=test["petal length"],x=test["type"],ax=axes[1,0])
sns.boxplot(y=test["petal width"],x=test["type"],ax=axes[1,1])
plt.show()
elif a=='0':
break
#特征两两关系图
# data.plot(kind="kde")#KDE图
# sns.heatmap(data.corr(),annot=True,cmap="YlGnBu")
#
# print(data.describe())
# #自动参数搜索
# x,y=fetch_california_housing(return_X_y=True);
# x_train,x_test,y_train,y_test=train_test_split(iris_x,iris_y,test_size=0.3)
# param_distributions = {'n_estimators': randint(1, 5),
# 'max_depth': randint(5, 10)}
# search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0),
# n_iter=5,
# param_distributions=param_distributions,
# random_state=0)
# search.fit(x_train, y_train)
# RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0), n_iter=5,
# param_distributions={'max_depth': ...,
# 'n_estimators': ...},
# random_state=0)
# search.best_params_
# {'max_depth': 9, 'n_estimators': 4}
# print(search.score(x_test, y_test))
# 使用KNN预测数据类别