You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

185 lines
6.6 KiB

import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier#估计器
from sklearn.ensemble import RandomForestClassifier#估计器
#管道
from sklearn.model_selection import train_test_split#数据训练集、测试集划分
from sklearn.preprocessing import StandardScaler#预处理器、转化器
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline#管道
from sklearn.metrics import accuracy_score#准确性计算
#交叉验证
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression
#自动参数搜索
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt #python可视化库
import seaborn as sns
from sklearn.model_selection import cross_val_score
from matplotlib.colors import ListedColormap
from sklearn import neighbors
import operator
def run_KNN(X,X_train,y_train,K): #需要预测的数据集,训练集,训练集,K个最近
dataSize = X_train.shape[0]
y_predict = []
for x in X:
diff = np.tile(x,(dataSize,1)) - X_train # 把X扩大然后矩阵相减
squaredDist = np.sum(diff**2,axis=1) # axis = 1计算每一行的和
distance = squaredDist ** 0.5
# 对距离递增排序获取最前面K个样本的种类并统计各种类出现次数
nearIds = distance.argsort() # 按值排序,得到对应下标数组
classesCount = {}
for i in range(K):
y = y_train[nearIds[i]] # 得到对应的种类
classesCount[y] = classesCount.get(y,0)+1 # 0为设置默认值
# print(classesCount)
# 对字典按值进行递减排序
sortClassesCount = sorted(classesCount.items(),key=operator.itemgetter(1),reverse=True)
# 获取对象第二个元素 逆序
y_predict.append(sortClassesCount[0][0]) # 预测种类为出现次数最多的那一类
return y_predict
sns.set_style("whitegrid")
filename = 'iris\iris.data'
data= pd.read_csv(filename,usecols=[0,1,2,3],header=None,names=["sepal length","sepal width","petal length","petal width"])
target = pd.read_csv(filename,usecols=[4],header=None,names=["type"])
test=pd.read_csv(filename,header=None,names=["sepal length","sepal width","petal length","petal width","type"])
pipe=make_pipeline(
StandardScaler(),#预处理器/转化器(特征缩放)
KNeighborsClassifier() #估计器
)
x = data
y = target
#划分鸢尾花数据集
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3)
#求取k值
k_range = range(1, 31)
k_error = []
index=0
min=1
#循环取k=1到k=31查看误差效果
for k in k_range:
pipe.fit(x_train,np.ravel(y_train))
Pipeline(steps=[('standardscaler', StandardScaler()),('kneighborsclassfier', KNeighborsClassifier(n_neighbors=k))])
#10折交叉验证
scores = cross_val_score(KNeighborsClassifier(n_neighbors=k), x_train, y_train, cv=10, scoring='accuracy')
k_error.append(1 - scores.mean())
if k==1:
index=k;
elif k>1 and k_error[k-1]<min:
index=k;
min=k_error[k-1]
#画图x轴为k值y值为误差值
plt.plot(k_range, k_error)
plt.xlabel('Value of K for KNN')
plt.ylabel('Error')
plt.show()
print("最小误差为:",min)
print("最小k值为",index)
# print(type(iris_x[0]))
# print(iris_x[:2])
# print(iris_y[:2])
# print(x_test)
# print(y_test)
pipe.fit(x_test,np.ravel(y_test))#格式转化
knn=Pipeline(steps=[('standardscaler', StandardScaler()),('kneighborsclassfier', KNeighborsClassifier(n_neighbors=index))])
ans=pipe.predict(x_test)
#ans1=run_KNN(test,x_train,y_train,index)
print(type(y_test))
y_test=y_test.values
for i in range(0,len(ans)):
print(" ",y_test[i][0],ans[i])
print("scikit-learn knn模型预估准确度为",accuracy_score(pipe.predict(x_test),y_test))#准确度
#print("knn模型预估准确度为",accuracy_score(ans1,y_test))#准确度
# kn=KNeighborsClassifier()
# ra=RandomForestClassifier(random_state=0)
# # print(ra.predict(x_test))
# # print(y_test)
# StandardScaler().fit(iris_x).transform(iris_x)
# ra.fit(x_train,y_train)
# print(ra.predict(x_test))
# print(y_test)
#线性回归交叉验证
X, y = make_regression(n_samples=1000, random_state=0)
lr = LinearRegression()
result = cross_validate(lr, X, y) #
print(result['test_score']) # r_squared score is high because dataset is easy
print("1.查看数据集直方图")
print("2.查看数据集波形图")
print("3.查看数据集特征关系图")
print("4.查看数据集箱形图")
print("0.退出")
while(1):
a=input()
if a=='1':
test.hist(bins=15)#绘制测试集各类花瓣直方图
plt.show()
elif a=='2':
test.plot.area(stacked=False)#波形图
plt.show()
elif a=='3':
sns.pairplot(test,hue="type",height=3)
plt.show()
elif a=='4':
fig,axes=plt.subplots(2,2,figsize=(10,8))
sns.boxplot(y=test["sepal length"],x=test["type"],ax=axes[0,0])
sns.boxplot(y=test["sepal width"],x=test["type"],ax=axes[0,1])
sns.boxplot(y=test["petal length"],x=test["type"],ax=axes[1,0])
sns.boxplot(y=test["petal width"],x=test["type"],ax=axes[1,1])
plt.show()
elif a=='0':
break
#特征两两关系图
# data.plot(kind="kde")#KDE图
# sns.heatmap(data.corr(),annot=True,cmap="YlGnBu")
#
# print(data.describe())
# #自动参数搜索
# x,y=fetch_california_housing(return_X_y=True);
# x_train,x_test,y_train,y_test=train_test_split(iris_x,iris_y,test_size=0.3)
# param_distributions = {'n_estimators': randint(1, 5),
# 'max_depth': randint(5, 10)}
# search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0),
# n_iter=5,
# param_distributions=param_distributions,
# random_state=0)
# search.fit(x_train, y_train)
# RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0), n_iter=5,
# param_distributions={'max_depth': ...,
# 'n_estimators': ...},
# random_state=0)
# search.best_params_
# {'max_depth': 9, 'n_estimators': 4}
# print(search.score(x_test, y_test))
# 使用KNN预测数据类别