|
|
import numpy as np
|
|
|
import pandas as pd
|
|
|
|
|
|
from sklearn import datasets
|
|
|
from sklearn.neighbors import KNeighborsClassifier#估计器
|
|
|
from sklearn.ensemble import RandomForestClassifier#估计器
|
|
|
#管道
|
|
|
from sklearn.model_selection import train_test_split#数据训练集、测试集划分
|
|
|
from sklearn.preprocessing import StandardScaler#预处理器、转化器
|
|
|
from sklearn.pipeline import make_pipeline
|
|
|
from sklearn.pipeline import Pipeline#管道
|
|
|
from sklearn.metrics import accuracy_score#准确性计算
|
|
|
#交叉验证
|
|
|
from sklearn.datasets import make_regression
|
|
|
from sklearn.model_selection import cross_validate
|
|
|
from sklearn.linear_model import LinearRegression
|
|
|
#自动参数搜索
|
|
|
from sklearn.datasets import fetch_california_housing
|
|
|
from sklearn.model_selection import RandomizedSearchCV
|
|
|
from scipy.stats import randint
|
|
|
from sklearn.ensemble import RandomForestRegressor
|
|
|
|
|
|
import matplotlib.pyplot as plt #python可视化库
|
|
|
import seaborn as sns
|
|
|
from sklearn.model_selection import cross_val_score
|
|
|
from matplotlib.colors import ListedColormap
|
|
|
from sklearn import neighbors
|
|
|
import operator
|
|
|
|
|
|
def run_KNN(X,X_train,y_train,K): #需要预测的数据集,训练集,训练集,K个最近
|
|
|
dataSize = X_train.shape[0]
|
|
|
y_predict = []
|
|
|
for x in X:
|
|
|
diff = np.tile(x,(dataSize,1)) - X_train # 把X扩大然后矩阵相减
|
|
|
squaredDist = np.sum(diff**2,axis=1) # axis = 1计算每一行的和
|
|
|
distance = squaredDist ** 0.5
|
|
|
|
|
|
# 对距离递增排序获取最前面K个样本的种类并统计各种类出现次数
|
|
|
nearIds = distance.argsort() # 按值排序,得到对应下标数组
|
|
|
classesCount = {}
|
|
|
for i in range(K):
|
|
|
y = y_train[nearIds[i]] # 得到对应的种类
|
|
|
classesCount[y] = classesCount.get(y,0)+1 # 0为设置默认值
|
|
|
# print(classesCount)
|
|
|
# 对字典按值进行递减排序
|
|
|
sortClassesCount = sorted(classesCount.items(),key=operator.itemgetter(1),reverse=True)
|
|
|
# 获取对象第二个元素 逆序
|
|
|
y_predict.append(sortClassesCount[0][0]) # 预测种类为出现次数最多的那一类
|
|
|
return y_predict
|
|
|
|
|
|
sns.set_style("whitegrid")
|
|
|
|
|
|
|
|
|
filename = 'iris\iris.data'
|
|
|
data= pd.read_csv(filename,usecols=[0,1,2,3],header=None,names=["sepal length","sepal width","petal length","petal width"])
|
|
|
target = pd.read_csv(filename,usecols=[4],header=None,names=["type"])
|
|
|
test=pd.read_csv(filename,header=None,names=["sepal length","sepal width","petal length","petal width","type"])
|
|
|
|
|
|
pipe=make_pipeline(
|
|
|
StandardScaler(),#预处理器/转化器(特征缩放)
|
|
|
KNeighborsClassifier() #估计器
|
|
|
)
|
|
|
|
|
|
x = data
|
|
|
y = target
|
|
|
#划分鸢尾花数据集
|
|
|
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3)
|
|
|
#求取k值
|
|
|
k_range = range(1, 31)
|
|
|
k_error = []
|
|
|
index=0
|
|
|
min=1
|
|
|
#循环,取k=1到k=31,查看误差效果
|
|
|
for k in k_range:
|
|
|
pipe.fit(x_train,np.ravel(y_train))
|
|
|
Pipeline(steps=[('standardscaler', StandardScaler()),('kneighborsclassfier', KNeighborsClassifier(n_neighbors=k))])
|
|
|
#10折交叉验证
|
|
|
scores = cross_val_score(KNeighborsClassifier(n_neighbors=k), x_train, y_train, cv=10, scoring='accuracy')
|
|
|
k_error.append(1 - scores.mean())
|
|
|
if k==1:
|
|
|
index=k;
|
|
|
elif k>1 and k_error[k-1]<min:
|
|
|
index=k;
|
|
|
min=k_error[k-1]
|
|
|
|
|
|
#画图,x轴为k值,y值为误差值
|
|
|
plt.plot(k_range, k_error)
|
|
|
plt.xlabel('Value of K for KNN')
|
|
|
plt.ylabel('Error')
|
|
|
plt.show()
|
|
|
print("最小误差为:",min)
|
|
|
print("最小k值为:",index)
|
|
|
|
|
|
|
|
|
|
|
|
# print(type(iris_x[0]))
|
|
|
# print(iris_x[:2])
|
|
|
# print(iris_y[:2])
|
|
|
|
|
|
# print(x_test)
|
|
|
# print(y_test)
|
|
|
pipe.fit(x_test,np.ravel(y_test))#格式转化
|
|
|
knn=Pipeline(steps=[('standardscaler', StandardScaler()),('kneighborsclassfier', KNeighborsClassifier(n_neighbors=index))])
|
|
|
ans=pipe.predict(x_test)
|
|
|
#ans1=run_KNN(test,x_train,y_train,index)
|
|
|
print(type(y_test))
|
|
|
y_test=y_test.values
|
|
|
for i in range(0,len(ans)):
|
|
|
print(" ",y_test[i][0],ans[i])
|
|
|
|
|
|
|
|
|
|
|
|
print("scikit-learn knn模型预估准确度为:",accuracy_score(pipe.predict(x_test),y_test))#准确度
|
|
|
#print("knn模型预估准确度为:",accuracy_score(ans1,y_test))#准确度
|
|
|
# kn=KNeighborsClassifier()
|
|
|
# ra=RandomForestClassifier(random_state=0)
|
|
|
# # print(ra.predict(x_test))
|
|
|
# # print(y_test)
|
|
|
# StandardScaler().fit(iris_x).transform(iris_x)
|
|
|
# ra.fit(x_train,y_train)
|
|
|
# print(ra.predict(x_test))
|
|
|
# print(y_test)
|
|
|
|
|
|
#线性回归交叉验证
|
|
|
X, y = make_regression(n_samples=1000, random_state=0)
|
|
|
lr = LinearRegression()
|
|
|
result = cross_validate(lr, X, y) #
|
|
|
print(result['test_score']) # r_squared score is high because dataset is easy
|
|
|
print("1.查看数据集直方图")
|
|
|
print("2.查看数据集波形图")
|
|
|
print("3.查看数据集特征关系图")
|
|
|
print("4.查看数据集箱形图")
|
|
|
print("0.退出")
|
|
|
while(1):
|
|
|
a=input()
|
|
|
if a=='1':
|
|
|
test.hist(bins=15)#绘制测试集各类花瓣直方图
|
|
|
plt.show()
|
|
|
elif a=='2':
|
|
|
test.plot.area(stacked=False)#波形图
|
|
|
plt.show()
|
|
|
elif a=='3':
|
|
|
sns.pairplot(test,hue="type",height=3)
|
|
|
plt.show()
|
|
|
elif a=='4':
|
|
|
fig,axes=plt.subplots(2,2,figsize=(10,8))
|
|
|
sns.boxplot(y=test["sepal length"],x=test["type"],ax=axes[0,0])
|
|
|
sns.boxplot(y=test["sepal width"],x=test["type"],ax=axes[0,1])
|
|
|
sns.boxplot(y=test["petal length"],x=test["type"],ax=axes[1,0])
|
|
|
sns.boxplot(y=test["petal width"],x=test["type"],ax=axes[1,1])
|
|
|
plt.show()
|
|
|
elif a=='0':
|
|
|
break
|
|
|
|
|
|
|
|
|
#特征两两关系图
|
|
|
# data.plot(kind="kde")#KDE图
|
|
|
# sns.heatmap(data.corr(),annot=True,cmap="YlGnBu")
|
|
|
#
|
|
|
# print(data.describe())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# #自动参数搜索
|
|
|
# x,y=fetch_california_housing(return_X_y=True);
|
|
|
# x_train,x_test,y_train,y_test=train_test_split(iris_x,iris_y,test_size=0.3)
|
|
|
# param_distributions = {'n_estimators': randint(1, 5),
|
|
|
# 'max_depth': randint(5, 10)}
|
|
|
# search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0),
|
|
|
# n_iter=5,
|
|
|
# param_distributions=param_distributions,
|
|
|
# random_state=0)
|
|
|
# search.fit(x_train, y_train)
|
|
|
# RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0), n_iter=5,
|
|
|
# param_distributions={'max_depth': ...,
|
|
|
# 'n_estimators': ...},
|
|
|
# random_state=0)
|
|
|
# search.best_params_
|
|
|
# {'max_depth': 9, 'n_estimators': 4}
|
|
|
# print(search.score(x_test, y_test))
|
|
|
# 使用KNN预测数据类别
|
|
|
|
|
|
|