|
|
|
@ -60,20 +60,9 @@ print('未命中次数:',no)
|
|
|
|
|
# 用于处理标签值缺失的一些数据,和剔除对预测不相关和不重要的特征属性
|
|
|
|
|
#
|
|
|
|
|
# #### 2.1 标签缺失数据处理
|
|
|
|
|
#
|
|
|
|
|
#
|
|
|
|
|
|
|
|
|
|
# In[4]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#保留标签缺失值的数据
|
|
|
|
|
data_no = data[pd.isnull(data['shot_made_flag'])]
|
|
|
|
|
print(data_no.shape)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# In[4]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#保留标签不为缺失值的数据
|
|
|
|
|
data = data[pd.notnull(data['shot_made_flag'])]
|
|
|
|
|
print(data.shape)
|
|
|
|
@ -87,9 +76,6 @@ print(data.shape)
|
|
|
|
|
#
|
|
|
|
|
# 通过比较,科比投篮的位置和经纬度这两组特征是类似的,保留一组特征即可。
|
|
|
|
|
|
|
|
|
|
# In[5]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#分配画布大小
|
|
|
|
|
plt.figure(figsize = (10,10))
|
|
|
|
|
|
|
|
|
@ -110,9 +96,6 @@ plt.show()
|
|
|
|
|
# 分钟与秒特征重复
|
|
|
|
|
# 原始特征中既有分钟又有秒,所以可以把这两组特征进行合并
|
|
|
|
|
|
|
|
|
|
# In[7]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data['remain_time'] = data['minutes_remaining']*60 + data['seconds_remaining']
|
|
|
|
|
data['remain_time'][:5]
|
|
|
|
|
|
|
|
|
@ -122,9 +105,6 @@ data['remain_time'][:5]
|
|
|
|
|
#
|
|
|
|
|
# 画图分析,其形状与坐标位置差不多,所以选用更加准确的坐标来表示投篮位置
|
|
|
|
|
|
|
|
|
|
# In[8]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import matplotlib.cm as cm
|
|
|
|
|
plt.figure(figsize=(20,10))
|
|
|
|
|
|
|
|
|
@ -165,8 +145,6 @@ plt.show()
|
|
|
|
|
# - opponent 对手
|
|
|
|
|
# - remain_time 剩余时间
|
|
|
|
|
|
|
|
|
|
# In[9]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 去掉如比赛id,投篮id等无关特征
|
|
|
|
|
drops = ['combined_shot_type','shot_id', 'team_id', 'team_name', 'shot_zone_area', 'shot_zone_range', 'shot_zone_basic', 'matchup', 'lon', 'lat', 'seconds_remaining', 'minutes_remaining', 'shot_distance', 'game_event_id', 'game_id', 'game_date','season']
|
|
|
|
@ -180,8 +158,6 @@ data.head()
|
|
|
|
|
# #### 3.1 将文字特征转换为数字特征
|
|
|
|
|
# 由于机器学习的模型只能处理数字类型的数据,我们需要将数据集中的文字特征转化为数字特征,此处使用one-hot编码来实现。one-hot编码相对于标签编码而言可以减少训练的误差。one-hot编码将类别编码成二进制形式,有多少类别就有多少bit位,例如有类别:男,女,使用one-hot的编码的结果就是00,01每个类别占用一个比特位,使得数据更加稀疏了。
|
|
|
|
|
|
|
|
|
|
# In[10]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
a = ['action_type', 'shot_type', 'opponent']
|
|
|
|
|
for i in a:
|
|
|
|
@ -193,17 +169,11 @@ data.head()
|
|
|
|
|
|
|
|
|
|
# #### 3.2 保存一下处理后的数据
|
|
|
|
|
|
|
|
|
|
# In[10]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data.to_csv("./data_processed.csv", encoding="utf-8-sig", mode="w", header=True, index=False)
|
|
|
|
|
data = pd.read_csv("data_processed.csv")
|
|
|
|
|
data_label = data['shot_made_flag']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# In[11]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#读入数据
|
|
|
|
|
data = pd.read_csv("data_processed.csv")
|
|
|
|
|
#显示大小
|
|
|
|
@ -214,65 +184,28 @@ print(data.info())
|
|
|
|
|
|
|
|
|
|
# #### 3.3 分离特征和标签
|
|
|
|
|
# 分离特征和标签是为了之后训练模型时,让数据处理更加方便。
|
|
|
|
|
|
|
|
|
|
# In[12]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data_feature = data.drop('shot_made_flag',1)
|
|
|
|
|
data_label = data['shot_made_flag']
|
|
|
|
|
data_label.shape
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# In[13]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data_label = np.array(data_label)
|
|
|
|
|
data_label.shape
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# #### 3.4 数据标准化
|
|
|
|
|
# 由于各个特征之间存在很大的量纲上的差异,所以先进行标准化,我们使用sklearn的工具包实现数据的标准化。
|
|
|
|
|
#
|
|
|
|
|
# fit_transform(X,y=None,**fit_params): 通过fit_params调整数据X,y得到一个调整后的X
|
|
|
|
|
#
|
|
|
|
|
|
|
|
|
|
# In[14]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#### 对data_feature进行数据标准化
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# In[15]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#### 上述cell的答案
|
|
|
|
|
from sklearn.preprocessing import StandardScaler
|
|
|
|
|
scaler = StandardScaler()
|
|
|
|
|
data_feature = scaler.fit_transform(data_feature)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# In[16]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data_feature = pd.DataFrame(data_feature)
|
|
|
|
|
data_feature.head()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# In[17]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data_feature.to_csv("./data_feature_standard.csv", encoding="utf-8-sig", mode="w", header=True, index=False)
|
|
|
|
|
data_feature = pd.read_csv("data_feature_standard.csv")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# In[18]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#### 载入数据和标签
|
|
|
|
|
knn_data = data_feature
|
|
|
|
|
knn_label = data_label
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ### 4 构建训练(train)和测试数据(test)
|
|
|
|
|
#
|
|
|
|
|
# 为了对模型进行训练和测试,我们需要将原始数据分为训练集和测试集,这里我们采用sklearn的库函数实现数据集的拆分,该函数如下所示:
|
|
|
|
@ -289,14 +222,8 @@ knn_label = data_label
|
|
|
|
|
# - random_state:是随机数的种子。随机数种子其实就是该组随机数的编号,在需要重复试验的时候,保证得到一组一样的随机数。比如你每次都填1,其他参数一样的情况下你得到的随机数组是一样的。但填0或不填,每次都会不一样。
|
|
|
|
|
#
|
|
|
|
|
# - stratify是为了保持split前类的分布。比如有100个数据,80个属于A类,20个属于B类。如果train_test_split(... test_size=0.25, stratify = y_all), 那么split之后数据如下: training: 75个数据,其中60个属于A类,15个属于B类。 testing: 25个数据,其中20个属于A类,5个属于B类。 用了stratify参数,training集和testing集的类的比例是 A:B= 4:1,等同于split前的比例(80:20)。通常在这种类分布不平衡的情况下会用到stratify。将stratify=X就是按照X中的比例分配。 将stratify=y就是按照y中的比例分配。
|
|
|
|
|
|
|
|
|
|
# In[19]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(knn_data,knn_label, random_state=2020, test_size=0.25)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ### 5 构建和训练模型
|
|
|
|
|
#
|
|
|
|
|
# 我们采用sklearn库的KNeighborsClassifier实现KNN模型,模型调用方法如下:
|
|
|
|
@ -317,18 +244,9 @@ X_train, X_test, y_train, y_test = train_test_split(knn_data,knn_label, random_s
|
|
|
|
|
#
|
|
|
|
|
# - fit(X, y):X作为训练数据,y作为标签数据进行模型训练。
|
|
|
|
|
# - score(X, y):返回给定测试数据以及标签的准确率均值。
|
|
|
|
|
|
|
|
|
|
# In[20]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from sklearn.neighbors import KNeighborsClassifier
|
|
|
|
|
knn = KNeighborsClassifier(n_neighbors=3)
|
|
|
|
|
knn.fit(X_train, y_train)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# In[21]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
score = knn.score(X_train, y_train)
|
|
|
|
|
print('训练数据集的准确率:{:.3}%'.format(score*100))
|
|
|
|
|
score = knn.score(X_test, y_test)
|
|
|
|
@ -353,10 +271,6 @@ print('测试数据集的准确率:{:.3}%'.format(score*100))
|
|
|
|
|
# - fit_params:传递给估计器的拟合方法的参数
|
|
|
|
|
# - pre_dispatch:控制并行执行期间调度的作业数量。减少这个数量对于避免在CPU发送更多作业时CPU内存消耗的扩大是有用的。该参数可以是:
|
|
|
|
|
#
|
|
|
|
|
|
|
|
|
|
# In[22]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 运行时间较长,请耐心等待
|
|
|
|
|
# 交叉验证
|
|
|
|
|
from sklearn.model_selection import cross_val_score
|
|
|
|
@ -380,10 +294,6 @@ plt.show()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 由上图观察可知在17到21区间模型的准确度较高,我们继续选定参数范围为[17,23]进行训练调试。
|
|
|
|
|
|
|
|
|
|
# In[23]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 进一步
|
|
|
|
|
k_range = range(17,23,2)
|
|
|
|
|
cv_scores = []
|
|
|
|
@ -441,10 +351,6 @@ plt.show()
|
|
|
|
|
# 第0行第0列的数表示y_true中值为0,y_pred中值也为0的个数
|
|
|
|
|
#
|
|
|
|
|
# 第0行第1列的数表示y_true中值为0,y_pred中值为1的个数
|
|
|
|
|
|
|
|
|
|
# In[24]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 测试集评估
|
|
|
|
|
knn = KNeighborsClassifier(n_neighbors=19)
|
|
|
|
|
knn.fit(X_train, y_train)
|
|
|
|
@ -452,11 +358,6 @@ score = knn.score(X_train, y_train)
|
|
|
|
|
print('训练数据集的准确率:{:.3}%'.format(score*100))
|
|
|
|
|
score = knn.score(X_test, y_test)
|
|
|
|
|
print('测试数据集的准确率:{:.3}%'.format(score*100))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# In[25]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ROC
|
|
|
|
|
from sklearn.metrics import roc_curve, auc, roc_auc_score
|
|
|
|
|
from sklearn.preprocessing import label_binarize
|
|
|
|
@ -469,11 +370,6 @@ plt.plot(knn_fpr, knn_tpr,
|
|
|
|
|
plt.plot([0, 1], [0, 1], 'k--', lw=2,c='r')
|
|
|
|
|
plt.title('knn roc')
|
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# In[26]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# AUC
|
|
|
|
|
knn_auc = auc(knn_fpr, knn_tpr)
|
|
|
|
|
knn_auc
|
|
|
|
@ -482,10 +378,6 @@ knn_auc
|
|
|
|
|
# 由于传统的机器学习方法精确度有限,我们尝试使用神经网络进行训练,看看神经网络的模型训练结果。
|
|
|
|
|
|
|
|
|
|
# ### 6 神经网络
|
|
|
|
|
|
|
|
|
|
# In[27]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cnn_data = np.asarray(data_feature)
|
|
|
|
|
cnn_label = np.asarray(data_label)
|
|
|
|
|
cnn_data.shape
|
|
|
|
@ -493,11 +385,6 @@ x_train = cnn_data[:20000]
|
|
|
|
|
y_train = cnn_label[:20000]
|
|
|
|
|
x_test = cnn_data[20000:]
|
|
|
|
|
y_test = cnn_label[20000:]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# In[28]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from keras import models
|
|
|
|
|
from keras import layers
|
|
|
|
|
model = models.Sequential()
|
|
|
|
@ -529,10 +416,6 @@ results
|
|
|
|
|
|
|
|
|
|
# #### 6.1 模型调优
|
|
|
|
|
# 我们查看历史不同的迭代次数下训练以及验证集的损失函数,以及不同迭代次数下,训练集与测试集的准确率,来查看迭代次数对模型的影响。
|
|
|
|
|
|
|
|
|
|
# In[29]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## 损失值
|
|
|
|
|
loss_values = history_dict['loss']
|
|
|
|
|
val_loss_values = history_dict['val_loss']
|
|
|
|
@ -558,10 +441,6 @@ plt.show()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 我们通过遍历输入层以及中间层不同层数,尝试不同迭代次数,将最优解参数放入best_param数组中。
|
|
|
|
|
|
|
|
|
|
# In[30]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# nums1 = [16,32,64]
|
|
|
|
|
# nums2 = [16,32,64]
|
|
|
|
|
# epochs = [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
|
|
|
|
@ -596,10 +475,6 @@ for num1 in nums1:
|
|
|
|
|
best_param.append(num2)
|
|
|
|
|
best_param.append(epoch)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# In[31]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print('最高的准确率:',best_result)
|
|
|
|
|
print('最好的参数:',best_param)
|
|
|
|
|
model = models.Sequential()
|
|
|
|
@ -621,10 +496,6 @@ model = load_model('cnn_model.h5')
|
|
|
|
|
|
|
|
|
|
# #### 6.1 模型评估
|
|
|
|
|
# 模型评估的方法如上。
|
|
|
|
|
|
|
|
|
|
# In[11]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 绘制混淆矩阵
|
|
|
|
|
from sklearn.metrics import confusion_matrix
|
|
|
|
|
y_pred = model.predict(x_test)
|
|
|
|
|