parent
e216e8d960
commit
2aab49daa0
Binary file not shown.
@ -0,0 +1,44 @@
|
||||
# 代码7-6
|
||||
import numpy as np
|
||||
from sklearn.preprocessing import scale, MaxAbsScaler
|
||||
from sklearn.cluster import KMeans
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
|
||||
mode_data1 = pd.read_csv('../tmp/mode_data1.csv', index_col=0)
|
||||
# 对数据做中心标准化
|
||||
scale_data = scale(mode_data1)
|
||||
# 使用K-Means聚类算法建模
|
||||
result = KMeans(n_clusters=5, random_state=1234).fit(scale_data)
|
||||
# 查看聚类结果
|
||||
label = result.labels_ # 获取聚类标签
|
||||
# 获取聚类中心
|
||||
center = pd.DataFrame(result.cluster_centers_,
|
||||
columns=['新闻动态', '教学资源', '项目与合作', '竞赛', '优秀作品'])
|
||||
# 改变字体大小
|
||||
plt.rcParams.update({'font.size': 10})
|
||||
# 自定义画雷达图函数
|
||||
def plot(model_center=None,label=None):
|
||||
plt.rcParams['axes.unicode_minus'] = False #用于正常显示负号
|
||||
plt.rcParams['font.sans-serif'] = 'SimHei' # 正常显示中文
|
||||
n = len(label) # 特征个数
|
||||
angles = np.linspace(0, 2 * np.pi, n, endpoint=False) # 间隔采样
|
||||
angles = np.concatenate((angles, [angles[0]]))
|
||||
fig = plt.figure(figsize=(5, 5)) # 创建一个空白的画布
|
||||
ax = fig.add_subplot(1, 1, 1, polar=True) # 创建子图
|
||||
ax.set_yticklabels([]) # 取消y轴
|
||||
ax.set_thetagrids(angles[: -1] * 180 / np.pi, label) # 设置网格线标签
|
||||
# ax.set_ylim(model_center.min(),5) # 设置Y轴的范围
|
||||
ax.grid(True) # 是否显示网格
|
||||
sam = ['b-.', 'k-', 'o--', ':', 'p:'] # 定义折线样式列表
|
||||
labels = []
|
||||
# 绘制雷达图
|
||||
for i in range(5):
|
||||
values = np.concatenate((model_center[i], [model_center[i][0]]))
|
||||
ax.plot(angles, values, sam[i])
|
||||
labels.append('用户群' + str(i + 1),)
|
||||
# 添加图例
|
||||
plt.legend(labels,bbox_to_anchor=(0.85, 0.85), loc=3)
|
||||
|
||||
plot(scale(result.cluster_centers_), center.columns)
|
||||
# plt.savefig('12.png', dpi=1080)
|
@ -0,0 +1,122 @@
|
||||
|
||||
|
||||
a = sum(tipdm_data['userid'].isnull())
|
||||
b = len(tipdm_data)-sum(tipdm_data['userid'].isnull())
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
plt.rcParams['font.sans-serif']='SimHei'#设置中文显示
|
||||
plt.figure(figsize=(4,4))#将画布设定为正方形,则绘制的饼图是正圆
|
||||
label=['空值','非空值']#定义饼图的标签,标签是列表
|
||||
explode=[0,0]#设定各项距离圆心n个半径
|
||||
#plt.pie(values[-1,3:6],explode=explode,labels=label,autopct='%1.1f%%')#绘制饼图
|
||||
values=[a,b]
|
||||
plt.pie(values,explode=explode,labels=label,autopct='%1.1f%%')#绘制饼图
|
||||
plt.title('userid空值与非空值占比')#绘制标题
|
||||
plt.show()
|
||||
plt.savefig('./饼图')
|
||||
#############################
|
||||
|
||||
|
||||
|
||||
a = sum(tipdm_data['uniqueVisitorId'].isnull())
|
||||
b = len(tipdm_data)-sum(tipdm_data['uniqueVisitorId'].isnull())
|
||||
|
||||
plt.figure(figsize=(4,4))#将画布设定为正方形,则绘制的饼图是正圆
|
||||
label=['空值','非空值']#定义饼图的标签,标签是列表
|
||||
explode=[0,0]#设定各项距离圆心n个半径
|
||||
#plt.pie(values[-1,3:6],explode=explode,labels=label,autopct='%1.1f%%')#绘制饼图
|
||||
values=[a,b]
|
||||
plt.pie(values,explode=explode,labels=label,autopct='%1.1f%%')#绘制饼图
|
||||
plt.title('uniqueVisitorId空值与非空值占比')#绘制标题
|
||||
plt.show()
|
||||
|
||||
|
||||
plt.savefig('./饼图2')#保存图片
|
||||
|
||||
#############
|
||||
|
||||
|
||||
con_data['reallID'].value_counts().values
|
||||
|
||||
a = pd.DataFrame(con_data['reallID'].value_counts())
|
||||
a.columns=['count']
|
||||
|
||||
# a.reset_index(inplace=True)
|
||||
b = pd.DataFrame(a['count'].value_counts())
|
||||
|
||||
|
||||
super_35 = list(b['count'])[34:]
|
||||
sum_35 = 0
|
||||
for i in super_35:
|
||||
sum_35 += int(i)
|
||||
|
||||
dianji = list(b['count'])[:34]
|
||||
dianji.append(sum_35)
|
||||
|
||||
tick_label = ["1", "5", "10", "15", "20",'25','30','35']
|
||||
|
||||
plt.bar(range(len(dianji)), dianji)
|
||||
plt.xticks([0,4,9,14,19,24,29,34], tick_label)
|
||||
plt.title('用户点击网页柱形图')
|
||||
plt.xlabel('点击网页数(次)')
|
||||
plt.ylabel('用户数(人)')
|
||||
plt.show()
|
||||
plt.savefig('./饼图3')#保存图片
|
||||
#######################################
|
||||
|
||||
|
||||
mode_data['page_path'].value_counts()
|
||||
ind_1 = a[a['count'] == 1].index
|
||||
ind_1_1 = [i in ind_1 for i in con_data['reallID']]
|
||||
c = pd.DataFrame(con_data[ind_1_1]['page_path'].value_counts())
|
||||
|
||||
#########################################################
|
||||
data = con_data
|
||||
data = data.fillna('1')
|
||||
indx = [i[:4] != '2021' for i in data['date_time']]
|
||||
data.loc[indx,'date_time'] = data.loc[indx, 'sessionid']
|
||||
data['date_time'] = pd.to_datetime(data['date_time'])
|
||||
|
||||
o=[]
|
||||
for k in range(3,35):
|
||||
li = reallid_count[reallid_count['count']==k]['reallID'].to_list()
|
||||
s=0
|
||||
for i in li:
|
||||
a=data[data['reallID']==i][['date_time','reallID']]['date_time']
|
||||
for j in range(0,k-2):
|
||||
num = (a.iloc[j+1]-a.iloc[j]).total_seconds()
|
||||
s+= num
|
||||
s=s/(k*len(li))
|
||||
o.append(s)
|
||||
|
||||
y=o
|
||||
plt.plot(range(3,35),y)
|
||||
plt.title('平均点击间隔')
|
||||
plt.xlabel('点击网页次数(次)')
|
||||
plt.ylabel('平均点击间隔(秒)')
|
||||
plt.xticks(range(3,35))
|
||||
plt.savefig('./饼图4')#保存图片
|
||||
#############################################################
|
||||
|
||||
data[data['reallID']==109664.0]['date_time'].shape
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in new issue