|
|
import pandas
|
|
|
import requests #发送请求
|
|
|
import pandas as pd
|
|
|
import numpy as np#导入matplotlib库,绘制图像
|
|
|
import matplotlib.pyplot as plt#读取数据集到phone_data中
|
|
|
import seaborn as sns
|
|
|
import jieba
|
|
|
from collections import Counter
|
|
|
from wordcloud import WordCloud
|
|
|
|
|
|
url_dict = {
|
|
|
'全站': 'https://api.bilibili.com/x/web-interface/ranking/v2?rid=0&type=all',
|
|
|
'音乐': 'https://api.bilibili.com/x/web-interface/ranking/v2?rid=3&type=all',
|
|
|
'舞蹈': 'https://api.bilibili.com/x/web-interface/ranking/v2?rid=129&type=all',
|
|
|
'生活': 'https://api.bilibili.com/x/web-interface/ranking/v2?rid=160&type=all',
|
|
|
'美食': 'https://api.bilibili.com/x/web-interface/ranking/v2?rid=211&type=all',
|
|
|
}
|
|
|
|
|
|
headers = {
|
|
|
"upgrade-insecure-requests": "1",
|
|
|
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0",
|
|
|
"Referer": "https://www.bilibili.com/v/popular/rank/all/",
|
|
|
}
|
|
|
|
|
|
for i in url_dict.items():
|
|
|
url = i[1] # url地址
|
|
|
tab_name = i[0] # tab页名称
|
|
|
title_list = []
|
|
|
play_cnt_list = [] # 播放数
|
|
|
danmu_cnt_list = [] # 弹幕数
|
|
|
coin_cnt_list = [] # 投币数
|
|
|
like_cnt_list = [] # 点赞数
|
|
|
share_cnt_list = [] # 分享数
|
|
|
favorite_cnt_list = [] # 收藏数
|
|
|
author_list = []
|
|
|
video_url = []
|
|
|
try:
|
|
|
r = requests.get(url, headers=headers)
|
|
|
print(r.status_code)#获取目标数据所在数据并转成字典类型
|
|
|
json_data = r.json()
|
|
|
list_data = json_data['data']['list']
|
|
|
|
|
|
for data in list_data:
|
|
|
title_list.append(data['title'])
|
|
|
play_cnt_list.append(data['stat']['view'])
|
|
|
danmu_cnt_list.append(data['stat']['danmaku'])
|
|
|
coin_cnt_list.append(data['stat']['coin'])
|
|
|
like_cnt_list.append(data['stat']['like'])
|
|
|
# dislike_cnt_list.append(data['stat']['dislike'])
|
|
|
share_cnt_list.append(data['stat']['share'])
|
|
|
favorite_cnt_list.append(data['stat']['favorite'])
|
|
|
author_list.append(data['owner']['name'])
|
|
|
# score_list.append(data['score'])
|
|
|
video_url.append('https://www.bilibili.com/video/' + data['bvid'])
|
|
|
# print('*' * 10)
|
|
|
except Exception as e:
|
|
|
print("爬取失败:{}".format(str(e)))
|
|
|
#创建dataframe保存数据
|
|
|
df = pd.DataFrame(
|
|
|
{'视频标题': title_list,
|
|
|
'视频地址': video_url,
|
|
|
'作者': author_list,
|
|
|
'播放数': play_cnt_list,
|
|
|
'弹幕数': danmu_cnt_list,
|
|
|
'投币数': coin_cnt_list,
|
|
|
'点赞数': like_cnt_list,
|
|
|
'分享数': share_cnt_list,
|
|
|
'收藏数': favorite_cnt_list,
|
|
|
})
|
|
|
#print(df.head())
|
|
|
#将数据保存到本地
|
|
|
df.to_csv('B站TOP100-{}.csv'.format(tab_name), index=False,encoding='utf_8_sig') # utf_8_sig修复乱码问题
|
|
|
print('写入成功: ' + 'B站TOP100-{}.csv'.format(tab_name))
|
|
|
data_meishi=pd.read_csv('B站TOP100-美食.csv')
|
|
|
data_quanzhan=pd.read_csv('B站TOP100-全站.csv')
|
|
|
data_shenghuo=pd.read_csv('B站TOP100-生活.csv')
|
|
|
data_wudao=pd.read_csv('B站TOP100-舞蹈.csv')
|
|
|
data_yinyue=pd.read_csv('B站TOP100-音乐.csv')
|
|
|
|
|
|
plt.rcParams['font.family'] = 'SimHei'
|
|
|
|
|
|
#弹幕数与播放数的散点图
|
|
|
plt.figure(figsize=(8, 6)) # 调整图像大小:确认图像没有被显示在可视区域外,尝试设置图像的大小利用plt.figure(figsize=(,))
|
|
|
sns.scatterplot(x='弹幕数', y='播放数', data=data_quanzhan)
|
|
|
plt.show() # 调用 plt.show()显示结果
|
|
|
|
|
|
numeric_data = data_quanzhan.select_dtypes(include=['number'])
|
|
|
cor = numeric_data.corr(method='spearman')
|
|
|
|
|
|
sns.heatmap(cor,
|
|
|
annot=True, # 显示相关系数的数据
|
|
|
center=0.5, # 居中
|
|
|
fmt='.2f', # 只显示两位小数
|
|
|
linewidth=0.5, # 设置每个单元格的距离
|
|
|
linecolor='blue', # 设置间距线的颜色
|
|
|
vmin=0, vmax=1, # 设置数值最小值和最大值
|
|
|
xticklabels=True, yticklabels=True, # 显示x轴和y轴
|
|
|
square=True, # 每个方格都是正方形
|
|
|
cbar=True, # 绘制颜色条
|
|
|
cmap='coolwarm_r', # 设置热力图颜色
|
|
|
)
|
|
|
plt.show() #显示图片
|
|
|
|
|
|
mean_list = [data_quanzhan['播放数'].mean(),data_wudao['播放数'].mean(),data_shenghuo['播放数'].mean(),data_meishi['播放数'].mean(),data_yinyue['播放数'].mean()]
|
|
|
name_list=['全站','舞蹈','生活','美食','音乐']
|
|
|
plt.bar(name_list,mean_list)
|
|
|
plt.ylabel('播放数')
|
|
|
plt.show()
|
|
|
|
|
|
var_list = [data_quanzhan['播放数'].var(),data_wudao['播放数'].var(),data_shenghuo['播放数'].var(),data_meishi['播放数'].var(),data_yinyue['播放数'].var()]
|
|
|
plt.bar(name_list,var_list)
|
|
|
plt.ylabel('播放数方差')
|
|
|
|
|
|
# 文本预处理
|
|
|
data_quanzhan['视频标题'] = data_quanzhan['视频标题'].apply(lambda x: ' '.join(jieba.lcut(x))) # 分词
|
|
|
# 读取停用词
|
|
|
with open('./停用词表.txt', 'r', encoding='utf-8') as f:
|
|
|
stop_words = f.read().split('\n')
|
|
|
data_quanzhan['视频标题'] = data_quanzhan['视频标题'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words])) # 去除停用词
|
|
|
# 统计单词出现频率
|
|
|
words = []
|
|
|
for comment in data_quanzhan['视频标题']:
|
|
|
words += comment.split()
|
|
|
word_count = Counter(words).most_common(100)
|
|
|
# 转成字典
|
|
|
dct = {k:v for k, v in word_count}
|
|
|
print(dct)
|
|
|
# 生成词云图
|
|
|
font_path = r'C:\Windows\Fonts\msyh.ttc' # 指定微软雅黑字体路径
|
|
|
wordcloud = WordCloud(width=800, height=600, background_color='white',font_path=font_path).generate_from_frequencies(dct)
|
|
|
plt.imshow(wordcloud, interpolation='bilinear')
|
|
|
plt.axis('off')
|
|
|
plt.show()
|
|
|
|
|
|
def panduan(x):
|
|
|
if x in data_meishi['作者'].tolist():
|
|
|
return '美食'
|
|
|
elif x in data_shenghuo['作者'].tolist():
|
|
|
return '生活'
|
|
|
elif x in data_yinyue['作者'].tolist():
|
|
|
return '音乐'
|
|
|
elif x in data_wudao['作者'].tolist():
|
|
|
return '舞蹈'
|
|
|
else:
|
|
|
return '其他'
|
|
|
data_quanzhan['类别'] = data_quanzhan['作者'].apply(panduan)
|
|
|
data_pie = data_quanzhan['类别'].value_counts()
|
|
|
plt.pie(data_pie.values,labels=data_pie.index,autopct="%3.1f%%")
|
|
|
plt.show()
|