You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

150 lines
5.9 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import pandas
import requests #发送请求
import pandas as pd
import numpy as np#导入matplotlib库绘制图像
import matplotlib.pyplot as plt#读取数据集到phone_data中
import seaborn as sns
import jieba
from collections import Counter
from wordcloud import WordCloud
url_dict = {
'全站': 'https://api.bilibili.com/x/web-interface/ranking/v2?rid=0&type=all',
'音乐': 'https://api.bilibili.com/x/web-interface/ranking/v2?rid=3&type=all',
'舞蹈': 'https://api.bilibili.com/x/web-interface/ranking/v2?rid=129&type=all',
'生活': 'https://api.bilibili.com/x/web-interface/ranking/v2?rid=160&type=all',
'美食': 'https://api.bilibili.com/x/web-interface/ranking/v2?rid=211&type=all',
}
headers = {
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0",
"Referer": "https://www.bilibili.com/v/popular/rank/all/",
}
for i in url_dict.items():
url = i[1] # url地址
tab_name = i[0] # tab页名称
title_list = []
play_cnt_list = [] # 播放数
danmu_cnt_list = [] # 弹幕数
coin_cnt_list = [] # 投币数
like_cnt_list = [] # 点赞数
share_cnt_list = [] # 分享数
favorite_cnt_list = [] # 收藏数
author_list = []
video_url = []
try:
r = requests.get(url, headers=headers)
print(r.status_code)#获取目标数据所在数据并转成字典类型
json_data = r.json()
list_data = json_data['data']['list']
for data in list_data:
title_list.append(data['title'])
play_cnt_list.append(data['stat']['view'])
danmu_cnt_list.append(data['stat']['danmaku'])
coin_cnt_list.append(data['stat']['coin'])
like_cnt_list.append(data['stat']['like'])
# dislike_cnt_list.append(data['stat']['dislike'])
share_cnt_list.append(data['stat']['share'])
favorite_cnt_list.append(data['stat']['favorite'])
author_list.append(data['owner']['name'])
# score_list.append(data['score'])
video_url.append('https://www.bilibili.com/video/' + data['bvid'])
# print('*' * 10)
except Exception as e:
print("爬取失败:{}".format(str(e)))
#创建dataframe保存数据
df = pd.DataFrame(
{'视频标题': title_list,
'视频地址': video_url,
'作者': author_list,
'播放数': play_cnt_list,
'弹幕数': danmu_cnt_list,
'投币数': coin_cnt_list,
'点赞数': like_cnt_list,
'分享数': share_cnt_list,
'收藏数': favorite_cnt_list,
})
#print(df.head())
#将数据保存到本地
df.to_csv('B站TOP100-{}.csv'.format(tab_name), index=False,encoding='utf_8_sig') # utf_8_sig修复乱码问题
print('写入成功: ' + 'B站TOP100-{}.csv'.format(tab_name))
data_meishi=pd.read_csv('B站TOP100-美食.csv')
data_quanzhan=pd.read_csv('B站TOP100-全站.csv')
data_shenghuo=pd.read_csv('B站TOP100-生活.csv')
data_wudao=pd.read_csv('B站TOP100-舞蹈.csv')
data_yinyue=pd.read_csv('B站TOP100-音乐.csv')
plt.rcParams['font.family'] = 'SimHei'
#弹幕数与播放数的散点图
plt.figure(figsize=(8, 6)) # 调整图像大小:确认图像没有被显示在可视区域外,尝试设置图像的大小利用plt.figure(figsize=(,))
sns.scatterplot(x='弹幕数', y='播放数', data=data_quanzhan)
plt.show() # 调用 plt.show()显示结果
numeric_data = data_quanzhan.select_dtypes(include=['number'])
cor = numeric_data.corr(method='spearman')
sns.heatmap(cor,
annot=True, # 显示相关系数的数据
center=0.5, # 居中
fmt='.2f', # 只显示两位小数
linewidth=0.5, # 设置每个单元格的距离
linecolor='blue', # 设置间距线的颜色
vmin=0, vmax=1, # 设置数值最小值和最大值
xticklabels=True, yticklabels=True, # 显示x轴和y轴
square=True, # 每个方格都是正方形
cbar=True, # 绘制颜色条
cmap='coolwarm_r', # 设置热力图颜色
)
plt.show() #显示图片
mean_list = [data_quanzhan['播放数'].mean(),data_wudao['播放数'].mean(),data_shenghuo['播放数'].mean(),data_meishi['播放数'].mean(),data_yinyue['播放数'].mean()]
name_list=['全站','舞蹈','生活','美食','音乐']
plt.bar(name_list,mean_list)
plt.ylabel('播放数')
plt.show()
var_list = [data_quanzhan['播放数'].var(),data_wudao['播放数'].var(),data_shenghuo['播放数'].var(),data_meishi['播放数'].var(),data_yinyue['播放数'].var()]
plt.bar(name_list,var_list)
plt.ylabel('播放数方差')
# 文本预处理
data_quanzhan['视频标题'] = data_quanzhan['视频标题'].apply(lambda x: ' '.join(jieba.lcut(x))) # 分词
# 读取停用词
with open('./停用词表.txt', 'r', encoding='utf-8') as f:
stop_words = f.read().split('\n')
data_quanzhan['视频标题'] = data_quanzhan['视频标题'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words])) # 去除停用词
# 统计单词出现频率
words = []
for comment in data_quanzhan['视频标题']:
words += comment.split()
word_count = Counter(words).most_common(100)
# 转成字典
dct = {k:v for k, v in word_count}
print(dct)
# 生成词云图
font_path = r'C:\Windows\Fonts\msyh.ttc' # 指定微软雅黑字体路径
wordcloud = WordCloud(width=800, height=600, background_color='white',font_path=font_path).generate_from_frequencies(dct)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
def panduan(x):
if x in data_meishi['作者'].tolist():
return '美食'
elif x in data_shenghuo['作者'].tolist():
return '生活'
elif x in data_yinyue['作者'].tolist():
return '音乐'
elif x in data_wudao['作者'].tolist():
return '舞蹈'
else:
return '其他'
data_quanzhan['类别'] = data_quanzhan['作者'].apply(panduan)
data_pie = data_quanzhan['类别'].value_counts()
plt.pie(data_pie.values,labels=data_pie.index,autopct="%3.1f%%")
plt.show()