You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

156 lines
6.1 KiB

import requests
import re
import time
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter
headers = {
'cookie':'CURRENT_FNVAL=4048; buvid4=CDB22228-76EA-BC93-F037-78FC6CEC077D36275-023090719-X83v1qigvaVs%2BeTu3%2F5T2g%3D%3D; rpdid=|(u)luk)m|)R0J\'uYmR|)Y)J); enable_web_push=DISABLE; header_theme_version=CLOSE; DedeUserID=567151924; DedeUserID__ckMd5=3d51b3cb3879b2e0; PVID=1; buvid3=909EA327-0037-5349-5CF4-5B2C4EF5300103254infoc; b_nut=1726374703; bsource=search_bing; _uuid=A28E1582-10535-12B8-49B2-D1F1026722D1607876infoc; buvid_fp=1ed033837a881c0f6fee6ce1ae293ed0; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY2MzM5MDQsImlhdCI6MTcyNjM3NDY0NCwicGx0IjotMX0.52ZE1RDG5tSqHh-ZOGgEHDzj6W1UyOtiMkcxw_a2WNY; bili_ticket_expires=1726633844; SESSDATA=701189c6%2C1741926705%2C58e19%2A91CjCANFin8L-nRK6CjxH9_BSgRe6HHUSWybFilZklu8yORRObfCV2cnJswJPECKKy1UcSVkMtcE4ydHItZF9lOW43ZFpyelRVVEUzZUVCdlh6S2ltWWJIaTg0MU1DclRIeG8wbE84cE1pSFBkOXA1alNxTkp3bDJuLWNCN2IzV2JXX2p4SGIxaW9BIIEC; bili_jct=98cbd1d0535939dc4a5c474a44d27ad7; sid=7wfo5xb3; home_feed_column=5; browser_resolution=1432-776; CURRENT_QUALITY=80; bp_t_offset_567151924=977713175669506048; b_lsid=811052DA4_191FB79926A',
'Referer': 'https://search.bilibili.com/all?vt=82484714&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0'
}
def get_response(url):
response = requests.get(url = url, headers = headers)
return response
def get_bv(page_num):
bv_list = []
link = 'https://api.bilibili.com/x/web-interface/wbi/search/type'
data = {
'category_id':'',
'search_type': 'video',
'ad_resource': '5654',
'__refresh__': 'true',
'_extra':'',
'context':'',
'page': page_num,
'page_size': '42',
'pubtime_begin_s': '0',
'pubtime_end_s': '0',
'from_source':'',
'from_spmid': '333.337',
'platform': 'pc',
'highlight': '1',
'single_column': '0',
'keyword': '2024巴黎奥运会',
'qv_id': 'Hz50pRYmKFQYlX2AorY3bJUTNJbLRwnX',
'source_tag': '3',
'gaia_vtoken':'',
'dynamic_offset': '30',
'web_location': '1430654',
'w_rid': '1b994979977a17ee8010f012d43fa7b6',
'wts': '1726501056'
}
link_data = requests.get(url = link, headers = headers, params = data).json()
for index in link_data['data']['result']:
bv_list.append(index['bvid'])
# pprint(bv_list)
return bv_list
def get_cid(bvid):
url = f"https://api.bilibili.com/x/web-interface/view?bvid={bvid}"
data = get_response(url).json()
# pprint(data)
return data['data']['cid']
def get_danmaku(cid):
url = f"https://comment.bilibili.com/{cid}.xml"
response = get_response(url)
response.encoding = 'utf-8'
# pprint(response.text)
return response.text
def parse_danmaku(danmaku_xml):
danmaku_list = re.findall('">(.*?)</d>',danmaku_xml)
return danmaku_list
# 弹幕相关关键词
ai_keywords = ["AI", "人工智能", "机器学习", "深度学习", "AI技术", "自动驾驶", "智能", "图像识别", "AI应用","智造"]
# 筛选与AI相关的弹幕
def get_ai_danmaku(danmaku_list_all, ai_keywords):
ai_related_danmaku = [danmaku for danmaku in danmaku_list_all if any(keyword in danmaku for keyword in ai_keywords)]
return ai_related_danmaku
# 统计弹幕出现频次
def count_danmaku(danmaku_list):
danmaku_counter = Counter(danmaku_list)
return danmaku_counter
# 获取前n个弹幕及其出现次数
def get_top_n_danmaku(danmaku_counter, n=8):
return danmaku_counter.most_common(n)
# 写入 Excel
def write_to_excel(data, filename='danmaku_AI_top8.xlsx'):
df = pd.DataFrame(data, columns=['弹幕内容', '出现次数'])
df.to_excel(filename, index=False)
# 生成并显示词云图
def get_wordcloud(danmaku_counter):
wordcloud = WordCloud(
width=800, # 宽度
height=400, # 高度
background_color='white', # 背景色
max_words=100, # 显示的最大词语数量
colormap='viridis', # 颜色映射
font_path='msyh.ttc' # 指定字体路径,适应中文显示
).generate_from_frequencies(danmaku_counter)
plt.figure(figsize=(10, 5)) # 图像大小
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off") # 关闭坐标轴显示
plt.show()
if __name__ == '__main__':
# 收集前300个视频的bvid
videos = []
page_num = 1
while len(videos) < 300:
bv_list = get_bv(str(page_num))
videos.extend(bv_list)
page_num += 1
time.sleep(1) # 防止请求过于频繁
# 仅保留前300个视频号
videos = videos[:300]
all_danmaku = []
all_ai_related_danmaku = []
# 获取每个视频的弹幕并打印
for bvid in videos:
try:
cid = get_cid(bvid)
danmaku_xml = get_danmaku(cid)
danmaku_list = parse_danmaku(danmaku_xml)
# 筛选AI相关弹幕
ai_related_danmaku = get_ai_danmaku(danmaku_list, ai_keywords)
all_ai_related_danmaku.extend(ai_related_danmaku)
for danmaku in danmaku_list:
all_danmaku.append(danmaku)
except Exception as e:
print(f"Error fetching danmaku for video {bvid}: {e}")
time.sleep(1) # 防止请求过于频繁
# 统计ai弹幕出现频次
ai_danmaku_counter = count_danmaku(all_ai_related_danmaku)
# 获取前8个频次最高的弹幕
top_8_danmaku = get_top_n_danmaku(ai_danmaku_counter, n=8)
for dm,cnt in top_8_danmaku:
print(dm)
# 输出到Excel
write_to_excel(top_8_danmaku)
# 生成词云
get_wordcloud(ai_danmaku_counter)
# print("弹幕统计结果已保存到Excel: danmaku_AI_top8.xlsx")