|
|
@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
|
|
|
import json
|
|
|
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
from collections import Counter
|
|
|
|
|
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
from wordcloud import WordCloud
|
|
|
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
|
|
import time
|
|
|
|
|
|
|
|
import jieba
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# B站搜索API URL
|
|
|
|
|
|
|
|
search_url = 'https://api.bilibili.com/x/web-interface/wbi/search/type'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# B站视频详情API URL,用于获取视频的cid
|
|
|
|
|
|
|
|
video_info_url = 'https://api.bilibili.com/x/web-interface/view'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# B站弹幕API URL
|
|
|
|
|
|
|
|
danmu_url = 'https://api.bilibili.com/x/v1/dm/list.so'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def search_bilibili(query, total_results):
|
|
|
|
|
|
|
|
num_per_page = 42 # 每页最大视频数
|
|
|
|
|
|
|
|
pages_needed = (total_results + num_per_page - 1) // num_per_page # 计算需要多少页
|
|
|
|
|
|
|
|
video_list = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for page in range(1, pages_needed + 1):
|
|
|
|
|
|
|
|
params = {
|
|
|
|
|
|
|
|
'__refresh__': 'true',
|
|
|
|
|
|
|
|
'_extra': '',
|
|
|
|
|
|
|
|
'context': '',
|
|
|
|
|
|
|
|
'page_size': num_per_page,
|
|
|
|
|
|
|
|
'from_source': '',
|
|
|
|
|
|
|
|
'from_spmid': '333.337',
|
|
|
|
|
|
|
|
'platform': 'pc',
|
|
|
|
|
|
|
|
'highlight': '1',
|
|
|
|
|
|
|
|
'single_column': '0',
|
|
|
|
|
|
|
|
'keyword': query,
|
|
|
|
|
|
|
|
'qv_id': '0EnOHi82F62j2usODhMghThN7EvXEZmj',
|
|
|
|
|
|
|
|
'source_tag': '3',
|
|
|
|
|
|
|
|
'dynamic_offset': 30,
|
|
|
|
|
|
|
|
'search_type': 'video',
|
|
|
|
|
|
|
|
'w_rid': '16f27d62ff40f1a5f935a6af26432c81',
|
|
|
|
|
|
|
|
'wts': '1726306000',
|
|
|
|
|
|
|
|
'page': page # 设置页码
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
|
|
|
|
'accept': 'application/json, text/plain, */*',
|
|
|
|
|
|
|
|
'accept-encoding': 'gzip, deflate, br, zstd',
|
|
|
|
|
|
|
|
'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,en-GB;q=0.6',
|
|
|
|
|
|
|
|
'cookie': 'DedeUserID=1075156086; DedeUserID__ckMd5=7460241d769e1da4; buvid4=9980B4C0-302E-C6A9-122A-0EFE06E4B5F435899-022102715-X83v1qigvaWQdhtSeo%2BvYQ%3D%3D; enable_web_push=DISABLE; buvid3=0DD4B4A8-5B28-59F0-F5EB-9EB31F483AF226299infoc; b_nut=1699086426; _uuid=1FCED779-E59E-F3CA-81A8-817C10CCF3105C25422infoc; header_theme_version=CLOSE; PVID=1; buvid_fp=395bc05f8612d5e47df093ecc1b2bd8e; rpdid=|(J|)Y)JlmJJ0J\'u~|~m|lJ|Y; CURRENT_FNVAL=4048; CURRENT_QUALITY=80; FEED_LIVE_VERSION=V_HEADER_LIVE_NO_POP; SESSDATA=45375f4b%2C1741793139%2Ccfd7b%2A92CjCDJGeSRO5decbO4E62OzjFfKGaAkW2xZVJMSoyMiINCOFoCUntmY4_rMzO8gFMJzYSVkNDTFpKa0h6TWEyS3NNMm5oUkluSnc2OGsySll6MUVBWGFJbmdmZ1VST3pWOWFiek92cnk4S3BhZVRpNFgtLWxacE5iZ3NlX29DdGx3dUJLZWJja253IIEC; bili_jct=f86ff49e42fb369eb9cfa114fa804019; sid=4qsj12c6; home_feed_column=5; browser_resolution=1528-738; bsource=search_bing; bp_t_offset_1075156086=976968501354823680; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY3MTY5MzMsImlhdCI6MTcyNjQ1NzY3MywicGx0IjotMX0.7WQjSxEb__Z8q6mXZZVKcYfGj_p_EP-8VkK9httVQQA; bili_ticket_expires=1726716873; b_lsid=8197BB1F_191FAD926B7',
|
|
|
|
|
|
|
|
'origin': 'https://search.bilibili.com',
|
|
|
|
|
|
|
|
'referer': 'https://search.bilibili.com/all',
|
|
|
|
|
|
|
|
'sec-ch-ua': '"Chromium";v="128", "Not;A=Brand";v="24", "Microsoft Edge";v="128"',
|
|
|
|
|
|
|
|
'sec-ch-ua-mobile': '?0',
|
|
|
|
|
|
|
|
'sec-ch-ua-platform': '"Windows"',
|
|
|
|
|
|
|
|
'sec-fetch-dest': 'empty',
|
|
|
|
|
|
|
|
'sec-fetch-mode': 'cors',
|
|
|
|
|
|
|
|
'sec-fetch-site': 'same-site',
|
|
|
|
|
|
|
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0'
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
response = requests.get(search_url, params=params, headers=headers)
|
|
|
|
|
|
|
|
print(f"Page {page} HTTP Status Code: {response.status_code}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if response.status_code == 412:
|
|
|
|
|
|
|
|
print("请求被阻止,等待1秒重试...")
|
|
|
|
|
|
|
|
time.sleep(1)
|
|
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
data = response.json()
|
|
|
|
|
|
|
|
print(f"Page {page} Parsed JSON Data:")
|
|
|
|
|
|
|
|
print(data)
|
|
|
|
|
|
|
|
except json.JSONDecodeError:
|
|
|
|
|
|
|
|
print(f"Page {page} 无法解析 JSON 数据")
|
|
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if data['code'] != 0:
|
|
|
|
|
|
|
|
print(f"Page {page} Failed to fetch data from Bilibili API")
|
|
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
videos = data['data']['result']
|
|
|
|
|
|
|
|
for video in videos:
|
|
|
|
|
|
|
|
video_id = video['bvid']
|
|
|
|
|
|
|
|
video_list.append(video_id)
|
|
|
|
|
|
|
|
if len(video_list) >= total_results:
|
|
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if len(video_list) >= total_results:
|
|
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return video_list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_video_cid(bvid):
|
|
|
|
|
|
|
|
# 请求视频的详情信息,获取cid
|
|
|
|
|
|
|
|
params = {
|
|
|
|
|
|
|
|
'bvid': bvid
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
|
|
|
|
'accept': 'application/json, text/plain, */*',
|
|
|
|
|
|
|
|
'accept-encoding': 'gzip',
|
|
|
|
|
|
|
|
'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,en-GB;q=0.6',
|
|
|
|
|
|
|
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0'
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
response = requests.get(video_info_url, params=params, headers=headers)
|
|
|
|
|
|
|
|
print(f"Video Info HTTP Status Code: {response.status_code}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if response.status_code != 200:
|
|
|
|
|
|
|
|
print(f"Failed to fetch video info for {bvid}")
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
data = response.json()
|
|
|
|
|
|
|
|
if 'cid' in data['data']:
|
|
|
|
|
|
|
|
return data['data']['cid']
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
print(f"CID not found for video {bvid}")
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
except json.JSONDecodeError:
|
|
|
|
|
|
|
|
print("无法解析视频信息的 JSON 数据")
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def fetch_danmu(cid):
|
|
|
|
|
|
|
|
params = {
|
|
|
|
|
|
|
|
'oid': cid
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
|
|
|
|
'accept': 'application/xml, text/xml, */*',
|
|
|
|
|
|
|
|
'accept-encoding': 'gzip',
|
|
|
|
|
|
|
|
'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,en-GB;q=0.6',
|
|
|
|
|
|
|
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0'
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
response = requests.get(danmu_url, params=params, headers=headers)
|
|
|
|
|
|
|
|
print(f"Danmu HTTP Status Code: {response.status_code}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if response.status_code != 200:
|
|
|
|
|
|
|
|
print(f"Failed to fetch danmu for CID {cid}")
|
|
|
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
content = response.content.decode('utf-8')
|
|
|
|
|
|
|
|
print("Danmu Response Content:")
|
|
|
|
|
|
|
|
print(content)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
soup = BeautifulSoup(content, 'xml')
|
|
|
|
|
|
|
|
danmu_texts = [d.text for d in soup.find_all('d')]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return danmu_texts
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def count_and_rank_danmu(danmu_texts):
|
|
|
|
|
|
|
|
ai_keywords = ['人工智能', '机器学习', '深度学习', '自然语言处理', '计算机视觉', '智能算法', '大数据', 'AI', '智能制造', '智能家居', '智能医疗', '物联网', '云计算', '智能服务', '自动化']
|
|
|
|
|
|
|
|
top_n = 8
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 统计每种弹幕的频率
|
|
|
|
|
|
|
|
counter = Counter(danmu_texts)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 统计与 AI 技术应用相关的弹幕频率
|
|
|
|
|
|
|
|
ai_counter = Counter()
|
|
|
|
|
|
|
|
keyword_counter = Counter()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for text, count in counter.items():
|
|
|
|
|
|
|
|
# 统计 AI 关键词的出现次数
|
|
|
|
|
|
|
|
for keyword in ai_keywords:
|
|
|
|
|
|
|
|
if keyword in text:
|
|
|
|
|
|
|
|
ai_counter[text] += count
|
|
|
|
|
|
|
|
keyword_counter[keyword] += count
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 排名前 top_n 的弹幕
|
|
|
|
|
|
|
|
ranked_ai_danmu = ai_counter.most_common(top_n)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 输出每种 AI 关键词的出现次数
|
|
|
|
|
|
|
|
print("AI 技术应用关键词的出现次数:")
|
|
|
|
|
|
|
|
for keyword, count in keyword_counter.items():
|
|
|
|
|
|
|
|
print(f"{keyword}: {count} 次")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 输出排名前 top_n 的弹幕及其频率
|
|
|
|
|
|
|
|
print(f"\n排名前 {top_n} 的 AI 技术应用弹幕:")
|
|
|
|
|
|
|
|
for text, count in ranked_ai_danmu:
|
|
|
|
|
|
|
|
print(f"弹幕: {text} - 频率: {count} 次")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 将统计结果导出到 Excel
|
|
|
|
|
|
|
|
export_to_excel(ranked_ai_danmu, keyword_counter)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def export_to_excel(ranked_ai_danmu, keyword_counter):
|
|
|
|
|
|
|
|
# 创建 DataFrame,不进行分词,保持原始弹幕
|
|
|
|
|
|
|
|
df_danmu = pd.DataFrame(ranked_ai_danmu, columns=['弹幕', '频率'])
|
|
|
|
|
|
|
|
df_keywords = pd.DataFrame(keyword_counter.items(), columns=['关键词', '出现次数'])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 保存到 Excel 文件
|
|
|
|
|
|
|
|
with pd.ExcelWriter('danmu_statistics.xlsx') as writer:
|
|
|
|
|
|
|
|
df_danmu.to_excel(writer, sheet_name='AI 技术应用弹幕', index=False)
|
|
|
|
|
|
|
|
df_keywords.to_excel(writer, sheet_name='AI 技术关键词', index=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("统计结果已导出到 danmu_statistics.xlsx")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 在生成词云图时进行分词
|
|
|
|
|
|
|
|
generate_wordcloud(df_danmu)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_wordcloud(df_danmu):
|
|
|
|
|
|
|
|
# 进行分词
|
|
|
|
|
|
|
|
processed_texts = []
|
|
|
|
|
|
|
|
for text in df_danmu['弹幕']:
|
|
|
|
|
|
|
|
words = jieba.cut(text) # 使用 jieba 分词
|
|
|
|
|
|
|
|
processed_texts.append(' '.join(words)) # 分词结果拼接为字符串
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 创建词云图的文本数据
|
|
|
|
|
|
|
|
text = ' '.join(processed_texts)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 生成词云图
|
|
|
|
|
|
|
|
wordcloud = WordCloud(font_path='simhei.ttf', width=800, height=600, background_color='white').generate(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 显示词云图
|
|
|
|
|
|
|
|
plt.imshow(wordcloud, interpolation='bilinear')
|
|
|
|
|
|
|
|
plt.axis('off')
|
|
|
|
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
|
|
|
query = '2024巴黎奥运会'
|
|
|
|
|
|
|
|
total_results = 300 # 设定要爬取的总视频数量
|
|
|
|
|
|
|
|
video_list = search_bilibili(query, total_results)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
all_danmu_texts = []
|
|
|
|
|
|
|
|
for bvid in video_list:
|
|
|
|
|
|
|
|
cid = get_video_cid(bvid)
|
|
|
|
|
|
|
|
if cid:
|
|
|
|
|
|
|
|
danmu_texts = fetch_danmu(cid)
|
|
|
|
|
|
|
|
all_danmu_texts.extend(danmu_texts)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
count_and_rank_danmu(all_danmu_texts)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
|
|
|
main()
|