You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

239 lines
9.2 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
import json
from bs4 import BeautifulSoup
from collections import Counter
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import time
import jieba
# B站搜索API URL
search_url = 'https://api.bilibili.com/x/web-interface/wbi/search/type'
# B站视频详情API URL用于获取视频的cid
video_info_url = 'https://api.bilibili.com/x/web-interface/view'
# B站弹幕API URL
danmu_url = 'https://api.bilibili.com/x/v1/dm/list.so'
def search_bilibili(query, total_results):
num_per_page = 42 # 每页最大视频数
pages_needed = (total_results + num_per_page - 1) // num_per_page # 计算需要多少页
video_list = []
for page in range(1, pages_needed + 1):
params = {
'__refresh__': 'true',
'_extra': '',
'context': '',
'page_size': num_per_page,
'from_source': '',
'from_spmid': '333.337',
'platform': 'pc',
'highlight': '1',
'single_column': '0',
'keyword': query,
'qv_id': '0EnOHi82F62j2usODhMghThN7EvXEZmj',
'source_tag': '3',
'dynamic_offset': 30,
'search_type': 'video',
'w_rid': '16f27d62ff40f1a5f935a6af26432c81',
'wts': '1726306000',
'page': page # 设置页码
}
headers = {
'accept': 'application/json, text/plain, */*',
'accept-encoding': 'gzip, deflate, br, zstd',
'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,en-GB;q=0.6',
'cookie': 'DedeUserID=1075156086; DedeUserID__ckMd5=7460241d769e1da4; buvid4=9980B4C0-302E-C6A9-122A-0EFE06E4B5F435899-022102715-X83v1qigvaWQdhtSeo%2BvYQ%3D%3D; enable_web_push=DISABLE; buvid3=0DD4B4A8-5B28-59F0-F5EB-9EB31F483AF226299infoc; b_nut=1699086426; _uuid=1FCED779-E59E-F3CA-81A8-817C10CCF3105C25422infoc; header_theme_version=CLOSE; PVID=1; buvid_fp=395bc05f8612d5e47df093ecc1b2bd8e; rpdid=|(J|)Y)JlmJJ0J\'u~|~m|lJ|Y; CURRENT_FNVAL=4048; CURRENT_QUALITY=80; FEED_LIVE_VERSION=V_HEADER_LIVE_NO_POP; home_feed_column=5; browser_resolution=1528-738; bsource=search_bing; bp_t_offset_1075156086=976968501354823680; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY3MTY5MzMsImlhdCI6MTcyNjQ1NzY3MywicGx0IjotMX0.7WQjSxEb__Z8q6mXZZVKcYfGj_p_EP-8VkK9httVQQA; bili_ticket_expires=1726716873; b_lsid=A255A8C5_191FF65B3BE; SESSDATA=0e66c2c1%2C1742120673%2Cd251f%2A92CjClS9jPOjTyWfjKmoc1Qved4Vfi9N1Jb4KXprWc3-K-qETxsCKQP47sEElvDz-dK0kSVjNHZTNRUUhDSS1DUUJfVzQ3VlQ2NW44YktqbmpLN2hSR2VGQUVIajlfMFAxeERvWlhlWEQ5M1FkX2gxV19FT2wwYjNIcWMwVVRTcElteFpLbkZvRnBRIIEC; bili_jct=0409648e28f719911ffba1058edc4d6d; sid=gq4mtedj',
'origin': 'https://search.bilibili.com',
'referer': 'https://search.bilibili.com/all',
'sec-ch-ua': '"Chromium";v="128", "Not;A=Brand";v="24", "Microsoft Edge";v="128"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-site',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0'
}
response = requests.get(search_url, params=params, headers=headers)
print(f"Page {page} HTTP Status Code: {response.status_code}")
if response.status_code == 412:
print("请求被阻止等待1秒重试...")
time.sleep(1)
continue
try:
data = response.json()
print(f"Page {page} Parsed JSON Data:")
print(data)
except json.JSONDecodeError:
print(f"Page {page} 无法解析 JSON 数据")
continue
if data['code'] != 0:
print(f"Page {page} Failed to fetch data from Bilibili API")
continue
videos = data['data']['result']
for video in videos:
video_id = video['bvid']
video_list.append(video_id)
if len(video_list) >= total_results:
break
if len(video_list) >= total_results:
break
return video_list
def get_video_cid(bvid):
# 请求视频的详情信息获取cid
params = {
'bvid': bvid
}
headers = {
'accept': 'application/json, text/plain, */*',
'accept-encoding': 'gzip',
'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,en-GB;q=0.6',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0'
}
response = requests.get(video_info_url, params=params, headers=headers)
print(f"Video Info HTTP Status Code: {response.status_code}")
if response.status_code != 200:
print(f"Failed to fetch video info for {bvid}")
return None
try:
data = response.json()
if 'cid' in data['data']:
return data['data']['cid']
else:
print(f"CID not found for video {bvid}")
return None
except json.JSONDecodeError:
print("无法解析视频信息的 JSON 数据")
return None
def fetch_danmu(cid):
params = {
'oid': cid
}
headers = {
'accept': 'application/xml, text/xml, */*',
'accept-encoding': 'gzip',
'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,en-GB;q=0.6',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0'
}
response = requests.get(danmu_url, params=params, headers=headers)
print(f"Danmu HTTP Status Code: {response.status_code}")
if response.status_code != 200:
print(f"Failed to fetch danmu for CID {cid}")
return []
content = response.content.decode('utf-8')
print("Danmu Response Content:")
print(content)
soup = BeautifulSoup(content, 'xml')
danmu_texts = [d.text for d in soup.find_all('d')]
return danmu_texts
def count_and_rank_danmu(danmu_texts):
ai_keywords = ['人工智能', '机器学习', '深度学习', '自然语言处理', '计算机视觉', '智能算法', '大数据', 'AI', '智能制造', '智能家居', '智能医疗', '物联网', '云计算', '智能服务', '自动化','ai','机器人']
top_n = 8
# 统计每种弹幕的频率
counter = Counter(danmu_texts)
# 统计与 AI 技术应用相关的弹幕频率
ai_counter = Counter()
keyword_counter = Counter()
for text, count in counter.items():
# 统计 AI 关键词的出现次数
for keyword in ai_keywords:
if keyword in text:
ai_counter[text] += count
keyword_counter[keyword] += count
# 排名前 top_n 的弹幕
ranked_ai_danmu = ai_counter.most_common(top_n)
# 输出每种 AI 关键词的出现次数
print("AI 技术应用关键词的出现次数:")
for keyword, count in keyword_counter.items():
print(f"{keyword}: {count}")
# 输出排名前 top_n 的弹幕及其频率
print(f"\n排名前 {top_n} 的 AI 技术应用弹幕:")
for text, count in ranked_ai_danmu:
print(f"弹幕: {text} - 频率: {count}")
# 将统计结果导出到 Excel
export_to_excel(ranked_ai_danmu, keyword_counter)
def export_to_excel(ranked_ai_danmu, keyword_counter):
# 创建 DataFrame不进行分词保持原始弹幕
df_danmu = pd.DataFrame(ranked_ai_danmu, columns=['弹幕', '频率'])
df_keywords = pd.DataFrame(keyword_counter.items(), columns=['关键词', '出现次数'])
# 保存到 Excel 文件
with pd.ExcelWriter('danmu_statistics.xlsx') as writer:
df_danmu.to_excel(writer, sheet_name='AI 技术应用弹幕', index=False)
df_keywords.to_excel(writer, sheet_name='AI 技术关键词', index=False)
print("统计结果已导出到 danmu_statistics.xlsx")
# 在生成词云图时进行分词
generate_wordcloud(df_danmu)
def generate_wordcloud(df_danmu):
# 进行分词
processed_texts = []
for text in df_danmu['弹幕']:
words = jieba.cut(text) # 使用 jieba 分词
processed_texts.append(' '.join(words)) # 分词结果拼接为字符串
# 创建词云图的文本数据
text = ' '.join(processed_texts)
# 生成词云图
wordcloud = WordCloud(font_path='simhei.ttf', width=800, height=600, background_color='white').generate(text)
# 保存词云图
wordcloud.generate(text)
wordcloud.to_file('词云.png')
def main():
query = '2024巴黎奥运会'
total_results = 300 # 设定要爬取的总视频数量
video_list = search_bilibili(query, total_results)
all_danmu_texts = []
for bvid in video_list:
cid = get_video_cid(bvid)
if cid:
danmu_texts = fetch_danmu(cid)
all_danmu_texts.extend(danmu_texts)
count_and_rank_danmu(all_danmu_texts)
if __name__ == '__main__':
main()