4.1 KiB
import re import requests from multiprocessing.dummy import Pool from tqdm import tqdm import pandas as pd from collections import Counter from wordcloud import WordCloud import matplotlib.pyplot as plt
配置常量
KEYWORD = "2024 巴黎奥运会" DANMU_KEYWORD = "AI" # 过滤弹幕中的关键字 PAGENUM = 10 # 设置要爬取的页面数量 WORKERS = 6 # 线程池工作线程数
HTTP请求头部
HEADERS = { "cookie": "your_cookie_here", # 替换为实际cookie 'origin': 'https://www.bilibili.com', "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "referer": "https://t.bilibili.com/?spm_id_from=333.337.0.0", }
def get_search_results_html(page: int) -> str: """获取搜索结果页面的HTML内容""" url = f"https://search.bilibili.com/all?keyword={KEYWORD}&order=click&page={page}" try: response = requests.get(url, headers=HEADERS) response.raise_for_status() return response.text except requests.RequestException as e: print(f"Error fetching page {page}: {e}") return ""
def get_bvs(html: str) -> list: """从HTML内容中提取BVs""" return re.findall(r'bvid:"([^"]+)"', html)
def get_info(vid: str) -> dict: """获取视频信息""" url = f"https://api.bilibili.com/x/web-interface/view/detail?bvid={vid}" try: response = requests.get(url) response.raise_for_status() data = response.json()
if 'data' in data:
info = {
"标题": data["data"]["View"]["title"],
"cid": [dic["cid"] for dic in data["data"]["View"]["pages"]]
}
return info
except requests.RequestException as e:
print(f"Error fetching info for vid {vid}: {e}")
return {}
def get_danmu(info: dict) -> list: """获取视频的弹幕""" all_dms = [] for cid in info.get("cid", []): url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}" try: response = requests.get(url) response.encoding = "utf-8" data = re.findall('(.*?)', response.text) dms = [d[1] for d in data if DANMU_KEYWORD in d[1]] # 过滤包含AI的弹幕 all_dms += dms except requests.RequestException as e: print(f"Error fetching danmu for cid {cid}: {e}")
print(f"获取弹幕{len(all_dms)}条!")
return all_dms
def save_danmu(bv: str, danmu_data: list): """将弹幕保存到文本文件和Excel中""" df = pd.DataFrame(danmu_data, columns=['弹幕']) df.to_excel(f"./{KEYWORD}弹幕.xlsx", index=False, mode='a', header=not pd.io.common.file_exists(f"./{KEYWORD}弹幕.xlsx"))
def main(): """主函数:爬取视频信息和弹幕""" pool = Pool(WORKERS) htmls = pool.map(get_search_results_html, range(1, PAGENUM + 1)) bvs = []
for html in htmls:
bvs.extend(get_bvs(html))
# 限制为前三百个视频
bvs = bvs[:300]
all_danmu = []
# 爬取弹幕
for bv in tqdm(bvs, desc="正在爬取弹幕"):
info = get_info(bv继续完成上述Python代码,确保我们可以爬取B站弹幕、保存到Excel文件,并生成词云图。
if info:
danmu = get_danmu(info)
all_danmu.extend(danmu)
# 统计AI相关弹幕数量
counter = Counter(all_danmu)
top_danmu = counter.most_common(8)
# 输出前8的弹幕
print("AI相关弹幕统计(数量排名前8):")
for text, count in top_danmu:
print(f"{text}: {count}")
# 将弹幕数据写入Excel
save_danmu(KEYWORD, all_danmu)
# 生成词云图
generate_wordcloud(all_danmu)
def generate_wordcloud(danmu_data): """生成弹幕的词云图""" text = " ".join(danmu_data) wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("弹幕词云图")
plt.show()
if name == "main": main()