import requests from bs4 import BeautifulSoup import time print("=== 简化版B站弹幕爬虫 ===") # 一些已知的大语言模型相关视频ID video_list = [ "BV1pu41137aK", # 大语言模型科普视频 "BV1nN4y1Y7o2", # LLM技术讲解 "BV1Gu4y1u7rN", # 大模型应用 "BV1AN41137Jp", # 语言模型发展 "BV1tu4y1u7Jh" # AI大模型 ] all_danmakus = [] def get_danmaku(bvid): """获取视频弹幕""" try: print(f"正在获取视频 {bvid} 的弹幕...") # 1. 先获取视频的cid(弹幕ID) info_url = f"https://api.bilibili.com/x/web-interface/view?bvid={bvid}" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Referer': 'https://www.bilibili.com/' } response = requests.get(info_url, headers=headers, timeout=10) data = response.json() if data['code'] != 0: print(f" 无法获取视频信息: {data.get('message')}") return [] cid = data['data']['cid'] print(f" 获取到CID: {cid}") # 2. 获取弹幕数据 danmaku_url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}" response = requests.get(danmaku_url, headers=headers, timeout=10) response.encoding = 'utf-8' # 解析XML soup = BeautifulSoup(response.text, 'xml') danmakus = soup.find_all('d') danmaku_list = [] for dm in danmakus: text = dm.get_text().strip() if text and len(text) > 0: danmaku_list.append(text) print(f" 成功获取 {len(danmaku_list)} 条弹幕") return danmaku_list except Exception as e: print(f" 获取弹幕失败: {e}") return [] # 主程序 for i, bvid in enumerate(video_list): print(f"\n处理第 {i+1}/{len(video_list)} 个视频: {bvid}") danmakus = get_danmaku(bvid) all_danmakus.extend(danmakus) # 每次获取后都保存 with open('bilibili_danmaku.txt', 'w', encoding='utf-8') as f: for dm in all_danmakus: f.write(dm + '\n') # 延时避免请求过快 time.sleep(1) print(f"\n=== 完成!共获取 {len(all_danmakus)} 条弹幕 ===") # 显示统计信息 if all_danmakus: print("\n弹幕示例:") for i in range(min(10, len(all_danmakus))): print(f"{i+1}. {all_danmakus[i]}") # 保存统计信息 with open('danmaku_stats.txt', 'w', encoding='utf-8') as f: f.write(f"总弹幕数: {len(all_danmakus)}\n") f.write("前10条弹幕:\n") for i, dm in enumerate(all_danmakus[:10]): f.write(f"{i+1}. {dm}\n") print(f"\n统计信息已保存到: danmaku_stats.txt") else: print("没有获取到弹幕数据") print(f"弹幕数据已保存到: bilibili_danmaku.txt")