|
|
import requests
|
|
|
from bs4 import BeautifulSoup
|
|
|
import time
|
|
|
|
|
|
print("=== 简化版B站弹幕爬虫 ===")
|
|
|
|
|
|
# 一些已知的大语言模型相关视频ID
|
|
|
video_list = [
|
|
|
"BV1pu41137aK", # 大语言模型科普视频
|
|
|
"BV1nN4y1Y7o2", # LLM技术讲解
|
|
|
"BV1Gu4y1u7rN", # 大模型应用
|
|
|
"BV1AN41137Jp", # 语言模型发展
|
|
|
"BV1tu4y1u7Jh" # AI大模型
|
|
|
]
|
|
|
|
|
|
all_danmakus = []
|
|
|
|
|
|
def get_danmaku(bvid):
|
|
|
"""获取视频弹幕"""
|
|
|
try:
|
|
|
print(f"正在获取视频 {bvid} 的弹幕...")
|
|
|
|
|
|
# 1. 先获取视频的cid(弹幕ID)
|
|
|
info_url = f"https://api.bilibili.com/x/web-interface/view?bvid={bvid}"
|
|
|
headers = {
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
|
|
'Referer': 'https://www.bilibili.com/'
|
|
|
}
|
|
|
|
|
|
response = requests.get(info_url, headers=headers, timeout=10)
|
|
|
data = response.json()
|
|
|
|
|
|
if data['code'] != 0:
|
|
|
print(f" 无法获取视频信息: {data.get('message')}")
|
|
|
return []
|
|
|
|
|
|
cid = data['data']['cid']
|
|
|
print(f" 获取到CID: {cid}")
|
|
|
|
|
|
# 2. 获取弹幕数据
|
|
|
danmaku_url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
|
|
|
response = requests.get(danmaku_url, headers=headers, timeout=10)
|
|
|
response.encoding = 'utf-8'
|
|
|
|
|
|
# 解析XML
|
|
|
soup = BeautifulSoup(response.text, 'xml')
|
|
|
danmakus = soup.find_all('d')
|
|
|
|
|
|
danmaku_list = []
|
|
|
for dm in danmakus:
|
|
|
text = dm.get_text().strip()
|
|
|
if text and len(text) > 0:
|
|
|
danmaku_list.append(text)
|
|
|
|
|
|
print(f" 成功获取 {len(danmaku_list)} 条弹幕")
|
|
|
return danmaku_list
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f" 获取弹幕失败: {e}")
|
|
|
return []
|
|
|
|
|
|
# 主程序
|
|
|
for i, bvid in enumerate(video_list):
|
|
|
print(f"\n处理第 {i+1}/{len(video_list)} 个视频: {bvid}")
|
|
|
danmakus = get_danmaku(bvid)
|
|
|
all_danmakus.extend(danmakus)
|
|
|
|
|
|
# 每次获取后都保存
|
|
|
with open('bilibili_danmaku.txt', 'w', encoding='utf-8') as f:
|
|
|
for dm in all_danmakus:
|
|
|
f.write(dm + '\n')
|
|
|
|
|
|
# 延时避免请求过快
|
|
|
time.sleep(1)
|
|
|
|
|
|
print(f"\n=== 完成!共获取 {len(all_danmakus)} 条弹幕 ===")
|
|
|
|
|
|
# 显示统计信息
|
|
|
if all_danmakus:
|
|
|
print("\n弹幕示例:")
|
|
|
for i in range(min(10, len(all_danmakus))):
|
|
|
print(f"{i+1}. {all_danmakus[i]}")
|
|
|
|
|
|
# 保存统计信息
|
|
|
with open('danmaku_stats.txt', 'w', encoding='utf-8') as f:
|
|
|
f.write(f"总弹幕数: {len(all_danmakus)}\n")
|
|
|
f.write("前10条弹幕:\n")
|
|
|
for i, dm in enumerate(all_danmakus[:10]):
|
|
|
f.write(f"{i+1}. {dm}\n")
|
|
|
|
|
|
print(f"\n统计信息已保存到: danmaku_stats.txt")
|
|
|
else:
|
|
|
print("没有获取到弹幕数据")
|
|
|
|
|
|
print(f"弹幕数据已保存到: bilibili_danmaku.txt") |