You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

95 lines
3.0 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
from bs4 import BeautifulSoup
import time
print("=== 简化版B站弹幕爬虫 ===")
# 一些已知的大语言模型相关视频ID
video_list = [
"BV1pu41137aK", # 大语言模型科普视频
"BV1nN4y1Y7o2", # LLM技术讲解
"BV1Gu4y1u7rN", # 大模型应用
"BV1AN41137Jp", # 语言模型发展
"BV1tu4y1u7Jh" # AI大模型
]
all_danmakus = []
def get_danmaku(bvid):
"""获取视频弹幕"""
try:
print(f"正在获取视频 {bvid} 的弹幕...")
# 1. 先获取视频的cid弹幕ID
info_url = f"https://api.bilibili.com/x/web-interface/view?bvid={bvid}"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Referer': 'https://www.bilibili.com/'
}
response = requests.get(info_url, headers=headers, timeout=10)
data = response.json()
if data['code'] != 0:
print(f" 无法获取视频信息: {data.get('message')}")
return []
cid = data['data']['cid']
print(f" 获取到CID: {cid}")
# 2. 获取弹幕数据
danmaku_url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
response = requests.get(danmaku_url, headers=headers, timeout=10)
response.encoding = 'utf-8'
# 解析XML
soup = BeautifulSoup(response.text, 'xml')
danmakus = soup.find_all('d')
danmaku_list = []
for dm in danmakus:
text = dm.get_text().strip()
if text and len(text) > 0:
danmaku_list.append(text)
print(f" 成功获取 {len(danmaku_list)} 条弹幕")
return danmaku_list
except Exception as e:
print(f" 获取弹幕失败: {e}")
return []
# 主程序
for i, bvid in enumerate(video_list):
print(f"\n处理第 {i+1}/{len(video_list)} 个视频: {bvid}")
danmakus = get_danmaku(bvid)
all_danmakus.extend(danmakus)
# 每次获取后都保存
with open('bilibili_danmaku.txt', 'w', encoding='utf-8') as f:
for dm in all_danmakus:
f.write(dm + '\n')
# 延时避免请求过快
time.sleep(1)
print(f"\n=== 完成!共获取 {len(all_danmakus)} 条弹幕 ===")
# 显示统计信息
if all_danmakus:
print("\n弹幕示例:")
for i in range(min(10, len(all_danmakus))):
print(f"{i+1}. {all_danmakus[i]}")
# 保存统计信息
with open('danmaku_stats.txt', 'w', encoding='utf-8') as f:
f.write(f"总弹幕数: {len(all_danmakus)}\n")
f.write("前10条弹幕:\n")
for i, dm in enumerate(all_danmakus[:10]):
f.write(f"{i+1}. {dm}\n")
print(f"\n统计信息已保存到: danmaku_stats.txt")
else:
print("没有获取到弹幕数据")
print(f"弹幕数据已保存到: bilibili_danmaku.txt")