parent
84ac1d628d
commit
05d8d516ed
@ -0,0 +1,150 @@
|
||||
import requests
|
||||
import re
|
||||
from lxml import etree
|
||||
import os
|
||||
import time
|
||||
import random
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from collections import OrderedDict
|
||||
|
||||
class BiliBiliDanMu:
|
||||
def __init__(self, bv, filename):
|
||||
if bv.startswith("BV"):
|
||||
bv = bv[2:]
|
||||
self.video_url = "https://bilibili.com/video/BV" + bv
|
||||
self.filename = filename
|
||||
self.headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/117.0.0.0 Safari/537.36",
|
||||
"Referer": "https://www.bilibili.com/",
|
||||
"Accept": "application/json, text/plain, */*",
|
||||
"Accept-Language": "zh-CN,zh;q=0.9",
|
||||
}
|
||||
|
||||
def get_video_cid(self):
|
||||
for attempt in range(3):
|
||||
try:
|
||||
response = requests.get(self.video_url, headers=self.headers, timeout=10)
|
||||
if response.status_code != 200:
|
||||
print(f"[{self.video_url}] 请求失败: {response.status_code}")
|
||||
continue
|
||||
html = response.content.decode(errors='ignore')
|
||||
cid = re.findall(r'("cid":)([0-9]+)', html)
|
||||
if cid:
|
||||
print(f"[{self.video_url}] 获取到 cid={cid[0][-1]}")
|
||||
return cid[0][-1]
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"[{self.video_url}] 获取 cid 出错: {e}")
|
||||
time.sleep(1.5)
|
||||
return None
|
||||
|
||||
def get_content(self, xml_url):
|
||||
try:
|
||||
response = requests.get(xml_url, headers=self.headers, timeout=10)
|
||||
if response.status_code == 200:
|
||||
return response.content
|
||||
except Exception as e:
|
||||
print(f"[{xml_url}] 请求弹幕出错: {e}")
|
||||
return None
|
||||
|
||||
def extract_danmu(self, content_str):
|
||||
try:
|
||||
html = etree.HTML(content_str)
|
||||
danmu_list = html.xpath("//d/text()")
|
||||
print(f"解析到 {len(danmu_list)} 条弹幕")
|
||||
if len(danmu_list) > 0:
|
||||
print("示例前5条:", danmu_list[:5])
|
||||
return danmu_list
|
||||
except Exception as e:
|
||||
print(f"解析弹幕出错: {e}")
|
||||
return []
|
||||
|
||||
def save(self, save_items):
|
||||
if not save_items:
|
||||
print(f"[{self.filename}] 无弹幕内容,跳过保存")
|
||||
return
|
||||
os.makedirs(os.path.dirname(self.filename), exist_ok=True)
|
||||
with open(self.filename, 'w', encoding='utf-8', buffering=1) as f:
|
||||
for item in save_items:
|
||||
f.write(item + '\n')
|
||||
f.flush()
|
||||
print(f"[写入完成] {self.filename} ({len(save_items)} 条)")
|
||||
|
||||
def crawl(self):
|
||||
cid = self.get_video_cid()
|
||||
if not cid:
|
||||
print(f"[跳过] 无效 BV: {self.video_url}")
|
||||
return False
|
||||
|
||||
xml_url = f"http://comment.bilibili.com/{cid}.xml"
|
||||
content = self.get_content(xml_url)
|
||||
if not content:
|
||||
print(f"[跳过] BV号 {self.video_url.split('/')[-1]} 弹幕请求失败")
|
||||
return False
|
||||
|
||||
danmu_list = self.extract_danmu(content)
|
||||
if not danmu_list:
|
||||
print(f"[跳过] BV号 {self.video_url.split('/')[-1]} 无弹幕")
|
||||
return False
|
||||
|
||||
self.save(danmu_list)
|
||||
return True
|
||||
|
||||
|
||||
def search_videos(query, max_results=20):
|
||||
"""缩小数量以便调试"""
|
||||
search_url = "https://api.bilibili.com/x/web-interface/search/type"
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/117.0.0.0 Safari/537.36",
|
||||
"Referer": "https://www.bilibili.com/",
|
||||
"Accept": "application/json, text/plain, */*",
|
||||
"Accept-Language": "zh-CN,zh;q=0.9",
|
||||
"cookie": "SESSDATA=your_cookie_here"
|
||||
}
|
||||
|
||||
bv_list = []
|
||||
page = 1
|
||||
while len(bv_list) < max_results:
|
||||
params = {'keyword': query, 'search_type': 'video', 'order': 'totalrank',
|
||||
'page': page, 'pagesize': 20}
|
||||
try:
|
||||
response = requests.get(search_url, params=params, headers=headers, timeout=10)
|
||||
if response.status_code != 200:
|
||||
print(f"搜索请求失败: {response.status_code}")
|
||||
break
|
||||
results = response.json()
|
||||
videos = results.get("data", {}).get("result", [])
|
||||
if not videos:
|
||||
break
|
||||
bv_list += [v['bvid'] for v in videos]
|
||||
print(f"搜索到 {len(bv_list)} 个 BV")
|
||||
except Exception as e:
|
||||
print(f"搜索出错: {e}")
|
||||
page += 1
|
||||
time.sleep(random.uniform(1, 2))
|
||||
|
||||
return list(OrderedDict.fromkeys(bv_list))[:max_results]
|
||||
|
||||
|
||||
def download_danmu(index, bv, filename):
|
||||
crawler = BiliBiliDanMu(bv, filename)
|
||||
return crawler.crawl()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
queries = ["大语言模型"]
|
||||
output_dir = './output_danmu_LLM_debug/'
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
bv_list = []
|
||||
for q in queries:
|
||||
bv_list.extend(search_videos(q))
|
||||
print(f"总计 {len(bv_list)} 个视频待抓取")
|
||||
|
||||
for i, bv in enumerate(bv_list[:20]): # 仅抓5个测试
|
||||
print(f"\n抓取第{i+1}个视频: {bv}")
|
||||
filename = f"{output_dir}第{i+1}个视频_{bv}.txt"
|
||||
download_danmu(i, bv, filename)
|
||||
Loading…
Reference in new issue