You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

170 lines
5.9 KiB

import requests
import re
from lxml import etree
import os
import time
import random
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import OrderedDict
class BiliBiliDanMu:
def __init__(self, bv, filename):
if bv.startswith("BV"):
bv = bv[2:]
self.video_url = "https://bilibili.com/video/BV" + bv
self.filename = filename
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/117.0.0.0 Safari/537.36",
"Referer": "https://www.bilibili.com/",
"Accept": "application/json, text/plain, */*",
"Accept-Language": "zh-CN,zh;q=0.9",
}
def get_video_cid(self):
for attempt in range(3):
try:
response = requests.get(self.video_url, headers=self.headers, timeout=10)
if response.status_code != 200:
print(f"请求失败,状态码: {response.status_code}")
continue
html = response.content.decode(errors='ignore')
cid = re.findall(r'("cid":)([0-9]+)', html)
if cid:
return cid[0][-1]
except requests.exceptions.RequestException as e:
print(f"获取 cid 出错: {e}")
print(f"{attempt + 1} 次重试获取 cid...")
time.sleep(1.5)
return None
def get_content(self, xml_url):
try:
response = requests.get(xml_url, headers=self.headers, timeout=10)
if response.status_code == 200:
return response.content
else:
print(f"获取弹幕失败,状态码: {response.status_code}")
except requests.exceptions.RequestException as e:
print(f"弹幕请求出错: {e}")
return None
def extract_danmu(self, content_str):
try:
html = etree.HTML(content_str)
return html.xpath("//d/text()")
except Exception as e:
print(f"解析弹幕出错: {e}")
return []
def save(self, save_items):
os.makedirs(os.path.dirname(self.filename), exist_ok=True)
with open(self.filename, 'w', encoding='utf-8') as f:
f.writelines([item + '\n' for item in save_items])
print(f"弹幕已保存: {self.filename} ({len(save_items)} 条)")
def crawl(self):
cid = self.get_video_cid()
if not cid:
print(f"无效 BV: {self.video_url}")
return False
xml_url = f"http://comment.bilibili.com/{cid}.xml"
content = self.get_content(xml_url)
if not content:
print(f"BV号 {self.video_url.split('/')[-1]} 弹幕请求失败")
return False
danmu_list = self.extract_danmu(content)
if not danmu_list:
print(f"BV号 {self.video_url.split('/')[-1]} 无弹幕,跳过保存")
return False
self.save(danmu_list)
return True
def search_videos(query, max_results=300):
search_url = "https://api.bilibili.com/x/web-interface/search/type"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/117.0.0.0 Safari/537.36",
"Referer": "https://www.bilibili.com/",
"Accept": "application/json, text/plain, */*",
"Accept-Language": "zh-CN,zh;q=0.9",
"cookie": "SESSDATA=your_cookie_here"
}
bv_list = []
page = 1
while len(bv_list) < max_results:
params = {'keyword': query, 'search_type': 'video', 'order': 'totalrank',
'page': page, 'pagesize': 50}
try:
response = requests.get(search_url, params=params, headers=headers, timeout=10)
if response.status_code != 200:
print(f"搜索请求失败: {response.status_code}")
break
results = response.json()
if results['code'] != 0:
print(f"搜索失败: {results.get('message', '未知错误')}")
break
videos = results['data']['result']
if not videos:
break
bv_list += [v['bvid'] for v in videos]
print(f"已抓取 {len(bv_list)} 个视频")
except Exception as e:
print(f"请求错误: {e}")
time.sleep(random.uniform(1, 3))
page += 1
time.sleep(random.uniform(1, 2))
return list(OrderedDict.fromkeys(bv_list))[:max_results]
def download_danmu(index, bv, filename):
crawler = BiliBiliDanMu(bv, filename)
return crawler.crawl()
def getthread():
success, fail = 0, 0
with ThreadPoolExecutor(max_workers=10) as executor:
futures = {
executor.submit(download_danmu, i, bv, f"{output_dir}{i + 1}个视频_{bv}.txt"): bv
for i, bv in enumerate(bv_list)
}
for future in as_completed(futures):
bv = futures[future]
try:
if future.result():
success += 1
else:
fail += 1
except Exception as e:
print(f"BV号 {bv} 抓取出错: {e}")
fail += 1
print(f"\n抓取完成:成功 {success} 个,失败或无弹幕 {fail} 个。")
if __name__ == '__main__':
queries = ["大语言模型",]
output_dir = 'E:/Codes/Software_Engineering/bilibili_danmu_crawl/output/danmu_LLM/'
os.makedirs(output_dir, exist_ok=True)
all_bv = []
for q in queries:
print(f"\n正在搜索关键词: {q}")
bvs = search_videos(q)
all_bv.extend(bvs)
print(f"关键词 '{q}' 共获取 {len(bvs)} 个视频")
bv_list = list(OrderedDict.fromkeys(all_bv))
print(f"\n总计准备抓取 {len(bv_list)} 个视频弹幕...\n")
getthread()