You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
170 lines
5.9 KiB
170 lines
5.9 KiB
import requests
|
|
import re
|
|
from lxml import etree
|
|
import os
|
|
import time
|
|
import random
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from collections import OrderedDict
|
|
|
|
class BiliBiliDanMu:
|
|
def __init__(self, bv, filename):
|
|
if bv.startswith("BV"):
|
|
bv = bv[2:]
|
|
self.video_url = "https://bilibili.com/video/BV" + bv
|
|
self.filename = filename
|
|
self.headers = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
"Chrome/117.0.0.0 Safari/537.36",
|
|
"Referer": "https://www.bilibili.com/",
|
|
"Accept": "application/json, text/plain, */*",
|
|
"Accept-Language": "zh-CN,zh;q=0.9",
|
|
}
|
|
|
|
def get_video_cid(self):
|
|
for attempt in range(3):
|
|
try:
|
|
response = requests.get(self.video_url, headers=self.headers, timeout=10)
|
|
if response.status_code != 200:
|
|
print(f"请求失败,状态码: {response.status_code}")
|
|
continue
|
|
html = response.content.decode(errors='ignore')
|
|
cid = re.findall(r'("cid":)([0-9]+)', html)
|
|
if cid:
|
|
return cid[0][-1]
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"获取 cid 出错: {e}")
|
|
print(f"第 {attempt + 1} 次重试获取 cid...")
|
|
time.sleep(1.5)
|
|
return None
|
|
|
|
def get_content(self, xml_url):
|
|
try:
|
|
response = requests.get(xml_url, headers=self.headers, timeout=10)
|
|
if response.status_code == 200:
|
|
return response.content
|
|
else:
|
|
print(f"获取弹幕失败,状态码: {response.status_code}")
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"弹幕请求出错: {e}")
|
|
return None
|
|
|
|
def extract_danmu(self, content_str):
|
|
try:
|
|
html = etree.HTML(content_str)
|
|
return html.xpath("//d/text()")
|
|
except Exception as e:
|
|
print(f"解析弹幕出错: {e}")
|
|
return []
|
|
|
|
def save(self, save_items):
|
|
os.makedirs(os.path.dirname(self.filename), exist_ok=True)
|
|
with open(self.filename, 'w', encoding='utf-8') as f:
|
|
f.writelines([item + '\n' for item in save_items])
|
|
print(f"弹幕已保存: {self.filename} ({len(save_items)} 条)")
|
|
|
|
def crawl(self):
|
|
cid = self.get_video_cid()
|
|
if not cid:
|
|
print(f"无效 BV: {self.video_url}")
|
|
return False
|
|
|
|
xml_url = f"http://comment.bilibili.com/{cid}.xml"
|
|
content = self.get_content(xml_url)
|
|
if not content:
|
|
print(f"BV号 {self.video_url.split('/')[-1]} 弹幕请求失败")
|
|
return False
|
|
|
|
danmu_list = self.extract_danmu(content)
|
|
if not danmu_list:
|
|
print(f"BV号 {self.video_url.split('/')[-1]} 无弹幕,跳过保存")
|
|
return False
|
|
|
|
self.save(danmu_list)
|
|
return True
|
|
|
|
|
|
def search_videos(query, max_results=300):
|
|
search_url = "https://api.bilibili.com/x/web-interface/search/type"
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
"Chrome/117.0.0.0 Safari/537.36",
|
|
"Referer": "https://www.bilibili.com/",
|
|
"Accept": "application/json, text/plain, */*",
|
|
"Accept-Language": "zh-CN,zh;q=0.9",
|
|
"cookie": "SESSDATA=your_cookie_here"
|
|
}
|
|
|
|
bv_list = []
|
|
page = 1
|
|
while len(bv_list) < max_results:
|
|
params = {'keyword': query, 'search_type': 'video', 'order': 'totalrank',
|
|
'page': page, 'pagesize': 50}
|
|
try:
|
|
response = requests.get(search_url, params=params, headers=headers, timeout=10)
|
|
if response.status_code != 200:
|
|
print(f"搜索请求失败: {response.status_code}")
|
|
break
|
|
results = response.json()
|
|
if results['code'] != 0:
|
|
print(f"搜索失败: {results.get('message', '未知错误')}")
|
|
break
|
|
videos = results['data']['result']
|
|
if not videos:
|
|
break
|
|
bv_list += [v['bvid'] for v in videos]
|
|
print(f"已抓取 {len(bv_list)} 个视频")
|
|
except Exception as e:
|
|
print(f"请求错误: {e}")
|
|
time.sleep(random.uniform(1, 3))
|
|
page += 1
|
|
time.sleep(random.uniform(1, 2))
|
|
|
|
return list(OrderedDict.fromkeys(bv_list))[:max_results]
|
|
|
|
|
|
def download_danmu(index, bv, filename):
|
|
crawler = BiliBiliDanMu(bv, filename)
|
|
return crawler.crawl()
|
|
|
|
|
|
def getthread():
|
|
success, fail = 0, 0
|
|
with ThreadPoolExecutor(max_workers=10) as executor:
|
|
futures = {
|
|
executor.submit(download_danmu, i, bv, f"{output_dir}第{i + 1}个视频_{bv}.txt"): bv
|
|
for i, bv in enumerate(bv_list)
|
|
}
|
|
for future in as_completed(futures):
|
|
bv = futures[future]
|
|
try:
|
|
if future.result():
|
|
success += 1
|
|
else:
|
|
fail += 1
|
|
except Exception as e:
|
|
print(f"BV号 {bv} 抓取出错: {e}")
|
|
fail += 1
|
|
|
|
print(f"\n抓取完成:成功 {success} 个,失败或无弹幕 {fail} 个。")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
queries = ["大语言模型",]
|
|
output_dir = 'E:/Codes/Software_Engineering/bilibili_danmu_crawl/output/danmu_LLM/'
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
all_bv = []
|
|
for q in queries:
|
|
print(f"\n正在搜索关键词: {q}")
|
|
bvs = search_videos(q)
|
|
all_bv.extend(bvs)
|
|
print(f"关键词 '{q}' 共获取 {len(bvs)} 个视频")
|
|
|
|
bv_list = list(OrderedDict.fromkeys(all_bv))
|
|
print(f"\n总计准备抓取 {len(bv_list)} 个视频弹幕...\n")
|
|
|
|
getthread()
|