|
|
|
|
import requests
|
|
|
|
|
import re
|
|
|
|
|
from lxml import etree
|
|
|
|
|
import os
|
|
|
|
|
import time
|
|
|
|
|
import random
|
|
|
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
|
|
from collections import OrderedDict
|
|
|
|
|
|
|
|
|
|
class BiliBiliDanMu:
|
|
|
|
|
def __init__(self, bv, filename):
|
|
|
|
|
#处理输入的 BV 号,确保是正确格式
|
|
|
|
|
if bv.startswith("BV"):
|
|
|
|
|
bv = bv[2:]
|
|
|
|
|
self.video_url = "https://bilibili.com/video/BV" + bv
|
|
|
|
|
self.filename = filename
|
|
|
|
|
self.headers = {
|
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36",
|
|
|
|
|
"Referer": "https://www.bilibili.com/",
|
|
|
|
|
"Accept": "application/json, text/plain, */*",
|
|
|
|
|
"Accept-Language": "zh-CN,zh;q=0.9",
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
def get_video_cid(self):
|
|
|
|
|
#尝试最多 3 次获取视频的 cid
|
|
|
|
|
retry_count = 3
|
|
|
|
|
for attempt in range(retry_count):
|
|
|
|
|
try:
|
|
|
|
|
response = requests.get(self.video_url, headers=self.headers, timeout=10)
|
|
|
|
|
if response.status_code != 200:
|
|
|
|
|
print(f"请求失败,状态码: {response.status_code}")
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
html = response.content.decode()
|
|
|
|
|
cid = re.findall(r'("cid":)([0-9]+)', html)
|
|
|
|
|
if not cid:
|
|
|
|
|
print("未找到 cid")
|
|
|
|
|
continue
|
|
|
|
|
else:
|
|
|
|
|
return cid[0][-1]
|
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
|
|
|
print(f"获取 cid 时出错: {e}")
|
|
|
|
|
print(f"第 {attempt + 1} 次重试获取 cid...")
|
|
|
|
|
time.sleep(2)
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
def get_content(self, xml_url):
|
|
|
|
|
#获取弹幕 XML 文件的内容
|
|
|
|
|
try:
|
|
|
|
|
response = requests.get(xml_url, headers=self.headers, timeout=10)
|
|
|
|
|
if response.status_code == 200:
|
|
|
|
|
return response.content
|
|
|
|
|
else:
|
|
|
|
|
print(f"获取弹幕内容失败,状态码: {response.status_code}")
|
|
|
|
|
return None
|
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
|
|
|
print(f"获取弹幕时出错: {e}")
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
def extract_danmu(self, content_str):
|
|
|
|
|
#解析XML内容,提取弹幕
|
|
|
|
|
try:
|
|
|
|
|
html = etree.HTML(content_str)
|
|
|
|
|
danmu_list = html.xpath("//d/text()")
|
|
|
|
|
return danmu_list
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"解析弹幕时出错: {e}")
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
def save(self, save_items):
|
|
|
|
|
#保存弹幕到文件
|
|
|
|
|
output_dir = os.path.dirname(self.filename)
|
|
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
with open(self.filename, 'w', encoding='utf-8') as f:
|
|
|
|
|
lines = [item + '\n' for item in save_items]
|
|
|
|
|
f.writelines(lines)
|
|
|
|
|
print(f"弹幕已保存至 {self.filename}")
|
|
|
|
|
|
|
|
|
|
def crawl(self):
|
|
|
|
|
#执行爬取流程
|
|
|
|
|
cid = self.get_video_cid()
|
|
|
|
|
if cid is not None:
|
|
|
|
|
xml_url = "http://comment.bilibili.com/" + str(cid) + ".xml"
|
|
|
|
|
content_str = self.get_content(xml_url)
|
|
|
|
|
if content_str:
|
|
|
|
|
danmu_lst = self.extract_danmu(content_str)
|
|
|
|
|
self.save(danmu_lst)
|
|
|
|
|
else:
|
|
|
|
|
print("视频没有有效的 cid,跳过此视频")
|
|
|
|
|
|
|
|
|
|
def search_videos(query, max_results=350):
|
|
|
|
|
#搜索视频,最多返回 max_results 个结果
|
|
|
|
|
search_url = "https://api.bilibili.com/x/web-interface/search/type"
|
|
|
|
|
headers = {
|
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36",
|
|
|
|
|
"Referer": "https://www.bilibili.com/",
|
|
|
|
|
"Accept": "application/json, text/plain, */*",
|
|
|
|
|
"Accept-Language": "zh-CN,zh;q=0.9",
|
|
|
|
|
"cookie": "your cookie" #Cookie 有就行,内容随意
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bv_list = []
|
|
|
|
|
page = 1
|
|
|
|
|
|
|
|
|
|
while len(bv_list) < max_results:
|
|
|
|
|
params = {
|
|
|
|
|
'keyword': query,
|
|
|
|
|
'search_type': 'video',
|
|
|
|
|
'order': 'totalrank',
|
|
|
|
|
'page': page,
|
|
|
|
|
'pagesize': 50
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
response = requests.get(search_url, params=params, headers=headers, timeout=10)
|
|
|
|
|
if response.status_code == 200:
|
|
|
|
|
results = response.json()
|
|
|
|
|
if results['code'] == 0:
|
|
|
|
|
videos = results['data']['result']
|
|
|
|
|
if not videos:
|
|
|
|
|
break
|
|
|
|
|
bv_list += [video['bvid'] for video in videos]
|
|
|
|
|
print(f"已抓取 {len(bv_list)} 个视频")
|
|
|
|
|
else:
|
|
|
|
|
print(f"搜索失败,错误代码: {results['code']},错误信息: {results.get('message', '无详细信息')}")
|
|
|
|
|
if '频繁' in results.get('message', ''):
|
|
|
|
|
print("限流,等待后重试")
|
|
|
|
|
time.sleep(random.uniform(5, 10))
|
|
|
|
|
continue
|
|
|
|
|
break
|
|
|
|
|
else:
|
|
|
|
|
print(f"搜索请求失败,状态码: {response.status_code}")
|
|
|
|
|
break
|
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
|
|
|
print(f"请求失败,错误: {e}")
|
|
|
|
|
time.sleep(random.uniform(2, 5))
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
page += 1
|
|
|
|
|
time.sleep(random.uniform(1, 3)) #防止请求过于频繁被禁止
|
|
|
|
|
|
|
|
|
|
bv_list = list(OrderedDict.fromkeys(bv_list)) #去重操作
|
|
|
|
|
return bv_list[:max_results]
|
|
|
|
|
|
|
|
|
|
def download_danmu(index, bv, filename):
|
|
|
|
|
#下载指定BV号视频的弹幕
|
|
|
|
|
danmu_crawler = BiliBiliDanMu(bv, filename)
|
|
|
|
|
danmu_crawler.crawl()
|
|
|
|
|
|
|
|
|
|
def getthread():
|
|
|
|
|
#使用线程池并发下载弹幕
|
|
|
|
|
with ThreadPoolExecutor(max_workers=10) as executor:
|
|
|
|
|
future_to_bv = {
|
|
|
|
|
executor.submit(download_danmu, index, bv, f'{output_dir}第{index + 1}个视频_{bv}.txt'): bv for index, bv in enumerate(bv_list)
|
|
|
|
|
}
|
|
|
|
|
for future in as_completed(future_to_bv):
|
|
|
|
|
bv = future_to_bv[future]
|
|
|
|
|
try:
|
|
|
|
|
future.result()
|
|
|
|
|
print(f"BV号 {bv} 的弹幕抓取完成")
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
print(f"BV号 {bv} 的弹幕抓取时出错: {exc}")
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
query = input("请输入搜索关键词: ")
|
|
|
|
|
bv_list = search_videos(query)
|
|
|
|
|
|
|
|
|
|
#限制爬取的最大视频数量为 300
|
|
|
|
|
bv_list = bv_list[:300]
|
|
|
|
|
|
|
|
|
|
output_dir = 'E:/前端/软件工程/弹幕收集bark/'
|
|
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
getthread()
|