You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

128 lines
5.9 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import time
import os
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
class BilibiliDanmakuCrawler:
def __init__(self, keyword, video_count, save_dir):
self.keyword = keyword
self.video_count = video_count
self.save_dir = save_dir
os.makedirs(save_dir, exist_ok=True)
# 带重试机制的会话(解决网络波动)
self.session = requests.Session()
retry = Retry(
total=3,
backoff_factor=1,
status_forcelist=[412, 429, 500, 502, 503, 504]
)
self.session.mount('http://', HTTPAdapter(max_retries=retry))
self.session.mount('https://', HTTPAdapter(max_retries=retry))
# 请求头替换为你的Cookie
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
"Referer": "https://search.bilibili.com/",
"Cookie": "buvid3=E8B6A22C-2C45-4243-1A50-0B7887C84A4500588infoc; rpdid=|(u))kkYu|um0J'u~k)~mYm|u; b_nut=100; b_lsid=BD6D42CE_19A76AED18B; bsource=search_sougo; _uuid=BD2459B10-A5107-5424-57D5-D3E3C2510222859761infoc; buvid_fp=01d6b98373d0ee5ae25897960af410a8; home_feed_column=5; browser_resolution=1699-941; buvid4=FD45C162-3221-ACF9-4FB1-19AE076778C860860-025111214-xHcqiQBrPSLcJVMwvAmRJw%3D%3D; CURRENT_QUALITY=0; csrf_state=e8a98437773776f3420329dc3758a34c; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3NjMxOTAwMzEsImlhdCI6MTc2MjkzMDc3MSwicGx0IjotMX0.wLjhNhZSAz-VJZCGAEltHftitM-90-C1nSRr_JtoVII; bili_ticket_expires=1763189971; SESSDATA=36fd0664%2C1778482862%2Ce1d71%2Ab1CjBUqeAMzYv1RDPYIrJSzYNj9v_TLDaM3RXdELRfLIrNqKJHA7i5yvvzwA3AiDKvw4ISVkN3Q0lpNFcwU2pLNk9ZakczZklvOWsxTWZvcm9KMzNBR1R1d05kOTcwU1BSSWY1RVdBWmtmUHY3VURDektoVHNJVTVFSF9vQlFmSlY4ZDRERkFfaF9nIIEC; bili_jct=3380c36097411cca708626cb4e7a81d6; DedeUserID=409262887; DedeUserID__ckMd5=234150aa4a4b4661; bp_t_offset_409262887=1134268504589991936; theme-tip-show=SHOWED; CURRENT_FNVAL=4048; sid=5uxkxypo"
}
def get_top_bv_ids(self):
"""获取视频BV号列表"""
bv_list = []
page = 1
max_pages = 20 # 最大页数
print(f"开始获取「{self.keyword}」相关视频BV号...")
while len(bv_list) < self.video_count and page <= max_pages:
api_url = (
f"https://api.bilibili.com/x/web-interface/search/type"
f"?keyword={self.keyword}&search_type=video&order=totalrank&page={page}"
)
try:
resp = self.session.get(api_url, headers=self.headers, timeout=15)
resp.raise_for_status()
data = resp.json()
if data.get("code") != 0:
print(f"{page}页错误:{data.get('message')}")
page += 1
time.sleep(2)
continue
# 解析BV号并去重
new_count = 0
for item in data["data"]["result"]:
bv = item.get("bvid")
if bv and bv not in bv_list:
bv_list.append(bv)
new_count += 1
if len(bv_list) >= self.video_count:
break
print(f"{page}页处理完成,新增{new_count}个,共{len(bv_list)}")
page += 1
time.sleep(1.5)
except Exception as e:
print(f"{page}页请求失败:{str(e)},重试...")
time.sleep(3)
# 保存BV号
with open(f"{self.save_dir}/bv_list.txt", "w", encoding="utf-8") as f:
f.write("\n".join(bv_list))
print(f"BV号获取完成{len(bv_list)}")
return bv_list
def get_cid_by_bv(self, bv):
"""通过BV号获取CID"""
try:
url = f"https://api.bilibili.com/x/web-interface/view?bvid={bv}"
resp = self.session.get(url, headers=self.headers, timeout=15)
return resp.json()["data"]["cid"]
except Exception as e:
print(f"BV号「{bv}」获取CID失败{str(e)}")
return None
def crawl_danmaku(self, cid):
"""爬取弹幕并过滤噪声"""
if not cid:
return []
try:
url = f"https://comment.bilibili.com/{cid}.xml"
resp = self.session.get(url, headers=self.headers, timeout=15)
resp.encoding = "utf-8"
soup = BeautifulSoup(resp.text, "xml")
danmu_tags = soup.find_all("d")
# 过滤规则
noise = {"666", "哈哈哈", "点赞", "投币", "收藏", "打卡", "来了"}
valid = [
tag.text.strip() for tag in danmu_tags
if tag.text.strip() and len(tag.text.strip()) > 1
and not any(n in tag.text.strip() for n in noise)
]
return valid
except Exception as e:
print(f"CID「{cid}」爬取失败:{str(e)}")
return []
def run(self):
"""执行爬取流程"""
bv_list = self.get_top_bv_ids()
all_danmu = []
for i, bv in enumerate(bv_list, 1):
print(f"处理第{i}/{len(bv_list)}个视频BV{bv}")
cid = self.get_cid_by_bv(bv)
all_danmu.extend(self.crawl_danmaku(cid))
time.sleep(1)
# 保存弹幕
with open(f"{self.save_dir}/all_danmu.txt", "w", encoding="utf-8") as f:
f.write("\n".join(all_danmu))
print(f"爬取完成,共{len(all_danmu)}条有效弹幕")
return all_danmu