From 0a4e2850756642b5008057842fae0c2b4eb8cd14 Mon Sep 17 00:00:00 2001 From: pmgp6jfbh <1072906427@qq.com> Date: Thu, 13 Nov 2025 15:44:45 +0800 Subject: [PATCH] ADD file via upload --- crawler.py | 128 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 crawler.py diff --git a/crawler.py b/crawler.py new file mode 100644 index 0000000..53a9ae3 --- /dev/null +++ b/crawler.py @@ -0,0 +1,128 @@ +# -*- coding: utf-8 -*- +import requests +from bs4 import BeautifulSoup +import time +import os +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry + +class BilibiliDanmakuCrawler: + def __init__(self, keyword, video_count, save_dir): + self.keyword = keyword + self.video_count = video_count + self.save_dir = save_dir + os.makedirs(save_dir, exist_ok=True) + + # 带重试机制的会话(解决网络波动) + self.session = requests.Session() + retry = Retry( + total=3, + backoff_factor=1, + status_forcelist=[412, 429, 500, 502, 503, 504] + ) + self.session.mount('http://', HTTPAdapter(max_retries=retry)) + self.session.mount('https://', HTTPAdapter(max_retries=retry)) + + # 请求头(替换为你的Cookie) + self.headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36", + "Referer": "https://search.bilibili.com/", + "Cookie": "buvid3=E8B6A22C-2C45-4243-1A50-0B7887C84A4500588infoc; rpdid=|(u))kkYu|um0J'u~k)~mYm|u; b_nut=100; b_lsid=BD6D42CE_19A76AED18B; bsource=search_sougo; _uuid=BD2459B10-A5107-5424-57D5-D3E3C2510222859761infoc; buvid_fp=01d6b98373d0ee5ae25897960af410a8; home_feed_column=5; browser_resolution=1699-941; buvid4=FD45C162-3221-ACF9-4FB1-19AE076778C860860-025111214-xHcqiQBrPSLcJVMwvAmRJw%3D%3D; CURRENT_QUALITY=0; csrf_state=e8a98437773776f3420329dc3758a34c; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3NjMxOTAwMzEsImlhdCI6MTc2MjkzMDc3MSwicGx0IjotMX0.wLjhNhZSAz-VJZCGAEltHftitM-90-C1nSRr_JtoVII; bili_ticket_expires=1763189971; SESSDATA=36fd0664%2C1778482862%2Ce1d71%2Ab1CjBUqeAMzYv1RDPYIrJSzYNj9v_TLDaM3RXdELRfLIrNqKJHA7i5yvvzwA3AiDKvw4ISVkN3Q0lpNFcwU2pLNk9ZakczZklvOWsxTWZvcm9KMzNBR1R1d05kOTcwU1BSSWY1RVdBWmtmUHY3VURDektoVHNJVTVFSF9vQlFmSlY4ZDRERkFfaF9nIIEC; bili_jct=3380c36097411cca708626cb4e7a81d6; DedeUserID=409262887; DedeUserID__ckMd5=234150aa4a4b4661; bp_t_offset_409262887=1134268504589991936; theme-tip-show=SHOWED; CURRENT_FNVAL=4048; sid=5uxkxypo" + } + + def get_top_bv_ids(self): + """获取视频BV号列表""" + bv_list = [] + page = 1 + max_pages = 20 # 最大页数 + print(f"开始获取「{self.keyword}」相关视频BV号...") + + while len(bv_list) < self.video_count and page <= max_pages: + api_url = ( + f"https://api.bilibili.com/x/web-interface/search/type" + f"?keyword={self.keyword}&search_type=video&order=totalrank&page={page}" + ) + try: + resp = self.session.get(api_url, headers=self.headers, timeout=15) + resp.raise_for_status() + data = resp.json() + + if data.get("code") != 0: + print(f"第{page}页错误:{data.get('message')}") + page += 1 + time.sleep(2) + continue + + # 解析BV号并去重 + new_count = 0 + for item in data["data"]["result"]: + bv = item.get("bvid") + if bv and bv not in bv_list: + bv_list.append(bv) + new_count += 1 + if len(bv_list) >= self.video_count: + break + + print(f"第{page}页处理完成,新增{new_count}个,共{len(bv_list)}个") + page += 1 + time.sleep(1.5) + + except Exception as e: + print(f"第{page}页请求失败:{str(e)},重试...") + time.sleep(3) + + # 保存BV号 + with open(f"{self.save_dir}/bv_list.txt", "w", encoding="utf-8") as f: + f.write("\n".join(bv_list)) + print(f"BV号获取完成,共{len(bv_list)}个") + return bv_list + + def get_cid_by_bv(self, bv): + """通过BV号获取CID""" + try: + url = f"https://api.bilibili.com/x/web-interface/view?bvid={bv}" + resp = self.session.get(url, headers=self.headers, timeout=15) + return resp.json()["data"]["cid"] + except Exception as e: + print(f"BV号「{bv}」获取CID失败:{str(e)}") + return None + + def crawl_danmaku(self, cid): + """爬取弹幕并过滤噪声""" + if not cid: + return [] + try: + url = f"https://comment.bilibili.com/{cid}.xml" + resp = self.session.get(url, headers=self.headers, timeout=15) + resp.encoding = "utf-8" + soup = BeautifulSoup(resp.text, "xml") + danmu_tags = soup.find_all("d") + + # 过滤规则 + noise = {"666", "哈哈哈", "点赞", "投币", "收藏", "打卡", "来了"} + valid = [ + tag.text.strip() for tag in danmu_tags + if tag.text.strip() and len(tag.text.strip()) > 1 + and not any(n in tag.text.strip() for n in noise) + ] + return valid + except Exception as e: + print(f"CID「{cid}」爬取失败:{str(e)}") + return [] + + def run(self): + """执行爬取流程""" + bv_list = self.get_top_bv_ids() + all_danmu = [] + + for i, bv in enumerate(bv_list, 1): + print(f"处理第{i}/{len(bv_list)}个视频(BV:{bv})") + cid = self.get_cid_by_bv(bv) + all_danmu.extend(self.crawl_danmaku(cid)) + time.sleep(1) + + # 保存弹幕 + with open(f"{self.save_dir}/all_danmu.txt", "w", encoding="utf-8") as f: + f.write("\n".join(all_danmu)) + print(f"爬取完成,共{len(all_danmu)}条有效弹幕") + return all_danmu \ No newline at end of file