From 6cd9c0cfaad659d5584d5604c18fe8bdf9e299e5 Mon Sep 17 00:00:00 2001 From: fzu102301136 <3225314707@qq.com> Date: Sun, 16 Nov 2025 21:43:00 +0800 Subject: [PATCH] Delete 'bilibili_spider.py' --- bilibili_spider.py | 87 ---------------------------------------------- 1 file changed, 87 deletions(-) delete mode 100644 bilibili_spider.py diff --git a/bilibili_spider.py b/bilibili_spider.py deleted file mode 100644 index 7a2ebf6..0000000 --- a/bilibili_spider.py +++ /dev/null @@ -1,87 +0,0 @@ -import requests -import re -import json -import time -import random - -class BilibiliDanmakuSpider: - def __init__(self): - self.headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", - "Referer": "https://www.bilibili.com/" - } - self.danmaku_list = [] - - def get_video_ids(self, keyword, page_count=36): - """获取搜索结果的视频ID,每页10个视频,36页共360个""" - video_ids = [] - for page in range(1, page_count + 1): - try: - url = f"https://api.bilibili.com/x/web-interface/search/type?keyword={keyword}&search_type=video&page={page}" - response = requests.get(url, headers=self.headers) - data = json.loads(response.text) - - #if data["code"] == 0 and data["data"]["result"]: - # for item in data["data"]["result"]: - # video_ids.append(item["aid"]) - # 检查接口响应是否正常且有视频数据 - if data["code"] == 0 and data["data"]["result"]: - # 提取当前页的视频ID并显示 - page_video_ids = [item["aid"] for item in data["data"]["result"]] - video_ids.extend(page_video_ids) # 批量添加到总列表 - - # 显示当前页获取到的视频号 - print(f"第{page}页获取到视频ID:{page_video_ids}共{len(page_video_ids)}个") - #else: - # 无数据时提示(非错误,可能是真的没结果) - #print(f"第{page}页未获取到视频数据") - - # 随机延迟,避免被反爬 - time.sleep(random.uniform(1, 3)) - except Exception as e: - print(f"获取第{page}页视频ID失败: {e}") - - return list(set(video_ids))[:360] # 去重并确保最多360个 - - def get_danmakus(self, aid): - """获取单个视频的弹幕""" - try: - # 获取cid - url = f"https://api.bilibili.com/x/web-interface/view?aid={aid}" - response = requests.get(url, headers=self.headers) - cid = json.loads(response.text)["data"]["cid"] - - # 获取弹幕 - danmaku_url = f"https://comment.bilibili.com/{cid}.xml" - response = requests.get(danmaku_url, headers=self.headers) - response.encoding = "utf-8" - - # 提取弹幕内容 - danmakus = re.findall(r'(.*?)', response.text) - self.danmaku_list.extend(danmakus) - - print(f"成功获取视频{aid}的{len(danmakus)}条弹幕") - time.sleep(random.uniform(0.5, 1.5)) - return True - except Exception as e: - print(f"获取视频{aid}弹幕失败: {e}") - return False - - def run(self, keywords=["大语言模型", "大模型", "LLM"]): - """运行爬虫主程序""" - all_video_ids = [] - for keyword in keywords: - print(f"搜索关键词: {keyword}") - video_ids = self.get_video_ids(keyword) - all_video_ids.extend(video_ids) - - # 去重并确保总数不超过360 - unique_video_ids = list(set(all_video_ids))[:360] - print(f"共获取{len(unique_video_ids)}个视频ID,开始爬取弹幕...") - - for idx, aid in enumerate(unique_video_ids, 1): - print(f"正在爬取第{idx}/{len(unique_video_ids)}个视频") - self.get_danmakus(aid) - - print(f"爬取完成,共获取{len(self.danmaku_list)}条弹幕") - return self.danmaku_list \ No newline at end of file