diff --git a/bilibili_spider.py b/bilibili_spider.py new file mode 100644 index 0000000..7a2ebf6 --- /dev/null +++ b/bilibili_spider.py @@ -0,0 +1,87 @@ +import requests +import re +import json +import time +import random + +class BilibiliDanmakuSpider: + def __init__(self): + self.headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", + "Referer": "https://www.bilibili.com/" + } + self.danmaku_list = [] + + def get_video_ids(self, keyword, page_count=36): + """获取搜索结果的视频ID,每页10个视频,36页共360个""" + video_ids = [] + for page in range(1, page_count + 1): + try: + url = f"https://api.bilibili.com/x/web-interface/search/type?keyword={keyword}&search_type=video&page={page}" + response = requests.get(url, headers=self.headers) + data = json.loads(response.text) + + #if data["code"] == 0 and data["data"]["result"]: + # for item in data["data"]["result"]: + # video_ids.append(item["aid"]) + # 检查接口响应是否正常且有视频数据 + if data["code"] == 0 and data["data"]["result"]: + # 提取当前页的视频ID并显示 + page_video_ids = [item["aid"] for item in data["data"]["result"]] + video_ids.extend(page_video_ids) # 批量添加到总列表 + + # 显示当前页获取到的视频号 + print(f"第{page}页获取到视频ID:{page_video_ids}共{len(page_video_ids)}个") + #else: + # 无数据时提示(非错误,可能是真的没结果) + #print(f"第{page}页未获取到视频数据") + + # 随机延迟,避免被反爬 + time.sleep(random.uniform(1, 3)) + except Exception as e: + print(f"获取第{page}页视频ID失败: {e}") + + return list(set(video_ids))[:360] # 去重并确保最多360个 + + def get_danmakus(self, aid): + """获取单个视频的弹幕""" + try: + # 获取cid + url = f"https://api.bilibili.com/x/web-interface/view?aid={aid}" + response = requests.get(url, headers=self.headers) + cid = json.loads(response.text)["data"]["cid"] + + # 获取弹幕 + danmaku_url = f"https://comment.bilibili.com/{cid}.xml" + response = requests.get(danmaku_url, headers=self.headers) + response.encoding = "utf-8" + + # 提取弹幕内容 + danmakus = re.findall(r'(.*?)', response.text) + self.danmaku_list.extend(danmakus) + + print(f"成功获取视频{aid}的{len(danmakus)}条弹幕") + time.sleep(random.uniform(0.5, 1.5)) + return True + except Exception as e: + print(f"获取视频{aid}弹幕失败: {e}") + return False + + def run(self, keywords=["大语言模型", "大模型", "LLM"]): + """运行爬虫主程序""" + all_video_ids = [] + for keyword in keywords: + print(f"搜索关键词: {keyword}") + video_ids = self.get_video_ids(keyword) + all_video_ids.extend(video_ids) + + # 去重并确保总数不超过360 + unique_video_ids = list(set(all_video_ids))[:360] + print(f"共获取{len(unique_video_ids)}个视频ID,开始爬取弹幕...") + + for idx, aid in enumerate(unique_video_ids, 1): + print(f"正在爬取第{idx}/{len(unique_video_ids)}个视频") + self.get_danmakus(aid) + + print(f"爬取完成,共获取{len(self.danmaku_list)}条弹幕") + return self.danmaku_list \ No newline at end of file