ADD file via upload

5 months ago · 6403703cf4
parent c6c4b8456e
commit 6403703cf4
1 changed files with 87 additions and 0 deletions
--- a/bilibili_spider.py
+++ b/bilibili_spider.py
@ -0,0 +1,87 @@
+import requests
+import re
+import json
+import time
+import random
+
+class BilibiliDanmakuSpider:
+    def __init__(self):
+        self.headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+            "Referer": "https://www.bilibili.com/"
+        }
+        self.danmaku_list = []
+        
+    def get_video_ids(self, keyword, page_count=36):
+        """获取搜索结果的视频ID，每页10个视频，36页共360个"""
+        video_ids = []
+        for page in range(1, page_count + 1):
+            try:
+                url = f"https://api.bilibili.com/x/web-interface/search/type?keyword={keyword}&search_type=video&page={page}"
+                response = requests.get(url, headers=self.headers)
+                data = json.loads(response.text)
+                
+                #if data["code"] == 0 and data["data"]["result"]:
+                 #   for item in data["data"]["result"]:
+                  #      video_ids.append(item["aid"])
+                # 检查接口响应是否正常且有视频数据
+                if data["code"] == 0 and data["data"]["result"]:
+                    # 提取当前页的视频ID并显示
+                    page_video_ids = [item["aid"] for item in data["data"]["result"]]
+                    video_ids.extend(page_video_ids)  # 批量添加到总列表
+                
+                    # 显示当前页获取到的视频号
+                    print(f"第{page}页获取到视频ID：{page_video_ids}共{len(page_video_ids)}个")
+                #else:
+                    # 无数据时提示（非错误，可能是真的没结果）
+                    #print(f"第{page}页未获取到视频数据")        
+                
+                # 随机延迟，避免被反爬
+                time.sleep(random.uniform(1, 3))
+            except Exception as e:
+                print(f"获取第{page}页视频ID失败: {e}")
+        
+        return list(set(video_ids))[:360]  # 去重并确保最多360个
+    
+    def get_danmakus(self, aid):
+        """获取单个视频的弹幕"""
+        try:
+            # 获取cid
+            url = f"https://api.bilibili.com/x/web-interface/view?aid={aid}"
+            response = requests.get(url, headers=self.headers)
+            cid = json.loads(response.text)["data"]["cid"]
+            
+            # 获取弹幕
+            danmaku_url = f"https://comment.bilibili.com/{cid}.xml"
+            response = requests.get(danmaku_url, headers=self.headers)
+            response.encoding = "utf-8"
+            
+            # 提取弹幕内容
+            danmakus = re.findall(r'<d.*?>(.*?)</d>', response.text)
+            self.danmaku_list.extend(danmakus)
+            
+            print(f"成功获取视频{aid}的{len(danmakus)}条弹幕")
+            time.sleep(random.uniform(0.5, 1.5))
+            return True
+        except Exception as e:
+            print(f"获取视频{aid}弹幕失败: {e}")
+            return False
+    
+    def run(self, keywords=["大语言模型", "大模型", "LLM"]):
+        """运行爬虫主程序"""
+        all_video_ids = []
+        for keyword in keywords:
+            print(f"搜索关键词: {keyword}")
+            video_ids = self.get_video_ids(keyword)
+            all_video_ids.extend(video_ids)
+        
+        # 去重并确保总数不超过360
+        unique_video_ids = list(set(all_video_ids))[:360]
+        print(f"共获取{len(unique_video_ids)}个视频ID，开始爬取弹幕...")
+        
+        for idx, aid in enumerate(unique_video_ids, 1):
+            print(f"正在爬取第{idx}/{len(unique_video_ids)}个视频")
+            self.get_danmakus(aid)
+        
+        print(f"爬取完成，共获取{len(self.danmaku_list)}条弹幕")
+        return self.danmaku_list