Update 弹幕.py

5 months ago · b193430ef3
parent 5b1c36ac00
commit b193430ef3
1 changed files with 156 additions and 121 deletions
--- a/弹幕.py
+++ b/弹幕.py
@ -1,121 +1,156 @@
-import requests
-import time
-import random
-import re
-import os
-from bs4 import BeautifulSoup
-import pandas as pd
-from collections import Counter
-from openpyxl import Workbook
-from openpyxl.styles import Font, Alignment, PatternFill
-import matplotlib.pyplot as plt
-from wordcloud import WordCloud
-import numpy as np
-from PIL import Image
-# 全局配置
-OUTPUT_EXCEL = "LLM弹幕分析结果.xlsx"
-WORDCLOUD_OUTPUT = "LLM弹幕词云_优化版.png"
-FONT_PATH = "C:/Windows/Fonts/simhei.ttf"
-KEYWORDS = ["大语言模型", "大模型", "LLM"]  # 搜索关键词
-MAX_VIDEOS_PER_KEYWORD = 120  # 每个关键词爬取最多120个视频
-TOTAL_MAX_VIDEOS = 360  # 总视频数上限（3个关键词×120=360）
-LATEST_COOKIES = {
-    "SESSDATA": "50e72c3f%2C1778736212%2C61aec%2Ab2CjBv-7jJBDjqTPsa8i7BlXK5UrYwalTGQwI_FjpRv8R7xKE13zppXoX5qubeVknEyVgSVmlvRHZzRExDdHBodkZTVlFnUUpiOUNwWGxmOWRvZWJQYXZLcHM4dGJhcTF5NXFDcUxIYWNtSGxseW4zZzEzWExiZW1nQV9pTEtZcDk0OXdDbjB0VlpnIIEC",
-    "bili_jct": "6c4eb6cbc0cef8ee55c3d61cdc6946d0"
-} # 提供cookies防止被认定为爬虫而无法获取
-global_session = requests.Session()  # 全局Session，保持连接
-
-# 模块1：AID获取+弹幕爬取
-def fetch_danmakus(aid):
-    """根据AID爬取单条视频的所有弹幕"""
-    headers = {
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
-        "Referer": "https://www.bilibili.com/"
-    }
-    
-    try:
-        # 获取CID（弹幕接口必需参数）
-        cid_resp = global_session.get(
-            f"https://api.bilibili.com/x/web-interface/view?aid={aid}",
-            headers=headers,
-            cookies=LATEST_COOKIES,
-            timeout=8
-        )
-        cid_resp.raise_for_status()
-        cid = cid_resp.json().get("data", {}).get("cid")
-        if not cid:
-            print(f"aid={aid} 未获取到CID")
-            return []
-        
-        # 爬取弹幕（XML格式）
-        danmaku_resp = global_session.get(
-            f"https://comment.bilibili.com/{cid}.xml",
-            headers=headers,
-            cookies=LATEST_COOKIES,
-            timeout=8
-        )
-        danmaku_resp.encoding = "utf-8"
-        danmaku_resp.raise_for_status()
-        soup = BeautifulSoup(danmaku_resp.text, "lxml-xml")
-        
-        # 提取弹幕并初步过滤空值
-        raw_danmus = [d.text.strip() for d in soup.find_all("d") if d.text.strip()]
-        print(f"aid={aid} 爬取完成，原始弹幕{len(raw_danmus)}条")
-        return raw_danmus
-    
-    except Exception as e:
-        print(f"aid={aid} 爬取失败：{str(e)[:50]}")
-        return []
-
-def get_top_videos_aids(keyword, max_videos=120):
-    """根据关键词获取综合排序前N条视频的AID"""
-    aids = []
-    page = 1
-    page_size = 30
-    headers = {
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
-        "Referer": "https://www.bilibili.com/"
-    }
-    
-    while len(aids) < max_videos and page <= 5: 
-        params = {
-            "keyword": keyword,
-            "page": page,
-            "page_size": page_size,
-            "search_type": "video",
-            "order": "totalrank"  # 综合排序
-        }
-        try:
-            time.sleep(1.5 + random.random())  # 随机延时防反爬
-            resp = global_session.get(
-                "https://api.bilibili.com/x/web-interface/search/type",
-                params=params,
-                headers=headers,
-                cookies=LATEST_COOKIES,
-                timeout=10
-            )
-            resp.raise_for_status()
-            data = resp.json()
-            
-            if data.get("code") != 0:
-                print(f"关键词[{keyword}]页{page} 接口返回错误：{data.get('message', '未知错误')}")
-                break
-            
-            video_list = data.get("data", {}).get("result", [])
-            if not video_list:
-                print(f"关键词[{keyword}]页{page} 无视频结果")
-                break
-            
-            # 提取AID并去重
-            new_aids = [str(v.get("aid")) for v in video_list if v.get("aid")]
-            aids.extend(new_aids)
-            aids = list(set(aids))  # 去重
-            print(f"关键词[{keyword}]页{page}，累计AID：{len(aids)}/{max_videos}")
-            page += 1
-        
-        except Exception as e:
-            print(f"关键词[{keyword}]页{page} 获取AID失败：{str(e)[:50]}")
-            page += 1
-            continue
-    
-    return aids[:max_videos]  # 确保不超过最大数量
+import requests
+import time
+import random
+import re
+import os
+from bs4 import BeautifulSoup
+import pandas as pd
+from collections import Counter
+from openpyxl import Workbook
+from openpyxl.styles import Font, Alignment, PatternFill
+import matplotlib.pyplot as plt
+from wordcloud import WordCloud
+import numpy as np
+from PIL import Image
+# 全局配置
+OUTPUT_EXCEL = "LLM弹幕分析结果.xlsx"
+WORDCLOUD_OUTPUT = "LLM弹幕词云_优化版.png"
+FONT_PATH = "C:/Windows/Fonts/simhei.ttf"
+KEYWORDS = ["大语言模型", "大模型", "LLM"]  # 搜索关键词
+MAX_VIDEOS_PER_KEYWORD = 120  # 每个关键词爬取最多120个视频
+TOTAL_MAX_VIDEOS = 360  # 总视频数上限（3个关键词×120=360）
+LATEST_COOKIES = {
+    "SESSDATA": "50e72c3f%2C1778736212%2C61aec%2Ab2CjBv-7jJBDjqTPsa8i7BlXK5UrYwalTGQwI_FjpRv8R7xKE13zppXoX5qubeVknEyVgSVmlvRHZzRExDdHBodkZTVlFnUUpiOUNwWGxmOWRvZWJQYXZLcHM4dGJhcTF5NXFDcUxIYWNtSGxseW4zZzEzWExiZW1nQV9pTEtZcDk0OXdDbjB0VlpnIIEC",
+    "bili_jct": "6c4eb6cbc0cef8ee55c3d61cdc6946d0"
+} # 提供cookies防止被认定为爬虫而无法获取
+global_session = requests.Session()  # 全局Session，保持连接
+
+# 模块1：AID获取+弹幕爬取
+def fetch_danmakus(aid):
+    """根据AID爬取单条视频的所有弹幕"""
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
+        "Referer": "https://www.bilibili.com/"
+    }
+    
+    try:
+        # 获取CID（弹幕接口必需参数）
+        cid_resp = global_session.get(
+            f"https://api.bilibili.com/x/web-interface/view?aid={aid}",
+            headers=headers,
+            cookies=LATEST_COOKIES,
+            timeout=8
+        )
+        cid_resp.raise_for_status()
+        cid = cid_resp.json().get("data", {}).get("cid")
+        if not cid:
+            print(f"aid={aid} 未获取到CID")
+            return []
+        
+        # 爬取弹幕（XML格式）
+        danmaku_resp = global_session.get(
+            f"https://comment.bilibili.com/{cid}.xml",
+            headers=headers,
+            cookies=LATEST_COOKIES,
+            timeout=8
+        )
+        danmaku_resp.encoding = "utf-8"
+        danmaku_resp.raise_for_status()
+        soup = BeautifulSoup(danmaku_resp.text, "lxml-xml")
+        
+        # 提取弹幕并初步过滤空值
+        raw_danmus = [d.text.strip() for d in soup.find_all("d") if d.text.strip()]
+        print(f"aid={aid} 爬取完成，原始弹幕{len(raw_danmus)}条")
+        return raw_danmus
+    
+    except Exception as e:
+        print(f"aid={aid} 爬取失败：{str(e)[:50]}")
+        return []
+
+def get_top_videos_aids(keyword, max_videos=120):
+    """根据关键词获取综合排序前N条视频的AID"""
+    aids = []
+    page = 1
+    page_size = 30
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
+        "Referer": "https://www.bilibili.com/"
+    }
+    
+    while len(aids) < max_videos and page <= 5: 
+        params = {
+            "keyword": keyword,
+            "page": page,
+            "page_size": page_size,
+            "search_type": "video",
+            "order": "totalrank"  # 综合排序
+        }
+        try:
+            time.sleep(1.5 + random.random())  # 随机延时防反爬
+            resp = global_session.get(
+                "https://api.bilibili.com/x/web-interface/search/type",
+                params=params,
+                headers=headers,
+                cookies=LATEST_COOKIES,
+                timeout=10
+            )
+            resp.raise_for_status()
+            data = resp.json()
+            
+            if data.get("code") != 0:
+                print(f"关键词[{keyword}]页{page} 接口返回错误：{data.get('message', '未知错误')}")
+                break
+            
+            video_list = data.get("data", {}).get("result", [])
+            if not video_list:
+                print(f"关键词[{keyword}]页{page} 无视频结果")
+                break
+            
+            # 提取AID并去重
+            new_aids = [str(v.get("aid")) for v in video_list if v.get("aid")]
+            aids.extend(new_aids)
+            aids = list(set(aids))  # 去重
+            print(f"关键词[{keyword}]页{page}，累计AID：{len(aids)}/{max_videos}")
+            page += 1
+        
+        except Exception as e:
+            print(f"关键词[{keyword}]页{page} 获取AID失败：{str(e)[:50]}")
+            page += 1
+            continue
+    
+    return aids[:max_videos]  # 确保不超过最大数量
+    
+if __name__ == "__main__":
+    print("="*50)
+    print("开始B站LLM相关弹幕分析任务（AID爬取版）")
+    print("="*50)
+    
+    # 步骤1：获取所有关键词的AID
+    print("\n【步骤1/6】获取360个视频AID...")
+    all_aids = []
+    for keyword in KEYWORDS:
+        print(f"\n正在获取关键词[{keyword}]的视频AID...")
+        aids = get_top_videos_aids(keyword, MAX_VIDEOS_PER_KEYWORD)
+        all_aids.extend(aids)
+        all_aids = list(set(all_aids))[:TOTAL_MAX_VIDEOS]
+        print(f"关键词[{keyword}]完成，当前累计AID：{len(all_aids)}/{TOTAL_MAX_VIDEOS}")
+        if len(all_aids) >= TOTAL_MAX_VIDEOS:
+            break
+    
+    print(f"\nAID获取完成，共{len(all_aids)}个有效AID")
+    if len(all_aids) == 0:
+        print("错误：未获取到任何AID，无法继续爬取")
+        exit()
+    
+    # 步骤2：批量爬取弹幕
+    print("\n【步骤2/6】批量爬取弹幕（共{len(all_aids)}个视频）...")
+    all_raw_danmus = []
+    for idx, aid in enumerate(all_aids, 1):
+        print(f"\n正在爬取第{idx}/{len(all_aids)}个视频（aid={aid}）...")
+        danmus = fetch_danmakus(aid)
+        all_raw_danmus.extend(danmus)
+    
+    print(f"\n弹幕爬取完成，累计原始弹幕：{len(all_raw_danmus)}条")
+    if len(all_raw_danmus) == 0:
+        print("警告：未爬取到任何原始弹幕，任务终止")
+        exit()