ADD file via upload

4 months ago · f277ef80f3
parent 42524c0319
commit f277ef80f3
1 changed files with 121 additions and 0 deletions
--- a/弹幕.py
+++ b/弹幕.py
@ -0,0 +1,121 @@
+import requests
+import time
+import random
+import re
+import os
+from bs4 import BeautifulSoup
+import pandas as pd
+from collections import Counter
+from openpyxl import Workbook
+from openpyxl.styles import Font, Alignment, PatternFill
+import matplotlib.pyplot as plt
+from wordcloud import WordCloud
+import numpy as np
+from PIL import Image
+# 全局配置
+OUTPUT_EXCEL = "LLM弹幕分析结果.xlsx"
+WORDCLOUD_OUTPUT = "LLM弹幕词云_优化版.png"
+FONT_PATH = "C:/Windows/Fonts/simhei.ttf"
+KEYWORDS = ["大语言模型", "大模型", "LLM"]  # 搜索关键词
+MAX_VIDEOS_PER_KEYWORD = 120  # 每个关键词爬取最多120个视频
+TOTAL_MAX_VIDEOS = 360  # 总视频数上限（3个关键词×120=360）
+LATEST_COOKIES = {
+    "SESSDATA": "50e72c3f%2C1778736212%2C61aec%2Ab2CjBv-7jJBDjqTPsa8i7BlXK5UrYwalTGQwI_FjpRv8R7xKE13zppXoX5qubeVknEyVgSVmlvRHZzRExDdHBodkZTVlFnUUpiOUNwWGxmOWRvZWJQYXZLcHM4dGJhcTF5NXFDcUxIYWNtSGxseW4zZzEzWExiZW1nQV9pTEtZcDk0OXdDbjB0VlpnIIEC",
+    "bili_jct": "6c4eb6cbc0cef8ee55c3d61cdc6946d0"
+} # 提供cookies防止被认定为爬虫而无法获取
+global_session = requests.Session()  # 全局Session，保持连接
+
+# 模块1：AID获取+弹幕爬取
+def fetch_danmakus(aid):
+    """根据AID爬取单条视频的所有弹幕"""
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
+        "Referer": "https://www.bilibili.com/"
+    }
+    
+    try:
+        # 获取CID（弹幕接口必需参数）
+        cid_resp = global_session.get(
+            f"https://api.bilibili.com/x/web-interface/view?aid={aid}",
+            headers=headers,
+            cookies=LATEST_COOKIES,
+            timeout=8
+        )
+        cid_resp.raise_for_status()
+        cid = cid_resp.json().get("data", {}).get("cid")
+        if not cid:
+            print(f"aid={aid} 未获取到CID")
+            return []
+        
+        # 爬取弹幕（XML格式）
+        danmaku_resp = global_session.get(
+            f"https://comment.bilibili.com/{cid}.xml",
+            headers=headers,
+            cookies=LATEST_COOKIES,
+            timeout=8
+        )
+        danmaku_resp.encoding = "utf-8"
+        danmaku_resp.raise_for_status()
+        soup = BeautifulSoup(danmaku_resp.text, "lxml-xml")
+        
+        # 提取弹幕并初步过滤空值
+        raw_danmus = [d.text.strip() for d in soup.find_all("d") if d.text.strip()]
+        print(f"aid={aid} 爬取完成，原始弹幕{len(raw_danmus)}条")
+        return raw_danmus
+    
+    except Exception as e:
+        print(f"aid={aid} 爬取失败：{str(e)[:50]}")
+        return []
+
+def get_top_videos_aids(keyword, max_videos=120):
+    """根据关键词获取综合排序前N条视频的AID"""
+    aids = []
+    page = 1
+    page_size = 30
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
+        "Referer": "https://www.bilibili.com/"
+    }
+    
+    while len(aids) < max_videos and page <= 5: 
+        params = {
+            "keyword": keyword,
+            "page": page,
+            "page_size": page_size,
+            "search_type": "video",
+            "order": "totalrank"  # 综合排序
+        }
+        try:
+            time.sleep(1.5 + random.random())  # 随机延时防反爬
+            resp = global_session.get(
+                "https://api.bilibili.com/x/web-interface/search/type",
+                params=params,
+                headers=headers,
+                cookies=LATEST_COOKIES,
+                timeout=10
+            )
+            resp.raise_for_status()
+            data = resp.json()
+            
+            if data.get("code") != 0:
+                print(f"关键词[{keyword}]页{page} 接口返回错误：{data.get('message', '未知错误')}")
+                break
+            
+            video_list = data.get("data", {}).get("result", [])
+            if not video_list:
+                print(f"关键词[{keyword}]页{page} 无视频结果")
+                break
+            
+            # 提取AID并去重
+            new_aids = [str(v.get("aid")) for v in video_list if v.get("aid")]
+            aids.extend(new_aids)
+            aids = list(set(aids))  # 去重
+            print(f"关键词[{keyword}]页{page}，累计AID：{len(aids)}/{max_videos}")
+            page += 1
+        
+        except Exception as e:
+            print(f"关键词[{keyword}]页{page} 获取AID失败：{str(e)[:50]}")
+            page += 1
+            continue
+    
+    return aids[:max_videos]  # 确保不超过最大数量