diff --git a/弹幕.py b/弹幕.py new file mode 100644 index 0000000..a4d4b1a --- /dev/null +++ b/弹幕.py @@ -0,0 +1,121 @@ +import requests +import time +import random +import re +import os +from bs4 import BeautifulSoup +import pandas as pd +from collections import Counter +from openpyxl import Workbook +from openpyxl.styles import Font, Alignment, PatternFill +import matplotlib.pyplot as plt +from wordcloud import WordCloud +import numpy as np +from PIL import Image +# 全局配置 +OUTPUT_EXCEL = "LLM弹幕分析结果.xlsx" +WORDCLOUD_OUTPUT = "LLM弹幕词云_优化版.png" +FONT_PATH = "C:/Windows/Fonts/simhei.ttf" +KEYWORDS = ["大语言模型", "大模型", "LLM"] # 搜索关键词 +MAX_VIDEOS_PER_KEYWORD = 120 # 每个关键词爬取最多120个视频 +TOTAL_MAX_VIDEOS = 360 # 总视频数上限(3个关键词×120=360) +LATEST_COOKIES = { + "SESSDATA": "50e72c3f%2C1778736212%2C61aec%2Ab2CjBv-7jJBDjqTPsa8i7BlXK5UrYwalTGQwI_FjpRv8R7xKE13zppXoX5qubeVknEyVgSVmlvRHZzRExDdHBodkZTVlFnUUpiOUNwWGxmOWRvZWJQYXZLcHM4dGJhcTF5NXFDcUxIYWNtSGxseW4zZzEzWExiZW1nQV9pTEtZcDk0OXdDbjB0VlpnIIEC", + "bili_jct": "6c4eb6cbc0cef8ee55c3d61cdc6946d0" +} # 提供cookies防止被认定为爬虫而无法获取 +global_session = requests.Session() # 全局Session,保持连接 + +# 模块1:AID获取+弹幕爬取 +def fetch_danmakus(aid): + """根据AID爬取单条视频的所有弹幕""" + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36", + "Referer": "https://www.bilibili.com/" + } + + try: + # 获取CID(弹幕接口必需参数) + cid_resp = global_session.get( + f"https://api.bilibili.com/x/web-interface/view?aid={aid}", + headers=headers, + cookies=LATEST_COOKIES, + timeout=8 + ) + cid_resp.raise_for_status() + cid = cid_resp.json().get("data", {}).get("cid") + if not cid: + print(f"aid={aid} 未获取到CID") + return [] + + # 爬取弹幕(XML格式) + danmaku_resp = global_session.get( + f"https://comment.bilibili.com/{cid}.xml", + headers=headers, + cookies=LATEST_COOKIES, + timeout=8 + ) + danmaku_resp.encoding = "utf-8" + danmaku_resp.raise_for_status() + soup = BeautifulSoup(danmaku_resp.text, "lxml-xml") + + # 提取弹幕并初步过滤空值 + raw_danmus = [d.text.strip() for d in soup.find_all("d") if d.text.strip()] + print(f"aid={aid} 爬取完成,原始弹幕{len(raw_danmus)}条") + return raw_danmus + + except Exception as e: + print(f"aid={aid} 爬取失败:{str(e)[:50]}") + return [] + +def get_top_videos_aids(keyword, max_videos=120): + """根据关键词获取综合排序前N条视频的AID""" + aids = [] + page = 1 + page_size = 30 + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36", + "Referer": "https://www.bilibili.com/" + } + + while len(aids) < max_videos and page <= 5: + params = { + "keyword": keyword, + "page": page, + "page_size": page_size, + "search_type": "video", + "order": "totalrank" # 综合排序 + } + try: + time.sleep(1.5 + random.random()) # 随机延时防反爬 + resp = global_session.get( + "https://api.bilibili.com/x/web-interface/search/type", + params=params, + headers=headers, + cookies=LATEST_COOKIES, + timeout=10 + ) + resp.raise_for_status() + data = resp.json() + + if data.get("code") != 0: + print(f"关键词[{keyword}]页{page} 接口返回错误:{data.get('message', '未知错误')}") + break + + video_list = data.get("data", {}).get("result", []) + if not video_list: + print(f"关键词[{keyword}]页{page} 无视频结果") + break + + # 提取AID并去重 + new_aids = [str(v.get("aid")) for v in video_list if v.get("aid")] + aids.extend(new_aids) + aids = list(set(aids)) # 去重 + print(f"关键词[{keyword}]页{page},累计AID:{len(aids)}/{max_videos}") + page += 1 + + except Exception as e: + print(f"关键词[{keyword}]页{page} 获取AID失败:{str(e)[:50]}") + page += 1 + continue + + return aids[:max_videos] # 确保不超过最大数量 \ No newline at end of file