ADD file via upload

main
fzu102301128 1 month ago
parent 42524c0319
commit f277ef80f3

@ -0,0 +1,121 @@
import requests
import time
import random
import re
import os
from bs4 import BeautifulSoup
import pandas as pd
from collections import Counter
from openpyxl import Workbook
from openpyxl.styles import Font, Alignment, PatternFill
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import numpy as np
from PIL import Image
# 全局配置
OUTPUT_EXCEL = "LLM弹幕分析结果.xlsx"
WORDCLOUD_OUTPUT = "LLM弹幕词云_优化版.png"
FONT_PATH = "C:/Windows/Fonts/simhei.ttf"
KEYWORDS = ["大语言模型", "大模型", "LLM"] # 搜索关键词
MAX_VIDEOS_PER_KEYWORD = 120 # 每个关键词爬取最多120个视频
TOTAL_MAX_VIDEOS = 360 # 总视频数上限3个关键词×120=360
LATEST_COOKIES = {
"SESSDATA": "50e72c3f%2C1778736212%2C61aec%2Ab2CjBv-7jJBDjqTPsa8i7BlXK5UrYwalTGQwI_FjpRv8R7xKE13zppXoX5qubeVknEyVgSVmlvRHZzRExDdHBodkZTVlFnUUpiOUNwWGxmOWRvZWJQYXZLcHM4dGJhcTF5NXFDcUxIYWNtSGxseW4zZzEzWExiZW1nQV9pTEtZcDk0OXdDbjB0VlpnIIEC",
"bili_jct": "6c4eb6cbc0cef8ee55c3d61cdc6946d0"
} # 提供cookies防止被认定为爬虫而无法获取
global_session = requests.Session() # 全局Session保持连接
# 模块1AID获取+弹幕爬取
def fetch_danmakus(aid):
"""根据AID爬取单条视频的所有弹幕"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
"Referer": "https://www.bilibili.com/"
}
try:
# 获取CID弹幕接口必需参数
cid_resp = global_session.get(
f"https://api.bilibili.com/x/web-interface/view?aid={aid}",
headers=headers,
cookies=LATEST_COOKIES,
timeout=8
)
cid_resp.raise_for_status()
cid = cid_resp.json().get("data", {}).get("cid")
if not cid:
print(f"aid={aid} 未获取到CID")
return []
# 爬取弹幕XML格式
danmaku_resp = global_session.get(
f"https://comment.bilibili.com/{cid}.xml",
headers=headers,
cookies=LATEST_COOKIES,
timeout=8
)
danmaku_resp.encoding = "utf-8"
danmaku_resp.raise_for_status()
soup = BeautifulSoup(danmaku_resp.text, "lxml-xml")
# 提取弹幕并初步过滤空值
raw_danmus = [d.text.strip() for d in soup.find_all("d") if d.text.strip()]
print(f"aid={aid} 爬取完成,原始弹幕{len(raw_danmus)}")
return raw_danmus
except Exception as e:
print(f"aid={aid} 爬取失败:{str(e)[:50]}")
return []
def get_top_videos_aids(keyword, max_videos=120):
"""根据关键词获取综合排序前N条视频的AID"""
aids = []
page = 1
page_size = 30
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
"Referer": "https://www.bilibili.com/"
}
while len(aids) < max_videos and page <= 5:
params = {
"keyword": keyword,
"page": page,
"page_size": page_size,
"search_type": "video",
"order": "totalrank" # 综合排序
}
try:
time.sleep(1.5 + random.random()) # 随机延时防反爬
resp = global_session.get(
"https://api.bilibili.com/x/web-interface/search/type",
params=params,
headers=headers,
cookies=LATEST_COOKIES,
timeout=10
)
resp.raise_for_status()
data = resp.json()
if data.get("code") != 0:
print(f"关键词[{keyword}]页{page} 接口返回错误:{data.get('message', '未知错误')}")
break
video_list = data.get("data", {}).get("result", [])
if not video_list:
print(f"关键词[{keyword}]页{page} 无视频结果")
break
# 提取AID并去重
new_aids = [str(v.get("aid")) for v in video_list if v.get("aid")]
aids.extend(new_aids)
aids = list(set(aids)) # 去重
print(f"关键词[{keyword}]页{page}累计AID{len(aids)}/{max_videos}")
page += 1
except Exception as e:
print(f"关键词[{keyword}]页{page} 获取AID失败{str(e)[:50]}")
page += 1
continue
return aids[:max_videos] # 确保不超过最大数量
Loading…
Cancel
Save