|
|
|
|
@ -0,0 +1,121 @@
|
|
|
|
|
import requests
|
|
|
|
|
import time
|
|
|
|
|
import random
|
|
|
|
|
import re
|
|
|
|
|
import os
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
import pandas as pd
|
|
|
|
|
from collections import Counter
|
|
|
|
|
from openpyxl import Workbook
|
|
|
|
|
from openpyxl.styles import Font, Alignment, PatternFill
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
from wordcloud import WordCloud
|
|
|
|
|
import numpy as np
|
|
|
|
|
from PIL import Image
|
|
|
|
|
# 全局配置
|
|
|
|
|
OUTPUT_EXCEL = "LLM弹幕分析结果.xlsx"
|
|
|
|
|
WORDCLOUD_OUTPUT = "LLM弹幕词云_优化版.png"
|
|
|
|
|
FONT_PATH = "C:/Windows/Fonts/simhei.ttf"
|
|
|
|
|
KEYWORDS = ["大语言模型", "大模型", "LLM"] # 搜索关键词
|
|
|
|
|
MAX_VIDEOS_PER_KEYWORD = 120 # 每个关键词爬取最多120个视频
|
|
|
|
|
TOTAL_MAX_VIDEOS = 360 # 总视频数上限(3个关键词×120=360)
|
|
|
|
|
LATEST_COOKIES = {
|
|
|
|
|
"SESSDATA": "50e72c3f%2C1778736212%2C61aec%2Ab2CjBv-7jJBDjqTPsa8i7BlXK5UrYwalTGQwI_FjpRv8R7xKE13zppXoX5qubeVknEyVgSVmlvRHZzRExDdHBodkZTVlFnUUpiOUNwWGxmOWRvZWJQYXZLcHM4dGJhcTF5NXFDcUxIYWNtSGxseW4zZzEzWExiZW1nQV9pTEtZcDk0OXdDbjB0VlpnIIEC",
|
|
|
|
|
"bili_jct": "6c4eb6cbc0cef8ee55c3d61cdc6946d0"
|
|
|
|
|
} # 提供cookies防止被认定为爬虫而无法获取
|
|
|
|
|
global_session = requests.Session() # 全局Session,保持连接
|
|
|
|
|
|
|
|
|
|
# 模块1:AID获取+弹幕爬取
|
|
|
|
|
def fetch_danmakus(aid):
|
|
|
|
|
"""根据AID爬取单条视频的所有弹幕"""
|
|
|
|
|
headers = {
|
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
|
|
|
|
|
"Referer": "https://www.bilibili.com/"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
# 获取CID(弹幕接口必需参数)
|
|
|
|
|
cid_resp = global_session.get(
|
|
|
|
|
f"https://api.bilibili.com/x/web-interface/view?aid={aid}",
|
|
|
|
|
headers=headers,
|
|
|
|
|
cookies=LATEST_COOKIES,
|
|
|
|
|
timeout=8
|
|
|
|
|
)
|
|
|
|
|
cid_resp.raise_for_status()
|
|
|
|
|
cid = cid_resp.json().get("data", {}).get("cid")
|
|
|
|
|
if not cid:
|
|
|
|
|
print(f"aid={aid} 未获取到CID")
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
# 爬取弹幕(XML格式)
|
|
|
|
|
danmaku_resp = global_session.get(
|
|
|
|
|
f"https://comment.bilibili.com/{cid}.xml",
|
|
|
|
|
headers=headers,
|
|
|
|
|
cookies=LATEST_COOKIES,
|
|
|
|
|
timeout=8
|
|
|
|
|
)
|
|
|
|
|
danmaku_resp.encoding = "utf-8"
|
|
|
|
|
danmaku_resp.raise_for_status()
|
|
|
|
|
soup = BeautifulSoup(danmaku_resp.text, "lxml-xml")
|
|
|
|
|
|
|
|
|
|
# 提取弹幕并初步过滤空值
|
|
|
|
|
raw_danmus = [d.text.strip() for d in soup.find_all("d") if d.text.strip()]
|
|
|
|
|
print(f"aid={aid} 爬取完成,原始弹幕{len(raw_danmus)}条")
|
|
|
|
|
return raw_danmus
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"aid={aid} 爬取失败:{str(e)[:50]}")
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
def get_top_videos_aids(keyword, max_videos=120):
|
|
|
|
|
"""根据关键词获取综合排序前N条视频的AID"""
|
|
|
|
|
aids = []
|
|
|
|
|
page = 1
|
|
|
|
|
page_size = 30
|
|
|
|
|
headers = {
|
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
|
|
|
|
|
"Referer": "https://www.bilibili.com/"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
while len(aids) < max_videos and page <= 5:
|
|
|
|
|
params = {
|
|
|
|
|
"keyword": keyword,
|
|
|
|
|
"page": page,
|
|
|
|
|
"page_size": page_size,
|
|
|
|
|
"search_type": "video",
|
|
|
|
|
"order": "totalrank" # 综合排序
|
|
|
|
|
}
|
|
|
|
|
try:
|
|
|
|
|
time.sleep(1.5 + random.random()) # 随机延时防反爬
|
|
|
|
|
resp = global_session.get(
|
|
|
|
|
"https://api.bilibili.com/x/web-interface/search/type",
|
|
|
|
|
params=params,
|
|
|
|
|
headers=headers,
|
|
|
|
|
cookies=LATEST_COOKIES,
|
|
|
|
|
timeout=10
|
|
|
|
|
)
|
|
|
|
|
resp.raise_for_status()
|
|
|
|
|
data = resp.json()
|
|
|
|
|
|
|
|
|
|
if data.get("code") != 0:
|
|
|
|
|
print(f"关键词[{keyword}]页{page} 接口返回错误:{data.get('message', '未知错误')}")
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
video_list = data.get("data", {}).get("result", [])
|
|
|
|
|
if not video_list:
|
|
|
|
|
print(f"关键词[{keyword}]页{page} 无视频结果")
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# 提取AID并去重
|
|
|
|
|
new_aids = [str(v.get("aid")) for v in video_list if v.get("aid")]
|
|
|
|
|
aids.extend(new_aids)
|
|
|
|
|
aids = list(set(aids)) # 去重
|
|
|
|
|
print(f"关键词[{keyword}]页{page},累计AID:{len(aids)}/{max_videos}")
|
|
|
|
|
page += 1
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"关键词[{keyword}]页{page} 获取AID失败:{str(e)[:50]}")
|
|
|
|
|
page += 1
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
return aids[:max_videos] # 确保不超过最大数量
|