Update 弹幕.py

main
fzu102301128 5 months ago
parent 5b1c36ac00
commit b193430ef3

@ -1,121 +1,156 @@
import requests
import time
import random
import re
import os
from bs4 import BeautifulSoup
import pandas as pd
from collections import Counter
from openpyxl import Workbook
from openpyxl.styles import Font, Alignment, PatternFill
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import numpy as np
from PIL import Image
# 全局配置
OUTPUT_EXCEL = "LLM弹幕分析结果.xlsx"
WORDCLOUD_OUTPUT = "LLM弹幕词云_优化版.png"
FONT_PATH = "C:/Windows/Fonts/simhei.ttf"
KEYWORDS = ["大语言模型", "大模型", "LLM"] # 搜索关键词
MAX_VIDEOS_PER_KEYWORD = 120 # 每个关键词爬取最多120个视频
TOTAL_MAX_VIDEOS = 360 # 总视频数上限3个关键词×120=360
LATEST_COOKIES = {
"SESSDATA": "50e72c3f%2C1778736212%2C61aec%2Ab2CjBv-7jJBDjqTPsa8i7BlXK5UrYwalTGQwI_FjpRv8R7xKE13zppXoX5qubeVknEyVgSVmlvRHZzRExDdHBodkZTVlFnUUpiOUNwWGxmOWRvZWJQYXZLcHM4dGJhcTF5NXFDcUxIYWNtSGxseW4zZzEzWExiZW1nQV9pTEtZcDk0OXdDbjB0VlpnIIEC",
"bili_jct": "6c4eb6cbc0cef8ee55c3d61cdc6946d0"
} # 提供cookies防止被认定为爬虫而无法获取
global_session = requests.Session() # 全局Session保持连接
# 模块1AID获取+弹幕爬取
def fetch_danmakus(aid):
"""根据AID爬取单条视频的所有弹幕"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
"Referer": "https://www.bilibili.com/"
}
try:
# 获取CID弹幕接口必需参数
cid_resp = global_session.get(
f"https://api.bilibili.com/x/web-interface/view?aid={aid}",
headers=headers,
cookies=LATEST_COOKIES,
timeout=8
)
cid_resp.raise_for_status()
cid = cid_resp.json().get("data", {}).get("cid")
if not cid:
print(f"aid={aid} 未获取到CID")
return []
# 爬取弹幕XML格式
danmaku_resp = global_session.get(
f"https://comment.bilibili.com/{cid}.xml",
headers=headers,
cookies=LATEST_COOKIES,
timeout=8
)
danmaku_resp.encoding = "utf-8"
danmaku_resp.raise_for_status()
soup = BeautifulSoup(danmaku_resp.text, "lxml-xml")
# 提取弹幕并初步过滤空值
raw_danmus = [d.text.strip() for d in soup.find_all("d") if d.text.strip()]
print(f"aid={aid} 爬取完成,原始弹幕{len(raw_danmus)}")
return raw_danmus
except Exception as e:
print(f"aid={aid} 爬取失败:{str(e)[:50]}")
return []
def get_top_videos_aids(keyword, max_videos=120):
"""根据关键词获取综合排序前N条视频的AID"""
aids = []
page = 1
page_size = 30
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
"Referer": "https://www.bilibili.com/"
}
while len(aids) < max_videos and page <= 5:
params = {
"keyword": keyword,
"page": page,
"page_size": page_size,
"search_type": "video",
"order": "totalrank" # 综合排序
}
try:
time.sleep(1.5 + random.random()) # 随机延时防反爬
resp = global_session.get(
"https://api.bilibili.com/x/web-interface/search/type",
params=params,
headers=headers,
cookies=LATEST_COOKIES,
timeout=10
)
resp.raise_for_status()
data = resp.json()
if data.get("code") != 0:
print(f"关键词[{keyword}]页{page} 接口返回错误:{data.get('message', '未知错误')}")
break
video_list = data.get("data", {}).get("result", [])
if not video_list:
print(f"关键词[{keyword}]页{page} 无视频结果")
break
# 提取AID并去重
new_aids = [str(v.get("aid")) for v in video_list if v.get("aid")]
aids.extend(new_aids)
aids = list(set(aids)) # 去重
print(f"关键词[{keyword}]页{page}累计AID{len(aids)}/{max_videos}")
page += 1
except Exception as e:
print(f"关键词[{keyword}]页{page} 获取AID失败{str(e)[:50]}")
page += 1
continue
return aids[:max_videos] # 确保不超过最大数量
import requests
import time
import random
import re
import os
from bs4 import BeautifulSoup
import pandas as pd
from collections import Counter
from openpyxl import Workbook
from openpyxl.styles import Font, Alignment, PatternFill
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import numpy as np
from PIL import Image
# 全局配置
OUTPUT_EXCEL = "LLM弹幕分析结果.xlsx"
WORDCLOUD_OUTPUT = "LLM弹幕词云_优化版.png"
FONT_PATH = "C:/Windows/Fonts/simhei.ttf"
KEYWORDS = ["大语言模型", "大模型", "LLM"] # 搜索关键词
MAX_VIDEOS_PER_KEYWORD = 120 # 每个关键词爬取最多120个视频
TOTAL_MAX_VIDEOS = 360 # 总视频数上限3个关键词×120=360
LATEST_COOKIES = {
"SESSDATA": "50e72c3f%2C1778736212%2C61aec%2Ab2CjBv-7jJBDjqTPsa8i7BlXK5UrYwalTGQwI_FjpRv8R7xKE13zppXoX5qubeVknEyVgSVmlvRHZzRExDdHBodkZTVlFnUUpiOUNwWGxmOWRvZWJQYXZLcHM4dGJhcTF5NXFDcUxIYWNtSGxseW4zZzEzWExiZW1nQV9pTEtZcDk0OXdDbjB0VlpnIIEC",
"bili_jct": "6c4eb6cbc0cef8ee55c3d61cdc6946d0"
} # 提供cookies防止被认定为爬虫而无法获取
global_session = requests.Session() # 全局Session保持连接
# 模块1AID获取+弹幕爬取
def fetch_danmakus(aid):
"""根据AID爬取单条视频的所有弹幕"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
"Referer": "https://www.bilibili.com/"
}
try:
# 获取CID弹幕接口必需参数
cid_resp = global_session.get(
f"https://api.bilibili.com/x/web-interface/view?aid={aid}",
headers=headers,
cookies=LATEST_COOKIES,
timeout=8
)
cid_resp.raise_for_status()
cid = cid_resp.json().get("data", {}).get("cid")
if not cid:
print(f"aid={aid} 未获取到CID")
return []
# 爬取弹幕XML格式
danmaku_resp = global_session.get(
f"https://comment.bilibili.com/{cid}.xml",
headers=headers,
cookies=LATEST_COOKIES,
timeout=8
)
danmaku_resp.encoding = "utf-8"
danmaku_resp.raise_for_status()
soup = BeautifulSoup(danmaku_resp.text, "lxml-xml")
# 提取弹幕并初步过滤空值
raw_danmus = [d.text.strip() for d in soup.find_all("d") if d.text.strip()]
print(f"aid={aid} 爬取完成,原始弹幕{len(raw_danmus)}")
return raw_danmus
except Exception as e:
print(f"aid={aid} 爬取失败:{str(e)[:50]}")
return []
def get_top_videos_aids(keyword, max_videos=120):
"""根据关键词获取综合排序前N条视频的AID"""
aids = []
page = 1
page_size = 30
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
"Referer": "https://www.bilibili.com/"
}
while len(aids) < max_videos and page <= 5:
params = {
"keyword": keyword,
"page": page,
"page_size": page_size,
"search_type": "video",
"order": "totalrank" # 综合排序
}
try:
time.sleep(1.5 + random.random()) # 随机延时防反爬
resp = global_session.get(
"https://api.bilibili.com/x/web-interface/search/type",
params=params,
headers=headers,
cookies=LATEST_COOKIES,
timeout=10
)
resp.raise_for_status()
data = resp.json()
if data.get("code") != 0:
print(f"关键词[{keyword}]页{page} 接口返回错误:{data.get('message', '未知错误')}")
break
video_list = data.get("data", {}).get("result", [])
if not video_list:
print(f"关键词[{keyword}]页{page} 无视频结果")
break
# 提取AID并去重
new_aids = [str(v.get("aid")) for v in video_list if v.get("aid")]
aids.extend(new_aids)
aids = list(set(aids)) # 去重
print(f"关键词[{keyword}]页{page}累计AID{len(aids)}/{max_videos}")
page += 1
except Exception as e:
print(f"关键词[{keyword}]页{page} 获取AID失败{str(e)[:50]}")
page += 1
continue
return aids[:max_videos] # 确保不超过最大数量
if __name__ == "__main__":
print("="*50)
print("开始B站LLM相关弹幕分析任务AID爬取版")
print("="*50)
# 步骤1获取所有关键词的AID
print("\n【步骤1/6】获取360个视频AID...")
all_aids = []
for keyword in KEYWORDS:
print(f"\n正在获取关键词[{keyword}]的视频AID...")
aids = get_top_videos_aids(keyword, MAX_VIDEOS_PER_KEYWORD)
all_aids.extend(aids)
all_aids = list(set(all_aids))[:TOTAL_MAX_VIDEOS]
print(f"关键词[{keyword}]完成当前累计AID{len(all_aids)}/{TOTAL_MAX_VIDEOS}")
if len(all_aids) >= TOTAL_MAX_VIDEOS:
break
print(f"\nAID获取完成{len(all_aids)}个有效AID")
if len(all_aids) == 0:
print("错误未获取到任何AID无法继续爬取")
exit()
# 步骤2批量爬取弹幕
print("\n【步骤2/6】批量爬取弹幕{len(all_aids)}个视频)...")
all_raw_danmus = []
for idx, aid in enumerate(all_aids, 1):
print(f"\n正在爬取第{idx}/{len(all_aids)}个视频aid={aid}...")
danmus = fetch_danmakus(aid)
all_raw_danmus.extend(danmus)
print(f"\n弹幕爬取完成,累计原始弹幕:{len(all_raw_danmus)}")
if len(all_raw_danmus) == 0:
print("警告:未爬取到任何原始弹幕,任务终止")
exit()
Loading…
Cancel
Save