ADD file via upload

main
fzu102301136 4 months ago
parent c6c4b8456e
commit 6403703cf4

@ -0,0 +1,87 @@
import requests
import re
import json
import time
import random
class BilibiliDanmakuSpider:
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Referer": "https://www.bilibili.com/"
}
self.danmaku_list = []
def get_video_ids(self, keyword, page_count=36):
"""获取搜索结果的视频ID每页10个视频36页共360个"""
video_ids = []
for page in range(1, page_count + 1):
try:
url = f"https://api.bilibili.com/x/web-interface/search/type?keyword={keyword}&search_type=video&page={page}"
response = requests.get(url, headers=self.headers)
data = json.loads(response.text)
#if data["code"] == 0 and data["data"]["result"]:
# for item in data["data"]["result"]:
# video_ids.append(item["aid"])
# 检查接口响应是否正常且有视频数据
if data["code"] == 0 and data["data"]["result"]:
# 提取当前页的视频ID并显示
page_video_ids = [item["aid"] for item in data["data"]["result"]]
video_ids.extend(page_video_ids) # 批量添加到总列表
# 显示当前页获取到的视频号
print(f"{page}页获取到视频ID{page_video_ids}{len(page_video_ids)}")
#else:
# 无数据时提示(非错误,可能是真的没结果)
#print(f"第{page}页未获取到视频数据")
# 随机延迟,避免被反爬
time.sleep(random.uniform(1, 3))
except Exception as e:
print(f"获取第{page}页视频ID失败: {e}")
return list(set(video_ids))[:360] # 去重并确保最多360个
def get_danmakus(self, aid):
"""获取单个视频的弹幕"""
try:
# 获取cid
url = f"https://api.bilibili.com/x/web-interface/view?aid={aid}"
response = requests.get(url, headers=self.headers)
cid = json.loads(response.text)["data"]["cid"]
# 获取弹幕
danmaku_url = f"https://comment.bilibili.com/{cid}.xml"
response = requests.get(danmaku_url, headers=self.headers)
response.encoding = "utf-8"
# 提取弹幕内容
danmakus = re.findall(r'<d.*?>(.*?)</d>', response.text)
self.danmaku_list.extend(danmakus)
print(f"成功获取视频{aid}{len(danmakus)}条弹幕")
time.sleep(random.uniform(0.5, 1.5))
return True
except Exception as e:
print(f"获取视频{aid}弹幕失败: {e}")
return False
def run(self, keywords=["大语言模型", "大模型", "LLM"]):
"""运行爬虫主程序"""
all_video_ids = []
for keyword in keywords:
print(f"搜索关键词: {keyword}")
video_ids = self.get_video_ids(keyword)
all_video_ids.extend(video_ids)
# 去重并确保总数不超过360
unique_video_ids = list(set(all_video_ids))[:360]
print(f"共获取{len(unique_video_ids)}个视频ID开始爬取弹幕...")
for idx, aid in enumerate(unique_video_ids, 1):
print(f"正在爬取第{idx}/{len(unique_video_ids)}个视频")
self.get_danmakus(aid)
print(f"爬取完成,共获取{len(self.danmaku_list)}条弹幕")
return self.danmaku_list
Loading…
Cancel
Save