|
|
|
|
@ -1,87 +0,0 @@
|
|
|
|
|
import requests
|
|
|
|
|
import re
|
|
|
|
|
import json
|
|
|
|
|
import time
|
|
|
|
|
import random
|
|
|
|
|
|
|
|
|
|
class BilibiliDanmakuSpider:
|
|
|
|
|
def __init__(self):
|
|
|
|
|
self.headers = {
|
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
|
|
|
|
"Referer": "https://www.bilibili.com/"
|
|
|
|
|
}
|
|
|
|
|
self.danmaku_list = []
|
|
|
|
|
|
|
|
|
|
def get_video_ids(self, keyword, page_count=36):
|
|
|
|
|
"""获取搜索结果的视频ID,每页10个视频,36页共360个"""
|
|
|
|
|
video_ids = []
|
|
|
|
|
for page in range(1, page_count + 1):
|
|
|
|
|
try:
|
|
|
|
|
url = f"https://api.bilibili.com/x/web-interface/search/type?keyword={keyword}&search_type=video&page={page}"
|
|
|
|
|
response = requests.get(url, headers=self.headers)
|
|
|
|
|
data = json.loads(response.text)
|
|
|
|
|
|
|
|
|
|
#if data["code"] == 0 and data["data"]["result"]:
|
|
|
|
|
# for item in data["data"]["result"]:
|
|
|
|
|
# video_ids.append(item["aid"])
|
|
|
|
|
# 检查接口响应是否正常且有视频数据
|
|
|
|
|
if data["code"] == 0 and data["data"]["result"]:
|
|
|
|
|
# 提取当前页的视频ID并显示
|
|
|
|
|
page_video_ids = [item["aid"] for item in data["data"]["result"]]
|
|
|
|
|
video_ids.extend(page_video_ids) # 批量添加到总列表
|
|
|
|
|
|
|
|
|
|
# 显示当前页获取到的视频号
|
|
|
|
|
print(f"第{page}页获取到视频ID:{page_video_ids}共{len(page_video_ids)}个")
|
|
|
|
|
#else:
|
|
|
|
|
# 无数据时提示(非错误,可能是真的没结果)
|
|
|
|
|
#print(f"第{page}页未获取到视频数据")
|
|
|
|
|
|
|
|
|
|
# 随机延迟,避免被反爬
|
|
|
|
|
time.sleep(random.uniform(1, 3))
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"获取第{page}页视频ID失败: {e}")
|
|
|
|
|
|
|
|
|
|
return list(set(video_ids))[:360] # 去重并确保最多360个
|
|
|
|
|
|
|
|
|
|
def get_danmakus(self, aid):
|
|
|
|
|
"""获取单个视频的弹幕"""
|
|
|
|
|
try:
|
|
|
|
|
# 获取cid
|
|
|
|
|
url = f"https://api.bilibili.com/x/web-interface/view?aid={aid}"
|
|
|
|
|
response = requests.get(url, headers=self.headers)
|
|
|
|
|
cid = json.loads(response.text)["data"]["cid"]
|
|
|
|
|
|
|
|
|
|
# 获取弹幕
|
|
|
|
|
danmaku_url = f"https://comment.bilibili.com/{cid}.xml"
|
|
|
|
|
response = requests.get(danmaku_url, headers=self.headers)
|
|
|
|
|
response.encoding = "utf-8"
|
|
|
|
|
|
|
|
|
|
# 提取弹幕内容
|
|
|
|
|
danmakus = re.findall(r'<d.*?>(.*?)</d>', response.text)
|
|
|
|
|
self.danmaku_list.extend(danmakus)
|
|
|
|
|
|
|
|
|
|
print(f"成功获取视频{aid}的{len(danmakus)}条弹幕")
|
|
|
|
|
time.sleep(random.uniform(0.5, 1.5))
|
|
|
|
|
return True
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"获取视频{aid}弹幕失败: {e}")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
def run(self, keywords=["大语言模型", "大模型", "LLM"]):
|
|
|
|
|
"""运行爬虫主程序"""
|
|
|
|
|
all_video_ids = []
|
|
|
|
|
for keyword in keywords:
|
|
|
|
|
print(f"搜索关键词: {keyword}")
|
|
|
|
|
video_ids = self.get_video_ids(keyword)
|
|
|
|
|
all_video_ids.extend(video_ids)
|
|
|
|
|
|
|
|
|
|
# 去重并确保总数不超过360
|
|
|
|
|
unique_video_ids = list(set(all_video_ids))[:360]
|
|
|
|
|
print(f"共获取{len(unique_video_ids)}个视频ID,开始爬取弹幕...")
|
|
|
|
|
|
|
|
|
|
for idx, aid in enumerate(unique_video_ids, 1):
|
|
|
|
|
print(f"正在爬取第{idx}/{len(unique_video_ids)}个视频")
|
|
|
|
|
self.get_danmakus(aid)
|
|
|
|
|
|
|
|
|
|
print(f"爬取完成,共获取{len(self.danmaku_list)}条弹幕")
|
|
|
|
|
return self.danmaku_list
|