|
|
|
|
@ -1,144 +0,0 @@
|
|
|
|
|
import requests
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
import re
|
|
|
|
|
import time
|
|
|
|
|
import random
|
|
|
|
|
|
|
|
|
|
# 定义一个爬虫类
|
|
|
|
|
class BilibiliCrawler:
|
|
|
|
|
def __init__(self):
|
|
|
|
|
# 设置请求头,模拟浏览器访问
|
|
|
|
|
self.headers = {
|
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
|
|
|
|
'Referer': 'https://www.bilibili.com/'
|
|
|
|
|
}
|
|
|
|
|
self.session = requests.Session()
|
|
|
|
|
self.session.headers.update(self.headers)
|
|
|
|
|
|
|
|
|
|
def search_videos(self, keyword, pages=3):
|
|
|
|
|
"""搜索视频,获取视频ID列表"""
|
|
|
|
|
print("正在搜索视频...")
|
|
|
|
|
video_ids = []
|
|
|
|
|
|
|
|
|
|
for page in range(1, pages + 1):
|
|
|
|
|
try:
|
|
|
|
|
# 构造搜索URL
|
|
|
|
|
url = f'https://search.bilibili.com/all'
|
|
|
|
|
params = {
|
|
|
|
|
'keyword': keyword,
|
|
|
|
|
'page': page,
|
|
|
|
|
'order': 'totalrank' # 综合排序
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
response = self.session.get(url, params=params, timeout=10)
|
|
|
|
|
response.encoding = 'utf-8'
|
|
|
|
|
|
|
|
|
|
# 使用BeautifulSoup解析HTML
|
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
|
|
|
|
|
|
# 查找视频链接(改进的选择器)
|
|
|
|
|
video_links = soup.find_all('a', href=re.compile(r'//www.bilibili.com/video/(BV[0-9A-Za-z]+)'))
|
|
|
|
|
|
|
|
|
|
for link in video_links:
|
|
|
|
|
href = link.get('href')
|
|
|
|
|
if 'BV' in href:
|
|
|
|
|
# 提取视频BV号
|
|
|
|
|
bv_match = re.search(r'BV[0-9A-Za-z]+', href)
|
|
|
|
|
if bv_match:
|
|
|
|
|
video_ids.append(bv_match.group())
|
|
|
|
|
|
|
|
|
|
print(f'第{page}页搜索完成,找到{len(video_links)}个视频')
|
|
|
|
|
|
|
|
|
|
# 随机延时,避免请求过快
|
|
|
|
|
time.sleep(random.uniform(1, 3))
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f'搜索第{page}页时出错: {e}')
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# 去重
|
|
|
|
|
video_ids = list(set(video_ids))
|
|
|
|
|
print(f'共找到{len(video_ids)}个唯一视频')
|
|
|
|
|
return video_ids[:20] # 限制视频数量,避免请求过多
|
|
|
|
|
|
|
|
|
|
def get_danmaku(self, bvid):
|
|
|
|
|
"""获取视频的弹幕"""
|
|
|
|
|
try:
|
|
|
|
|
# 先获取视频的cid(弹幕ID)
|
|
|
|
|
video_info_url = f'https://api.bilibili.com/x/web-interface/view'
|
|
|
|
|
params = {'bvid': bvid}
|
|
|
|
|
|
|
|
|
|
response = self.session.get(video_info_url, params=params, timeout=10)
|
|
|
|
|
video_info = response.json()
|
|
|
|
|
|
|
|
|
|
if video_info['code'] != 0:
|
|
|
|
|
print(f"无法获取视频{bvid}信息")
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
cid = video_info['data']['cid']
|
|
|
|
|
|
|
|
|
|
# 获取弹幕
|
|
|
|
|
danmaku_url = f'https://api.bilibili.com/x/v1/dm/list.so'
|
|
|
|
|
params = {'oid': cid}
|
|
|
|
|
|
|
|
|
|
response = self.session.get(danmaku_url, params=params, timeout=10)
|
|
|
|
|
response.encoding = 'utf-8'
|
|
|
|
|
|
|
|
|
|
# 解析XML格式的弹幕
|
|
|
|
|
soup = BeautifulSoup(response.text, 'xml')
|
|
|
|
|
danmakus = soup.find_all('d')
|
|
|
|
|
|
|
|
|
|
danmaku_list = []
|
|
|
|
|
for danmaku in danmakus:
|
|
|
|
|
text = danmaku.get_text()
|
|
|
|
|
if text and len(text.strip()) > 0:
|
|
|
|
|
danmaku_list.append(text.strip())
|
|
|
|
|
|
|
|
|
|
print(f'视频{bvid}获取到{len(danmaku_list)}条弹幕')
|
|
|
|
|
return danmaku_list
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f'获取视频{bvid}弹幕时出错: {e}')
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
def crawl(self, keywords, output_file='danmaku.txt'):
|
|
|
|
|
"""主爬取函数"""
|
|
|
|
|
all_danmakus = []
|
|
|
|
|
|
|
|
|
|
for keyword in keywords:
|
|
|
|
|
print(f'\n开始爬取关键词: {keyword}')
|
|
|
|
|
video_ids = self.search_videos(keyword)
|
|
|
|
|
|
|
|
|
|
for i, bvid in enumerate(video_ids):
|
|
|
|
|
print(f'正在处理第{i+1}/{len(video_ids)}个视频: {bvid}')
|
|
|
|
|
danmakus = self.get_danmaku(bvid)
|
|
|
|
|
all_danmakus.extend(danmakus)
|
|
|
|
|
|
|
|
|
|
# 保存进度
|
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
|
|
|
for danmaku in all_danmakus:
|
|
|
|
|
f.write(danmaku + '\n')
|
|
|
|
|
|
|
|
|
|
# 延时
|
|
|
|
|
time.sleep(random.uniform(0.5, 1.5))
|
|
|
|
|
|
|
|
|
|
print(f'\n爬取完成!共获取{len(all_danmakus)}条弹幕,已保存到{output_file}')
|
|
|
|
|
return all_danmakus
|
|
|
|
|
|
|
|
|
|
# 主程序
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
crawler = BilibiliCrawler()
|
|
|
|
|
|
|
|
|
|
# 搜索关键词
|
|
|
|
|
keywords = ['大语言模型', 'LLM', '大模型']
|
|
|
|
|
|
|
|
|
|
# 开始爬取
|
|
|
|
|
danmakus = crawler.crawl(keywords, 'bilibili_danmaku.txt')
|
|
|
|
|
|
|
|
|
|
# 打印一些统计信息
|
|
|
|
|
print(f"\n最终统计:")
|
|
|
|
|
print(f"总弹幕数: {len(danmakus)}")
|
|
|
|
|
if danmakus:
|
|
|
|
|
print("前10条弹幕示例:")
|
|
|
|
|
for i, dm in enumerate(danmakus[:10]):
|
|
|
|
|
print(f"{i+1}. {dm}")
|