You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

144 lines
5.2 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
from bs4 import BeautifulSoup
import re
import time
import random
# 定义一个爬虫类
class BilibiliCrawler:
def __init__(self):
# 设置请求头,模拟浏览器访问
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Referer': 'https://www.bilibili.com/'
}
self.session = requests.Session()
self.session.headers.update(self.headers)
def search_videos(self, keyword, pages=3):
"""搜索视频获取视频ID列表"""
print("正在搜索视频...")
video_ids = []
for page in range(1, pages + 1):
try:
# 构造搜索URL
url = f'https://search.bilibili.com/all'
params = {
'keyword': keyword,
'page': page,
'order': 'totalrank' # 综合排序
}
response = self.session.get(url, params=params, timeout=10)
response.encoding = 'utf-8'
# 使用BeautifulSoup解析HTML
soup = BeautifulSoup(response.text, 'html.parser')
# 查找视频链接(改进的选择器)
video_links = soup.find_all('a', href=re.compile(r'//www.bilibili.com/video/(BV[0-9A-Za-z]+)'))
for link in video_links:
href = link.get('href')
if 'BV' in href:
# 提取视频BV号
bv_match = re.search(r'BV[0-9A-Za-z]+', href)
if bv_match:
video_ids.append(bv_match.group())
print(f'{page}页搜索完成,找到{len(video_links)}个视频')
# 随机延时,避免请求过快
time.sleep(random.uniform(1, 3))
except Exception as e:
print(f'搜索第{page}页时出错: {e}')
continue
# 去重
video_ids = list(set(video_ids))
print(f'共找到{len(video_ids)}个唯一视频')
return video_ids[:20] # 限制视频数量,避免请求过多
def get_danmaku(self, bvid):
"""获取视频的弹幕"""
try:
# 先获取视频的cid弹幕ID
video_info_url = f'https://api.bilibili.com/x/web-interface/view'
params = {'bvid': bvid}
response = self.session.get(video_info_url, params=params, timeout=10)
video_info = response.json()
if video_info['code'] != 0:
print(f"无法获取视频{bvid}信息")
return []
cid = video_info['data']['cid']
# 获取弹幕
danmaku_url = f'https://api.bilibili.com/x/v1/dm/list.so'
params = {'oid': cid}
response = self.session.get(danmaku_url, params=params, timeout=10)
response.encoding = 'utf-8'
# 解析XML格式的弹幕
soup = BeautifulSoup(response.text, 'xml')
danmakus = soup.find_all('d')
danmaku_list = []
for danmaku in danmakus:
text = danmaku.get_text()
if text and len(text.strip()) > 0:
danmaku_list.append(text.strip())
print(f'视频{bvid}获取到{len(danmaku_list)}条弹幕')
return danmaku_list
except Exception as e:
print(f'获取视频{bvid}弹幕时出错: {e}')
return []
def crawl(self, keywords, output_file='danmaku.txt'):
"""主爬取函数"""
all_danmakus = []
for keyword in keywords:
print(f'\n开始爬取关键词: {keyword}')
video_ids = self.search_videos(keyword)
for i, bvid in enumerate(video_ids):
print(f'正在处理第{i+1}/{len(video_ids)}个视频: {bvid}')
danmakus = self.get_danmaku(bvid)
all_danmakus.extend(danmakus)
# 保存进度
with open(output_file, 'w', encoding='utf-8') as f:
for danmaku in all_danmakus:
f.write(danmaku + '\n')
# 延时
time.sleep(random.uniform(0.5, 1.5))
print(f'\n爬取完成!共获取{len(all_danmakus)}条弹幕,已保存到{output_file}')
return all_danmakus
# 主程序
if __name__ == '__main__':
crawler = BilibiliCrawler()
# 搜索关键词
keywords = ['大语言模型', 'LLM', '大模型']
# 开始爬取
danmakus = crawler.crawl(keywords, 'bilibili_danmaku.txt')
# 打印一些统计信息
print(f"\n最终统计:")
print(f"总弹幕数: {len(danmakus)}")
if danmakus:
print("前10条弹幕示例:")
for i, dm in enumerate(danmakus[:10]):
print(f"{i+1}. {dm}")