You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

131 lines
4.8 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
from bs4 import BeautifulSoup
import re
import time
import random
class BilibiliCrawler:
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Referer': 'https://www.bilibili.com/'
}
self.session = requests.Session()
self.session.headers.update(self.headers)
def search_videos(self, keyword, pages=2):
"""搜索视频获取视频ID列表"""
print("正在搜索视频...")
video_ids = []
for page in range(1, pages + 1):
try:
url = f'https://search.bilibili.com/all'
params = {'keyword': keyword, 'page': page, 'order': 'totalrank'}
response = self.session.get(url, params=params, timeout=10)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
video_links = soup.find_all('a', href=re.compile(r'//www.bilibili.com/video/(BV[0-9A-Za-z]+)'))
for link in video_links:
href = link.get('href')
if 'BV' in href:
bv_match = re.search(r'BV[0-9A-Za-z]+', href)
if bv_match:
video_ids.append(bv_match.group())
print(f'{page}页搜索完成,找到{len(video_links)}个视频')
time.sleep(random.uniform(1, 2))
except Exception as e:
print(f'搜索第{page}页时出错: {e}')
continue
video_ids = list(set(video_ids))
print(f'共找到{len(video_ids)}个唯一视频')
return video_ids[:10] # 只取前10个视频
def get_danmaku(self, bvid):
"""获取视频的弹幕"""
try:
# 先获取视频的cid
video_info_url = f'https://api.bilibili.com/x/web-interface/view'
params = {'bvid': bvid}
response = self.session.get(video_info_url, params=params, timeout=10)
video_info = response.json()
if video_info['code'] != 0:
print(f"无法获取视频{bvid}信息")
return []
cid = video_info['data']['cid']
# 获取弹幕
danmaku_url = f'https://api.bilibili.com/x/v1/dm/list.so'
params = {'oid': cid}
response = self.session.get(danmaku_url, params=params, timeout=10)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'xml')
danmakus = soup.find_all('d')
danmaku_list = []
for danmaku in danmakus:
text = danmaku.get_text()
if text and len(text.strip()) > 0:
danmaku_list.append(text.strip())
print(f'视频{bvid} 获取到 {len(danmaku_list)} 条弹幕')
return danmaku_list
except Exception as e:
print(f'获取视频{bvid}弹幕时出错: {e}')
return []
def crawl(self, keywords, output_file='danmaku.txt'):
"""主爬取函数"""
all_danmakus = []
for keyword in keywords:
print(f'\n开始爬取关键词: {keyword}')
video_ids = self.search_videos(keyword)
for i, bvid in enumerate(video_ids):
print(f'正在处理第 {i+1}/{len(video_ids)} 个视频: {bvid}')
danmakus = self.get_danmaku(bvid)
all_danmakus.extend(danmakus)
# 保存进度
with open(output_file, 'w', encoding='utf-8') as f:
for danmaku in all_danmakus:
f.write(danmaku + '\n')
time.sleep(random.uniform(0.5, 1.0))
print(f'\n爬取完成!共获取 {len(all_danmakus)} 条弹幕')
print(f'数据已保存到: {output_file}')
# 显示前几条弹幕作为示例
if all_danmakus:
print("\n前5条弹幕示例:")
for i, dm in enumerate(all_danmakus[:5]):
print(f"{i+1}. {dm}")
return all_danmakus
# 主程序
if __name__ == '__main__':
print("=== B站弹幕爬虫启动 ===")
crawler = BilibiliCrawler()
# 搜索关键词
keywords = ['大语言模型', 'LLM']
# 开始爬取
danmakus = crawler.crawl(keywords, 'bilibili_danmaku.txt')
print("\n=== 程序执行完毕 ===")