import requests from bs4 import BeautifulSoup import re import time import random # 定义一个爬虫类 class BilibiliCrawler: def __init__(self): # 设置请求头,模拟浏览器访问 self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Referer': 'https://www.bilibili.com/' } self.session = requests.Session() self.session.headers.update(self.headers) def search_videos(self, keyword, pages=3): """搜索视频,获取视频ID列表""" print("正在搜索视频...") video_ids = [] for page in range(1, pages + 1): try: # 构造搜索URL url = f'https://search.bilibili.com/all' params = { 'keyword': keyword, 'page': page, 'order': 'totalrank' # 综合排序 } response = self.session.get(url, params=params, timeout=10) response.encoding = 'utf-8' # 使用BeautifulSoup解析HTML soup = BeautifulSoup(response.text, 'html.parser') # 查找视频链接(改进的选择器) video_links = soup.find_all('a', href=re.compile(r'//www.bilibili.com/video/(BV[0-9A-Za-z]+)')) for link in video_links: href = link.get('href') if 'BV' in href: # 提取视频BV号 bv_match = re.search(r'BV[0-9A-Za-z]+', href) if bv_match: video_ids.append(bv_match.group()) print(f'第{page}页搜索完成,找到{len(video_links)}个视频') # 随机延时,避免请求过快 time.sleep(random.uniform(1, 3)) except Exception as e: print(f'搜索第{page}页时出错: {e}') continue # 去重 video_ids = list(set(video_ids)) print(f'共找到{len(video_ids)}个唯一视频') return video_ids[:20] # 限制视频数量,避免请求过多 def get_danmaku(self, bvid): """获取视频的弹幕""" try: # 先获取视频的cid(弹幕ID) video_info_url = f'https://api.bilibili.com/x/web-interface/view' params = {'bvid': bvid} response = self.session.get(video_info_url, params=params, timeout=10) video_info = response.json() if video_info['code'] != 0: print(f"无法获取视频{bvid}信息") return [] cid = video_info['data']['cid'] # 获取弹幕 danmaku_url = f'https://api.bilibili.com/x/v1/dm/list.so' params = {'oid': cid} response = self.session.get(danmaku_url, params=params, timeout=10) response.encoding = 'utf-8' # 解析XML格式的弹幕 soup = BeautifulSoup(response.text, 'xml') danmakus = soup.find_all('d') danmaku_list = [] for danmaku in danmakus: text = danmaku.get_text() if text and len(text.strip()) > 0: danmaku_list.append(text.strip()) print(f'视频{bvid}获取到{len(danmaku_list)}条弹幕') return danmaku_list except Exception as e: print(f'获取视频{bvid}弹幕时出错: {e}') return [] def crawl(self, keywords, output_file='danmaku.txt'): """主爬取函数""" all_danmakus = [] for keyword in keywords: print(f'\n开始爬取关键词: {keyword}') video_ids = self.search_videos(keyword) for i, bvid in enumerate(video_ids): print(f'正在处理第{i+1}/{len(video_ids)}个视频: {bvid}') danmakus = self.get_danmaku(bvid) all_danmakus.extend(danmakus) # 保存进度 with open(output_file, 'w', encoding='utf-8') as f: for danmaku in all_danmakus: f.write(danmaku + '\n') # 延时 time.sleep(random.uniform(0.5, 1.5)) print(f'\n爬取完成!共获取{len(all_danmakus)}条弹幕,已保存到{output_file}') return all_danmakus # 主程序 if __name__ == '__main__': crawler = BilibiliCrawler() # 搜索关键词 keywords = ['大语言模型', 'LLM', '大模型'] # 开始爬取 danmakus = crawler.crawl(keywords, 'bilibili_danmaku.txt') # 打印一些统计信息 print(f"\n最终统计:") print(f"总弹幕数: {len(danmakus)}") if danmakus: print("前10条弹幕示例:") for i, dm in enumerate(danmakus[:10]): print(f"{i+1}. {dm}")