second-task/crawler.py

import requests
from bs4 import BeautifulSoup
import re
import time
import random

class BilibiliCrawler:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Referer': 'https://www.bilibili.com/'
        }
        self.session = requests.Session()
        self.session.headers.update(self.headers)

    def search_videos(self, keyword, pages=2):
        """搜索视频，获取视频ID列表"""
        print("正在搜索视频...")
        video_ids = []

        for page in range(1, pages + 1):
            try:
                url = f'https://search.bilibili.com/all'
                params = {'keyword': keyword, 'page': page, 'order': 'totalrank'}

                response = self.session.get(url, params=params, timeout=10)
                response.encoding = 'utf-8'

                soup = BeautifulSoup(response.text, 'html.parser')
                video_links = soup.find_all('a', href=re.compile(r'//www.bilibili.com/video/(BV[0-9A-Za-z]+)'))

                for link in video_links:
                    href = link.get('href')
                    if 'BV' in href:
                        bv_match = re.search(r'BV[0-9A-Za-z]+', href)
                        if bv_match:
                            video_ids.append(bv_match.group())

                print(f'第{page}页搜索完成，找到{len(video_links)}个视频')
                time.sleep(random.uniform(1, 2))

            except Exception as e:
                print(f'搜索第{page}页时出错: {e}')
                continue

        video_ids = list(set(video_ids))
        print(f'共找到{len(video_ids)}个唯一视频')
        return video_ids[:10]  # 只取前10个视频

    def get_danmaku(self, bvid):
        """获取视频的弹幕"""
        try:
            # 先获取视频的cid
            video_info_url = f'https://api.bilibili.com/x/web-interface/view'
            params = {'bvid': bvid}

            response = self.session.get(video_info_url, params=params, timeout=10)
            video_info = response.json()

            if video_info['code'] != 0:
                print(f"无法获取视频{bvid}信息")
                return []

            cid = video_info['data']['cid']

            # 获取弹幕
            danmaku_url = f'https://api.bilibili.com/x/v1/dm/list.so'
            params = {'oid': cid}

            response = self.session.get(danmaku_url, params=params, timeout=10)
            response.encoding = 'utf-8'

            soup = BeautifulSoup(response.text, 'xml')
            danmakus = soup.find_all('d')

            danmaku_list = []
            for danmaku in danmakus:
                text = danmaku.get_text()
                if text and len(text.strip()) > 0:
                    danmaku_list.append(text.strip())

            print(f'视频{bvid} 获取到 {len(danmaku_list)} 条弹幕')
            return danmaku_list

        except Exception as e:
            print(f'获取视频{bvid}弹幕时出错: {e}')
            return []

    def crawl(self, keywords, output_file='danmaku.txt'):
        """主爬取函数"""
        all_danmakus = []

        for keyword in keywords:
            print(f'\n开始爬取关键词: {keyword}')
            video_ids = self.search_videos(keyword)

            for i, bvid in enumerate(video_ids):
                print(f'正在处理第 {i+1}/{len(video_ids)} 个视频: {bvid}')
                danmakus = self.get_danmaku(bvid)
                all_danmakus.extend(danmakus)

                # 保存进度
                with open(output_file, 'w', encoding='utf-8') as f:
                    for danmaku in all_danmakus:
                        f.write(danmaku + '\n')

                time.sleep(random.uniform(0.5, 1.0))

        print(f'\n爬取完成！共获取 {len(all_danmakus)} 条弹幕')
        print(f'数据已保存到: {output_file}')

        # 显示前几条弹幕作为示例
        if all_danmakus:
            print("\n前5条弹幕示例:")
            for i, dm in enumerate(all_danmakus[:5]):
                print(f"{i+1}. {dm}")

        return all_danmakus

# 主程序
if __name__ == '__main__':
    print("=== B站弹幕爬虫启动 ===")
    crawler = BilibiliCrawler()

    # 搜索关键词
    keywords = ['大语言模型', 'LLM']

    # 开始爬取
    danmakus = crawler.crawl(keywords, 'bilibili_danmaku.txt')

    print("\n=== 程序执行完毕 ===")