102201539/main.py

"""
main.py - 从 Bilibili 获取视频弹幕并保存到 Excel 文件中
"""

import re
import time
import xml.etree.ElementTree as ET
import requests
import pandas as pd
from bs4 import BeautifulSoup

def search_videos(keyword, max_results=300):
    """
    根据关键词搜索视频，并返回视频 ID 列表。

    :param keyword: 搜索关键词
    :param max_results: 最大返回结果数量
    :return: 视频 ID 列表
    """
    video_list = []  # 存储视频 ID 的列表
    page = 1  # 当前页码
    while len(video_list) < max_results:
        # 构建搜索 URL
        search_url = f"http://search.bilibili.com/all?keyword={keyword}&page={page}"
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
                (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }
        try:
            response = requests.get(search_url, headers=headers, timeout=10)  # 发送请求
            if response.status_code == 200:
                print(f"获取第 {page} 页搜索结果...")
                soup = BeautifulSoup(response.text, 'html.parser')  # 解析页面内容
                # 获取视频信息
                for item in soup.find_all('div', class_='bili-video-card'):
                    title_tag = item.find('a', href=True)  # 查找视频链接
                    if title_tag:
                        link = title_tag['href']  # 获取链接
                        if link.startswith('//'):
                            link = 'https:' + link  # 处理相对链接
                        video_id = link.split('/')[-2]  # 提取视频 ID
                        video_list.append(video_id)  # 添加视频 ID 到列表
                        print(f"视频 ID: {video_id}, 链接: {link}")
                    # 如果达到最大结果数，则停止获取
                    if len(video_list) >= max_results:
                        break
                page += 1  # 请求下一页
                time.sleep(1)  # 添加延迟，避免频繁请求
            else:
                print("搜索页面获取失败，状态码:", response.status_code)
                break
        except requests.exceptions.RequestException as req_error:
            print(f"搜索请求失败: {req_error}")
            break
    print(f"找到 {len(video_list)} 个视频.")  # 输出找到的视频数量
    return video_list

def get_cid(video_id):
    """
    根据视频 ID 获取弹幕 CID。

    :param video_id: 视频 ID
    :return: 弹幕 CID
    """
    video_url = f"http://www.bilibili.com/video/{video_id}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
            (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    try:
        response = requests.get(video_url, headers=headers, timeout=10)  # 发送请求
        if response.status_code == 200:
            print("成功获取视频页面")
            match = re.search(r'"cid":(\d+)', response.text)  # 正则表达式匹配 cid
            if match:
                cid = match.group(1)  # 提取 cid
                print(f"找到 cid: {cid}")
                return cid
            print("未找到 cid")
            return None
        print("获取视频页面失败，状态码:", response.status_code)
        return None
    except requests.exceptions.RequestException as req_error:
        print(f"获取视频页面失败: {req_error}")
        return None

def get_danmaku(video_id, cid):
    """
    根据视频 ID 和 CID 获取弹幕数据。

    :param video_id: 视频 ID
    :param cid: 弹幕 CID
    :return: 弹幕数据的 XML 内容
    """
    danmaku_url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
            (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Referer": f"https://www.bilibili.com/video/{video_id}"
    }
    try:
        response = requests.get(danmaku_url, headers=headers, timeout=10)  # 发送请求
        if response.status_code == 200:
            print("成功获取弹幕数据")
            return response.content  # 返回弹幕数据
        print("获取弹幕数据失败，状态码:", response.status_code)
        return None
    except requests.exceptions.RequestException as req_error:
        print(f"获取弹幕数据失败: {req_error}")
        return None

def parse_danmaku(xml_content):
    """
    解析弹幕 XML 内容，提取弹幕文本。

    :param xml_content: 弹幕的 XML 内容
    :return: 弹幕文本列表
    """
    try:
        root = ET.fromstring(xml_content)  # 解析 XML
        danmaku_items = []  # 存储弹幕文本
        for item in root.findall('.//d'):
            content = item.text  # 提取弹幕文本
            if content:
                danmaku_items.append(content)  # 添加到列表
        return danmaku_items
    except ET.ParseError as parse_error:
        print(f"解析弹幕数据时出错: {parse_error}")
        return []

def save_danmakus_to_excel(all_danmakus, filename):
    """
    将弹幕数据保存到 Excel 文件中。

    :param all_danmakus: 所有弹幕内容
    :param filename: 保存的文件名
    """
    try:
        data_frame = pd.DataFrame(all_danmakus, columns=["弹幕内容"])  # 创建 DataFrame
        data_frame.to_excel(filename, index=False)  # 保存到 Excel 文件
        print(f"所有弹幕数据已保存到 {filename}")
    except FileNotFoundError:
        print(f"输出文件路径 {filename} 未找到，请检查路径是否正确。")
    except ValueError as value_error:
        print(f"数据转换错误: {value_error}. 请检查数据格式。")
    except PermissionError:
        print(f"没有权限写入文件 {filename}。请检查文件权限。")

def main():
    """
    主函数，执行视频搜索、获取弹幕并保存到 Excel 的主要逻辑。
    """
    keyword = "2024巴黎奥运会"  # 搜索关键词
    video_ids = search_videos(keyword)  # 搜索视频并获取视频 ID 列表
    all_danmakus = []  # 存储所有弹幕内容
    for video_id in video_ids:
        cid = get_cid(video_id)  # 获取视频的 cid
        print(f"正在获取视频 {video_id} 的弹幕...")
        if cid:
            xml_content = get_danmaku(video_id, cid)  # 获取弹幕数据
            if xml_content:
                danmaku_items = parse_danmaku(xml_content)  # 解析弹幕
                print(f"获取到 {len(danmaku_items)} 条弹幕")
                all_danmakus.extend(danmaku_items)  # 合并弹幕内容
            else:
                print(f"获取弹幕失败，视频 ID: {video_id}")
        time.sleep(2)  # 添加延迟，避免频繁请求
    # 将所有弹幕保存到 Excel 文件中
    save_danmakus_to_excel(all_danmakus, "all_danmakus.xlsx")

if __name__ == "__main__":
    main()  # 执行主函数