BliBili_danmu_crawl/release/crawl_1.py

import requests
import re
from lxml import etree
import os

class BiliBiliDanMu:
    def __init__(self, bv, filename):
        # 自动处理 BV 号，确保没有重复的 "BV" 前缀
        if bv.startswith("BV"):
            bv = bv[2:]
        # 根据 bv 号构造要爬取的视频 URL 地址
        self.video_url = "https://bilibili.com/video/BV" + bv
        self.filename = filename
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)\
             AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.44"
        }

    # 获取视频的 cid
    def get_video_cid(self):
        response = requests.get(self.video_url, headers=self.headers, timeout=10)
        if response.status_code != 200:
            print(f"请求失败，状态码: {response.status_code}")
            return None

        html = response.content.decode()
        print(f"HTML 内容(前500字符): {html[:500]}")  # 打印部分 HTML 内容用于调试
        cid = re.findall(r'("cid":)([0-9]+)', html)
        # 有的视频没有这个字段，我们跳过它
        if len(cid) == 0:
            print("未找到 cid")
            return None
        else:
            return cid[0][-1]

    # 获取请求弹幕 XML 文件返回的内容
    def get_content(self, xml_url):
        response = requests.get(xml_url, headers=self.headers, timeout=10)
        return response.content

    # 解析获取到的内容，得到包含视频所有弹幕的列表
    def extract_danmu(self, content_str):
        html = etree.HTML(content_str)
        danmu_list = html.xpath("//d/text()")
        return danmu_list

    # 将弹幕逐行写入并保存为 txt 文件
    def save(self, save_items):
        # 确保输出目录存在
        output_dir = os.path.dirname(self.filename)
        os.makedirs(output_dir, exist_ok=True)  # 自动创建目录

        with open(self.filename, 'w', encoding='utf-8') as f:
            lines = []
            for item in save_items:
                lines.append(item + '\n')
            f.writelines(lines)
        print(f"弹幕已保存至 {self.filename}")

    # 爬虫的过程封装
    def crawl(self):
        cid = self.get_video_cid()
        # 跳过没有 cid 字段的视频
        if cid is not None:
            xml_url = "http://comment.bilibili.com/" + str(cid) + ".xml"
            content_str = self.get_content(xml_url)
            danmu_lst = self.extract_danmu(content_str)
            self.save(danmu_lst)
        else:
            print("视频没有有效的 cid，跳过此视频")

if __name__ == '__main__':
    bv = input("请输入视频的 bv 号: ")
    # 处理文件名，确保路径正确
    filename = 'E:/前端/软件工程/{}.txt'.format(str(bv))
    dm = BiliBiliDanMu(bv, filename)
    dm.crawl()