You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

77 lines
2.8 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
import re
from lxml import etree
import os
class BiliBiliDanMu:
def __init__(self, bv, filename):
# 自动处理 BV 号,确保没有重复的 "BV" 前缀
if bv.startswith("BV"):
bv = bv[2:]
# 根据 bv 号构造要爬取的视频 URL 地址
self.video_url = "https://bilibili.com/video/BV" + bv
self.filename = filename
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)\
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.44"
}
# 获取视频的 cid
def get_video_cid(self):
response = requests.get(self.video_url, headers=self.headers, timeout=10)
if response.status_code != 200:
print(f"请求失败,状态码: {response.status_code}")
return None
html = response.content.decode()
print(f"HTML 内容(前500字符): {html[:500]}") # 打印部分 HTML 内容用于调试
cid = re.findall(r'("cid":)([0-9]+)', html)
# 有的视频没有这个字段,我们跳过它
if len(cid) == 0:
print("未找到 cid")
return None
else:
return cid[0][-1]
# 获取请求弹幕 XML 文件返回的内容
def get_content(self, xml_url):
response = requests.get(xml_url, headers=self.headers, timeout=10)
return response.content
# 解析获取到的内容,得到包含视频所有弹幕的列表
def extract_danmu(self, content_str):
html = etree.HTML(content_str)
danmu_list = html.xpath("//d/text()")
return danmu_list
# 将弹幕逐行写入并保存为 txt 文件
def save(self, save_items):
# 确保输出目录存在
output_dir = os.path.dirname(self.filename)
os.makedirs(output_dir, exist_ok=True) # 自动创建目录
with open(self.filename, 'w', encoding='utf-8') as f:
lines = []
for item in save_items:
lines.append(item + '\n')
f.writelines(lines)
print(f"弹幕已保存至 {self.filename}")
# 爬虫的过程封装
def crawl(self):
cid = self.get_video_cid()
# 跳过没有 cid 字段的视频
if cid is not None:
xml_url = "http://comment.bilibili.com/" + str(cid) + ".xml"
content_str = self.get_content(xml_url)
danmu_lst = self.extract_danmu(content_str)
self.save(danmu_lst)
else:
print("视频没有有效的 cid跳过此视频")
if __name__ == '__main__':
bv = input("请输入视频的 bv 号: ")
# 处理文件名,确保路径正确
filename = 'E:/前端/软件工程/{}.txt'.format(str(bv))
dm = BiliBiliDanMu(bv, filename)
dm.crawl()