You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

77 lines
2.8 KiB

import requests
import re
from lxml import etree
import os
class BiliBiliDanMu:
def __init__(self, bv, filename):
# 自动处理 BV 号,确保没有重复的 "BV" 前缀
if bv.startswith("BV"):
bv = bv[2:]
# 根据 bv 号构造要爬取的视频 URL 地址
self.video_url = "https://bilibili.com/video/BV" + bv
self.filename = filename
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)\
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.44"
}
# 获取视频的 cid
def get_video_cid(self):
response = requests.get(self.video_url, headers=self.headers, timeout=10)
if response.status_code != 200:
print(f"请求失败,状态码: {response.status_code}")
return None
html = response.content.decode()
print(f"HTML 内容(前500字符): {html[:500]}") # 打印部分 HTML 内容用于调试
cid = re.findall(r'("cid":)([0-9]+)', html)
# 有的视频没有这个字段,我们跳过它
if len(cid) == 0:
print("未找到 cid")
return None
else:
return cid[0][-1]
# 获取请求弹幕 XML 文件返回的内容
def get_content(self, xml_url):
response = requests.get(xml_url, headers=self.headers, timeout=10)
return response.content
# 解析获取到的内容,得到包含视频所有弹幕的列表
def extract_danmu(self, content_str):
html = etree.HTML(content_str)
danmu_list = html.xpath("//d/text()")
return danmu_list
# 将弹幕逐行写入并保存为 txt 文件
def save(self, save_items):
# 确保输出目录存在
output_dir = os.path.dirname(self.filename)
os.makedirs(output_dir, exist_ok=True) # 自动创建目录
with open(self.filename, 'w', encoding='utf-8') as f:
lines = []
for item in save_items:
lines.append(item + '\n')
f.writelines(lines)
print(f"弹幕已保存至 {self.filename}")
# 爬虫的过程封装
def crawl(self):
cid = self.get_video_cid()
# 跳过没有 cid 字段的视频
if cid is not None:
xml_url = "http://comment.bilibili.com/" + str(cid) + ".xml"
content_str = self.get_content(xml_url)
danmu_lst = self.extract_danmu(content_str)
self.save(danmu_lst)
else:
print("视频没有有效的 cid跳过此视频")
if __name__ == '__main__':
bv = input("请输入视频的 bv 号: ")
# 处理文件名,确保路径正确
filename = 'E:/前端/软件工程/{}.txt'.format(str(bv))
dm = BiliBiliDanMu(bv, filename)
dm.crawl()