|
|
|
|
import requests
|
|
|
|
|
import re
|
|
|
|
|
from lxml import etree
|
|
|
|
|
import os
|
|
|
|
|
|
|
|
|
|
class BiliBiliDanMu:
|
|
|
|
|
def __init__(self, bv, filename):
|
|
|
|
|
# 自动处理 BV 号,确保没有重复的 "BV" 前缀
|
|
|
|
|
if bv.startswith("BV"):
|
|
|
|
|
bv = bv[2:]
|
|
|
|
|
# 根据 bv 号构造要爬取的视频 URL 地址
|
|
|
|
|
self.video_url = "https://bilibili.com/video/BV" + bv
|
|
|
|
|
self.filename = filename
|
|
|
|
|
self.headers = {
|
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)\
|
|
|
|
|
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.44"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# 获取视频的 cid
|
|
|
|
|
def get_video_cid(self):
|
|
|
|
|
response = requests.get(self.video_url, headers=self.headers, timeout=10)
|
|
|
|
|
if response.status_code != 200:
|
|
|
|
|
print(f"请求失败,状态码: {response.status_code}")
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
html = response.content.decode()
|
|
|
|
|
print(f"HTML 内容(前500字符): {html[:500]}") # 打印部分 HTML 内容用于调试
|
|
|
|
|
cid = re.findall(r'("cid":)([0-9]+)', html)
|
|
|
|
|
# 有的视频没有这个字段,我们跳过它
|
|
|
|
|
if len(cid) == 0:
|
|
|
|
|
print("未找到 cid")
|
|
|
|
|
return None
|
|
|
|
|
else:
|
|
|
|
|
return cid[0][-1]
|
|
|
|
|
|
|
|
|
|
# 获取请求弹幕 XML 文件返回的内容
|
|
|
|
|
def get_content(self, xml_url):
|
|
|
|
|
response = requests.get(xml_url, headers=self.headers, timeout=10)
|
|
|
|
|
return response.content
|
|
|
|
|
|
|
|
|
|
# 解析获取到的内容,得到包含视频所有弹幕的列表
|
|
|
|
|
def extract_danmu(self, content_str):
|
|
|
|
|
html = etree.HTML(content_str)
|
|
|
|
|
danmu_list = html.xpath("//d/text()")
|
|
|
|
|
return danmu_list
|
|
|
|
|
|
|
|
|
|
# 将弹幕逐行写入并保存为 txt 文件
|
|
|
|
|
def save(self, save_items):
|
|
|
|
|
# 确保输出目录存在
|
|
|
|
|
output_dir = os.path.dirname(self.filename)
|
|
|
|
|
os.makedirs(output_dir, exist_ok=True) # 自动创建目录
|
|
|
|
|
|
|
|
|
|
with open(self.filename, 'w', encoding='utf-8') as f:
|
|
|
|
|
lines = []
|
|
|
|
|
for item in save_items:
|
|
|
|
|
lines.append(item + '\n')
|
|
|
|
|
f.writelines(lines)
|
|
|
|
|
print(f"弹幕已保存至 {self.filename}")
|
|
|
|
|
|
|
|
|
|
# 爬虫的过程封装
|
|
|
|
|
def crawl(self):
|
|
|
|
|
cid = self.get_video_cid()
|
|
|
|
|
# 跳过没有 cid 字段的视频
|
|
|
|
|
if cid is not None:
|
|
|
|
|
xml_url = "http://comment.bilibili.com/" + str(cid) + ".xml"
|
|
|
|
|
content_str = self.get_content(xml_url)
|
|
|
|
|
danmu_lst = self.extract_danmu(content_str)
|
|
|
|
|
self.save(danmu_lst)
|
|
|
|
|
else:
|
|
|
|
|
print("视频没有有效的 cid,跳过此视频")
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
bv = input("请输入视频的 bv 号: ")
|
|
|
|
|
# 处理文件名,确保路径正确
|
|
|
|
|
filename = 'E:/前端/软件工程/{}.txt'.format(str(bv))
|
|
|
|
|
dm = BiliBiliDanMu(bv, filename)
|
|
|
|
|
dm.crawl()
|