|
|
@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
main.py - 从 Bilibili 获取视频弹幕并保存到 Excel 文件中
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
import time
|
|
|
|
|
|
|
|
import xml.etree.ElementTree as ET
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def search_videos(keyword, max_results=300):
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
根据关键词搜索视频,并返回视频 ID 列表。
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
:param keyword: 搜索关键词
|
|
|
|
|
|
|
|
:param max_results: 最大返回结果数量
|
|
|
|
|
|
|
|
:return: 视频 ID 列表
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
video_list = [] # 存储视频 ID 的列表
|
|
|
|
|
|
|
|
page = 1 # 当前页码
|
|
|
|
|
|
|
|
while len(video_list) < max_results:
|
|
|
|
|
|
|
|
# 构建搜索 URL
|
|
|
|
|
|
|
|
search_url = f"http://search.bilibili.com/all?keyword={keyword}&page={page}"
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
|
|
|
|
|
|
|
|
(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
response = requests.get(search_url, headers=headers, timeout=10) # 发送请求
|
|
|
|
|
|
|
|
if response.status_code == 200:
|
|
|
|
|
|
|
|
print(f"获取第 {page} 页搜索结果...")
|
|
|
|
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser') # 解析页面内容
|
|
|
|
|
|
|
|
# 获取视频信息
|
|
|
|
|
|
|
|
for item in soup.find_all('div', class_='bili-video-card'):
|
|
|
|
|
|
|
|
title_tag = item.find('a', href=True) # 查找视频链接
|
|
|
|
|
|
|
|
if title_tag:
|
|
|
|
|
|
|
|
link = title_tag['href'] # 获取链接
|
|
|
|
|
|
|
|
if link.startswith('//'):
|
|
|
|
|
|
|
|
link = 'https:' + link # 处理相对链接
|
|
|
|
|
|
|
|
video_id = link.split('/')[-2] # 提取视频 ID
|
|
|
|
|
|
|
|
video_list.append(video_id) # 添加视频 ID 到列表
|
|
|
|
|
|
|
|
print(f"视频 ID: {video_id}, 链接: {link}")
|
|
|
|
|
|
|
|
# 如果达到最大结果数,则停止获取
|
|
|
|
|
|
|
|
if len(video_list) >= max_results:
|
|
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
page += 1 # 请求下一页
|
|
|
|
|
|
|
|
time.sleep(1) # 添加延迟,避免频繁请求
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
print("搜索页面获取失败,状态码:", response.status_code)
|
|
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
except requests.exceptions.RequestException as req_error:
|
|
|
|
|
|
|
|
print(f"搜索请求失败: {req_error}")
|
|
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
print(f"找到 {len(video_list)} 个视频.") # 输出找到的视频数量
|
|
|
|
|
|
|
|
return video_list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_cid(video_id):
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
根据视频 ID 获取弹幕 CID。
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
:param video_id: 视频 ID
|
|
|
|
|
|
|
|
:return: 弹幕 CID
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
video_url = f"http://www.bilibili.com/video/{video_id}"
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
|
|
|
|
|
|
|
|
(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
response = requests.get(video_url, headers=headers, timeout=10) # 发送请求
|
|
|
|
|
|
|
|
if response.status_code == 200:
|
|
|
|
|
|
|
|
print("成功获取视频页面")
|
|
|
|
|
|
|
|
match = re.search(r'"cid":(\d+)', response.text) # 正则表达式匹配 cid
|
|
|
|
|
|
|
|
if match:
|
|
|
|
|
|
|
|
cid = match.group(1) # 提取 cid
|
|
|
|
|
|
|
|
print(f"找到 cid: {cid}")
|
|
|
|
|
|
|
|
return cid
|
|
|
|
|
|
|
|
print("未找到 cid")
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
print("获取视频页面失败,状态码:", response.status_code)
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
except requests.exceptions.RequestException as req_error:
|
|
|
|
|
|
|
|
print(f"获取视频页面失败: {req_error}")
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_danmaku(video_id, cid):
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
根据视频 ID 和 CID 获取弹幕数据。
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
:param video_id: 视频 ID
|
|
|
|
|
|
|
|
:param cid: 弹幕 CID
|
|
|
|
|
|
|
|
:return: 弹幕数据的 XML 内容
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
danmaku_url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
|
|
|
|
|
|
|
|
(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
|
|
|
|
|
|
|
"Referer": f"https://www.bilibili.com/video/{video_id}"
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
response = requests.get(danmaku_url, headers=headers, timeout=10) # 发送请求
|
|
|
|
|
|
|
|
if response.status_code == 200:
|
|
|
|
|
|
|
|
print("成功获取弹幕数据")
|
|
|
|
|
|
|
|
return response.content # 返回弹幕数据
|
|
|
|
|
|
|
|
print("获取弹幕数据失败,状态码:", response.status_code)
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
except requests.exceptions.RequestException as req_error:
|
|
|
|
|
|
|
|
print(f"获取弹幕数据失败: {req_error}")
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_danmaku(xml_content):
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
解析弹幕 XML 内容,提取弹幕文本。
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
:param xml_content: 弹幕的 XML 内容
|
|
|
|
|
|
|
|
:return: 弹幕文本列表
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
root = ET.fromstring(xml_content) # 解析 XML
|
|
|
|
|
|
|
|
danmaku_items = [] # 存储弹幕文本
|
|
|
|
|
|
|
|
for item in root.findall('.//d'):
|
|
|
|
|
|
|
|
content = item.text # 提取弹幕文本
|
|
|
|
|
|
|
|
if content:
|
|
|
|
|
|
|
|
danmaku_items.append(content) # 添加到列表
|
|
|
|
|
|
|
|
return danmaku_items
|
|
|
|
|
|
|
|
except ET.ParseError as parse_error:
|
|
|
|
|
|
|
|
print(f"解析弹幕数据时出错: {parse_error}")
|
|
|
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def save_danmakus_to_excel(all_danmakus, filename):
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
将弹幕数据保存到 Excel 文件中。
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
:param all_danmakus: 所有弹幕内容
|
|
|
|
|
|
|
|
:param filename: 保存的文件名
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
data_frame = pd.DataFrame(all_danmakus, columns=["弹幕内容"]) # 创建 DataFrame
|
|
|
|
|
|
|
|
data_frame.to_excel(filename, index=False) # 保存到 Excel 文件
|
|
|
|
|
|
|
|
print(f"所有弹幕数据已保存到 {filename}")
|
|
|
|
|
|
|
|
except FileNotFoundError:
|
|
|
|
|
|
|
|
print(f"输出文件路径 {filename} 未找到,请检查路径是否正确。")
|
|
|
|
|
|
|
|
except ValueError as value_error:
|
|
|
|
|
|
|
|
print(f"数据转换错误: {value_error}. 请检查数据格式。")
|
|
|
|
|
|
|
|
except PermissionError:
|
|
|
|
|
|
|
|
print(f"没有权限写入文件 {filename}。请检查文件权限。")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
主函数,执行视频搜索、获取弹幕并保存到 Excel 的主要逻辑。
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
keyword = "2024巴黎奥运会" # 搜索关键词
|
|
|
|
|
|
|
|
video_ids = search_videos(keyword) # 搜索视频并获取视频 ID 列表
|
|
|
|
|
|
|
|
all_danmakus = [] # 存储所有弹幕内容
|
|
|
|
|
|
|
|
for video_id in video_ids:
|
|
|
|
|
|
|
|
cid = get_cid(video_id) # 获取视频的 cid
|
|
|
|
|
|
|
|
print(f"正在获取视频 {video_id} 的弹幕...")
|
|
|
|
|
|
|
|
if cid:
|
|
|
|
|
|
|
|
xml_content = get_danmaku(video_id, cid) # 获取弹幕数据
|
|
|
|
|
|
|
|
if xml_content:
|
|
|
|
|
|
|
|
danmaku_items = parse_danmaku(xml_content) # 解析弹幕
|
|
|
|
|
|
|
|
print(f"获取到 {len(danmaku_items)} 条弹幕")
|
|
|
|
|
|
|
|
all_danmakus.extend(danmaku_items) # 合并弹幕内容
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
print(f"获取弹幕失败,视频 ID: {video_id}")
|
|
|
|
|
|
|
|
time.sleep(2) # 添加延迟,避免频繁请求
|
|
|
|
|
|
|
|
# 将所有弹幕保存到 Excel 文件中
|
|
|
|
|
|
|
|
save_danmakus_to_excel(all_danmakus, "all_danmakus.xlsx")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
|
|
main() # 执行主函数
|