You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

145 lines
5.5 KiB

import requests
import re
import xml.etree.ElementTree as ET
from DrissionPage import ChromiumPage
from bs4 import BeautifulSoup
from collections import OrderedDict
import pandas as pd
headers = {
# cookie用户信息用于检测用户是否登录账号
"cookie": "buvid4=B1349383-F2A6-E4E5-ED2C-A5B428CC0ED955473-022061918-9v4qmUz9VFkZrXyhSPgawQ%3D%3D; buvid_fp_plain=undefined; LIVE_BUVID=AUTO2216562263398747; is-2022-channel=1; CURRENT_FNVAL=4048; DedeUserID=406267828; DedeUserID__ckMd5=a7899a47ba8a07ab; enable_web_push=DISABLE; rpdid=|(u))kkYu||u0J'u~|JkJR)ul; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; CURRENT_QUALITY=80; buvid3=3D0F285C-7560-9E88-C05A-1AFF4D9CAD7617643infoc; b_nut=1718944717; _uuid=BAAADDA5-4DDD-184B-DB310-EC4105D97D51E17974infoc; header_theme_version=CLOSE; fingerprint=5d019e2bddbce042baa44b97851bb2fd; buvid_fp=5d019e2bddbce042baa44b97851bb2fd; PVID=3; SESSDATA=e97fd178%2C1741840358%2C6c71f%2A91CjB7QCyKSGsn3CjY0C0HRTQF_8a0AYz9r9QLq2FL2YiXLfQFehx5MdXHV3EDcePi4mwSVlM3TVE3LWcyczhLbXoycnhOTWtwa2g5OEl4RVdxOVRQZjljWGJ5d2czbFZpTkVhd3hfRnFmYTE4TVRISTlkdXFpemd1SmhOejNnVm1mYlJ0UDZEcTNBIIEC; bili_jct=4b351ffc7724b0bb10c029efdfad7f75; sid=6jnrbe6l; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY1NDc1NzEsImlhdCI6MTcyNjI4ODMxMSwicGx0IjotMX0.l-PQPoCUFvsMgRpDwqUKwUY9jLLjI-p-HZ1Qaq7AIjI; bili_ticket_expires=1726547511; b_lsid=1564BAD10_191F00CC884; bp_t_offset_406267828=976996173829111808; home_feed_column=5; browser_resolution=2048-1030",
# user-agent用户代理表示浏览器设备的基本信息
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"
}
def get_video_details(search_url):
response = requests.get(search_url, headers=headers)
# 确保请求成功
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# 匹配 BV 号
bv_pattern = re.compile(r'BV[1-9A-Za-z]{10}')
bv_l=[]
for tag in soup.find_all('a', href=True):
if 'BV' in tag['href']:
bv = bv_pattern.search(tag['href'])
bv_l.append(bv.group())
bv_l = list(OrderedDict.fromkeys(bv_l))# 去重
return bv_l
def get_video_info(bv_id):
""" 获取视频信息并解析CID """
url = f'https://api.bilibili.com/x/web-interface/view?bvid={bv_id}'
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
response.raise_for_status()
data = response.json()
cid = data['data']['cid']
return cid
def get_danmaku_xml(cid):
""" 根据CID获取弹幕XML数据 """
url = f'https://comment.bilibili.com/{cid}.xml'
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
# 确保请求成功
response.raise_for_status()
return response.text
def parse_danmaku_xml(xml_str):
""" 解析弹幕XML数据 """
root = ET.fromstring(xml_str)
danmaku_list = [d.text for d in root.findall('.//d') if d.text]
return danmaku_list
def save_danmaku_to_excel(danmaku_content, file_name='danmaku_content.xlsx'):
"""
将弹幕内容保存到 Excel 文件中
:param danmaku_content: 包含弹幕内容的列表
:param file_name: Excel 文件名
"""
# 检查是否已经存在文件
try:
# 尝试读取现有文件
existing_df = pd.read_excel(file_name,engine='openpyxl')
except FileNotFoundError:
# 文件不存在,则创建新的 DataFrame
existing_df = pd.DataFrame(columns=['content'])
# 将新弹幕内容转换为 DataFrame并自动生成 'id'
new_df = pd.DataFrame(danmaku_content, columns=['content'])
# new_df['id'] = range(len(existing_df), len(existing_df) + len(new_df))
# 合并现有内容和新内容
updated_df = pd.concat([existing_df, new_df], ignore_index=True)
# 保存到 Excel 文件
updated_df.to_excel(file_name, index=False)
def savef(video_url):
""" 主函数,获取并展示弹幕数据 """
# 提取BV号
bv_id = re.search(r'BV\w+', video_url).group(0)
# 获取视频CID
cid = get_video_info(bv_id)
# print(f"提取的CID: {cid}")
# 获取弹幕数据
xml_data = get_danmaku_xml(cid)
danmakus = parse_danmaku_xml(xml_data)
save_danmaku_to_excel(danmakus, file_name='danmaku_content.xlsx')
# # 保存并打印弹幕
# for i, danmaku in enumerate(danmakus):
# # 遍历爬取的弹幕内容并逐条保存
#
# print(f"弹幕 {i + 1}: {danmaku}")
# file_name = 'danmaku_content.xlsx'
# print(f"弹幕内容已保存到 {file_name}")
# 网址
driver = ChromiumPage()
url = "https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3&page=2&o=36"
driver.get('https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3&page=2&o=36')
j=0
for page in range(8):
bv_list = get_video_details(url)
for bv in bv_list:
if j==300:
break
j+=1
# 函数入口,定义函数需要调用
video_url = f'https://www.bilibili.com/video/{bv}/?spm_id_from=333.337'
# 调用保存的函数
print(f"v{j}")
savef(video_url)
# 点击下一页
driver.ele('css:vui_button vui_pagenation--btn vui_pagenation--btn-side').click()