|
|
|
|
import requests
|
|
|
|
|
import re
|
|
|
|
|
import xml.etree.ElementTree as ET
|
|
|
|
|
from DrissionPage import ChromiumPage
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
from collections import OrderedDict
|
|
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
|
# cookie用户信息:用于检测用户是否登录账号
|
|
|
|
|
"cookie": "buvid4=B1349383-F2A6-E4E5-ED2C-A5B428CC0ED955473-022061918-9v4qmUz9VFkZrXyhSPgawQ%3D%3D; buvid_fp_plain=undefined; LIVE_BUVID=AUTO2216562263398747; is-2022-channel=1; CURRENT_FNVAL=4048; DedeUserID=406267828; DedeUserID__ckMd5=a7899a47ba8a07ab; enable_web_push=DISABLE; rpdid=|(u))kkYu||u0J'u~|JkJR)ul; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; CURRENT_QUALITY=80; buvid3=3D0F285C-7560-9E88-C05A-1AFF4D9CAD7617643infoc; b_nut=1718944717; _uuid=BAAADDA5-4DDD-184B-DB310-EC4105D97D51E17974infoc; header_theme_version=CLOSE; fingerprint=5d019e2bddbce042baa44b97851bb2fd; buvid_fp=5d019e2bddbce042baa44b97851bb2fd; PVID=3; SESSDATA=e97fd178%2C1741840358%2C6c71f%2A91CjB7QCyKSGsn3CjY0C0HRTQF_8a0AYz9r9QLq2FL2YiXLfQFehx5MdXHV3EDcePi4mwSVlM3TVE3LWcyczhLbXoycnhOTWtwa2g5OEl4RVdxOVRQZjljWGJ5d2czbFZpTkVhd3hfRnFmYTE4TVRISTlkdXFpemd1SmhOejNnVm1mYlJ0UDZEcTNBIIEC; bili_jct=4b351ffc7724b0bb10c029efdfad7f75; sid=6jnrbe6l; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY1NDc1NzEsImlhdCI6MTcyNjI4ODMxMSwicGx0IjotMX0.l-PQPoCUFvsMgRpDwqUKwUY9jLLjI-p-HZ1Qaq7AIjI; bili_ticket_expires=1726547511; b_lsid=1564BAD10_191F00CC884; bp_t_offset_406267828=976996173829111808; home_feed_column=5; browser_resolution=2048-1030",
|
|
|
|
|
# user-agent用户代理:表示浏览器设备的基本信息
|
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
def get_video_details(search_url):
|
|
|
|
|
|
|
|
|
|
response = requests.get(search_url, headers=headers)
|
|
|
|
|
# 确保请求成功
|
|
|
|
|
response.raise_for_status()
|
|
|
|
|
|
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
|
|
|
|
|
|
# 匹配 BV 号
|
|
|
|
|
bv_pattern = re.compile(r'BV[1-9A-Za-z]{10}')
|
|
|
|
|
bv_l=[]
|
|
|
|
|
for tag in soup.find_all('a', href=True):
|
|
|
|
|
if 'BV' in tag['href']:
|
|
|
|
|
bv = bv_pattern.search(tag['href'])
|
|
|
|
|
bv_l.append(bv.group())
|
|
|
|
|
bv_l = list(OrderedDict.fromkeys(bv_l))# 去重
|
|
|
|
|
|
|
|
|
|
return bv_l
|
|
|
|
|
|
|
|
|
|
def get_video_info(bv_id):
|
|
|
|
|
""" 获取视频信息并解析CID """
|
|
|
|
|
url = f'https://api.bilibili.com/x/web-interface/view?bvid={bv_id}'
|
|
|
|
|
response = requests.get(url, headers=headers)
|
|
|
|
|
response.encoding = 'utf-8'
|
|
|
|
|
response.raise_for_status()
|
|
|
|
|
data = response.json()
|
|
|
|
|
cid = data['data']['cid']
|
|
|
|
|
return cid
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_danmaku_xml(cid):
|
|
|
|
|
""" 根据CID获取弹幕XML数据 """
|
|
|
|
|
url = f'https://comment.bilibili.com/{cid}.xml'
|
|
|
|
|
response = requests.get(url, headers=headers)
|
|
|
|
|
response.encoding = 'utf-8'
|
|
|
|
|
# 确保请求成功
|
|
|
|
|
response.raise_for_status()
|
|
|
|
|
return response.text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_danmaku_xml(xml_str):
|
|
|
|
|
""" 解析弹幕XML数据 """
|
|
|
|
|
root = ET.fromstring(xml_str)
|
|
|
|
|
danmaku_list = [d.text for d in root.findall('.//d') if d.text]
|
|
|
|
|
return danmaku_list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def save_danmaku_to_excel(danmaku_content, file_name='danmaku_content.xlsx'):
|
|
|
|
|
"""
|
|
|
|
|
将弹幕内容保存到 Excel 文件中
|
|
|
|
|
:param danmaku_content: 包含弹幕内容的列表
|
|
|
|
|
:param file_name: Excel 文件名
|
|
|
|
|
"""
|
|
|
|
|
# 检查是否已经存在文件
|
|
|
|
|
try:
|
|
|
|
|
# 尝试读取现有文件
|
|
|
|
|
existing_df = pd.read_excel(file_name,engine='openpyxl')
|
|
|
|
|
except FileNotFoundError:
|
|
|
|
|
# 文件不存在,则创建新的 DataFrame
|
|
|
|
|
existing_df = pd.DataFrame(columns=['content'])
|
|
|
|
|
|
|
|
|
|
# 将新弹幕内容转换为 DataFrame,并自动生成 'id'
|
|
|
|
|
new_df = pd.DataFrame(danmaku_content, columns=['content'])
|
|
|
|
|
# new_df['id'] = range(len(existing_df), len(existing_df) + len(new_df))
|
|
|
|
|
|
|
|
|
|
# 合并现有内容和新内容
|
|
|
|
|
updated_df = pd.concat([existing_df, new_df], ignore_index=True)
|
|
|
|
|
|
|
|
|
|
# 保存到 Excel 文件
|
|
|
|
|
updated_df.to_excel(file_name, index=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def savef(video_url):
|
|
|
|
|
""" 主函数,获取并展示弹幕数据 """
|
|
|
|
|
|
|
|
|
|
# 提取BV号
|
|
|
|
|
bv_id = re.search(r'BV\w+', video_url).group(0)
|
|
|
|
|
|
|
|
|
|
# 获取视频CID
|
|
|
|
|
cid = get_video_info(bv_id)
|
|
|
|
|
# print(f"提取的CID: {cid}")
|
|
|
|
|
|
|
|
|
|
# 获取弹幕数据
|
|
|
|
|
xml_data = get_danmaku_xml(cid)
|
|
|
|
|
danmakus = parse_danmaku_xml(xml_data)
|
|
|
|
|
|
|
|
|
|
save_danmaku_to_excel(danmakus, file_name='danmaku_content.xlsx')
|
|
|
|
|
|
|
|
|
|
# # 保存并打印弹幕
|
|
|
|
|
# for i, danmaku in enumerate(danmakus):
|
|
|
|
|
# # 遍历爬取的弹幕内容并逐条保存
|
|
|
|
|
#
|
|
|
|
|
# print(f"弹幕 {i + 1}: {danmaku}")
|
|
|
|
|
# file_name = 'danmaku_content.xlsx'
|
|
|
|
|
# print(f"弹幕内容已保存到 {file_name}")
|
|
|
|
|
|
|
|
|
|
# 网址
|
|
|
|
|
driver = ChromiumPage()
|
|
|
|
|
url = "https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3&page=2&o=36"
|
|
|
|
|
driver.get('https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3&page=2&o=36')
|
|
|
|
|
j=0
|
|
|
|
|
for page in range(8):
|
|
|
|
|
bv_list = get_video_details(url)
|
|
|
|
|
for bv in bv_list:
|
|
|
|
|
if j==300:
|
|
|
|
|
break
|
|
|
|
|
j+=1
|
|
|
|
|
# 函数入口,定义函数需要调用
|
|
|
|
|
video_url = f'https://www.bilibili.com/video/{bv}/?spm_id_from=333.337'
|
|
|
|
|
# 调用保存的函数
|
|
|
|
|
print(f"v{j}")
|
|
|
|
|
savef(video_url)
|
|
|
|
|
# 点击下一页
|
|
|
|
|
driver.ele('css:vui_button vui_pagenation--btn vui_pagenation--btn-side').click()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|