|
|
@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
import time
|
|
|
|
|
|
|
|
from typing import List
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
|
|
|
import json
|
|
|
|
|
|
|
|
from urllib import parse
|
|
|
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 定义 BilibiliSpider 类,封装与 Bilibili API 交互的功能
|
|
|
|
|
|
|
|
class BilibiliSpider:
|
|
|
|
|
|
|
|
# 初始化方法,传入用户的cookie和user_agent(模拟浏览器请求的用户标识)
|
|
|
|
|
|
|
|
def __init__(self, cookie: str, user_agent):
|
|
|
|
|
|
|
|
self.cookie = cookie
|
|
|
|
|
|
|
|
self.user_agent = user_agent
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 方法:根据关键词、页码和页面大小从 Bilibili 搜索视频
|
|
|
|
|
|
|
|
# 返回该页的视频 ID 列表(aid)
|
|
|
|
|
|
|
|
def get_search_result(self, keyword: str, page: int, page_size: int) -> list:
|
|
|
|
|
|
|
|
# 构建请求头,模拟真实浏览器请求,包含cookie和user_agent
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
|
|
|
|
"Accept": "application/json, text/plain, */*",
|
|
|
|
|
|
|
|
"Accept-Encoding": "gzip, deflate, br, zstd",
|
|
|
|
|
|
|
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
|
|
|
|
|
|
|
"Cache-Control": "no-cache",
|
|
|
|
|
|
|
|
"Cookie": self.cookie, # 用户的 cookie,用于模拟登录状态
|
|
|
|
|
|
|
|
"Origin": "https://search.bilibili.com",
|
|
|
|
|
|
|
|
"Pragma": "no-cache",
|
|
|
|
|
|
|
|
"Priority": "u=1, i",
|
|
|
|
|
|
|
|
"Referer": f"https://search.bilibili.com/all?vt=71519330&keyword={parse.quote(keyword)}&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page=3&o=48",
|
|
|
|
|
|
|
|
"Sec-Ch-Ua": "\"Not)A;Brand\";v=\"99\", \"Google Chrome\";v=\"127\", \"Chromium\";v=\"127\"",
|
|
|
|
|
|
|
|
"Sec-Ch-Ua-Mobile": "?0",
|
|
|
|
|
|
|
|
"Sec-Ch-Ua-Platform": "\"Windows\"",
|
|
|
|
|
|
|
|
"Sec-Fetch-Dest": "empty",
|
|
|
|
|
|
|
|
"Sec-Fetch-Mode": "cors",
|
|
|
|
|
|
|
|
"Sec-Fetch-Site": "same-site",
|
|
|
|
|
|
|
|
"User-Agent": self.user_agent # 模拟浏览器的用户代理信息
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 构建查询参数,用于传递给 Bilibili API,包括搜索关键词、分页信息等
|
|
|
|
|
|
|
|
params = {
|
|
|
|
|
|
|
|
"category_id": "",
|
|
|
|
|
|
|
|
"search_type": "video",
|
|
|
|
|
|
|
|
"ad_resource": 5654,
|
|
|
|
|
|
|
|
"__refresh__": True,
|
|
|
|
|
|
|
|
"_extra": "",
|
|
|
|
|
|
|
|
"context": "",
|
|
|
|
|
|
|
|
"page": page, # 当前页码
|
|
|
|
|
|
|
|
"page_size": page_size, # 每页结果数量
|
|
|
|
|
|
|
|
"from_source": "",
|
|
|
|
|
|
|
|
"from_spmid": "333.337",
|
|
|
|
|
|
|
|
"platform": "pc",
|
|
|
|
|
|
|
|
"highlight": 1,
|
|
|
|
|
|
|
|
"single_column": 0,
|
|
|
|
|
|
|
|
"keyword": keyword, # 搜索关键词
|
|
|
|
|
|
|
|
"qv_id": "D9L6NRPnDle6B4EA2dJ4hfRjUOIvKeIM",
|
|
|
|
|
|
|
|
"source_tag": 3,
|
|
|
|
|
|
|
|
"gaia_vtoken": "",
|
|
|
|
|
|
|
|
"dynamic_offset": 48,
|
|
|
|
|
|
|
|
"web_location": 1430654,
|
|
|
|
|
|
|
|
"w_rid": "dc50190c40844231b9ad3622eebcc62b",
|
|
|
|
|
|
|
|
"wts": 1724771963
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 无限循环,直到成功获取数据为止(防止请求失败)
|
|
|
|
|
|
|
|
while True:
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
# 调用 Bilibili API,获取搜索结果
|
|
|
|
|
|
|
|
url = "https://api.bilibili.com/x/web-interface/search/type"
|
|
|
|
|
|
|
|
response = requests.get(url, headers=headers, params=params).json() # 将返回的结果转换为 JSON
|
|
|
|
|
|
|
|
# 判断 API 响应码是否为 0,表示成功
|
|
|
|
|
|
|
|
if response['code'] == 0:
|
|
|
|
|
|
|
|
# 提取视频的 aid(视频ID),并返回
|
|
|
|
|
|
|
|
aids = [item['id'] for item in response['data']['result']]
|
|
|
|
|
|
|
|
return aids
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
|
|
# 捕获异常并打印错误信息,等待1秒后重试
|
|
|
|
|
|
|
|
print(e)
|
|
|
|
|
|
|
|
time.sleep(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 方法:根据视频 aid 获取对应的 cid(视频分段ID)
|
|
|
|
|
|
|
|
def get_cid(self, aid: int) -> int:
|
|
|
|
|
|
|
|
# 构建请求头
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
|
|
|
|
"Accept": "application/json, text/plain, */*",
|
|
|
|
|
|
|
|
"User-Agent": self.user_agent,
|
|
|
|
|
|
|
|
"Cookie": self.cookie,
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 向 Bilibili API 请求视频详细信息,获取 cid
|
|
|
|
|
|
|
|
response = requests.get(f"https://api.bilibili.com/x/player/pagelist?aid={aid}", headers=headers)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if response.status_code == 200:
|
|
|
|
|
|
|
|
# 解析返回的 JSON 数据
|
|
|
|
|
|
|
|
data = response.json()
|
|
|
|
|
|
|
|
if data and 'data' in data and len(data['data']) > 0:
|
|
|
|
|
|
|
|
# 返回视频的第一个页面的 cid
|
|
|
|
|
|
|
|
return data['data'][0]['cid']
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
# 如果未找到视频,抛出异常
|
|
|
|
|
|
|
|
raise ValueError(f"No video found for aid {aid}.")
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
# 如果请求失败,抛出异常
|
|
|
|
|
|
|
|
raise Exception(f"Failed to retrieve CID for aid {aid}. Status code: {response.status_code}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 方法:根据视频 cid 获取弹幕数据,返回弹幕列表
|
|
|
|
|
|
|
|
def get_bullet_screen(self, aid: int) -> List:
|
|
|
|
|
|
|
|
# 构建请求头
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
|
|
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
|
|
|
|
|
|
|
"Accept-Encoding": "gzip, deflate, br, zstd",
|
|
|
|
|
|
|
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
|
|
|
|
|
|
|
"Cache-Control": "no-cache",
|
|
|
|
|
|
|
|
"Cookie": self.cookie,
|
|
|
|
|
|
|
|
"Pragma": "no-cache",
|
|
|
|
|
|
|
|
"Priority": "u=0, i",
|
|
|
|
|
|
|
|
"Sec-CH-UA": "\"Not)A;Brand\";v=\"99\", \"Google Chrome\";v=\"127\", \"Chromium\";v=\"127\"",
|
|
|
|
|
|
|
|
"Sec-CH-UA-Mobile": "?0",
|
|
|
|
|
|
|
|
"Sec-CH-UA-Platform": "\"Windows\"",
|
|
|
|
|
|
|
|
"Sec-Fetch-Dest": "document",
|
|
|
|
|
|
|
|
"Sec-Fetch-Mode": "navigate",
|
|
|
|
|
|
|
|
"Sec-Fetch-Site": "none",
|
|
|
|
|
|
|
|
"Sec-Fetch-User": "?1",
|
|
|
|
|
|
|
|
"Upgrade-Insecure-Requests": "1",
|
|
|
|
|
|
|
|
"User-Agent": self.user_agent
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 请求弹幕数据(弹幕数据以 XML 格式返回)
|
|
|
|
|
|
|
|
url = 'https://comment.bilibili.com/' + str(aid) + '.xml'
|
|
|
|
|
|
|
|
response = requests.get(url, headers=headers)
|
|
|
|
|
|
|
|
response.encoding = 'utf-8'
|
|
|
|
|
|
|
|
html = response.text
|
|
|
|
|
|
|
|
# 使用正则表达式从 XML 中提取弹幕文本,并返回弹幕列表
|
|
|
|
|
|
|
|
return re.findall("<d p=.+?>(.+?)</d>", html)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 主函数:爬取 Bilibili 视频弹幕
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
|
|
|
# 模拟用户的 User-Agent 和 Cookie
|
|
|
|
|
|
|
|
User_Agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 " \
|
|
|
|
|
|
|
|
"Safari/537.36 Core/1.94.265.400 QQBrowser/12.7.5769.400 "
|
|
|
|
|
|
|
|
cookies = "buvid3=0010B368-0E93-5612-1F55-B0AEFA2A788E68736infoc; b_nut=1722828468; _uuid=65A45AC6-10CC5-FD72-3AE5-EDD1D94C6B2A71555infoc; enable_web_push=DISABLE; home_feed_column=5; buvid4=D4A818B4-3DAF-E9E9-CCA6-2292209CA07D70717-024080503-n8yYBXNzLps6TrOphT3zww%3D%3D; header_theme_version=CLOSE; rpdid=|(J|)Rl|kRuk0J'u~kk)k)lJY; CURRENT_QUALITY=80; fingerprint=6d7a6d23f809895ad523f52c214cab31; buvid_fp_plain=undefined; b-user-id=06265419-2000-a180-a632-d8face940e87; CURRENT_BLACKGAP=0; is-2022-channel=1; buvid_fp=6d7a6d23f809895ad523f52c214cab31; bili_ticket_expires=1726804000; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY4MDQwNjAsImlhdCI6MTcyNjU0NDgwMCwicGx0IjotMX0.GNq3G9bTe-3WQ8VAVc0sI4Qz9p3p7_dhcjMvGWeptHU; browser_resolution=1491-706; bp_t_offset_342699701=978044347712798720; CURRENT_FNVAL=4048; b_lsid=E10FCACF9_191FEFD995E; SESSDATA=a362140e%2C1742111957%2Cf7826%2A91CjBL7AZ1ewk0PKshkKehAyTa-FGUdcmmYfvGIZvLIvE3mrP0Lp1ZFo7Vp-Hg1cnTkFcSVkpPSlUwSkpFWWNodXFmNDdRNnFOdkdfTkpkbmpQNjlDUGkxLXpMRXZIMWpLUkVVSU1sNjM2clZVUmp1dEZDeDFmRTZJS0JObEstb1RVeV94ek91UktnIIEC; bili_jct=2627cc66d6b22d78edd09ea63d44b26e; DedeUserID=3546763816339525; DedeUserID__ckMd5=d856350aecedd530; sid=8ec23v36"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 创建 BilibiliSpider 实例
|
|
|
|
|
|
|
|
Bili = BilibiliSpider(cookies, User_Agent)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 搜索关键词,例如 "2024巴黎奥运会"
|
|
|
|
|
|
|
|
keyword = "2024巴黎奥运会"
|
|
|
|
|
|
|
|
page_size = 30 # 每页30个结果
|
|
|
|
|
|
|
|
total_pages = 10 # 爬取10页,总计300个视频
|
|
|
|
|
|
|
|
data_list = [] # 存储所有弹幕数据
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 循环爬取指定页数的结果
|
|
|
|
|
|
|
|
for page in range(1, total_pages + 1):
|
|
|
|
|
|
|
|
print(f"Fetching search results for page {page}...")
|
|
|
|
|
|
|
|
# 获取当前页的视频 aid 列表
|
|
|
|
|
|
|
|
aids = Bili.get_search_result(keyword, page, page_size)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 对于每个视频 aid,获取弹幕数据
|
|
|
|
|
|
|
|
for aid in aids:
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
print(f"Fetching bullet screen for video with aid {aid}...")
|
|
|
|
|
|
|
|
# 获取视频的 cid
|
|
|
|
|
|
|
|
cid = Bili.get_cid(aid)
|
|
|
|
|
|
|
|
# 获取弹幕数据并加入列表
|
|
|
|
|
|
|
|
bullet_screens = Bili.get_bullet_screen(cid)
|
|
|
|
|
|
|
|
data_list.extend(bullet_screens) # 将弹幕数据添加到 data_list
|
|
|
|
|
|
|
|
print(f"Fetched {len(bullet_screens)} bullet screens for aid {aid}.")
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
|
|
# 处理爬取过程中发生的错误
|
|
|
|
|
|
|
|
print(f"An error occurred while fetching data for aid {aid}: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 打印总共获取的弹幕数量
|
|
|
|
|
|
|
|
print(f"Total bullet screens fetched: {len(data_list)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 将弹幕数据保存到 "弹幕.txt" 文件
|
|
|
|
|
|
|
|
with open("弹幕.txt", mode='a', encoding="utf-8") as f:
|
|
|
|
|
|
|
|
for data in data_list:
|
|
|
|
|
|
|
|
f.write(data + '\n')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 程序入口
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
|
|
main()
|