You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

105 lines
5.3 KiB

import time
from typing import List
import requests
import re
from urllib import parse
from concurrent.futures import ThreadPoolExecutor, as_completed
class BilibiliVideoSpider:
def __init__(self, session_cookie: str, user_agent: str):
self.session_cookie = session_cookie
self.user_agent = user_agent
def search_videos(self, keyword: str, page: int, page_size: int) -> list:
headers = {
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Cache-Control": "no-cache",
"Cookie": self.session_cookie,
"Origin": "https://search.bilibili.com",
"Pragma": "no-cache",
"Referer": f"https://search.bilibili.com/all?keyword={parse.quote(keyword)}",
"User-Agent": self.user_agent,
}
params = {
"search_type": "video",
"page": page,
"page_size": page_size,
"keyword": keyword,
}
while True:
try:
response = requests.get("https://api.bilibili.com/x/web-interface/search/type", headers=headers, params=params).json()
if response['code'] == 0:
return [item['id'] for item in response['data']['result']]
except Exception as error:
print(error)
time.sleep(1)
def retrieve_cid(self, aid: int) -> int:
headers = {
"Accept": "application/json, text/plain, */*",
"User-Agent": self.user_agent,
"Cookie": self.session_cookie,
}
response = requests.get(f"https://api.bilibili.com/x/player/pagelist?aid={aid}", headers=headers)
if response.status_code == 200:
data = response.json()
if data and 'data' in data and len(data['data']) > 0:
return data['data'][0]['cid']
raise ValueError(f"No video found for aid {aid}.")
def fetch_danmaku(self, aid: int) -> List[str]:
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
"Cookie": self.session_cookie,
"User-Agent": self.user_agent
}
response = requests.get(f'https://comment.bilibili.com/{aid}.xml', headers=headers)
response.encoding = 'utf-8'
return re.findall("<d p=.+?>(.+?)</d>", response.text)
def fetch_bullet_screen(spider: BilibiliVideoSpider, aid: int) -> List[str]:
try:
print(f"Fetching bullet screen for video with aid {aid}...")
cid = spider.retrieve_cid(aid)
return spider.fetch_danmaku(cid)
except Exception as error:
print(f"Error fetching data for aid {aid}: {error}")
return []
def main():
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"
session_cookie = "CURRENT_BLACKGAP=0; buvid_fp_plain=undefined; buvid4=AD99F657-25EE-E722-504A-38818AB2C96431271-022083012-SK3hbof5R8m0%2FMDXrqLXF0ew7%2BFE4Qf8ZlKJTIBem2GNFTrtkIOz1g%3D%3D; hit-dyn-v2=1; DedeUserID=379056927; DedeUserID__ckMd5=f108d9af1bf79bfa; enable_web_push=DISABLE; header_theme_version=CLOSE; rpdid=|(u)luk))YJY0J'u~|JJk)m~|; LIVE_BUVID=AUTO7417034221177410; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; PVID=1; fingerprint=15814142e80dfa9c068eed7a71851bf5; CURRENT_FNVAL=4048; CURRENT_QUALITY=80; buvid3=1ABDB66E-4FED-982B-9E33-0B8252C286A183741infoc; b_nut=1726320983; _uuid=993A102D3-51024-9AFA-A267-911A4D7E1051590626infoc; buvid_fp=15814142e80dfa9c068eed7a71851bf5; SESSDATA=73515621%2C1742135177%2C0e2a5%2A91CjD4Y5RhCOEVNl9wRlHMuu46raFGvX_PIZISKjPfA6kidgWNqhp7ORMi42EVo7IHscoSVmdsTlNiTDltTzFnSEtSam54WnpjVnZvUzdHTGNnd1pUNGZTb1pvYTlmSjdTby0wTVlCMFFvU1lpVGxkd1owTktpY1NDVWlpUTV2b1lmNUJfenpGa0RnIIEC; bili_jct=53fef919101982fb820518642f8ea298; sid=7s0heoa0; b_lsid=5A10234D8_19204359DD8; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY5MDcxODYsImlhdCI6MTcyNjY0NzkyNiwicGx0IjotMX0.1EN8gxd6-3GfZQ3b4b-Iz-4vmICUJHaoAoKpMbLlRtg; bili_ticket_expires=1726907126; bp_t_offset_379056927=978437006507900928; home_feed_column=4; browser_resolution=1065-941"
spider = BilibiliVideoSpider(session_cookie, user_agent)
keyword = "2024巴黎奥运会"
results_per_page = 30
total_pages = 10
all_danmaku = []
with ThreadPoolExecutor(max_workers=10) as executor:
futures = []
for page in range(1, total_pages + 1):
print(f"Fetching search results for page {page}...")
aids = spider.search_videos(keyword, page, results_per_page)
for aid in aids:
futures.append(executor.submit(fetch_bullet_screen, spider, aid))
for future in as_completed(futures):
all_danmaku.extend(future.result())
print(f"Total bullet screens fetched: {len(all_danmaku)}")
# 将弹幕数据保存到 "弹幕.txt" 文件
with open("弹幕.txt", mode='w', encoding="utf-8") as file: # 以写入模式打开文件
for danmaku in all_danmaku: # 遍历所有弹幕数据
file.write(danmaku + '\n') # 将弹幕数据写入文件
if __name__ == "__main__":
main()