You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

140 lines
4.7 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
import time
from bs4 import BeautifulSoup
# 固定的 User-Agent
custom_user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
# 固定的 Cookie
custom_cookie = 'SESSDATA=f088e027%2C1742009093%2C2981a%2A91CjAZOWy34Bnk2yutAp0iAkrT9_GnVSnWjaJxjZh5xNycojooqqb3GBAqWEkeXTkUQLQSVk5rcWNaekt5VjNUVkJYV2VtaWo4NHlvc2NPQUFabTRhY2RBUkg5U1B4d3RoMmQ4eXQzN1R3Y1BXc0dBelJEenk4WTgyQjFJRlhmcFZZel94aThkQXVRIIEC; bili_jct=b51d3572b16a74e3ffc7a59f8a252e91; DedeUserID=3494376802093902;'
# 1. 搜索视频,按照综合排序
def get_search_results(keyword, page):
search_url = "https://api.bilibili.com/x/web-interface/search/type"
params = {
"keyword": keyword,
"page": page,
"search_type": "video",
"order": "totalrank" # 添加综合排序参数
}
headers = {
'User-Agent': custom_user_agent,
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Referer': 'https://www.bilibili.com/',
'Cookie': custom_cookie
}
try:
response = requests.get(search_url, headers=headers, params=params)
response.raise_for_status() # 检查 HTTP 状态码
result = response.json()
if 'data' in result and 'result' in result['data']:
return result['data']['result']
else:
print(f"{page} 页的搜索结果未找到")
return []
except requests.exceptions.HTTPError as http_err:
print(f"HTTP 错误: {http_err}")
return []
except requests.exceptions.RequestException as e:
print(f"请求失败: {e}")
return []
# 2. 获取视频的 CID (使用 bvid)
def get_cid(bvid):
url = f"https://api.bilibili.com/x/player/pagelist?bvid={bvid}"
headers = {
'User-Agent': custom_user_agent,
'Referer': f"https://www.bilibili.com/video/{bvid}",
'Cookie': custom_cookie
}
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
data = response.json()
if 'data' in data and len(data['data']) > 0:
return data['data'][0]['cid'] # 获取第一个页面的CID
else:
print(f"视频 {bvid} 的 CID 获取失败")
return None
except Exception as e:
print(f"获取 CID 失败: {e}")
return None
# 3. 获取弹幕
def get_danmaku(cid, bvid):
danmaku_url = f"https://comment.bilibili.com/{cid}.xml"
headers = {
'User-Agent': custom_user_agent,
'Referer': f"https://www.bilibili.com/video/{bvid}",
'Cookie': custom_cookie
}
try:
response = requests.get(danmaku_url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'lxml')
danmakus = [d.text for d in soup.find_all('d')] # 提取每条弹幕的内容
return danmakus
except Exception as e:
print(f"获取弹幕失败: {e}")
return []
# 4. 主逻辑
def main():
keyword = "2024巴黎奥运会"
videos = []
# 获取前300个视频
for page in range(1, 16): # 每页有20个视频15页大概可以获取到300个
results = get_search_results(keyword, page)
if results:
videos.extend(results)
print(f"{page} 页获取到 {len(results)} 个视频")
else:
print(f"{page} 页的搜索结果未找到或请求失败")
if len(videos) >= 300:
break
time.sleep(1) # 每页请求间隔1秒
print(f"共找到 {len(videos)} 个视频")
if not videos:
print("未找到任何视频,停止爬取")
return
# 获取每个视频的弹幕
all_danmakus = []
for i, video in enumerate(videos[:300]): # 只处理前300个视频
bvid = video['bvid'] # 使用 bvid 而不是 aid
title = video['title']
try:
cid = get_cid(bvid)
if cid:
danmakus = get_danmaku(cid, bvid)
print(f"获取视频 {i+1}/{len(videos)} 的弹幕: {title}, 弹幕数: {len(danmakus)}")
all_danmakus.extend(danmakus)
else:
print(f"视频 {title} 的 CID 获取失败")
except Exception as e:
print(f"视频 {title} 的弹幕获取失败: {e}")
time.sleep(2) # 每次请求间隔2秒
# 保存弹幕数据到本地文件
with open("danmakus_2024_olympics.txt", "w", encoding='utf-8') as f:
for danmaku in all_danmakus:
f.write(danmaku + "\n")
print(f"所有弹幕已保存至 danmakus_2024_olympics.txt")
# 运行主程序
if __name__ == "__main__":
main()