import requests import time from bs4 import BeautifulSoup # 固定的 User-Agent custom_user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' # 固定的 Cookie custom_cookie = 'SESSDATA=f088e027%2C1742009093%2C2981a%2A91CjAZOWy34Bnk2yutAp0iAkrT9_GnVSnWjaJxjZh5xNycojooqqb3GBAqWEkeXTkUQLQSVk5rcWNaekt5VjNUVkJYV2VtaWo4NHlvc2NPQUFabTRhY2RBUkg5U1B4d3RoMmQ4eXQzN1R3Y1BXc0dBelJEenk4WTgyQjFJRlhmcFZZel94aThkQXVRIIEC; bili_jct=b51d3572b16a74e3ffc7a59f8a252e91; DedeUserID=3494376802093902;' # 1. 搜索视频,按照综合排序 def get_search_results(keyword, page): search_url = "https://api.bilibili.com/x/web-interface/search/type" params = { "keyword": keyword, "page": page, "search_type": "video", "order": "totalrank" # 添加综合排序参数 } headers = { 'User-Agent': custom_user_agent, 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Referer': 'https://www.bilibili.com/', 'Cookie': custom_cookie } try: response = requests.get(search_url, headers=headers, params=params) response.raise_for_status() # 检查 HTTP 状态码 result = response.json() if 'data' in result and 'result' in result['data']: return result['data']['result'] else: print(f"第 {page} 页的搜索结果未找到") return [] except requests.exceptions.HTTPError as http_err: print(f"HTTP 错误: {http_err}") return [] except requests.exceptions.RequestException as e: print(f"请求失败: {e}") return [] # 2. 获取视频的 CID (使用 bvid) def get_cid(bvid): url = f"https://api.bilibili.com/x/player/pagelist?bvid={bvid}" headers = { 'User-Agent': custom_user_agent, 'Referer': f"https://www.bilibili.com/video/{bvid}", 'Cookie': custom_cookie } try: response = requests.get(url, headers=headers) response.raise_for_status() data = response.json() if 'data' in data and len(data['data']) > 0: return data['data'][0]['cid'] # 获取第一个页面的CID else: print(f"视频 {bvid} 的 CID 获取失败") return None except Exception as e: print(f"获取 CID 失败: {e}") return None # 3. 获取弹幕 def get_danmaku(cid, bvid): danmaku_url = f"https://comment.bilibili.com/{cid}.xml" headers = { 'User-Agent': custom_user_agent, 'Referer': f"https://www.bilibili.com/video/{bvid}", 'Cookie': custom_cookie } try: response = requests.get(danmaku_url, headers=headers) response.raise_for_status() soup = BeautifulSoup(response.content, 'lxml') danmakus = [d.text for d in soup.find_all('d')] # 提取每条弹幕的内容 return danmakus except Exception as e: print(f"获取弹幕失败: {e}") return [] # 4. 主逻辑 def main(): keyword = "2024巴黎奥运会" videos = [] # 获取前300个视频 for page in range(1, 16): # 每页有20个视频,15页大概可以获取到300个 results = get_search_results(keyword, page) if results: videos.extend(results) print(f"第 {page} 页获取到 {len(results)} 个视频") else: print(f"第 {page} 页的搜索结果未找到或请求失败") if len(videos) >= 300: break time.sleep(1) # 每页请求间隔1秒,防止爬取过快 print(f"共找到 {len(videos)} 个视频") if not videos: print("未找到任何视频,停止爬取") return # 获取每个视频的弹幕 all_danmakus = [] for i, video in enumerate(videos[:300]): # 只处理前300个视频 bvid = video['bvid'] # 使用 bvid 而不是 aid title = video['title'] try: cid = get_cid(bvid) if cid: danmakus = get_danmaku(cid, bvid) print(f"获取视频 {i+1}/{len(videos)} 的弹幕: {title}, 弹幕数: {len(danmakus)}") all_danmakus.extend(danmakus) else: print(f"视频 {title} 的 CID 获取失败") except Exception as e: print(f"视频 {title} 的弹幕获取失败: {e}") time.sleep(2) # 每次请求间隔2秒,避免请求过快 # 保存弹幕数据到本地文件 with open("danmakus_2024_olympics.txt", "w", encoding='utf-8') as f: for danmaku in all_danmakus: f.write(danmaku + "\n") print(f"所有弹幕已保存至 danmakus_2024_olympics.txt") # 运行主程序 if __name__ == "__main__": main()