import requests import re from bs4 import BeautifulSoup def fetch_comments(search_word, num): print("正在爬取弹幕...") # 获取所需视频的 bvid def get_bvid(url, headers): try: response = requests.get(url, headers=headers, timeout=10) response.encoding = "utf-8" matches = re.finditer(r'aid:.*?bvid:"(?P.*?)",', response.text) return {it.group("bvs") for it in matches} # 使用集合推导提升性能 except requests.exceptions.RequestException as e: # 异常处理 print(f"请求 bvid 失败: {e}") return set() # 返回空集合以表示失败 # 获取所需视频的 cid def get_cid(url, headers): try: response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() # 检查请求是否成功 dict = response.json() if "data" in dict and len(dict["data"]) > 0: cid = dict["data"][0]["cid"] return cid else: print("Error: 'data' not found in response") return None except requests.exceptions.RequestException as e: # 异常处理 print(f"请求 cid 失败: {e}") return None # 获取弹幕数据 def get_comment(url, headers): try: response = requests.get(url, headers=headers, timeout=10) response.encoding = "utf-8" response.raise_for_status() # 检查请求是否成功 xml = BeautifulSoup(response.text, "xml") return [d.text for d in xml.find_all("d")] # 使用列表推导提升性能 except requests.exceptions.RequestException as e: # 异常处理 print(f"请求评论失败: {e}") return [] # 返回空列表以表示失败 # 设置请求头 headers = { 'Cookie': 'i-wanna-go-back=-1; LIVE_BUVID=AUTO4416565733586161; buvid_fp_plain=undefined; DedeUserID=438653318; DedeUserID__ckMd5=7c2d23c88aee6f46; hit-new-style-dyn=1; theme_style=light; enable_web_push=DISABLE; header_theme_version=CLOSE; rpdid=|(u))kkYu|JY0J\'u~|JulYmJY; b_ut=5; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; PVID=4; CURRENT_QUALITY=80; _uuid=EAB710246-F627-98E6-4529-CDFBD3522961088900infoc; buvid3=2FF036D6-2F9E-BA53-12AC-B8475DFEA7EF68465infoc; b_nut=1722680568; buvid4=04DF8495-DC85-5EB7-DB02-0141ADBD9CA652678-022063015-%2B9JvYED62Pzmtk4PYtAPcQ%3D%3D; hit-dyn-v2=1; CURRENT_FNVAL=4048; is-2022-channel=1; bsource=search_bing; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY2MzI2NzksImlhdCI6MTcyNjM3MzQxOSwicGx0IjotMX0.CU-aVew4u5W9di0OPj_R_gpW_RHKEAUVY21KRU0Eqno; bili_ticket_expires=1726632619; home_feed_column=5; browser_resolution=1528-740; SESSDATA=1374e376%2C1742014673%2Cbbc95%2A91CjBNLZ0d3MNwQsDj3Nw8WKYP82oiAT74WcRbs-4z44DsU1npTEDGVRhrDiwcQ4a7KAYSVkxudHdzWmRENG5XcmtuZWNRSklrbWUyNDV4ajdzT2V5MjBTMDlaTi1YdWdpeGFOY0xNZW55UGF4NGMtNEdDNE4zV3ZSOFlna29JTTlxZEFwOFRZeDlRIIEC; bili_jct=9b8521d7e4cc60a577b9a29ce8ecc6a0; sid=5acjva0u; fingerprint=9bb0311b6cea85ad074caa19c63dee04; buvid_fp=9bb0311b6cea85ad074caa19c63dee04; b_lsid=B5102210DA_191FEFB31DC; bp_t_offset_438653318=978063108129947648', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0' } # 通过 search_word 获取视频 bvid page = 1 bvid_list = set() # 使用集合防止获取重复的bvid while len(bvid_list) <= num: url = f'https://search.bilibili.com/video?keyword={search_word}&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page={page}' bvid_list.update(get_bvid(url, headers)) page += 1 # 获取视频 cid cid_list = [] bvid_list = list(bvid_list) # 将集合转为列表 for bv in bvid_list[:num]: url = f"https://api.bilibili.com/x/player/pagelist?bvid={bv}&jsonp=jsonp" cid = get_cid(url, headers) if cid is not None: cid_list.append(cid) # 获取弹幕 comment_list = [] for cid in cid_list: url = f"https://comment.bilibili.com/{cid}.xml" comments = get_comment(url, headers) comment_list.extend(comments) # 保存到文本文件 with open(f'{search_word}弹幕.txt', mode='w', encoding='utf-8') as f: for comment in comment_list: f.write(comment + '\n') print(f"成功获取{len(comment_list)}条弹幕!")