import re import requests # “HTTP for humans” # 由于b站的反爬比较严格,请求头需要以下信息 headers = { 'authority': 'data.bilibili.com', 'accept': 'application/json, text/plain, */*', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', 'cookie': 'buvid4=8AF878ED-E44F-755F-11D3-64DA64F69E4116221-022090315-m4T90WVXeahcRPohl2liLg%3D%3D; DedeUserID=1298162992; DedeUserID__ckMd5=7cba268dad61786b; FEED_LIVE_VERSION=V8; header_theme_version=CLOSE; enable_web_push=DISABLE; PVID=1; rpdid=|(u))kkY|mmJ0Ju~|JY)l|kR; buvid_fp_plain=undefined; CURRENT_QUALITY=0; fingerprint=9cdc191abf68aa9b9a5bdfd9ea3ccd06; buvid_fp=9cdc191abf68aa9b9a5bdfd9ea3ccd06; buvid3=E4939679-CBBE-5754-364A-86E62CA46B0738242infoc; b_nut=1725540338; _uuid=7DCC10D99-EB78-76101-85FC-FD82836ACF8342265infoc; CURRENT_BLACKGAP=0; CURRENT_FNVAL=4048; bp_t_offset_1298162992=974832433729896448; SESSDATA=d762ed30%2C1741443642%2C9f2d3%2A92CjBn89a4wlq2S-eFP4pMC8rPS00G4v4cFcPyg7qHVMmfzGHjkeYhz10GAnYhyvOxyTASVkI2MlVHdW5hb1d4WUIxZDFCSlJhV0dGM1NzQjVZeFFWallRU0kyYjBnVEQ4TkRKcEs2NmN3a3lvWGZWMHd6Uk1oeE1MZ0tyVlJCbWJDWm5SMV9najFnIIEC; bili_jct=a391cc5100a136dacf6586abfaab24a6; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYxNTEzMzYsImlhdCI6MTcyNTg5MjA3NiwicGx0IjotMX0.r4K4_ABZd5na44SVZJdrQ6f11XK62AQ1wnr2f998it8; bili_ticket_expires=1726151276; bsource=search_bing; b_lsid=E265964D_191D78437CC; sid=5b9r21ku; home_feed_column=5; browser_resolution=1455-790', 'origin': 'https://www.bilibili.com', 'referer': 'https://www.bilibili.com/video/BV1G2421f7cE/?spm_id_from=333.1007.tianma.2-2-5.click&vd_source=a21d0efaab8782a1839893735379d59a', 'sec-ch-ua': '"Chromium";v="128", "Not;A=Brand";v="24", "Microsoft Edge";v="128"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-site', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0' } ye = 1 cid_num = 0 # 用于获取搜索结果页的源码 def Get_html_data (page): url = f'https://api.bilibili.com/x/web-interface/wbi/search/type?__refresh__=true&_extra=&context=&page={page}&page_size=34&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=jOfkBnzsmlKLpeoEa9g32wjiyl1J0Up7&ad_resource=5654&source_tag=3&gaia_vtoken=&category_id=&search_type=video&' response = requests.get(url=url, headers=headers) response.encoding = 'utf-8'# 设置响应编码为 UTF-8,以确保正确解码网页内容 html_data = response.text print('成功获取源码') return html_data #从 HTML 数据中提取前300个视频的 BV 号,并生成视频的 URL 列表 def Get_Bv(html_data): url_list = [] text_list = re.findall('.*?"bvid":"(.*?)","title":".*?',html_data)#使用正则表达式从 HTML 数据中提取 BV 号 for index in text_list: temp = "http://www.bilibili.com/video/" + index url_list.append(temp) print("成功获取bv号") return url_list #从url_list中获取cid,拼接后得到视频弹幕的url def Get_cid(url_list, max_cid_num=300, headers=headers): xml_url = [] for index in url_list: try: response = requests.get(url=index, headers=headers) response.raise_for_status() html_str = response.content.decode() cid = str(re.findall('"bvid\":\".*?"cids\":{\"1\":(.*?)}},\"BV.*?\":{\"aid\":', html_str)) cid = cid.replace("['", "").replace("']", "") print(cid) xml_url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}" global cid_num cid_num += 1 if cid_num >= max_cid_num: break except requests.RequestException as e: print(f"请求失败: {e}") continue print("获取弹幕链接成功") return xml_url if __name__ == '__main__': for page in range(1, 34): html_data = Get_html_data (page) url_list = Get_Bv(html_data) xml_url = Get_cid(url_list) for j in xml_url: response3 = requests.get(url=xml_url, headers=headers) response3.encoding = response3.apparent_encoding data_list = re.findall('(.*?)', response3.text, ) for i in data_list: with open('巴黎弹幕.txt', mode='a', encoding='utf-8') as f: f.write(i) f.write('\n')