From bc6eb3888a0589d1eb7d364cc20434cb80e274f1 Mon Sep 17 00:00:00 2001 From: p4payi836 <3131266284@qq.com> Date: Sun, 15 Sep 2024 22:50:58 +0800 Subject: [PATCH] ADD file via upload --- 巴黎奥运会前300弹幕.py | 85 ++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 巴黎奥运会前300弹幕.py diff --git a/巴黎奥运会前300弹幕.py b/巴黎奥运会前300弹幕.py new file mode 100644 index 0000000..c9db669 --- /dev/null +++ b/巴黎奥运会前300弹幕.py @@ -0,0 +1,85 @@ +import re +import requests # “HTTP for humans” + + +# 由于b站的反爬比较严格,请求头需要以下信息 +headers = { + 'authority': 'data.bilibili.com', + 'accept': 'application/json, text/plain, */*', + 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', + 'cookie': 'buvid4=8AF878ED-E44F-755F-11D3-64DA64F69E4116221-022090315-m4T90WVXeahcRPohl2liLg%3D%3D; DedeUserID=1298162992; DedeUserID__ckMd5=7cba268dad61786b; FEED_LIVE_VERSION=V8; header_theme_version=CLOSE; enable_web_push=DISABLE; PVID=1; rpdid=|(u))kkY|mmJ0Ju~|JY)l|kR; buvid_fp_plain=undefined; CURRENT_QUALITY=0; fingerprint=9cdc191abf68aa9b9a5bdfd9ea3ccd06; buvid_fp=9cdc191abf68aa9b9a5bdfd9ea3ccd06; buvid3=E4939679-CBBE-5754-364A-86E62CA46B0738242infoc; b_nut=1725540338; _uuid=7DCC10D99-EB78-76101-85FC-FD82836ACF8342265infoc; CURRENT_BLACKGAP=0; CURRENT_FNVAL=4048; bp_t_offset_1298162992=974832433729896448; SESSDATA=d762ed30%2C1741443642%2C9f2d3%2A92CjBn89a4wlq2S-eFP4pMC8rPS00G4v4cFcPyg7qHVMmfzGHjkeYhz10GAnYhyvOxyTASVkI2MlVHdW5hb1d4WUIxZDFCSlJhV0dGM1NzQjVZeFFWallRU0kyYjBnVEQ4TkRKcEs2NmN3a3lvWGZWMHd6Uk1oeE1MZ0tyVlJCbWJDWm5SMV9najFnIIEC; bili_jct=a391cc5100a136dacf6586abfaab24a6; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYxNTEzMzYsImlhdCI6MTcyNTg5MjA3NiwicGx0IjotMX0.r4K4_ABZd5na44SVZJdrQ6f11XK62AQ1wnr2f998it8; bili_ticket_expires=1726151276; bsource=search_bing; b_lsid=E265964D_191D78437CC; sid=5b9r21ku; home_feed_column=5; browser_resolution=1455-790', + 'origin': 'https://www.bilibili.com', + 'referer': 'https://www.bilibili.com/video/BV1G2421f7cE/?spm_id_from=333.1007.tianma.2-2-5.click&vd_source=a21d0efaab8782a1839893735379d59a', + 'sec-ch-ua': '"Chromium";v="128", "Not;A=Brand";v="24", "Microsoft Edge";v="128"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': '"Windows"', + 'sec-fetch-dest': 'empty', + 'sec-fetch-mode': 'cors', + 'sec-fetch-site': 'same-site', + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0' +} + + + +ye = 1 +cid_num = 0 + +# 用于获取搜索结果页的源码 +def Get_html_data (page): + url = f'https://api.bilibili.com/x/web-interface/wbi/search/type?__refresh__=true&_extra=&context=&page={page}&page_size=34&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=jOfkBnzsmlKLpeoEa9g32wjiyl1J0Up7&ad_resource=5654&source_tag=3&gaia_vtoken=&category_id=&search_type=video&' + response = requests.get(url=url, headers=headers) + response.encoding = 'utf-8'# 设置响应编码为 UTF-8,以确保正确解码网页内容 + html_data = response.text + print('成功获取源码') + return html_data + +#从 HTML 数据中提取前300个视频的 BV 号,并生成视频的 URL 列表 +def Get_Bv(html_data): + url_list = [] + text_list = re.findall('.*?"bvid":"(.*?)","title":".*?',html_data)#使用正则表达式从 HTML 数据中提取 BV 号 + for index in text_list: + temp = "http://www.bilibili.com/video/" + index + url_list.append(temp) + print("成功获取bv号") + return url_list + +#从url_list中获取cid,拼接后得到视频弹幕的url +def Get_cid(url_list, max_cid_num=300, headers=headers): + xml_url = [] + for index in url_list: + try: + response = requests.get(url=index, headers=headers) + response.raise_for_status() + html_str = response.content.decode() + cid = str(re.findall('"bvid\":\".*?"cids\":{\"1\":(.*?)}},\"BV.*?\":{\"aid\":', html_str)) + cid = cid.replace("['", "").replace("']", "") + print(cid) + xml_url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}" + global cid_num + cid_num += 1 + if cid_num >= max_cid_num: + break + except requests.RequestException as e: + print(f"请求失败: {e}") + continue + print("获取弹幕链接成功") + return xml_url + + + +if __name__ == '__main__': + for page in range(1, 34): + html_data = Get_html_data (page) + url_list = Get_Bv(html_data) + xml_url = Get_cid(url_list) + + for j in xml_url: + response3 = requests.get(url=xml_url, headers=headers) + response3.encoding = response3.apparent_encoding + data_list = re.findall('(.*?)', response3.text, ) + for i in data_list: + with open('巴黎弹幕.txt', mode='a', encoding='utf-8') as f: + f.write(i) + f.write('\n') + + \ No newline at end of file