You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
102201401/巴黎奥运会前300弹幕.py

85 lines
4.6 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import re
import requests # “HTTP for humans”
# 由于b站的反爬比较严格请求头需要以下信息
headers = {
'authority': 'data.bilibili.com',
'accept': 'application/json, text/plain, */*',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'cookie': 'buvid4=8AF878ED-E44F-755F-11D3-64DA64F69E4116221-022090315-m4T90WVXeahcRPohl2liLg%3D%3D; DedeUserID=1298162992; DedeUserID__ckMd5=7cba268dad61786b; FEED_LIVE_VERSION=V8; header_theme_version=CLOSE; enable_web_push=DISABLE; PVID=1; rpdid=|(u))kkY|mmJ0Ju~|JY)l|kR; buvid_fp_plain=undefined; CURRENT_QUALITY=0; fingerprint=9cdc191abf68aa9b9a5bdfd9ea3ccd06; buvid_fp=9cdc191abf68aa9b9a5bdfd9ea3ccd06; buvid3=E4939679-CBBE-5754-364A-86E62CA46B0738242infoc; b_nut=1725540338; _uuid=7DCC10D99-EB78-76101-85FC-FD82836ACF8342265infoc; CURRENT_BLACKGAP=0; CURRENT_FNVAL=4048; bp_t_offset_1298162992=974832433729896448; SESSDATA=d762ed30%2C1741443642%2C9f2d3%2A92CjBn89a4wlq2S-eFP4pMC8rPS00G4v4cFcPyg7qHVMmfzGHjkeYhz10GAnYhyvOxyTASVkI2MlVHdW5hb1d4WUIxZDFCSlJhV0dGM1NzQjVZeFFWallRU0kyYjBnVEQ4TkRKcEs2NmN3a3lvWGZWMHd6Uk1oeE1MZ0tyVlJCbWJDWm5SMV9najFnIIEC; bili_jct=a391cc5100a136dacf6586abfaab24a6; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYxNTEzMzYsImlhdCI6MTcyNTg5MjA3NiwicGx0IjotMX0.r4K4_ABZd5na44SVZJdrQ6f11XK62AQ1wnr2f998it8; bili_ticket_expires=1726151276; bsource=search_bing; b_lsid=E265964D_191D78437CC; sid=5b9r21ku; home_feed_column=5; browser_resolution=1455-790',
'origin': 'https://www.bilibili.com',
'referer': 'https://www.bilibili.com/video/BV1G2421f7cE/?spm_id_from=333.1007.tianma.2-2-5.click&vd_source=a21d0efaab8782a1839893735379d59a',
'sec-ch-ua': '"Chromium";v="128", "Not;A=Brand";v="24", "Microsoft Edge";v="128"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-site',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0'
}
ye = 1
cid_num = 0
# 用于获取搜索结果页的源码
def Get_html_data (page):
url = f'https://api.bilibili.com/x/web-interface/wbi/search/type?__refresh__=true&_extra=&context=&page={page}&page_size=34&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=jOfkBnzsmlKLpeoEa9g32wjiyl1J0Up7&ad_resource=5654&source_tag=3&gaia_vtoken=&category_id=&search_type=video&'
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'# 设置响应编码为 UTF-8以确保正确解码网页内容
html_data = response.text
print('成功获取源码')
return html_data
#从 HTML 数据中提取前300个视频的 BV 号,并生成视频的 URL 列表
def Get_Bv(html_data):
url_list = []
text_list = re.findall('.*?"bvid":"(.*?)","title":".*?',html_data)#使用正则表达式从 HTML 数据中提取 BV 号
for index in text_list:
temp = "http://www.bilibili.com/video/" + index
url_list.append(temp)
print("成功获取bv号")
return url_list
#从url_list中获取cid拼接后得到视频弹幕的url
def Get_cid(url_list, max_cid_num=300, headers=headers):
xml_url = []
for index in url_list:
try:
response = requests.get(url=index, headers=headers)
response.raise_for_status()
html_str = response.content.decode()
cid = str(re.findall('"bvid\":\".*?"cids\":{\"1\":(.*?)}},\"BV.*?\":{\"aid\":', html_str))
cid = cid.replace("['", "").replace("']", "")
print(cid)
xml_url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
global cid_num
cid_num += 1
if cid_num >= max_cid_num:
break
except requests.RequestException as e:
print(f"请求失败: {e}")
continue
print("获取弹幕链接成功")
return xml_url
if __name__ == '__main__':
for page in range(1, 34):
html_data = Get_html_data (page)
url_list = Get_Bv(html_data)
xml_url = Get_cid(url_list)
for j in xml_url:
response3 = requests.get(url=xml_url, headers=headers)
response3.encoding = response3.apparent_encoding
data_list = re.findall('<d p=".*?">(.*?)</d>', response3.text, )
for i in data_list:
with open('巴黎弹幕.txt', mode='a', encoding='utf-8') as f:
f.write(i)
f.write('\n')