You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
102201401/巴黎奥运会前300弹幕.py

85 lines
4.6 KiB

import re
import requests # “HTTP for humans”
# 由于b站的反爬比较严格请求头需要以下信息
headers = {
'authority': 'data.bilibili.com',
'accept': 'application/json, text/plain, */*',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'cookie': 'buvid4=8AF878ED-E44F-755F-11D3-64DA64F69E4116221-022090315-m4T90WVXeahcRPohl2liLg%3D%3D; DedeUserID=1298162992; DedeUserID__ckMd5=7cba268dad61786b; FEED_LIVE_VERSION=V8; header_theme_version=CLOSE; enable_web_push=DISABLE; PVID=1; rpdid=|(u))kkY|mmJ0Ju~|JY)l|kR; buvid_fp_plain=undefined; CURRENT_QUALITY=0; fingerprint=9cdc191abf68aa9b9a5bdfd9ea3ccd06; buvid_fp=9cdc191abf68aa9b9a5bdfd9ea3ccd06; buvid3=E4939679-CBBE-5754-364A-86E62CA46B0738242infoc; b_nut=1725540338; _uuid=7DCC10D99-EB78-76101-85FC-FD82836ACF8342265infoc; CURRENT_BLACKGAP=0; CURRENT_FNVAL=4048; bp_t_offset_1298162992=974832433729896448; SESSDATA=d762ed30%2C1741443642%2C9f2d3%2A92CjBn89a4wlq2S-eFP4pMC8rPS00G4v4cFcPyg7qHVMmfzGHjkeYhz10GAnYhyvOxyTASVkI2MlVHdW5hb1d4WUIxZDFCSlJhV0dGM1NzQjVZeFFWallRU0kyYjBnVEQ4TkRKcEs2NmN3a3lvWGZWMHd6Uk1oeE1MZ0tyVlJCbWJDWm5SMV9najFnIIEC; bili_jct=a391cc5100a136dacf6586abfaab24a6; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYxNTEzMzYsImlhdCI6MTcyNTg5MjA3NiwicGx0IjotMX0.r4K4_ABZd5na44SVZJdrQ6f11XK62AQ1wnr2f998it8; bili_ticket_expires=1726151276; bsource=search_bing; b_lsid=E265964D_191D78437CC; sid=5b9r21ku; home_feed_column=5; browser_resolution=1455-790',
'origin': 'https://www.bilibili.com',
'referer': 'https://www.bilibili.com/video/BV1G2421f7cE/?spm_id_from=333.1007.tianma.2-2-5.click&vd_source=a21d0efaab8782a1839893735379d59a',
'sec-ch-ua': '"Chromium";v="128", "Not;A=Brand";v="24", "Microsoft Edge";v="128"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-site',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0'
}
ye = 1
cid_num = 0
# 用于获取搜索结果页的源码
def Get_html_data (page):
url = f'https://api.bilibili.com/x/web-interface/wbi/search/type?__refresh__=true&_extra=&context=&page={page}&page_size=34&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=jOfkBnzsmlKLpeoEa9g32wjiyl1J0Up7&ad_resource=5654&source_tag=3&gaia_vtoken=&category_id=&search_type=video&'
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'# 设置响应编码为 UTF-8以确保正确解码网页内容
html_data = response.text
print('成功获取源码')
return html_data
#从 HTML 数据中提取前300个视频的 BV 号,并生成视频的 URL 列表
def Get_Bv(html_data):
url_list = []
text_list = re.findall('.*?"bvid":"(.*?)","title":".*?',html_data)#使用正则表达式从 HTML 数据中提取 BV 号
for index in text_list:
temp = "http://www.bilibili.com/video/" + index
url_list.append(temp)
print("成功获取bv号")
return url_list
#从url_list中获取cid拼接后得到视频弹幕的url
def Get_cid(url_list, max_cid_num=300, headers=headers):
xml_url = []
for index in url_list:
try:
response = requests.get(url=index, headers=headers)
response.raise_for_status()
html_str = response.content.decode()
cid = str(re.findall('"bvid\":\".*?"cids\":{\"1\":(.*?)}},\"BV.*?\":{\"aid\":', html_str))
cid = cid.replace("['", "").replace("']", "")
print(cid)
xml_url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
global cid_num
cid_num += 1
if cid_num >= max_cid_num:
break
except requests.RequestException as e:
print(f"请求失败: {e}")
continue
print("获取弹幕链接成功")
return xml_url
if __name__ == '__main__':
for page in range(1, 34):
html_data = Get_html_data (page)
url_list = Get_Bv(html_data)
xml_url = Get_cid(url_list)
for j in xml_url:
response3 = requests.get(url=xml_url, headers=headers)
response3.encoding = response3.apparent_encoding
data_list = re.findall('<d p=".*?">(.*?)</d>', response3.text, )
for i in data_list:
with open('巴黎弹幕.txt', mode='a', encoding='utf-8') as f:
f.write(i)
f.write('\n')