|
|
import re
|
|
|
import requests # “HTTP for humans”
|
|
|
|
|
|
|
|
|
# 由于b站的反爬比较严格,请求头需要以下信息
|
|
|
headers = {
|
|
|
'authority': 'data.bilibili.com',
|
|
|
'accept': 'application/json, text/plain, */*',
|
|
|
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
|
|
|
'cookie': 'buvid4=8AF878ED-E44F-755F-11D3-64DA64F69E4116221-022090315-m4T90WVXeahcRPohl2liLg%3D%3D; DedeUserID=1298162992; DedeUserID__ckMd5=7cba268dad61786b; FEED_LIVE_VERSION=V8; header_theme_version=CLOSE; enable_web_push=DISABLE; PVID=1; rpdid=|(u))kkY|mmJ0Ju~|JY)l|kR; buvid_fp_plain=undefined; CURRENT_QUALITY=0; fingerprint=9cdc191abf68aa9b9a5bdfd9ea3ccd06; buvid_fp=9cdc191abf68aa9b9a5bdfd9ea3ccd06; buvid3=E4939679-CBBE-5754-364A-86E62CA46B0738242infoc; b_nut=1725540338; _uuid=7DCC10D99-EB78-76101-85FC-FD82836ACF8342265infoc; CURRENT_BLACKGAP=0; CURRENT_FNVAL=4048; bp_t_offset_1298162992=974832433729896448; SESSDATA=d762ed30%2C1741443642%2C9f2d3%2A92CjBn89a4wlq2S-eFP4pMC8rPS00G4v4cFcPyg7qHVMmfzGHjkeYhz10GAnYhyvOxyTASVkI2MlVHdW5hb1d4WUIxZDFCSlJhV0dGM1NzQjVZeFFWallRU0kyYjBnVEQ4TkRKcEs2NmN3a3lvWGZWMHd6Uk1oeE1MZ0tyVlJCbWJDWm5SMV9najFnIIEC; bili_jct=a391cc5100a136dacf6586abfaab24a6; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYxNTEzMzYsImlhdCI6MTcyNTg5MjA3NiwicGx0IjotMX0.r4K4_ABZd5na44SVZJdrQ6f11XK62AQ1wnr2f998it8; bili_ticket_expires=1726151276; bsource=search_bing; b_lsid=E265964D_191D78437CC; sid=5b9r21ku; home_feed_column=5; browser_resolution=1455-790',
|
|
|
'origin': 'https://www.bilibili.com',
|
|
|
'referer': 'https://www.bilibili.com/video/BV1G2421f7cE/?spm_id_from=333.1007.tianma.2-2-5.click&vd_source=a21d0efaab8782a1839893735379d59a',
|
|
|
'sec-ch-ua': '"Chromium";v="128", "Not;A=Brand";v="24", "Microsoft Edge";v="128"',
|
|
|
'sec-ch-ua-mobile': '?0',
|
|
|
'sec-ch-ua-platform': '"Windows"',
|
|
|
'sec-fetch-dest': 'empty',
|
|
|
'sec-fetch-mode': 'cors',
|
|
|
'sec-fetch-site': 'same-site',
|
|
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0'
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
ye = 1
|
|
|
cid_num = 0
|
|
|
|
|
|
# 用于获取搜索结果页的源码
|
|
|
def Get_html_data (page):
|
|
|
url = f'https://api.bilibili.com/x/web-interface/wbi/search/type?__refresh__=true&_extra=&context=&page={page}&page_size=34&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=jOfkBnzsmlKLpeoEa9g32wjiyl1J0Up7&ad_resource=5654&source_tag=3&gaia_vtoken=&category_id=&search_type=video&'
|
|
|
response = requests.get(url=url, headers=headers)
|
|
|
response.encoding = 'utf-8'# 设置响应编码为 UTF-8,以确保正确解码网页内容
|
|
|
html_data = response.text
|
|
|
print('成功获取源码')
|
|
|
return html_data
|
|
|
|
|
|
#从 HTML 数据中提取前300个视频的 BV 号,并生成视频的 URL 列表
|
|
|
def Get_Bv(html_data):
|
|
|
url_list = []
|
|
|
text_list = re.findall('.*?"bvid":"(.*?)","title":".*?',html_data)#使用正则表达式从 HTML 数据中提取 BV 号
|
|
|
for index in text_list:
|
|
|
temp = "http://www.bilibili.com/video/" + index
|
|
|
url_list.append(temp)
|
|
|
print("成功获取bv号")
|
|
|
return url_list
|
|
|
|
|
|
#从url_list中获取cid,拼接后得到视频弹幕的url
|
|
|
def Get_cid(url_list, max_cid_num=300, headers=headers):
|
|
|
xml_url = []
|
|
|
for index in url_list:
|
|
|
try:
|
|
|
response = requests.get(url=index, headers=headers)
|
|
|
response.raise_for_status()
|
|
|
html_str = response.content.decode()
|
|
|
cid = str(re.findall('"bvid\":\".*?"cids\":{\"1\":(.*?)}},\"BV.*?\":{\"aid\":', html_str))
|
|
|
cid = cid.replace("['", "").replace("']", "")
|
|
|
print(cid)
|
|
|
xml_url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
|
|
|
global cid_num
|
|
|
cid_num += 1
|
|
|
if cid_num >= max_cid_num:
|
|
|
break
|
|
|
except requests.RequestException as e:
|
|
|
print(f"请求失败: {e}")
|
|
|
continue
|
|
|
print("获取弹幕链接成功")
|
|
|
return xml_url
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
for page in range(1, 34):
|
|
|
html_data = Get_html_data (page)
|
|
|
url_list = Get_Bv(html_data)
|
|
|
xml_url = Get_cid(url_list)
|
|
|
|
|
|
for j in xml_url:
|
|
|
response3 = requests.get(url=xml_url, headers=headers)
|
|
|
response3.encoding = response3.apparent_encoding
|
|
|
data_list = re.findall('<d p=".*?">(.*?)</d>', response3.text, )
|
|
|
for i in data_list:
|
|
|
with open('巴黎弹幕.txt', mode='a', encoding='utf-8') as f:
|
|
|
f.write(i)
|
|
|
f.write('\n')
|
|
|
|
|
|
|