You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

123 lines
5.1 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#软工个人作业——爬虫主程序
#结果为文本txt
import requests # 发送请求
import re # 正则表达式,用于数据清洗
def get_cid(bv_id):
url = f"https://api.bilibili.com/x/web-interface/view?bvid={bv_id}"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
response = requests.get(url, headers=headers)
data = response.json()
if data['code'] == 0:
return data['data']['cid']
else:
print("Error:", data['message'])
return None
def get_danmaku(cid):
url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={cid}'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
'Referer': 'https://www.bilibili.com/'
}
cookies = {
'cookie_name': 'cookie_value' # 请替换为实际的cookie值
}
response = requests.get(url, headers=headers, cookies=cookies)
response.encoding = 'utf-8'
html_data = response.text
content_list = re.findall('<d p=".*?">(.*?)</d>', html_data)
return '\n'.join(content_list)
def get_search(v_keyword, v_max_page, v_out_file, danmaku_file):
with open(v_out_file, 'w', encoding='utf-8') as f, open(danmaku_file, 'w', encoding='utf-8') as df:
video_count = 0
for page in range(1, v_max_page + 1):
if video_count >= 300:
break
print('开始爬取第{}'.format(page))
# 必要的请求地址
url = 'https://api.bilibili.com/x/web-interface/wbi/search/type'
headers = {
'accept': 'application/json, text/plain, */*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'cookie': "", # 这里填入自己的cookie
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0',
'referer': "https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3&page=2&o=24".format(v_keyword),
'origin': 'https://search.bilibili.com',
'sec-ch-ua': '"Google Chrome";v="107", "Chromium";v="107", "Not=A?Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-site'
}
# 根据网页获取的请求参数
params = {
'category_id':'',
'search_type': 'video',
'ad_resource': '5654',
'__refresh__': 'true',
'_extra': '',
'context': '',
'page': page,
'page_size': '42',
'pubtime_begin_s': '0',
'pubtime_end_s': '0',
'from_source':'',
'from_spmid': '333.337',
'platform': 'pc',
'highlight': '1',
'single_column':'0',
'keyword': '2024巴黎奥运会',
'qv_id': '1P0f9h8c7OOA9SpNbY7Rs6XaEUa80p13',
'source_tag': '3',
'gaia_vtoken':'',
'dynamic_offset': '24',
'web_location': '1430654',
'w_rid': 'e0021a1eb2c68a9df2fec8a5a287352e',
'wts': '1726311718',
}
# 向页面发送请求
r = requests.get(url, headers=headers, params=params)
# 查看响应码
print(r.status_code)
if r.status_code != 200:
print(f"请求失败,状态码: {r.status_code}")
continue
j_data = r.json()
if 'data' not in j_data or 'result' not in j_data['data']:
print("响应中没有找到数据")
continue
data_list = j_data['data']['result']
print('数据长度:', len(data_list))
for data in data_list:
if video_count >= 300:
break
mid = data['mid']
bvid = data['bvid']
cid = get_cid(bvid)
if cid:
f.write(f'{mid},{bvid},{cid}\n')
print(f'mid: {mid}, bvid: {bvid}, cid: {cid}')
# 获取弹幕并保存到同一个文件中
danmaku_content = get_danmaku(cid)
df.write(f'弹幕 for cid {cid}:\n{danmaku_content}\n\n')
video_count += 1
else:
print(f'Failed to get cid for bvid: {bvid}')
# 调用函数
get_search('2024巴黎奥运会', 10, 'output.txt', 'all_danmaku.txt')