|
|
|
|
#软工个人作业——爬虫主程序
|
|
|
|
|
#(结果为文本txt)
|
|
|
|
|
import requests # 发送请求
|
|
|
|
|
import re # 正则表达式,用于数据清洗
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_cid(bv_id):
|
|
|
|
|
url = f"https://api.bilibili.com/x/web-interface/view?bvid={bv_id}"
|
|
|
|
|
headers = {
|
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
|
|
|
}
|
|
|
|
|
response = requests.get(url, headers=headers)
|
|
|
|
|
data = response.json()
|
|
|
|
|
if data['code'] == 0:
|
|
|
|
|
return data['data']['cid']
|
|
|
|
|
else:
|
|
|
|
|
print("Error:", data['message'])
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
def get_danmaku(cid):
|
|
|
|
|
url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={cid}'
|
|
|
|
|
headers = {
|
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
|
|
|
|
|
'Referer': 'https://www.bilibili.com/'
|
|
|
|
|
}
|
|
|
|
|
cookies = {
|
|
|
|
|
'cookie_name': 'cookie_value' # 请替换为实际的cookie值
|
|
|
|
|
}
|
|
|
|
|
response = requests.get(url, headers=headers, cookies=cookies)
|
|
|
|
|
response.encoding = 'utf-8'
|
|
|
|
|
html_data = response.text
|
|
|
|
|
content_list = re.findall('<d p=".*?">(.*?)</d>', html_data)
|
|
|
|
|
return '\n'.join(content_list)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_search(v_keyword, v_max_page, v_out_file, danmaku_file):
|
|
|
|
|
with open(v_out_file, 'w', encoding='utf-8') as f, open(danmaku_file, 'w', encoding='utf-8') as df:
|
|
|
|
|
video_count = 0
|
|
|
|
|
for page in range(1, v_max_page + 1):
|
|
|
|
|
if video_count >= 300:
|
|
|
|
|
break
|
|
|
|
|
print('开始爬取第{}页'.format(page))
|
|
|
|
|
# 必要的请求地址
|
|
|
|
|
url = 'https://api.bilibili.com/x/web-interface/wbi/search/type'
|
|
|
|
|
headers = {
|
|
|
|
|
'accept': 'application/json, text/plain, */*',
|
|
|
|
|
'accept-encoding': 'gzip, deflate, br',
|
|
|
|
|
'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
|
|
|
|
|
'cookie': "", # 这里填入自己的cookie
|
|
|
|
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0',
|
|
|
|
|
'referer': "https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3&page=2&o=24".format(v_keyword),
|
|
|
|
|
'origin': 'https://search.bilibili.com',
|
|
|
|
|
'sec-ch-ua': '"Google Chrome";v="107", "Chromium";v="107", "Not=A?Brand";v="24"',
|
|
|
|
|
'sec-ch-ua-mobile': '?0',
|
|
|
|
|
'sec-ch-ua-platform': '"Windows"',
|
|
|
|
|
'sec-fetch-dest': 'empty',
|
|
|
|
|
'sec-fetch-mode': 'cors',
|
|
|
|
|
'sec-fetch-site': 'same-site'
|
|
|
|
|
}
|
|
|
|
|
# 根据网页获取的请求参数
|
|
|
|
|
params = {
|
|
|
|
|
'category_id':'',
|
|
|
|
|
'search_type': 'video',
|
|
|
|
|
'ad_resource': '5654',
|
|
|
|
|
'__refresh__': 'true',
|
|
|
|
|
'_extra': '',
|
|
|
|
|
'context': '',
|
|
|
|
|
'page': page,
|
|
|
|
|
'page_size': '42',
|
|
|
|
|
'pubtime_begin_s': '0',
|
|
|
|
|
'pubtime_end_s': '0',
|
|
|
|
|
'from_source':'',
|
|
|
|
|
'from_spmid': '333.337',
|
|
|
|
|
'platform': 'pc',
|
|
|
|
|
'highlight': '1',
|
|
|
|
|
'single_column':'0',
|
|
|
|
|
'keyword': '2024巴黎奥运会',
|
|
|
|
|
'qv_id': '1P0f9h8c7OOA9SpNbY7Rs6XaEUa80p13',
|
|
|
|
|
'source_tag': '3',
|
|
|
|
|
'gaia_vtoken':'',
|
|
|
|
|
'dynamic_offset': '24',
|
|
|
|
|
'web_location': '1430654',
|
|
|
|
|
'w_rid': 'e0021a1eb2c68a9df2fec8a5a287352e',
|
|
|
|
|
'wts': '1726311718',
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# 向页面发送请求
|
|
|
|
|
r = requests.get(url, headers=headers, params=params)
|
|
|
|
|
# 查看响应码
|
|
|
|
|
print(r.status_code)
|
|
|
|
|
if r.status_code != 200:
|
|
|
|
|
print(f"请求失败,状态码: {r.status_code}")
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
j_data = r.json()
|
|
|
|
|
if 'data' not in j_data or 'result' not in j_data['data']:
|
|
|
|
|
print("响应中没有找到数据")
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
data_list = j_data['data']['result']
|
|
|
|
|
print('数据长度:', len(data_list))
|
|
|
|
|
|
|
|
|
|
for data in data_list:
|
|
|
|
|
if video_count >= 300:
|
|
|
|
|
break
|
|
|
|
|
mid = data['mid']
|
|
|
|
|
bvid = data['bvid']
|
|
|
|
|
cid = get_cid(bvid)
|
|
|
|
|
if cid:
|
|
|
|
|
f.write(f'{mid},{bvid},{cid}\n')
|
|
|
|
|
print(f'mid: {mid}, bvid: {bvid}, cid: {cid}')
|
|
|
|
|
# 获取弹幕并保存到同一个文件中
|
|
|
|
|
danmaku_content = get_danmaku(cid)
|
|
|
|
|
df.write(f'弹幕 for cid {cid}:\n{danmaku_content}\n\n')
|
|
|
|
|
video_count += 1
|
|
|
|
|
else:
|
|
|
|
|
print(f'Failed to get cid for bvid: {bvid}')
|
|
|
|
|
|
|
|
|
|
# 调用函数
|
|
|
|
|
get_search('2024巴黎奥运会', 10, 'output.txt', 'all_danmaku.txt')
|
|
|
|
|
|