From 2db6e5bb8e11d4e2cb4383a22c9ac94ea9db76f8 Mon Sep 17 00:00:00 2001 From: pxf746fmv Date: Wed, 18 Sep 2024 17:36:59 +0800 Subject: [PATCH] =?UTF-8?q?=E8=BD=AF=E5=B7=A5=E4=B8=AA=E4=BA=BA=E4=BD=9C?= =?UTF-8?q?=E4=B8=9A=E2=80=94=E2=80=94=E7=88=AC=E8=99=AB=E4=B8=BB=E7=A8=8B?= =?UTF-8?q?=E5=BA=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bilibili_scraper.py | 122 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 bilibili_scraper.py diff --git a/bilibili_scraper.py b/bilibili_scraper.py new file mode 100644 index 0000000..8feb59d --- /dev/null +++ b/bilibili_scraper.py @@ -0,0 +1,122 @@ +#软工个人作业——爬虫主程序 +#(结果为文本txt) +import requests # 发送请求 +import re # 正则表达式,用于数据清洗 + + +def get_cid(bv_id): + url = f"https://api.bilibili.com/x/web-interface/view?bvid={bv_id}" + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' + } + response = requests.get(url, headers=headers) + data = response.json() + if data['code'] == 0: + return data['data']['cid'] + else: + print("Error:", data['message']) + return None + +def get_danmaku(cid): + url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={cid}' + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3', + 'Referer': 'https://www.bilibili.com/' + } + cookies = { + 'cookie_name': 'cookie_value' # 请替换为实际的cookie值 + } + response = requests.get(url, headers=headers, cookies=cookies) + response.encoding = 'utf-8' + html_data = response.text + content_list = re.findall('(.*?)', html_data) + return '\n'.join(content_list) + + + +def get_search(v_keyword, v_max_page, v_out_file, danmaku_file): + with open(v_out_file, 'w', encoding='utf-8') as f, open(danmaku_file, 'w', encoding='utf-8') as df: + video_count = 0 + for page in range(1, v_max_page + 1): + if video_count >= 300: + break + print('开始爬取第{}页'.format(page)) + # 必要的请求地址 + url = 'https://api.bilibili.com/x/web-interface/wbi/search/type' + headers = { + 'accept': 'application/json, text/plain, */*', + 'accept-encoding': 'gzip, deflate, br', + 'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7', + 'cookie': "", # 这里填入自己的cookie + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0', + 'referer': "https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3&page=2&o=24".format(v_keyword), + 'origin': 'https://search.bilibili.com', + 'sec-ch-ua': '"Google Chrome";v="107", "Chromium";v="107", "Not=A?Brand";v="24"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': '"Windows"', + 'sec-fetch-dest': 'empty', + 'sec-fetch-mode': 'cors', + 'sec-fetch-site': 'same-site' + } + # 根据网页获取的请求参数 + params = { + 'category_id':'', + 'search_type': 'video', + 'ad_resource': '5654', + '__refresh__': 'true', + '_extra': '', + 'context': '', + 'page': page, + 'page_size': '42', + 'pubtime_begin_s': '0', + 'pubtime_end_s': '0', + 'from_source':'', + 'from_spmid': '333.337', + 'platform': 'pc', + 'highlight': '1', + 'single_column':'0', + 'keyword': '2024巴黎奥运会', + 'qv_id': '1P0f9h8c7OOA9SpNbY7Rs6XaEUa80p13', + 'source_tag': '3', + 'gaia_vtoken':'', + 'dynamic_offset': '24', + 'web_location': '1430654', + 'w_rid': 'e0021a1eb2c68a9df2fec8a5a287352e', + 'wts': '1726311718', + } + + # 向页面发送请求 + r = requests.get(url, headers=headers, params=params) + # 查看响应码 + print(r.status_code) + if r.status_code != 200: + print(f"请求失败,状态码: {r.status_code}") + continue + + j_data = r.json() + if 'data' not in j_data or 'result' not in j_data['data']: + print("响应中没有找到数据") + continue + + data_list = j_data['data']['result'] + print('数据长度:', len(data_list)) + + for data in data_list: + if video_count >= 300: + break + mid = data['mid'] + bvid = data['bvid'] + cid = get_cid(bvid) + if cid: + f.write(f'{mid},{bvid},{cid}\n') + print(f'mid: {mid}, bvid: {bvid}, cid: {cid}') + # 获取弹幕并保存到同一个文件中 + danmaku_content = get_danmaku(cid) + df.write(f'弹幕 for cid {cid}:\n{danmaku_content}\n\n') + video_count += 1 + else: + print(f'Failed to get cid for bvid: {bvid}') + +# 调用函数 +get_search('2024巴黎奥运会', 10, 'output.txt', 'all_danmaku.txt') +