From 88d834cb96990fbbf003c24a4987d53a818cf536 Mon Sep 17 00:00:00 2001 From: pxf746fmv Date: Wed, 18 Sep 2024 17:35:05 +0800 Subject: [PATCH] =?UTF-8?q?Delete=20'=E8=BD=AF=E5=B7=A5=E4=B8=AA=E4=BA=BA?= =?UTF-8?q?=E4=BD=9C=E4=B8=9A=E2=80=94=E2=80=94=E7=88=AC=E8=99=AB=E4=B8=BB?= =?UTF-8?q?=E7=A8=8B=E5=BA=8F.py'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 软工个人作业——爬虫主程序.py | 125 --------------------- 1 file changed, 125 deletions(-) delete mode 100644 软工个人作业——爬虫主程序.py diff --git a/软工个人作业——爬虫主程序.py b/软工个人作业——爬虫主程序.py deleted file mode 100644 index a876a8c..0000000 --- a/软工个人作业——爬虫主程序.py +++ /dev/null @@ -1,125 +0,0 @@ -#软工个人作业——爬虫主程序 -#(结果为文本txt) -import requests # 发送请求 -import re # 正则表达式,用于数据清洗 - - -#顺序是:获取搜索结果视频的bv_id----->获取视频的cid----->获取视频的api网址----->得到弹幕文本“all_danmaku.txt” -#最终调用get_search函数获取10页的视频,最多300个视频,输出output.txt来确认每个视频的cid参数, -# 最后将所有的弹幕内容按行写入“all_danmaku.txt”当中 - -def get_cid(bv_id): - url = f"https://api.bilibili.com/x/web-interface/view?bvid={bv_id}" - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' - } - response = requests.get(url, headers=headers) - data = response.json() - if data['code'] == 0: - return data['data']['cid'] - else: - print("Error:", data['message']) - return None - -def get_danmaku(cid): - url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={cid}' - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3', - 'Referer': 'https://www.bilibili.com/' - } - cookies = { - 'cookie_name': 'cookie_value' # 请替换为实际的cookie值 - } - response = requests.get(url, headers=headers, cookies=cookies) - response.encoding = 'utf-8' - html_data = response.text - content_list = re.findall('(.*?)', html_data) - return '\n'.join(content_list) - - - -def get_search(v_keyword, v_max_page, v_out_file, danmaku_file): - with open(v_out_file, 'w', encoding='utf-8') as f, open(danmaku_file, 'w', encoding='utf-8') as df: - video_count = 0 - for page in range(1, v_max_page + 1): - if video_count >= 300: - break - print('开始爬取第{}页'.format(page)) - # 必要的请求地址 - url = 'https://api.bilibili.com/x/web-interface/wbi/search/type' - headers = { - 'accept': 'application/json, text/plain, */*', - 'accept-encoding': 'gzip, deflate, br', - 'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7', - 'cookie': "LIVE_BUVID=AUTO7916121615720683; i-wanna-go-back=-1; buvid_fp_plain=undefined; CURRENT_BLACKGAP=0; is-2022-channel=1; hit-new-style-dyn=1; enable_web_push=DISABLE; dy_spec_agreed=1; rpdid=|(J~|Ru|lJ)k0J'u~|JJumJl); buvid3=8A7F6EAD-00B3-3465-4FEB-A3D76D7D21C787048infoc; b_nut=1706790987; b_ut=5; buvid4=A02BC0EE-6193-B757-5840-72EB7E46883B06616-022012016-hP2UaKuRV%2Bk15H%2FNkVSjAQ%3D%3D; _uuid=25F7DC38-4EDF-9B108-FEA4-CE41102C91DB353019infoc; header_theme_version=CLOSE; hit-dyn-v2=1; DedeUserID=1532836416; DedeUserID__ckMd5=d9fd4d5e1ce0c6b9; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; fingerprint=e98ddd5011762c73f4a1ef10ff98d407; CURRENT_QUALITY=80; CURRENT_FNVAL=4048; SESSDATA=cc51c9ec%2C1741942750%2C04840%2A91CjAvMYw5mnEXum0quVkRd4XUH2R0uj3rS_kZkCCh5dGd-fimmg0rriz85NYy52_bvDUSVmIteEdFeDZmb01NQVNJX2xGNUR2U09XbGdXY2dmNjcwOGJTVW1DQVZfUFFUb2prcmc5ajdYbm9Vd2pOQUZobDdWQ1Vyb0l2YUNJTXlBYmlJVFhoYVFRIIEC; bili_jct=0dd6a38f2af2e487d1c68e0c7eae1002; PVID=1; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY3Mzc2MDIsImlhdCI6MTcyNjQ3ODM0MiwicGx0IjotMX0.4mCBE7C6g_7heMqP7kDsqHezof6ROslNUo8c_nClCEE; bili_ticket_expires=1726737542; sid=69rwhw9r; buvid_fp=e98ddd5011762c73f4a1ef10ff98d407; b_lsid=265810EA3_192002B0498; bp_t_offset_1532836416=978142612269563904; home_feed_column=4; browser_resolution=545-706", # 这里填入自己的cookie - 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0', - 'referer': "https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3&page=2&o=24".format(v_keyword), - 'origin': 'https://search.bilibili.com', - 'sec-ch-ua': '"Google Chrome";v="107", "Chromium";v="107", "Not=A?Brand";v="24"', - 'sec-ch-ua-mobile': '?0', - 'sec-ch-ua-platform': '"Windows"', - 'sec-fetch-dest': 'empty', - 'sec-fetch-mode': 'cors', - 'sec-fetch-site': 'same-site' - } - # 根据网页获取的请求参数 - params = { - 'category_id':'', - 'search_type': 'video', - 'ad_resource': '5654', - '__refresh__': 'true', - '_extra': '', - 'context': '', - 'page': page, - 'page_size': '42', - 'pubtime_begin_s': '0', - 'pubtime_end_s': '0', - 'from_source':'', - 'from_spmid': '333.337', - 'platform': 'pc', - 'highlight': '1', - 'single_column':'0', - 'keyword': '2024巴黎奥运会', - 'qv_id': '1P0f9h8c7OOA9SpNbY7Rs6XaEUa80p13', - 'source_tag': '3', - 'gaia_vtoken':'', - 'dynamic_offset': '24', - 'web_location': '1430654', - 'w_rid': 'e0021a1eb2c68a9df2fec8a5a287352e', - 'wts': '1726311718', - } - - # 向页面发送请求 - r = requests.get(url, headers=headers, params=params) - # 查看响应码 - print(r.status_code) - if r.status_code != 200: - print(f"请求失败,状态码: {r.status_code}") - continue - - j_data = r.json() - if 'data' not in j_data or 'result' not in j_data['data']: - print("响应中没有找到数据") - continue - - data_list = j_data['data']['result'] - print('数据长度:', len(data_list)) - - for data in data_list: - if video_count >= 300: - break - mid = data['mid'] - bvid = data['bvid'] - cid = get_cid(bvid) - if cid: - f.write(f'{mid},{bvid},{cid}\n') - print(f'mid: {mid}, bvid: {bvid}, cid: {cid}') - # 获取弹幕并保存到同一个文件中 - danmaku_content = get_danmaku(cid) - df.write(f'弹幕 for cid {cid}:\n{danmaku_content}\n\n') - video_count += 1 - else: - print(f'Failed to get cid for bvid: {bvid}') - -# 调用函数 -get_search('2024巴黎奥运会', 10, 'output.txt', 'all_danmaku.txt')