From 2db6e5bb8e11d4e2cb4383a22c9ac94ea9db76f8 Mon Sep 17 00:00:00 2001
From: pxf746fmv <yyansheng144@qq.com>
Date: Wed, 18 Sep 2024 17:36:59 +0800
Subject: [PATCH] =?UTF-8?q?=E8=BD=AF=E5=B7=A5=E4=B8=AA=E4=BA=BA=E4=BD=9C?=
 =?UTF-8?q?=E4=B8=9A=E2=80=94=E2=80=94=E7=88=AC=E8=99=AB=E4=B8=BB=E7=A8=8B?=
 =?UTF-8?q?=E5=BA=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 bilibili_scraper.py | 122 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 122 insertions(+)
 create mode 100644 bilibili_scraper.py
diff --git a/bilibili_scraper.py b/bilibili_scraper.py
new file mode 100644
index 0000000..8feb59d
--- /dev/null
+++ b/bilibili_scraper.py
@@ -0,0 +1,122 @@
+#软工个人作业——爬虫主程序
+#（结果为文本txt）
+import requests  # 发送请求
+import re  # 正则表达式，用于数据清洗
+
+
+def get_cid(bv_id):
+    url = f"https://api.bilibili.com/x/web-interface/view?bvid={bv_id}"
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+    }
+    response = requests.get(url, headers=headers)
+    data = response.json()
+    if data['code'] == 0:
+        return data['data']['cid']
+    else:
+        print("Error:", data['message'])
+        return None
+
+def get_danmaku(cid):
+    url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={cid}'
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
+        'Referer': 'https://www.bilibili.com/'
+    }
+    cookies = {
+        'cookie_name': 'cookie_value'  # 请替换为实际的cookie值
+    }
+    response = requests.get(url, headers=headers, cookies=cookies)
+    response.encoding = 'utf-8'
+    html_data = response.text
+    content_list = re.findall('<d p=".*?">(.*?)</d>', html_data)
+    return '\n'.join(content_list)
+
+
+
+def get_search(v_keyword, v_max_page, v_out_file, danmaku_file):
+    with open(v_out_file, 'w', encoding='utf-8') as f, open(danmaku_file, 'w', encoding='utf-8') as df:
+        video_count = 0
+        for page in range(1, v_max_page + 1):
+            if video_count >= 300:
+                break
+            print('开始爬取第{}页'.format(page))
+            # 必要的请求地址
+            url = 'https://api.bilibili.com/x/web-interface/wbi/search/type'
+            headers = {
+                'accept': 'application/json, text/plain, */*',
+                'accept-encoding': 'gzip, deflate, br',
+                'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
+                'cookie': "",  # 这里填入自己的cookie
+                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0',
+                'referer': "https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3&page=2&o=24".format(v_keyword),
+                'origin': 'https://search.bilibili.com',
+                'sec-ch-ua': '"Google Chrome";v="107", "Chromium";v="107", "Not=A?Brand";v="24"',
+                'sec-ch-ua-mobile': '?0',
+                'sec-ch-ua-platform': '"Windows"',
+                'sec-fetch-dest': 'empty',
+                'sec-fetch-mode': 'cors',
+                'sec-fetch-site': 'same-site'
+            }
+            # 根据网页获取的请求参数
+            params = {
+                'category_id':'',
+                'search_type': 'video',
+                'ad_resource': '5654',
+                '__refresh__': 'true',
+                '_extra': '',
+                'context': '',
+                'page': page,
+                'page_size': '42',
+                'pubtime_begin_s': '0',
+                'pubtime_end_s': '0',
+                'from_source':'',
+                'from_spmid': '333.337',
+                'platform': 'pc',
+                'highlight': '1',
+                'single_column':'0',
+                'keyword': '2024巴黎奥运会',
+                'qv_id': '1P0f9h8c7OOA9SpNbY7Rs6XaEUa80p13',
+                'source_tag': '3',
+                'gaia_vtoken':'',
+                'dynamic_offset': '24',
+                'web_location': '1430654',
+                'w_rid': 'e0021a1eb2c68a9df2fec8a5a287352e',
+                'wts': '1726311718',
+            }
+
+            # 向页面发送请求
+            r = requests.get(url, headers=headers, params=params)
+            # 查看响应码
+            print(r.status_code)
+            if r.status_code != 200:
+                print(f"请求失败，状态码: {r.status_code}")
+                continue
+
+            j_data = r.json()
+            if 'data' not in j_data or 'result' not in j_data['data']:
+                print("响应中没有找到数据")
+                continue
+
+            data_list = j_data['data']['result']
+            print('数据长度:', len(data_list))
+
+            for data in data_list:
+                if video_count >= 300:
+                    break
+                mid = data['mid']
+                bvid = data['bvid']
+                cid = get_cid(bvid)
+                if cid:
+                    f.write(f'{mid},{bvid},{cid}\n')
+                    print(f'mid: {mid}, bvid: {bvid}, cid: {cid}')
+                    # 获取弹幕并保存到同一个文件中
+                    danmaku_content = get_danmaku(cid)
+                    df.write(f'弹幕 for cid {cid}:\n{danmaku_content}\n\n')
+                    video_count += 1
+                else:
+                    print(f'Failed to get cid for bvid: {bvid}')
+
+# 调用函数
+get_search('2024巴黎奥运会', 10, 'output.txt', 'all_danmaku.txt')
+