ADD file via upload

2 years ago · 1bc34bff47
parent 58db263363
commit 1bc34bff47
1 changed files with 99 additions and 0 deletions
--- a/爬取前300条视频弹幕.py
+++ b/爬取前300条视频弹幕.py
@ -0,0 +1,99 @@
+import requests
+import re
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+count = 0
+
+
+def get_page_url(n):
+    """
+    获取页面的URL
+    """
+    page_url_list = []
+    for i in range(n):
+        if i == 0:
+            page_url = "https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5"
+        else:
+            page_url = f"https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page={i + 1}&o={i * 36}"
+        page_url_list.append(page_url)
+    return page_url_list
+
+
+header = {
+    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"}
+
+
+def fetch_page(url):
+    try:
+        response = requests.get(url=url, headers=header)
+        response.encoding = 'utf-8'
+        return response.text
+    except Exception as e:
+        print(f"请求失败: {e}")
+        return ""
+
+
+def get_cid(page_url_list):
+    global count
+    cid_list = []
+
+    with ThreadPoolExecutor(max_workers=5) as executor:
+        future_to_url = {executor.submit(fetch_page, url): url for url in page_url_list}
+        for future in as_completed(future_to_url):
+            page_url = future_to_url[future]
+            try:
+                data = future.result()
+                content = re.findall('"//www.bilibili.com/video/(.*?)/"', data)
+                content = set(content)
+                content = list(content)
+                for bvid in content:
+                    url = f"https://api.bilibili.com/x/player/pagelist?bvid={bvid}&jsonp=jsonp"
+                    response = fetch_page(url)
+                    cids = re.findall('{"cid":(.*?),', response)
+                    if cids:
+                        cid_list.append(cids[0])
+                        count += 1
+                        print(f"已获取到 {count} 个cid")
+                    if count >= 300:
+                        break
+                if count >= 300:
+                    break
+            except Exception as e:
+                print(f"处理失败: {e}")
+    return cid_list
+
+
+def get_danmu(cid_list):
+    def fetch_danmu(cid):
+        try:
+            url = f"https://comment.bilibili.com/{cid}.xml"
+            response = requests.get(url=url, headers=header)
+            response.encoding = 'utf-8'
+            data = response.text
+            return re.findall('<d p=".*?">(.*?)</d>', data)
+        except Exception as e:
+            print(f"请求失败: {e}")
+            return []
+
+    danmu_list = []
+    with ThreadPoolExecutor(max_workers=5) as executor:
+        future_to_cid = {executor.submit(fetch_danmu, cid): cid for cid in cid_list}
+        for future in as_completed(future_to_cid):
+            cid = future_to_cid[future]
+            try:
+                danmu_list.extend(future.result())
+                print(f"已获取到 {len(danmu_list)} 条弹幕")
+            except Exception as e:
+                print(f"处理失败: {e}")
+    return danmu_list
+
+
+cid_list = get_cid(get_page_url(10))
+print("开始获取弹幕数据...")
+danmu_list = get_danmu(cid_list)
+print("弹幕数据爬取完成。")
+with open('弹幕.txt', 'w', encoding='utf-8') as f:
+    for danmu in danmu_list:
+        f.write(danmu + '\n')
+
+print("弹幕已保存到 '弹幕.txt'")