From 68973bc7e481dca58d1776f0c1e9058249c95d7f Mon Sep 17 00:00:00 2001
From: pjmw9izve <2308014474@qq.com>
Date: Tue, 17 Sep 2024 12:42:45 +0800
Subject: [PATCH] ADD file via upload

---
 get_danmu.py | 100 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 100 insertions(+)
 create mode 100644 get_danmu.py

diff --git a/get_danmu.py b/get_danmu.py
new file mode 100644
index 0000000..1c6d902
--- /dev/null
+++ b/get_danmu.py
@@ -0,0 +1,100 @@
+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+import time
+
+def get_danmu(urls):
+# 从给定的 URL 列表中获取弹幕数据，并保存到 Excel 文件。
+    # 获取 BV 号
+    bv_ids = extract_bv_ids(urls)
+    
+    # 获取 cid 号
+    cids = fetch_cids(bv_ids)
+    
+    # 获取弹幕数据
+    danmu_data = fetch_danmu_data(cids)
+    
+    # 解析弹幕数据
+    all_danmu = parse_danmu(danmu_data)
+    
+    # 保存到 Excel 文件
+    save_danmu_to_excel(all_danmu)
+    
+    return all_danmu
+
+def extract_bv_ids(urls):
+#  从 URL 列表中提取 BV 号。
+
+    bv_ids = []
+    for url in urls:
+        parts = url.split('/')
+        bv_ids.extend(part for part in parts if part.startswith('BV'))
+    return bv_ids
+
+def fetch_cids(bv_ids):
+#   根据 BV 号列表获取 cid 号列表。
+    cids = []
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"
+    }
+
+    for bv_id in bv_ids:
+        url = f"https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp"
+        try:
+            response = requests.get(url, headers=headers)
+            response.raise_for_status()
+            data = response.json()
+            if data.get('code') == 0 and data.get('data'):
+                cids.append(data['data'][0]['cid'])
+        except requests.RequestException as e:
+            print(f"Error fetching CID for BV {bv_id}: {e}")
+        time.sleep(0.5)  # 避免过于频繁的请求
+
+    print(f"CID count: {len(cids)}")
+    return cids
+
+def fetch_danmu_data(cids):
+
+#   根据 cid 号列表获取弹幕数据。
+    danmu_data = []
+    fail_count = 0
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"
+    }
+
+    for cid in cids:
+        url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
+        try:
+            response = requests.get(url, headers=headers)
+            response.raise_for_status()
+            response.encoding = 'utf-8'
+            danmu_data.append(response.text)
+        except requests.RequestException as e:
+            print(f"Error fetching danmu for CID {cid}: {e}")
+            fail_count += 1
+        time.sleep(0.5)  # 避免过于频繁的请求
+
+    print(f"Danmu data count: {len(danmu_data)}")
+    if fail_count > 0:
+        print(f"Failed to fetch {fail_count} danmu data pages")
+    
+    return danmu_data
+
+def parse_danmu(danmu_data):
+
+#  解析弹幕数据。
+
+    all_danmu = []
+    for html in danmu_data:
+        soup = BeautifulSoup(html, 'html.parser')
+        all_danmu.extend(d.get_text() for d in soup.find_all('d'))
+    
+    print(f"Total danmu count: {len(all_danmu)}")
+    return all_danmu
+
+def save_danmu_to_excel(all_danmu):
+# 将弹幕数据保存到 Excel 文件。
+
+    df = pd.DataFrame({'danmu': all_danmu})
+    df.to_excel("all_danmu_data.xlsx", index=False, engine='openpyxl')
+    print("Danmu data saved to all_danmu_data.xlsx")