diff --git a/get_danmu.py b/get_danmu.py new file mode 100644 index 0000000..1c6d902 --- /dev/null +++ b/get_danmu.py @@ -0,0 +1,100 @@ +import requests +from bs4 import BeautifulSoup +import pandas as pd +import time + +def get_danmu(urls): +# 从给定的 URL 列表中获取弹幕数据,并保存到 Excel 文件。 + # 获取 BV 号 + bv_ids = extract_bv_ids(urls) + + # 获取 cid 号 + cids = fetch_cids(bv_ids) + + # 获取弹幕数据 + danmu_data = fetch_danmu_data(cids) + + # 解析弹幕数据 + all_danmu = parse_danmu(danmu_data) + + # 保存到 Excel 文件 + save_danmu_to_excel(all_danmu) + + return all_danmu + +def extract_bv_ids(urls): +# 从 URL 列表中提取 BV 号。 + + bv_ids = [] + for url in urls: + parts = url.split('/') + bv_ids.extend(part for part in parts if part.startswith('BV')) + return bv_ids + +def fetch_cids(bv_ids): +# 根据 BV 号列表获取 cid 号列表。 + cids = [] + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0" + } + + for bv_id in bv_ids: + url = f"https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp" + try: + response = requests.get(url, headers=headers) + response.raise_for_status() + data = response.json() + if data.get('code') == 0 and data.get('data'): + cids.append(data['data'][0]['cid']) + except requests.RequestException as e: + print(f"Error fetching CID for BV {bv_id}: {e}") + time.sleep(0.5) # 避免过于频繁的请求 + + print(f"CID count: {len(cids)}") + return cids + +def fetch_danmu_data(cids): + +# 根据 cid 号列表获取弹幕数据。 + danmu_data = [] + fail_count = 0 + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0" + } + + for cid in cids: + url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}" + try: + response = requests.get(url, headers=headers) + response.raise_for_status() + response.encoding = 'utf-8' + danmu_data.append(response.text) + except requests.RequestException as e: + print(f"Error fetching danmu for CID {cid}: {e}") + fail_count += 1 + time.sleep(0.5) # 避免过于频繁的请求 + + print(f"Danmu data count: {len(danmu_data)}") + if fail_count > 0: + print(f"Failed to fetch {fail_count} danmu data pages") + + return danmu_data + +def parse_danmu(danmu_data): + +# 解析弹幕数据。 + + all_danmu = [] + for html in danmu_data: + soup = BeautifulSoup(html, 'html.parser') + all_danmu.extend(d.get_text() for d in soup.find_all('d')) + + print(f"Total danmu count: {len(all_danmu)}") + return all_danmu + +def save_danmu_to_excel(all_danmu): +# 将弹幕数据保存到 Excel 文件。 + + df = pd.DataFrame({'danmu': all_danmu}) + df.to_excel("all_danmu_data.xlsx", index=False, engine='openpyxl') + print("Danmu data saved to all_danmu_data.xlsx")