import requests from bs4 import BeautifulSoup import pandas as pd import time def get_danmu(urls): # 从给定的 URL 列表中获取弹幕数据,并保存到 Excel 文件。 # 获取 BV 号 bv_ids = extract_bv_ids(urls) # 获取 cid 号 cids = fetch_cids(bv_ids) # 获取弹幕数据 danmu_data = fetch_danmu_data(cids) # 解析弹幕数据 all_danmu = parse_danmu(danmu_data) # 保存到 Excel 文件 save_danmu_to_excel(all_danmu) return all_danmu def extract_bv_ids(urls): # 从 URL 列表中提取 BV 号。 bv_ids = [] for url in urls: parts = url.split('/') bv_ids.extend(part for part in parts if part.startswith('BV')) return bv_ids def fetch_cids(bv_ids): # 根据 BV 号列表获取 cid 号列表。 cids = [] headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0" } for bv_id in bv_ids: url = f"https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp" try: response = requests.get(url, headers=headers) response.raise_for_status() data = response.json() if data.get('code') == 0 and data.get('data'): cids.append(data['data'][0]['cid']) except requests.RequestException as e: print(f"Error fetching CID for BV {bv_id}: {e}") time.sleep(0.5) # 避免过于频繁的请求 print(f"CID count: {len(cids)}") return cids def fetch_danmu_data(cids): # 根据 cid 号列表获取弹幕数据。 danmu_data = [] fail_count = 0 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0" } for cid in cids: url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}" try: response = requests.get(url, headers=headers) response.raise_for_status() response.encoding = 'utf-8' danmu_data.append(response.text) except requests.RequestException as e: print(f"Error fetching danmu for CID {cid}: {e}") fail_count += 1 time.sleep(0.5) # 避免过于频繁的请求 print(f"Danmu data count: {len(danmu_data)}") if fail_count > 0: print(f"Failed to fetch {fail_count} danmu data pages") return danmu_data def parse_danmu(danmu_data): # 解析弹幕数据。 all_danmu = [] for html in danmu_data: soup = BeautifulSoup(html, 'html.parser') all_danmu.extend(d.get_text() for d in soup.find_all('d')) print(f"Total danmu count: {len(all_danmu)}") return all_danmu def save_danmu_to_excel(all_danmu): # 将弹幕数据保存到 Excel 文件。 df = pd.DataFrame({'danmu': all_danmu}) df.to_excel("all_danmu_data.xlsx", index=False, engine='openpyxl') print("Danmu data saved to all_danmu_data.xlsx")