import requests from bs4 import BeautifulSoup import pandas as pd import time def get_danmu(urls): # 获取BV号 bv_id = [] for url in urls: parts = url.split('/') for part in parts: if part.startswith('BV'): bv_id.append(part) #获取cid号 cids = [] for id in bv_id: url = f"https://api.bilibili.com/x/player/pagelist?bvid={id}&jsonp=jsonp" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0" } response = requests.get(url,headers=headers) time.sleep(0.5) if response.status_code == 200: data = response.json() if data['code'] == 0 and data['data']: cid = data['data'][0]['cid'] cids.append(cid) print("cid",len(cids)) #获取弹幕数据 danmu_data = [] fail_count = 0 for id in cids: url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={id}" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0" } response = requests.get(url,headers=headers) time.sleep(0.5) if response.status_code != 200: fail_count += 1 response.encoding = 'utf-8' danmu_data.append(response.text) print("danmu_data_html",len(danmu_data)) print("fail_count",fail_count) #返回一个所有弹幕的list all_danmu = [] for html in danmu_data: soup = BeautifulSoup(html, 'html.parser') for d in soup.find_all('d'): all_danmu.append(d.get_text()) print("all_danmu",len(all_danmu)) df = pd.DataFrame({'danmu': all_danmu}) df.to_excel("all_danmu_data.xlsx", index=False,engine='openpyxl') return all_danmu