parent
c1845db3e5
commit
68973bc7e4
@ -0,0 +1,100 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import pandas as pd
|
||||||
|
import time
|
||||||
|
|
||||||
|
def get_danmu(urls):
|
||||||
|
# 从给定的 URL 列表中获取弹幕数据,并保存到 Excel 文件。
|
||||||
|
# 获取 BV 号
|
||||||
|
bv_ids = extract_bv_ids(urls)
|
||||||
|
|
||||||
|
# 获取 cid 号
|
||||||
|
cids = fetch_cids(bv_ids)
|
||||||
|
|
||||||
|
# 获取弹幕数据
|
||||||
|
danmu_data = fetch_danmu_data(cids)
|
||||||
|
|
||||||
|
# 解析弹幕数据
|
||||||
|
all_danmu = parse_danmu(danmu_data)
|
||||||
|
|
||||||
|
# 保存到 Excel 文件
|
||||||
|
save_danmu_to_excel(all_danmu)
|
||||||
|
|
||||||
|
return all_danmu
|
||||||
|
|
||||||
|
def extract_bv_ids(urls):
|
||||||
|
# 从 URL 列表中提取 BV 号。
|
||||||
|
|
||||||
|
bv_ids = []
|
||||||
|
for url in urls:
|
||||||
|
parts = url.split('/')
|
||||||
|
bv_ids.extend(part for part in parts if part.startswith('BV'))
|
||||||
|
return bv_ids
|
||||||
|
|
||||||
|
def fetch_cids(bv_ids):
|
||||||
|
# 根据 BV 号列表获取 cid 号列表。
|
||||||
|
cids = []
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"
|
||||||
|
}
|
||||||
|
|
||||||
|
for bv_id in bv_ids:
|
||||||
|
url = f"https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp"
|
||||||
|
try:
|
||||||
|
response = requests.get(url, headers=headers)
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
if data.get('code') == 0 and data.get('data'):
|
||||||
|
cids.append(data['data'][0]['cid'])
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"Error fetching CID for BV {bv_id}: {e}")
|
||||||
|
time.sleep(0.5) # 避免过于频繁的请求
|
||||||
|
|
||||||
|
print(f"CID count: {len(cids)}")
|
||||||
|
return cids
|
||||||
|
|
||||||
|
def fetch_danmu_data(cids):
|
||||||
|
|
||||||
|
# 根据 cid 号列表获取弹幕数据。
|
||||||
|
danmu_data = []
|
||||||
|
fail_count = 0
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"
|
||||||
|
}
|
||||||
|
|
||||||
|
for cid in cids:
|
||||||
|
url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
|
||||||
|
try:
|
||||||
|
response = requests.get(url, headers=headers)
|
||||||
|
response.raise_for_status()
|
||||||
|
response.encoding = 'utf-8'
|
||||||
|
danmu_data.append(response.text)
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"Error fetching danmu for CID {cid}: {e}")
|
||||||
|
fail_count += 1
|
||||||
|
time.sleep(0.5) # 避免过于频繁的请求
|
||||||
|
|
||||||
|
print(f"Danmu data count: {len(danmu_data)}")
|
||||||
|
if fail_count > 0:
|
||||||
|
print(f"Failed to fetch {fail_count} danmu data pages")
|
||||||
|
|
||||||
|
return danmu_data
|
||||||
|
|
||||||
|
def parse_danmu(danmu_data):
|
||||||
|
|
||||||
|
# 解析弹幕数据。
|
||||||
|
|
||||||
|
all_danmu = []
|
||||||
|
for html in danmu_data:
|
||||||
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
|
all_danmu.extend(d.get_text() for d in soup.find_all('d'))
|
||||||
|
|
||||||
|
print(f"Total danmu count: {len(all_danmu)}")
|
||||||
|
return all_danmu
|
||||||
|
|
||||||
|
def save_danmu_to_excel(all_danmu):
|
||||||
|
# 将弹幕数据保存到 Excel 文件。
|
||||||
|
|
||||||
|
df = pd.DataFrame({'danmu': all_danmu})
|
||||||
|
df.to_excel("all_danmu_data.xlsx", index=False, engine='openpyxl')
|
||||||
|
print("Danmu data saved to all_danmu_data.xlsx")
|
Loading…
Reference in new issue