|
|
|
|
@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
import pandas as pd
|
|
|
|
|
import time
|
|
|
|
|
import re
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 搜索B站视频
|
|
|
|
|
def search_bilibili_videos(keyword, pages=1):
|
|
|
|
|
videos = []
|
|
|
|
|
search_url = "https://api.bilibili.com/x/web-interface/search/all/v2"
|
|
|
|
|
|
|
|
|
|
for page in range(1, pages + 1):
|
|
|
|
|
params = {
|
|
|
|
|
'keyword': keyword,
|
|
|
|
|
'page': page,
|
|
|
|
|
'order': 'toalrank' # 综合排序
|
|
|
|
|
}
|
|
|
|
|
headers = {
|
|
|
|
|
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:129.0) Gecko/20100101 Firefox/129.0',
|
|
|
|
|
# "Referer": "https://www.bilibili.com",
|
|
|
|
|
'cookie': 'buvid3=71DE7BF1-65B2-E9EF-A649-24AAA66F6C7678761infoc; b_nut=1725457378; _uuid=9D6B1761-AF8E-C361-377A-C98C108473410589523infoc; enable_web_push=DISABLE; home_feed_column=5; browser_resolution=1536-256; buvid_fp=4bcf0d8c44fc914c4c5fd2ed0dae4f4b; buvid4=5DF4D6D0-2AE4-BF08-68E4-9D5F0577449779973-024090413-QskMrVA3JCskGyxu8QjrHA%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u))kkYu|R~0J\'u~klRl~J)R; SESSDATA=861f18ef%2C1741438809%2Cea6ab%2A91CjBoRWoAds6HeZgtds_Bgccg6ZxZDoepB94Evl7kVfLMtwSKMU2X2lFH5ZfOQAqlsCESVmlINjNpV050NnFucWlibklobHhOYlJlLW9laEtyRnd5aElhVzZMM1NXdnhxT1lzMHB5WUNvcnpOcjVuUDZhMWhhUmhQdU5rQzJaQlZ0RXN5dXMtb3RnIIEC; bili_jct=0bf03509e683bc052db716bbaf2463f1; DedeUserID=1517012437; DedeUserID__ckMd5=20fba854b81fb2cd; b_lsid=73C810398_191D6DEBF89; bsource=search_baidu; header_theme_version=CLOSE; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYxNDYwMDgsImlhdCI6MTcyNTg4Njc0OCwicGx0IjotMX0.JZKSF_Wk7iJkslm76AUNtIij5YF8FPsezF14FG1KTbk; bili_ticket_expires=1726145948; sid=e6ubibh7'
|
|
|
|
|
}
|
|
|
|
|
response = requests.get(search_url, params=params, headers=headers)
|
|
|
|
|
|
|
|
|
|
if response.status_code == 200:
|
|
|
|
|
data = response.json()
|
|
|
|
|
for result in data['data']['result']:
|
|
|
|
|
if result['result_type'] == 'video':
|
|
|
|
|
for video in result['data']:
|
|
|
|
|
videos.append({
|
|
|
|
|
'title': video['title'],
|
|
|
|
|
'aid': video['aid'],
|
|
|
|
|
'bvid': video['bvid'],
|
|
|
|
|
'danmaku': video['danmaku'],
|
|
|
|
|
'url': video['arcurl']
|
|
|
|
|
})
|
|
|
|
|
time.sleep(1) # 防止过快请求被封
|
|
|
|
|
return videos
|
|
|
|
|
|
|
|
|
|
def get_cid(aid):
|
|
|
|
|
video_url = f"https://api.bilibili.com/x/player/pagelist?aid={aid}&jsonp=jsonp"
|
|
|
|
|
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:129.0) Gecko/20100101 Firefox/129.0'}
|
|
|
|
|
response = requests.get(video_url, headers=headers)
|
|
|
|
|
if response.status_code == 200:
|
|
|
|
|
data = response.json()
|
|
|
|
|
|
|
|
|
|
return data['data'][0]['cid']
|
|
|
|
|
return None
|
|
|
|
|
# 获取弹幕
|
|
|
|
|
def get_danmaku(cid):
|
|
|
|
|
danmaku_url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
|
|
|
|
|
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:129.0) Gecko/20100101 Firefox/129.0'}
|
|
|
|
|
|
|
|
|
|
response = requests.get(danmaku_url, headers=headers)
|
|
|
|
|
response.encoding = 'utf-8'
|
|
|
|
|
|
|
|
|
|
danmakus = []
|
|
|
|
|
obj=re.compile(r'<d p="(.*?)">(?P<danmaku>.*?)</d>', re.S)
|
|
|
|
|
if response.status_code == 200:
|
|
|
|
|
ds=obj.finditer(response.text)
|
|
|
|
|
for d in ds:
|
|
|
|
|
danmakus.append(d.group("danmaku"))
|
|
|
|
|
# soup = BeautifulSoup(response.text, 'lxml-xml')
|
|
|
|
|
# danmakus = [d.text for d in soup.find_all('d')]
|
|
|
|
|
return danmakus
|
|
|
|
|
return []
|
|
|
|
|
def process_video(video):
|
|
|
|
|
cid = get_cid(video['aid'])
|
|
|
|
|
if cid:
|
|
|
|
|
danmakus = get_danmaku(cid)
|
|
|
|
|
print(f"获取视频 {video['title']} 的 {len(danmakus)} 条弹幕")
|
|
|
|
|
return danmakus
|
|
|
|
|
else:
|
|
|
|
|
print(f"无法获取视频 {video['title']} 的 cid")
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 主函数
|
|
|
|
|
def main():
|
|
|
|
|
keyword = "2024巴黎奥运会"
|
|
|
|
|
pages = 10 # 每页默认返回30个视频,10页为300个视频
|
|
|
|
|
videos = search_bilibili_videos(keyword, pages=pages)
|
|
|
|
|
|
|
|
|
|
all_danmakus = []
|
|
|
|
|
with ThreadPoolExecutor(max_workers=10) as executor:
|
|
|
|
|
futures = [executor.submit(process_video, video) for video in videos]
|
|
|
|
|
for future in as_completed(futures):
|
|
|
|
|
danmakus = future.result()
|
|
|
|
|
all_danmakus.extend(danmakus)
|
|
|
|
|
with open('danmaku.xlsx', 'w', encoding='utf-8') as f:
|
|
|
|
|
for danmaku in all_danmakus:
|
|
|
|
|
f.write(danmaku + '\n')
|
|
|
|
|
# 保存为CSV文件
|
|
|
|
|
df = pd.DataFrame({'danmaku': all_danmakus})
|
|
|
|
|
df.to_csv('danmaku.csv', index=False, encoding='utf-8-sig')
|
|
|
|
|
print(f"共收集到 {len(all_danmakus)} 条弹幕,保存至 danmaku.csv")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|