You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

59 lines
2.8 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
import pandas as pd
from bs4 import BeautifulSoup
# 设置请求头包含cookie
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0',
'Cookie': (
'FEED_LIVE_VERSION=V8; DedeUserID=510350995; DedeUserID__ckMd5=14c636895deb7c29; header_theme_version=CLOSE; '
'buvid_fp_plain=undefined; CURRENT_FNVAL=4048; fingerprint=392849e442389a5172b1bce7ac7fa5ae; '
'buvid4=05F09197-CE91-BA95-E588-D919D8039B9465145-023041520-/xwqHe8zHTWav6Q4ZiB1Ag%3D%3D; buvid_fp=392849e442389a5172b1bce7ac7fa5ae; '
'enable_web_push=DISABLE; hit-dyn-v2=1; LIVE_BUVID=AUTO1717056655135780; buvid3=C6187AC2-E645-2958-DD47-99318D61580704137infoc; '
'b_nut=1714580404; _uuid=4C67D269-FC41-2610D-36DD-2963C239D54C05142infoc; rpdid=|(u))kkYu|Yl0J\'u~u~Y)uk~m; PVID=1; '
'SESSDATA=a7fc3017%2C1742138085%2Cc250a%2A92CjCoa3O4aJmYnJtsLjmtfYyMgBe9aSZOcY6c6GlQ_J_ys9NCJO9H_s28C2lPAlqStw0SVkRfZkVxWG5Ea3hxYlRDN1JZY3VoNlFWcUFPSHZhUWRmSzNENlBHejdUdXhyUlo1aWhpV2JFQnN1M29YM2lfc2lJMzRuSm9yaVRqc3dwMmxGTi03dHh3IIEC; '
'bili_jct=688fa98091b9b81ec231123b7633bd7c; sid=4qz14i8c; b_lsid=2FD10D816_19205443CB5; bmg_af_switch=1; '
'bmg_src_def_domain=i1.hdslb.com; home_feed_column=4; browser_resolution=1144-907; '
'bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY5MjQ0NTEsImlhdCI6MTcyNjY2NTE5MSwicGx0IjotMX0.ybtOtqJwb8nAF5a-4s8wWqPrwsjkjP0doYMDrhUlJvM; '
'bili_ticket_expires=1726924391; bp_t_offset_510350995=978519676038414336'
)
}
# 搜索关键词并获取视频列表
search_url = 'https://api.bilibili.com/x/web-interface/search/all/v2'
params = {
'keyword': '2024巴黎奥运会',
'order': 'totalrank',
'duration': 0,
'tids_1': 0,
'tids_2': 0,
'page': 1
}
video_ids = []
for page in range(1, 6): # 爬取前300个视频假设每页有60个视频
params['page'] = page
response = requests.get(search_url, headers=headers, params=params)
data = response.json()
for result in data['data']['result']:
if result['result_type'] == 'video':
for video in result['data']:
video_ids.append(video['id'])
# 获取弹幕数据
def get_danmaku(video_id):
danmaku_url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={video_id}'
response = requests.get(danmaku_url, headers=headers)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'lxml')
danmakus = [d.text for d in soup.find_all('d')]
return danmakus
all_danmakus = []
for video_id in video_ids:
all_danmakus.extend(get_danmaku(video_id))
# 保存弹幕数据到Excel
df = pd.DataFrame(all_danmakus, columns=['danmaku'])
df.to_excel('danmakus.xlsx', index=False)