You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
82 lines
4.5 KiB
82 lines
4.5 KiB
import requests
|
|
import re
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
def fetch_comments(search_word, num):
|
|
print("正在爬取弹幕...")
|
|
|
|
# 获取所需视频的 bvid
|
|
def get_bvid(url, headers):
|
|
try:
|
|
response = requests.get(url, headers=headers, timeout=10)
|
|
response.encoding = "utf-8"
|
|
matches = re.finditer(r'aid:.*?bvid:"(?P<bvs>.*?)",', response.text)
|
|
return {it.group("bvs") for it in matches} # 使用集合推导提升性能
|
|
except requests.exceptions.RequestException as e: # 异常处理
|
|
print(f"请求 bvid 失败: {e}")
|
|
return set() # 返回空集合以表示失败
|
|
|
|
# 获取所需视频的 cid
|
|
def get_cid(url, headers):
|
|
try:
|
|
response = requests.get(url, headers=headers, timeout=10)
|
|
response.raise_for_status() # 检查请求是否成功
|
|
dict = response.json()
|
|
if "data" in dict and len(dict["data"]) > 0:
|
|
cid = dict["data"][0]["cid"]
|
|
return cid
|
|
else:
|
|
print("Error: 'data' not found in response")
|
|
return None
|
|
except requests.exceptions.RequestException as e: # 异常处理
|
|
print(f"请求 cid 失败: {e}")
|
|
return None
|
|
|
|
# 获取弹幕数据
|
|
def get_comment(url, headers):
|
|
try:
|
|
response = requests.get(url, headers=headers, timeout=10)
|
|
response.encoding = "utf-8"
|
|
response.raise_for_status() # 检查请求是否成功
|
|
xml = BeautifulSoup(response.text, "xml")
|
|
return [d.text for d in xml.find_all("d")] # 使用列表推导提升性能
|
|
except requests.exceptions.RequestException as e: # 异常处理
|
|
print(f"请求评论失败: {e}")
|
|
return [] # 返回空列表以表示失败
|
|
|
|
# 设置请求头
|
|
headers = {
|
|
'Cookie': 'i-wanna-go-back=-1; LIVE_BUVID=AUTO4416565733586161; buvid_fp_plain=undefined; DedeUserID=438653318; DedeUserID__ckMd5=7c2d23c88aee6f46; hit-new-style-dyn=1; theme_style=light; enable_web_push=DISABLE; header_theme_version=CLOSE; rpdid=|(u))kkYu|JY0J\'u~|JulYmJY; b_ut=5; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; PVID=4; CURRENT_QUALITY=80; _uuid=EAB710246-F627-98E6-4529-CDFBD3522961088900infoc; buvid3=2FF036D6-2F9E-BA53-12AC-B8475DFEA7EF68465infoc; b_nut=1722680568; buvid4=04DF8495-DC85-5EB7-DB02-0141ADBD9CA652678-022063015-%2B9JvYED62Pzmtk4PYtAPcQ%3D%3D; hit-dyn-v2=1; CURRENT_FNVAL=4048; is-2022-channel=1; bsource=search_bing; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY2MzI2NzksImlhdCI6MTcyNjM3MzQxOSwicGx0IjotMX0.CU-aVew4u5W9di0OPj_R_gpW_RHKEAUVY21KRU0Eqno; bili_ticket_expires=1726632619; home_feed_column=5; browser_resolution=1528-740; SESSDATA=1374e376%2C1742014673%2Cbbc95%2A91CjBNLZ0d3MNwQsDj3Nw8WKYP82oiAT74WcRbs-4z44DsU1npTEDGVRhrDiwcQ4a7KAYSVkxudHdzWmRENG5XcmtuZWNRSklrbWUyNDV4ajdzT2V5MjBTMDlaTi1YdWdpeGFOY0xNZW55UGF4NGMtNEdDNE4zV3ZSOFlna29JTTlxZEFwOFRZeDlRIIEC; bili_jct=9b8521d7e4cc60a577b9a29ce8ecc6a0; sid=5acjva0u; fingerprint=9bb0311b6cea85ad074caa19c63dee04; buvid_fp=9bb0311b6cea85ad074caa19c63dee04; b_lsid=B5102210DA_191FEFB31DC; bp_t_offset_438653318=978063108129947648',
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0'
|
|
}
|
|
|
|
# 通过 search_word 获取视频 bvid
|
|
page = 1
|
|
bvid_list = set() # 使用集合防止获取重复的bvid
|
|
while len(bvid_list) <= num:
|
|
url = f'https://search.bilibili.com/video?keyword={search_word}&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page={page}'
|
|
bvid_list.update(get_bvid(url, headers))
|
|
page += 1
|
|
|
|
# 获取视频 cid
|
|
cid_list = []
|
|
bvid_list = list(bvid_list) # 将集合转为列表
|
|
for bv in bvid_list[:num]:
|
|
url = f"https://api.bilibili.com/x/player/pagelist?bvid={bv}&jsonp=jsonp"
|
|
cid = get_cid(url, headers)
|
|
if cid is not None:
|
|
cid_list.append(cid)
|
|
|
|
# 获取弹幕
|
|
comment_list = []
|
|
for cid in cid_list:
|
|
url = f"https://comment.bilibili.com/{cid}.xml"
|
|
comments = get_comment(url, headers)
|
|
comment_list.extend(comments)
|
|
|
|
# 保存到文本文件
|
|
with open(f'{search_word}弹幕.txt', mode='w', encoding='utf-8') as f:
|
|
for comment in comment_list:
|
|
f.write(comment + '\n')
|
|
print(f"成功获取{len(comment_list)}条弹幕!") |