You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

167 lines
7.5 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import time
from typing import List
import requests
import json
from urllib import parse
import re
class BilibiliSpider:
def __init__(self, cookie: str, user_agent):
self.cookie = cookie
self.user_agent = user_agent
def get_search_result(self, keyword: str, page: int, page_size: int) -> list:
headers = {
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Cache-Control": "no-cache",
"Cookie": self.cookie,
"Origin": "https://search.bilibili.com",
"Pragma": "no-cache",
"Priority": "u=1, i",
"Referer": f"https://search.bilibili.com/all?vt=71519330&keyword={parse.quote(keyword)}&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page=3&o=48",
"Sec-Ch-Ua": "\"Not)A;Brand\";v=\"99\", \"Google Chrome\";v=\"127\", \"Chromium\";v=\"127\"",
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Ch-Ua-Platform": "\"Windows\"",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-site",
"User-Agent": self.user_agent
}
params = {
"category_id": "",
"search_type": "video",
"ad_resource": 5654,
"__refresh__": True,
"_extra": "",
"context": "",
"page": page,
"page_size": page_size,
"from_source": "",
"from_spmid": "333.337",
"platform": "pc",
"highlight": 1,
"single_column": 0,
"keyword": keyword,
"qv_id": "D9L6NRPnDle6B4EA2dJ4hfRjUOIvKeIM",
"source_tag": 3,
"gaia_vtoken": "",
"dynamic_offset": 48,
"web_location": 1430654,
"w_rid": "dc50190c40844231b9ad3622eebcc62b",
"wts": 1724771963
}
while True:
try:
url = "https://api.bilibili.com/x/web-interface/search/type"
response = requests.get(url, headers=headers, params=params).json()
if response['code'] == 0:
aids = [item['id'] for item in response['data']['result']]
return aids
except Exception as e:
print(e)
time.sleep(1)
def get_cid(self, aid: int) -> int:
headers = {
"Accept": "application/json, text/plain, */*",
"User-Agent": self.user_agent,
"Cookie": self.cookie,
}
# 假设这个URL是获取视频详细信息的API
response = requests.get(f"https://api.bilibili.com/x/player/pagelist?aid={aid}", headers=headers)
if response.status_code == 200:
data = response.json()
if data and 'data' in data and len(data['data']) > 0:
# 假设返回的数据结构中 cid 在每个页面对象中
return data['data'][0]['cid'] # 返回第一个页面的 cid
else:
raise ValueError(f"No video found for aid {aid}.")
else:
raise Exception(f"Failed to retrieve CID for aid {aid}. Status code: {response.status_code}")
def get_bullet_screen(self, aid: int) -> List:
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Cache-Control": "no-cache",
"Cookie": self.cookie,
"Pragma": "no-cache",
"Priority": "u=0, i",
"Sec-CH-UA": "\"Not)A;Brand\";v=\"99\", \"Google Chrome\";v=\"127\", \"Chromium\";v=\"127\"",
"Sec-CH-UA-Mobile": "?0",
"Sec-CH-UA-Platform": "\"Windows\"",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": self.user_agent
}
url = 'https://comment.bilibili.com/' + str(aid) + '.xml'
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
html = response.text
return re.findall("<d p=.+?>(.+?)</d>", html)
# print(Bili.get_search_result("巴黎奥运会", 1, 30))
#
# Bili2 = BilibiliSpider(cookies, User_Agent)
# a=1856498793
# print(Bili2.get_cid(int(a)))
# data_list=Bili2.get_bullet_screen(1646728264)
def main():
# 创建 BilibiliSpider 实例
User_Agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 " \
"Safari/537.36 Core/1.94.265.400 QQBrowser/12.7.5769.400 "
cookies = "buvid3=0010B368-0E93-5612-1F55-B0AEFA2A788E68736infoc; b_nut=1722828468; _uuid=65A45AC6-10CC5-FD72-3AE5-EDD1D94C6B2A71555infoc; enable_web_push=DISABLE; home_feed_column=5; buvid4=D4A818B4-3DAF-E9E9-CCA6-2292209CA07D70717-024080503-n8yYBXNzLps6TrOphT3zww%3D%3D; header_theme_version=CLOSE; rpdid=|(J|)Rl|kRuk0J'u~kk)k)lJY; CURRENT_QUALITY=80; fingerprint=6d7a6d23f809895ad523f52c214cab31; buvid_fp_plain=undefined; b-user-id=06265419-2000-a180-a632-d8face940e87; CURRENT_BLACKGAP=0; is-2022-channel=1; buvid_fp=6d7a6d23f809895ad523f52c214cab31; bili_ticket_expires=1726804000; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY4MDQwNjAsImlhdCI6MTcyNjU0NDgwMCwicGx0IjotMX0.GNq3G9bTe-3WQ8VAVc0sI4Qz9p3p7_dhcjMvGWeptHU; browser_resolution=1491-706; bp_t_offset_342699701=978044347712798720; CURRENT_FNVAL=4048; b_lsid=E10FCACF9_191FEFD995E; SESSDATA=a362140e%2C1742111957%2Cf7826%2A91CjBL7AZ1ewk0PKshkKehAyTa-FGUdcmmYfvGIZvLIvE3mrP0Lp1ZFo7Vp-Hg1cnTkFcSVkpPSlUwSkpFWWNodXFmNDdRNnFOdkdfTkpkbmpQNjlDUGkxLXpMRXZIMWpLUkVVSU1sNjM2clZVUmp1dEZDeDFmRTZJS0JObEstb1RVeV94ek91UktnIIEC; bili_jct=2627cc66d6b22d78edd09ea63d44b26e; DedeUserID=3546763816339525; DedeUserID__ckMd5=d856350aecedd530; sid=8ec23v36"
Bili = BilibiliSpider(cookies, User_Agent)
# 搜索关键词
keyword = "2024巴黎奥运会"
page_size = 30 # 每页30个结果
total_pages = 10 # 总共爬取10页300个视频
data_list = [] # 存储所有弹幕数据
for page in range(1, total_pages + 1):
print(f"Fetching search results for page {page}...")
aids = Bili.get_search_result(keyword, page, page_size)
for aid in aids:
try:
print(f"Fetching bullet screen for video with aid {aid}...")
# 获取视频的 cid
cid = Bili.get_cid(aid)
# 获取弹幕数据
bullet_screens = Bili.get_bullet_screen(cid)
data_list.extend(bullet_screens) # 将弹幕数据添加到 data_list
print(f"Fetched {len(bullet_screens)} bullet screens for aid {aid}.")
except Exception as e:
print(f"An error occurred while fetching data for aid {aid}: {e}")
# 打印所有弹幕数据的数量
print(f"Total bullet screens fetched: {len(data_list)}")
# 将 data_list 保存到文件"弹幕.txt"
for data in data_list:
with open("弹幕.txt", mode='a', encoding="utf-8") as f:
f.write(data + '\n')
if __name__ == "__main__":
main()