From 6f4a55b74e18c406f7168271ccc02ca0d85084d5 Mon Sep 17 00:00:00 2001 From: pc7si35ku <282589624@qq.com> Date: Wed, 18 Sep 2024 00:29:15 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=94=B9=E5=90=8E=E7=9A=84Crawler.py?= =?UTF-8?q?=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Crawler.py | 166 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 Crawler.py diff --git a/Crawler.py b/Crawler.py new file mode 100644 index 0000000..938007a --- /dev/null +++ b/Crawler.py @@ -0,0 +1,166 @@ +import time +from typing import List +import requests +import json +from urllib import parse +import re + + +class BilibiliSpider: + def __init__(self, cookie: str, user_agent): + self.cookie = cookie + self.user_agent = user_agent + + def get_search_result(self, keyword: str, page: int, page_size: int) -> list: + headers = { + "Accept": "application/json, text/plain, */*", + "Accept-Encoding": "gzip, deflate, br, zstd", + "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", + "Cache-Control": "no-cache", + "Cookie": self.cookie, + "Origin": "https://search.bilibili.com", + "Pragma": "no-cache", + "Priority": "u=1, i", + "Referer": f"https://search.bilibili.com/all?vt=71519330&keyword={parse.quote(keyword)}&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page=3&o=48", + "Sec-Ch-Ua": "\"Not)A;Brand\";v=\"99\", \"Google Chrome\";v=\"127\", \"Chromium\";v=\"127\"", + "Sec-Ch-Ua-Mobile": "?0", + "Sec-Ch-Ua-Platform": "\"Windows\"", + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-site", + "User-Agent": self.user_agent + } + + params = { + "category_id": "", + "search_type": "video", + "ad_resource": 5654, + "__refresh__": True, + "_extra": "", + "context": "", + "page": page, + "page_size": page_size, + "from_source": "", + "from_spmid": "333.337", + "platform": "pc", + "highlight": 1, + "single_column": 0, + "keyword": keyword, + "qv_id": "D9L6NRPnDle6B4EA2dJ4hfRjUOIvKeIM", + "source_tag": 3, + "gaia_vtoken": "", + "dynamic_offset": 48, + "web_location": 1430654, + "w_rid": "dc50190c40844231b9ad3622eebcc62b", + "wts": 1724771963 + } + + while True: + try: + url = "https://api.bilibili.com/x/web-interface/search/type" + response = requests.get(url, headers=headers, params=params).json() + if response['code'] == 0: + aids = [item['id'] for item in response['data']['result']] + return aids + except Exception as e: + print(e) + time.sleep(1) + + def get_cid(self, aid: int) -> int: + headers = { + "Accept": "application/json, text/plain, */*", + "User-Agent": self.user_agent, + "Cookie": self.cookie, + } + + # 假设这个URL是获取视频详细信息的API + response = requests.get(f"https://api.bilibili.com/x/player/pagelist?aid={aid}", headers=headers) + + if response.status_code == 200: + data = response.json() + if data and 'data' in data and len(data['data']) > 0: + # 假设返回的数据结构中 cid 在每个页面对象中 + return data['data'][0]['cid'] # 返回第一个页面的 cid + else: + raise ValueError(f"No video found for aid {aid}.") + else: + raise Exception(f"Failed to retrieve CID for aid {aid}. Status code: {response.status_code}") + + def get_bullet_screen(self, aid: int) -> List: + headers = { + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", + "Accept-Encoding": "gzip, deflate, br, zstd", + "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", + "Cache-Control": "no-cache", + "Cookie": self.cookie, + "Pragma": "no-cache", + "Priority": "u=0, i", + "Sec-CH-UA": "\"Not)A;Brand\";v=\"99\", \"Google Chrome\";v=\"127\", \"Chromium\";v=\"127\"", + "Sec-CH-UA-Mobile": "?0", + "Sec-CH-UA-Platform": "\"Windows\"", + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "none", + "Sec-Fetch-User": "?1", + "Upgrade-Insecure-Requests": "1", + "User-Agent": self.user_agent + } + + url = 'https://comment.bilibili.com/' + str(aid) + '.xml' + response = requests.get(url, headers=headers) + response.encoding = 'utf-8' + html = response.text + return re.findall("(.+?)", html) + + + +# print(Bili.get_search_result("巴黎奥运会", 1, 30)) +# +# Bili2 = BilibiliSpider(cookies, User_Agent) + + +# a=1856498793 +# print(Bili2.get_cid(int(a))) +# data_list=Bili2.get_bullet_screen(1646728264) + +def main(): + # 创建 BilibiliSpider 实例 + User_Agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 " \ + "Safari/537.36 Core/1.94.265.400 QQBrowser/12.7.5769.400 " + cookies = "buvid3=0010B368-0E93-5612-1F55-B0AEFA2A788E68736infoc; b_nut=1722828468; _uuid=65A45AC6-10CC5-FD72-3AE5-EDD1D94C6B2A71555infoc; enable_web_push=DISABLE; home_feed_column=5; buvid4=D4A818B4-3DAF-E9E9-CCA6-2292209CA07D70717-024080503-n8yYBXNzLps6TrOphT3zww%3D%3D; header_theme_version=CLOSE; rpdid=|(J|)Rl|kRuk0J'u~kk)k)lJY; CURRENT_QUALITY=80; fingerprint=6d7a6d23f809895ad523f52c214cab31; buvid_fp_plain=undefined; b-user-id=06265419-2000-a180-a632-d8face940e87; CURRENT_BLACKGAP=0; is-2022-channel=1; buvid_fp=6d7a6d23f809895ad523f52c214cab31; bili_ticket_expires=1726804000; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY4MDQwNjAsImlhdCI6MTcyNjU0NDgwMCwicGx0IjotMX0.GNq3G9bTe-3WQ8VAVc0sI4Qz9p3p7_dhcjMvGWeptHU; browser_resolution=1491-706; bp_t_offset_342699701=978044347712798720; CURRENT_FNVAL=4048; b_lsid=E10FCACF9_191FEFD995E; SESSDATA=a362140e%2C1742111957%2Cf7826%2A91CjBL7AZ1ewk0PKshkKehAyTa-FGUdcmmYfvGIZvLIvE3mrP0Lp1ZFo7Vp-Hg1cnTkFcSVkpPSlUwSkpFWWNodXFmNDdRNnFOdkdfTkpkbmpQNjlDUGkxLXpMRXZIMWpLUkVVSU1sNjM2clZVUmp1dEZDeDFmRTZJS0JObEstb1RVeV94ek91UktnIIEC; bili_jct=2627cc66d6b22d78edd09ea63d44b26e; DedeUserID=3546763816339525; DedeUserID__ckMd5=d856350aecedd530; sid=8ec23v36" + + Bili = BilibiliSpider(cookies, User_Agent) + + # 搜索关键词 + keyword = "2024巴黎奥运会" + page_size = 30 # 每页30个结果 + total_pages = 10 # 总共爬取10页,300个视频 + data_list = [] # 存储所有弹幕数据 + + for page in range(1, total_pages + 1): + print(f"Fetching search results for page {page}...") + aids = Bili.get_search_result(keyword, page, page_size) + + for aid in aids: + try: + print(f"Fetching bullet screen for video with aid {aid}...") + # 获取视频的 cid + cid = Bili.get_cid(aid) + # 获取弹幕数据 + bullet_screens = Bili.get_bullet_screen(cid) + data_list.extend(bullet_screens) # 将弹幕数据添加到 data_list + print(f"Fetched {len(bullet_screens)} bullet screens for aid {aid}.") + except Exception as e: + print(f"An error occurred while fetching data for aid {aid}: {e}") + + # 打印所有弹幕数据的数量 + print(f"Total bullet screens fetched: {len(data_list)}") + # 将 data_list 保存到文件"弹幕.txt" + + for data in data_list: + with open("弹幕.txt", mode='a', encoding="utf-8") as f: + f.write(data + '\n') + + +if __name__ == "__main__": + main()