diff --git a/danmu.py b/danmu.py new file mode 100644 index 0000000..a6b27d5 --- /dev/null +++ b/danmu.py @@ -0,0 +1,105 @@ +import time +from typing import List +import requests +import re +from urllib import parse +from concurrent.futures import ThreadPoolExecutor, as_completed + +class BilibiliVideoSpider: + def __init__(self, session_cookie: str, user_agent: str): + self.session_cookie = session_cookie + self.user_agent = user_agent + + def search_videos(self, keyword: str, page: int, page_size: int) -> list: + headers = { + "Accept": "application/json, text/plain, */*", + "Accept-Encoding": "gzip, deflate, br, zstd", + "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", + "Cache-Control": "no-cache", + "Cookie": self.session_cookie, + "Origin": "https://search.bilibili.com", + "Pragma": "no-cache", + "Referer": f"https://search.bilibili.com/all?keyword={parse.quote(keyword)}", + "User-Agent": self.user_agent, + } + + params = { + "search_type": "video", + "page": page, + "page_size": page_size, + "keyword": keyword, + } + + while True: + try: + response = requests.get("https://api.bilibili.com/x/web-interface/search/type", headers=headers, params=params).json() + if response['code'] == 0: + return [item['id'] for item in response['data']['result']] + except Exception as error: + print(error) + time.sleep(1) + + def retrieve_cid(self, aid: int) -> int: + headers = { + "Accept": "application/json, text/plain, */*", + "User-Agent": self.user_agent, + "Cookie": self.session_cookie, + } + + response = requests.get(f"https://api.bilibili.com/x/player/pagelist?aid={aid}", headers=headers) + if response.status_code == 200: + data = response.json() + if data and 'data' in data and len(data['data']) > 0: + return data['data'][0]['cid'] + raise ValueError(f"No video found for aid {aid}.") + + def fetch_danmaku(self, aid: int) -> List[str]: + headers = { + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8", + "Cookie": self.session_cookie, + "User-Agent": self.user_agent + } + + response = requests.get(f'https://comment.bilibili.com/{aid}.xml', headers=headers) + response.encoding = 'utf-8' + return re.findall("(.+?)", response.text) + +def fetch_bullet_screen(spider: BilibiliVideoSpider, aid: int) -> List[str]: + try: + print(f"Fetching bullet screen for video with aid {aid}...") + cid = spider.retrieve_cid(aid) + return spider.fetch_danmaku(cid) + except Exception as error: + print(f"Error fetching data for aid {aid}: {error}") + return [] + +def main(): + user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0" + session_cookie = "CURRENT_BLACKGAP=0; buvid_fp_plain=undefined; buvid4=AD99F657-25EE-E722-504A-38818AB2C96431271-022083012-SK3hbof5R8m0%2FMDXrqLXF0ew7%2BFE4Qf8ZlKJTIBem2GNFTrtkIOz1g%3D%3D; hit-dyn-v2=1; DedeUserID=379056927; DedeUserID__ckMd5=f108d9af1bf79bfa; enable_web_push=DISABLE; header_theme_version=CLOSE; rpdid=|(u)luk))YJY0J'u~|JJk)m~|; LIVE_BUVID=AUTO7417034221177410; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; PVID=1; fingerprint=15814142e80dfa9c068eed7a71851bf5; CURRENT_FNVAL=4048; CURRENT_QUALITY=80; buvid3=1ABDB66E-4FED-982B-9E33-0B8252C286A183741infoc; b_nut=1726320983; _uuid=993A102D3-51024-9AFA-A267-911A4D7E1051590626infoc; buvid_fp=15814142e80dfa9c068eed7a71851bf5; SESSDATA=73515621%2C1742135177%2C0e2a5%2A91CjD4Y5RhCOEVNl9wRlHMuu46raFGvX_PIZISKjPfA6kidgWNqhp7ORMi42EVo7IHscoSVmdsTlNiTDltTzFnSEtSam54WnpjVnZvUzdHTGNnd1pUNGZTb1pvYTlmSjdTby0wTVlCMFFvU1lpVGxkd1owTktpY1NDVWlpUTV2b1lmNUJfenpGa0RnIIEC; bili_jct=53fef919101982fb820518642f8ea298; sid=7s0heoa0; b_lsid=5A10234D8_19204359DD8; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY5MDcxODYsImlhdCI6MTcyNjY0NzkyNiwicGx0IjotMX0.1EN8gxd6-3GfZQ3b4b-Iz-4vmICUJHaoAoKpMbLlRtg; bili_ticket_expires=1726907126; bp_t_offset_379056927=978437006507900928; home_feed_column=4; browser_resolution=1065-941" + + spider = BilibiliVideoSpider(session_cookie, user_agent) + keyword = "2024巴黎奥运会" + results_per_page = 30 + total_pages = 10 + all_danmaku = [] + + with ThreadPoolExecutor(max_workers=10) as executor: + futures = [] + for page in range(1, total_pages + 1): + print(f"Fetching search results for page {page}...") + aids = spider.search_videos(keyword, page, results_per_page) + for aid in aids: + futures.append(executor.submit(fetch_bullet_screen, spider, aid)) + + for future in as_completed(futures): + all_danmaku.extend(future.result()) + + print(f"Total bullet screens fetched: {len(all_danmaku)}") + + # 将弹幕数据保存到 "弹幕.txt" 文件 + with open("弹幕.txt", mode='w', encoding="utf-8") as file: # 以写入模式打开文件 + for danmaku in all_danmaku: # 遍历所有弹幕数据 + file.write(danmaku + '\n') # 将弹幕数据写入文件 + +if __name__ == "__main__": + main() \ No newline at end of file