import os import time from concurrent.futures import ThreadPoolExecutor import requests import re import json import pandas as pd os.environ['NO_PROXY'] = 'bilibili.com' USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0' COOKIE = "buvid3=5CD968B6-5E6F-AA6C-D8BB-422744C1DB0054109infoc; b_nut=1673696354; _uuid=1092729F9-10FE3-DB8C-64A2-F27261E3B43153165infoc; buvid4=82DFB562-AF5C-F20C-AC96-E71C089E97E355884-023011419-hBTxrxbr8pWyUVDiIX7ZVw%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u)luk|llRR0J'uY~RkuJ|Ju; buvid_fp_plain=undefined; i-wanna-go-back=-1; nostalgia_conf=-1; b_ut=5; header_theme_version=CLOSE; LIVE_BUVID=AUTO6216768194613696; home_feed_column=4; CURRENT_PID=8e025d90-cb08-11ed-8e36-15f3cf3099af; CURRENT_QUALITY=80; browser_resolution=1392-786; FEED_LIVE_VERSION=V_SEO_FIRST_CARD; fingerprint=efbe80e589d57838b8ff20cb5df98e9d; buvid_fp=a1198a46c9f71d42436ace10a9ab7448; bili_jct=416c2ad96c39091affdb9092e9a593d9; DedeUserID=414937912; DedeUserID__ckMd5=a09ace8891a7091a; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE2OTQwODgxODUsImlhdCI6MTY5MzgyODk4NSwicGx0IjotMX0.pEBW75b3VX6p2lqx7jPuUpPvrAHz0QIoHtagLMp3_iU; bili_ticket_expires=1694088185; bp_video_offset_1480857975=837478778313637938; bp_video_offset_414937912=838225299482083363; sid=6udo3o89; PVID=3; b_lsid=B11E5210A_18A6ABF8D38" def get_bvid(page, pos): url = 'https://api.bilibili.com/x/web-interface/search/all/v2?page='+str(page+1)+'&keyword=2024巴黎奥运会' headers = { 'User-Agent': USER_AGENT, 'cookie': COOKIE, } response = requests.get(url=url, headers=headers, verify=False).text print(response) json_dict = json.loads(response) return json_dict["data"]["result"][11]["data"][pos]['bvid'] def get_cid(bvid): url = f"https://api.bilibili.com/x/player/pagelist?bvid={bvid}&jsonp=jsonp" headers = { 'User-Agent': USER_AGENT, 'cookie': COOKIE, } response = requests.get(url=url, headers=headers, verify=False).text dirt = json.loads(response) cid = dirt['data'][0]['cid'] print(cid) return cid def get_barrage(cid): url = 'https://api.bilibili.com/x/v1/dm/list.so?oid=' + str(cid) headers = { "User-Agent": USER_AGENT } response = requests.get(url, headers=headers) html_doc = response.content.decode('utf-8') # 正则表达式的匹配模式 res = re.compile('(.*?)') # 根据模式提取网页数据 barrage = re.findall(res, html_doc) df = pd.DataFrame(barrage, columns=['barrage']) if not os.path.isfile('2.csv'): df.to_csv('2.csv', mode='w', index=False, encoding='utf-8-sig') else: df.to_csv('2.csv', mode='a', index=False, header=False, encoding='utf-8-sig') def process_page(page, pos): bvid = get_bvid(page, pos) cid = get_cid(bvid) get_barrage(cid) if __name__ == '__main__': with ThreadPoolExecutor(max_workers=10) as executor: for i in range(15): for j in range(20): executor.submit(process_page, i, j)