You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

69 lines
3.1 KiB

import os
import time
from concurrent.futures import ThreadPoolExecutor
import requests
import re
import json
import pandas as pd
os.environ['NO_PROXY'] = 'bilibili.com'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'
COOKIE = "buvid3=5CD968B6-5E6F-AA6C-D8BB-422744C1DB0054109infoc; b_nut=1673696354; _uuid=1092729F9-10FE3-DB8C-64A2-F27261E3B43153165infoc; buvid4=82DFB562-AF5C-F20C-AC96-E71C089E97E355884-023011419-hBTxrxbr8pWyUVDiIX7ZVw%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u)luk|llRR0J'uY~RkuJ|Ju; buvid_fp_plain=undefined; i-wanna-go-back=-1; nostalgia_conf=-1; b_ut=5; header_theme_version=CLOSE; LIVE_BUVID=AUTO6216768194613696; home_feed_column=4; CURRENT_PID=8e025d90-cb08-11ed-8e36-15f3cf3099af; CURRENT_QUALITY=80; browser_resolution=1392-786; FEED_LIVE_VERSION=V_SEO_FIRST_CARD; fingerprint=efbe80e589d57838b8ff20cb5df98e9d; buvid_fp=a1198a46c9f71d42436ace10a9ab7448; bili_jct=416c2ad96c39091affdb9092e9a593d9; DedeUserID=414937912; DedeUserID__ckMd5=a09ace8891a7091a; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE2OTQwODgxODUsImlhdCI6MTY5MzgyODk4NSwicGx0IjotMX0.pEBW75b3VX6p2lqx7jPuUpPvrAHz0QIoHtagLMp3_iU; bili_ticket_expires=1694088185; bp_video_offset_1480857975=837478778313637938; bp_video_offset_414937912=838225299482083363; sid=6udo3o89; PVID=3; b_lsid=B11E5210A_18A6ABF8D38"
def get_bvid(page, pos):
url = 'https://api.bilibili.com/x/web-interface/search/all/v2?page='+str(page+1)+'&keyword=2024巴黎奥运会'
headers = {
'User-Agent': USER_AGENT,
'cookie': COOKIE,
}
response = requests.get(url=url, headers=headers, verify=False).text
print(response)
json_dict = json.loads(response)
return json_dict["data"]["result"][11]["data"][pos]['bvid']
def get_cid(bvid):
url = f"https://api.bilibili.com/x/player/pagelist?bvid={bvid}&jsonp=jsonp"
headers = {
'User-Agent': USER_AGENT,
'cookie': COOKIE,
}
response = requests.get(url=url, headers=headers, verify=False).text
dirt = json.loads(response)
cid = dirt['data'][0]['cid']
print(cid)
return cid
def get_barrage(cid):
url = 'https://api.bilibili.com/x/v1/dm/list.so?oid=' + str(cid)
headers = {
"User-Agent": USER_AGENT
}
response = requests.get(url, headers=headers)
html_doc = response.content.decode('utf-8')
# 正则表达式的匹配模式
res = re.compile('<d.*?>(.*?)</d>')
# 根据模式提取网页数据
barrage = re.findall(res, html_doc)
df = pd.DataFrame(barrage, columns=['barrage'])
if not os.path.isfile('2.csv'):
df.to_csv('2.csv', mode='w', index=False, encoding='utf-8-sig')
else:
df.to_csv('2.csv', mode='a', index=False, header=False, encoding='utf-8-sig')
def process_page(page, pos):
bvid = get_bvid(page, pos)
cid = get_cid(bvid)
get_barrage(cid)
if __name__ == '__main__':
with ThreadPoolExecutor(max_workers=10) as executor:
for i in range(15):
for j in range(20):
executor.submit(process_page, i, j)