You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
69 lines
3.1 KiB
69 lines
3.1 KiB
2 months ago
|
import os
|
||
|
import time
|
||
|
from concurrent.futures import ThreadPoolExecutor
|
||
|
|
||
|
import requests
|
||
|
import re
|
||
|
import json
|
||
|
import pandas as pd
|
||
|
|
||
|
os.environ['NO_PROXY'] = 'bilibili.com'
|
||
|
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'
|
||
|
COOKIE = "buvid3=5CD968B6-5E6F-AA6C-D8BB-422744C1DB0054109infoc; b_nut=1673696354; _uuid=1092729F9-10FE3-DB8C-64A2-F27261E3B43153165infoc; buvid4=82DFB562-AF5C-F20C-AC96-E71C089E97E355884-023011419-hBTxrxbr8pWyUVDiIX7ZVw%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u)luk|llRR0J'uY~RkuJ|Ju; buvid_fp_plain=undefined; i-wanna-go-back=-1; nostalgia_conf=-1; b_ut=5; header_theme_version=CLOSE; LIVE_BUVID=AUTO6216768194613696; home_feed_column=4; CURRENT_PID=8e025d90-cb08-11ed-8e36-15f3cf3099af; CURRENT_QUALITY=80; browser_resolution=1392-786; FEED_LIVE_VERSION=V_SEO_FIRST_CARD; fingerprint=efbe80e589d57838b8ff20cb5df98e9d; buvid_fp=a1198a46c9f71d42436ace10a9ab7448; bili_jct=416c2ad96c39091affdb9092e9a593d9; DedeUserID=414937912; DedeUserID__ckMd5=a09ace8891a7091a; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE2OTQwODgxODUsImlhdCI6MTY5MzgyODk4NSwicGx0IjotMX0.pEBW75b3VX6p2lqx7jPuUpPvrAHz0QIoHtagLMp3_iU; bili_ticket_expires=1694088185; bp_video_offset_1480857975=837478778313637938; bp_video_offset_414937912=838225299482083363; sid=6udo3o89; PVID=3; b_lsid=B11E5210A_18A6ABF8D38"
|
||
|
|
||
|
|
||
|
def get_bvid(page, pos):
|
||
|
url = 'https://api.bilibili.com/x/web-interface/search/all/v2?page='+str(page+1)+'&keyword=2024巴黎奥运会'
|
||
|
headers = {
|
||
|
'User-Agent': USER_AGENT,
|
||
|
'cookie': COOKIE,
|
||
|
}
|
||
|
response = requests.get(url=url, headers=headers, verify=False).text
|
||
|
print(response)
|
||
|
json_dict = json.loads(response)
|
||
|
return json_dict["data"]["result"][11]["data"][pos]['bvid']
|
||
|
|
||
|
|
||
|
def get_cid(bvid):
|
||
|
url = f"https://api.bilibili.com/x/player/pagelist?bvid={bvid}&jsonp=jsonp"
|
||
|
headers = {
|
||
|
'User-Agent': USER_AGENT,
|
||
|
'cookie': COOKIE,
|
||
|
}
|
||
|
response = requests.get(url=url, headers=headers, verify=False).text
|
||
|
dirt = json.loads(response)
|
||
|
cid = dirt['data'][0]['cid']
|
||
|
print(cid)
|
||
|
return cid
|
||
|
|
||
|
|
||
|
def get_barrage(cid):
|
||
|
url = 'https://api.bilibili.com/x/v1/dm/list.so?oid=' + str(cid)
|
||
|
headers = {
|
||
|
"User-Agent": USER_AGENT
|
||
|
}
|
||
|
response = requests.get(url, headers=headers)
|
||
|
html_doc = response.content.decode('utf-8')
|
||
|
# 正则表达式的匹配模式
|
||
|
res = re.compile('<d.*?>(.*?)</d>')
|
||
|
# 根据模式提取网页数据
|
||
|
barrage = re.findall(res, html_doc)
|
||
|
df = pd.DataFrame(barrage, columns=['barrage'])
|
||
|
if not os.path.isfile('2.csv'):
|
||
|
df.to_csv('2.csv', mode='w', index=False, encoding='utf-8-sig')
|
||
|
else:
|
||
|
df.to_csv('2.csv', mode='a', index=False, header=False, encoding='utf-8-sig')
|
||
|
|
||
|
|
||
|
def process_page(page, pos):
|
||
|
bvid = get_bvid(page, pos)
|
||
|
cid = get_cid(bvid)
|
||
|
get_barrage(cid)
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
with ThreadPoolExecutor(max_workers=10) as executor:
|
||
|
for i in range(15):
|
||
|
for j in range(20):
|
||
|
executor.submit(process_page, i, j)
|