You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

62 lines
2.9 KiB

import os
import time
import requests
import re
import json
import pandas as pd
os.environ['NO_PROXY'] = 'bilibili.com'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'
COOKIE = "buvid3=5CD968B6-5E6F-AA6C-D8BB-422744C1DB0054109infoc; b_nut=1673696354; _uuid=1092729F9-10FE3-DB8C-64A2-F27261E3B43153165infoc; buvid4=82DFB562-AF5C-F20C-AC96-E71C089E97E355884-023011419-hBTxrxbr8pWyUVDiIX7ZVw%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u)luk|llRR0J'uY~RkuJ|Ju; buvid_fp_plain=undefined; i-wanna-go-back=-1; nostalgia_conf=-1; b_ut=5; header_theme_version=CLOSE; LIVE_BUVID=AUTO6216768194613696; home_feed_column=4; CURRENT_PID=8e025d90-cb08-11ed-8e36-15f3cf3099af; CURRENT_QUALITY=80; browser_resolution=1392-786; FEED_LIVE_VERSION=V_SEO_FIRST_CARD; fingerprint=efbe80e589d57838b8ff20cb5df98e9d; buvid_fp=a1198a46c9f71d42436ace10a9ab7448; bili_jct=416c2ad96c39091affdb9092e9a593d9; DedeUserID=414937912; DedeUserID__ckMd5=a09ace8891a7091a; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE2OTQwODgxODUsImlhdCI6MTY5MzgyODk4NSwicGx0IjotMX0.pEBW75b3VX6p2lqx7jPuUpPvrAHz0QIoHtagLMp3_iU; bili_ticket_expires=1694088185; bp_video_offset_1480857975=837478778313637938; bp_video_offset_414937912=838225299482083363; sid=6udo3o89; PVID=3; b_lsid=B11E5210A_18A6ABF8D38"
def get_bvid(page, pos):
url = 'https://api.bilibili.com/x/web-interface/search/all/v2?page='+str(page+1)+'&keyword=2024巴黎奥运会'
headers = {
'User-Agent': USER_AGENT,
'cookie': COOKIE,
}
response = requests.get(url=url, headers=headers, verify=False).text
print(response)
json_dict = json.loads(response)
return json_dict["data"]["result"][11]["data"][pos]['bvid']
def get_cid(bvid):
url = f"https://api.bilibili.com/x/player/pagelist?bvid={bvid}&jsonp=jsonp"
headers = {
'User-Agent': USER_AGENT,
'cookie': COOKIE,
}
response = requests.get(url=url, headers=headers, verify=False).text
dirt = json.loads(response)
cid = dirt['data'][0]['cid']
print(cid)
return cid
def get_barrage(cid):
url = 'https://api.bilibili.com/x/v1/dm/list.so?oid=' + str(cid)
headers = {
"User-Agent": USER_AGENT
}
response = requests.get(url, headers=headers)
html_doc = response.content.decode('utf-8')
# 正则表达式的匹配模式
res = re.compile('<d.*?>(.*?)</d>')
# 根据模式提取网页数据
barrage = re.findall(res, html_doc)
df = pd.DataFrame(barrage, columns=['barrage'])
if not os.path.isfile('barrage.csv'):
df.to_csv('barrage.csv', mode='w', index=False, encoding='utf-8-sig')
else:
df.to_csv('barrage.csv', mode='a', index=False, header=False, encoding='utf-8-sig')
if __name__ == '__main__':
for i in range(15):
for j in range(20):
get_barrage(get_cid(get_bvid(i, j)))
time.sleep(1)