You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Paris_2024/爬前300条视频弹幕.py

57 lines
3.3 KiB

import re
import requests
import json
import time
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter
cnt=0#已爬取视频数
#danmuku=[]#弹幕库
for Page in range(1,22):
url=f'https://api.bilibili.com/x/web-interface/search/type?search_type=video&keyword=巴黎奥运会&page={Page}'
#获得搜索“巴黎奥运会”第一页的json数据的url
#print(url)
headers={
"cookie":"buvid4=7D777810-9AD0-8C21-1D6A-C607F528C7B427206-022123000-Sfw%2Bq8N2F39WtfvTG9WlSA%3D%3D; DedeUserID=352875468; DedeUserID__ckMd5=0a90e72ce13d5f80; buvid_fp_plain=undefined; is-2022-channel=1; FEED_LIVE_VERSION=V8; hit-new-style-dyn=1; enable_web_push=DISABLE; header_theme_version=CLOSE; buvid3=D0F7F1A0-043F-F252-3B6E-407C2F74F3E233731infoc; b_nut=1703867934; _uuid=10782D5106-6BD5-2F25-DB56-93210AD5B1077B97901infoc; hit-dyn-v2=1; rpdid=|(u))kkYuu|u0J'u~|)~)k)Ju; LIVE_BUVID=AUTO3717091338559729; CURRENT_QUALITY=80; fingerprint=77f845d4623f8049224f6d42350abef3; PVID=1; home_feed_column=5; browser_resolution=1897-998; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYxNTAzOTIsImlhdCI6MTcyNTg5MTEzMiwicGx0IjotMX0.-PIHJf-y8eiCUpgjbSr80_6MMNByawusvATxL1TlWEg; bili_ticket_expires=1726150332; SESSDATA=e4358344%2C1741449782%2Cf61a4%2A92CjCRkwSuqlWD2A7212ZB1TNWsYRH10J2J7J5MA_OLtgkpVB-NiWTShPTTV1Uvij4R28SVjRCcl9yUW9sREZnQWNxMXpsQ3IxdC03QVJsMURLUTBuMjJkRVVZYjlNVUZfVUhNYjZlUEgtdDI0aVlfZnZRUjBqb01EZjFMWDAyaW9LQW9XWDJPdnBnIIEC; bili_jct=a84563cf63b9e39aae094c54f58ac264; sid=690q9qio; buvid_fp=77f845d4623f8049224f6d42350abef3; CURRENT_FNVAL=4048; b_lsid=883A9109D_191E0390D0E; bp_t_offset_352875468=975839826899107840",
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
}#请求头伪装浏览器
response=requests.get(url,headers=headers)
Json=response.json()
results=Json['data']['result']#获得当前页面所有视频的具体信息
#print(results)
for result in results:
#with open("视频bv号.txt",mode='a',encoding='utf-8') as f:
#f.write(result['bvid']+'\n')
#print(result['bvid'])
url = f"https://api.bilibili.com/x/player/pagelist?bvid={result['bvid']}"
#获得当前视频的bv号并且访问api接口获取他的cid
response=requests.get(url,headers=headers)
response.encoding='utf-8'
Json=response.json()
cid=Json['data'][0]['cid']
#print(cid)
url = f"https://comment.bilibili.com/{cid}.xml"
#print(url)
#通过cid访问视频弹幕数据所在界面
response = requests.get(url=url,headers=headers)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'xml')
cnt += 1
#已爬取视频数量+1
for i in soup.find_all('d'):
#print(i.text)
#danmuku.append(i.text)
with open(f"{cnt}个视频弹幕.txt",mode='a',encoding='utf-8') as f:
f.write(i.text+'\n')
#写入本地文档,方便后续读取调用筛选
#print(danmuku)
if(cnt==300):#如果爬了300个视频的弹幕就结束循环
break
if(cnt==300):
break