You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
57 lines
3.3 KiB
57 lines
3.3 KiB
import re
|
|
import requests
|
|
import json
|
|
import time
|
|
from bs4 import BeautifulSoup
|
|
import pandas as pd
|
|
import matplotlib.pyplot as plt
|
|
from wordcloud import WordCloud
|
|
from collections import Counter
|
|
|
|
|
|
cnt=0#已爬取视频数
|
|
#danmuku=[]#弹幕库
|
|
|
|
for Page in range(1,22):
|
|
url=f'https://api.bilibili.com/x/web-interface/search/type?search_type=video&keyword=巴黎奥运会&page={Page}'
|
|
#获得搜索“巴黎奥运会”第一页的json数据的url
|
|
#print(url)
|
|
headers={
|
|
"cookie":"buvid4=7D777810-9AD0-8C21-1D6A-C607F528C7B427206-022123000-Sfw%2Bq8N2F39WtfvTG9WlSA%3D%3D; DedeUserID=352875468; DedeUserID__ckMd5=0a90e72ce13d5f80; buvid_fp_plain=undefined; is-2022-channel=1; FEED_LIVE_VERSION=V8; hit-new-style-dyn=1; enable_web_push=DISABLE; header_theme_version=CLOSE; buvid3=D0F7F1A0-043F-F252-3B6E-407C2F74F3E233731infoc; b_nut=1703867934; _uuid=10782D5106-6BD5-2F25-DB56-93210AD5B1077B97901infoc; hit-dyn-v2=1; rpdid=|(u))kkYuu|u0J'u~|)~)k)Ju; LIVE_BUVID=AUTO3717091338559729; CURRENT_QUALITY=80; fingerprint=77f845d4623f8049224f6d42350abef3; PVID=1; home_feed_column=5; browser_resolution=1897-998; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYxNTAzOTIsImlhdCI6MTcyNTg5MTEzMiwicGx0IjotMX0.-PIHJf-y8eiCUpgjbSr80_6MMNByawusvATxL1TlWEg; bili_ticket_expires=1726150332; SESSDATA=e4358344%2C1741449782%2Cf61a4%2A92CjCRkwSuqlWD2A7212ZB1TNWsYRH10J2J7J5MA_OLtgkpVB-NiWTShPTTV1Uvij4R28SVjRCcl9yUW9sREZnQWNxMXpsQ3IxdC03QVJsMURLUTBuMjJkRVVZYjlNVUZfVUhNYjZlUEgtdDI0aVlfZnZRUjBqb01EZjFMWDAyaW9LQW9XWDJPdnBnIIEC; bili_jct=a84563cf63b9e39aae094c54f58ac264; sid=690q9qio; buvid_fp=77f845d4623f8049224f6d42350abef3; CURRENT_FNVAL=4048; b_lsid=883A9109D_191E0390D0E; bp_t_offset_352875468=975839826899107840",
|
|
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
|
|
}#请求头伪装浏览器
|
|
response=requests.get(url,headers=headers)
|
|
Json=response.json()
|
|
results=Json['data']['result']#获得当前页面所有视频的具体信息
|
|
#print(results)
|
|
for result in results:
|
|
#with open("视频bv号.txt",mode='a',encoding='utf-8') as f:
|
|
#f.write(result['bvid']+'\n')
|
|
#print(result['bvid'])
|
|
url = f"https://api.bilibili.com/x/player/pagelist?bvid={result['bvid']}"
|
|
#获得当前视频的bv号并且访问api接口获取他的cid
|
|
response=requests.get(url,headers=headers)
|
|
response.encoding='utf-8'
|
|
Json=response.json()
|
|
cid=Json['data'][0]['cid']
|
|
#print(cid)
|
|
url = f"https://comment.bilibili.com/{cid}.xml"
|
|
#print(url)
|
|
#通过cid访问视频弹幕数据所在界面
|
|
response = requests.get(url=url,headers=headers)
|
|
response.encoding = 'utf-8'
|
|
soup = BeautifulSoup(response.text, 'xml')
|
|
cnt += 1
|
|
#已爬取视频数量+1
|
|
for i in soup.find_all('d'):
|
|
#print(i.text)
|
|
#danmuku.append(i.text)
|
|
with open(f"第{cnt}个视频弹幕.txt",mode='a',encoding='utf-8') as f:
|
|
f.write(i.text+'\n')
|
|
#写入本地文档,方便后续读取调用筛选
|
|
#print(danmuku)
|
|
if(cnt==300):#如果爬了300个视频的弹幕就结束循环
|
|
break
|
|
if(cnt==300):
|
|
break
|