import re import requests import json import time from bs4 import BeautifulSoup import pandas as pd import matplotlib.pyplot as plt from wordcloud import WordCloud from collections import Counter cnt=0#已爬取视频数 #danmuku=[]#弹幕库 for Page in range(1,22): url=f'https://api.bilibili.com/x/web-interface/search/type?search_type=video&keyword=巴黎奥运会&page={Page}' #获得搜索“巴黎奥运会”第一页的json数据的url #print(url) headers={ "cookie":"buvid4=7D777810-9AD0-8C21-1D6A-C607F528C7B427206-022123000-Sfw%2Bq8N2F39WtfvTG9WlSA%3D%3D; DedeUserID=352875468; DedeUserID__ckMd5=0a90e72ce13d5f80; buvid_fp_plain=undefined; is-2022-channel=1; FEED_LIVE_VERSION=V8; hit-new-style-dyn=1; enable_web_push=DISABLE; header_theme_version=CLOSE; buvid3=D0F7F1A0-043F-F252-3B6E-407C2F74F3E233731infoc; b_nut=1703867934; _uuid=10782D5106-6BD5-2F25-DB56-93210AD5B1077B97901infoc; hit-dyn-v2=1; rpdid=|(u))kkYuu|u0J'u~|)~)k)Ju; LIVE_BUVID=AUTO3717091338559729; CURRENT_QUALITY=80; fingerprint=77f845d4623f8049224f6d42350abef3; PVID=1; home_feed_column=5; browser_resolution=1897-998; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYxNTAzOTIsImlhdCI6MTcyNTg5MTEzMiwicGx0IjotMX0.-PIHJf-y8eiCUpgjbSr80_6MMNByawusvATxL1TlWEg; bili_ticket_expires=1726150332; SESSDATA=e4358344%2C1741449782%2Cf61a4%2A92CjCRkwSuqlWD2A7212ZB1TNWsYRH10J2J7J5MA_OLtgkpVB-NiWTShPTTV1Uvij4R28SVjRCcl9yUW9sREZnQWNxMXpsQ3IxdC03QVJsMURLUTBuMjJkRVVZYjlNVUZfVUhNYjZlUEgtdDI0aVlfZnZRUjBqb01EZjFMWDAyaW9LQW9XWDJPdnBnIIEC; bili_jct=a84563cf63b9e39aae094c54f58ac264; sid=690q9qio; buvid_fp=77f845d4623f8049224f6d42350abef3; CURRENT_FNVAL=4048; b_lsid=883A9109D_191E0390D0E; bp_t_offset_352875468=975839826899107840", "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36" }#请求头伪装浏览器 response=requests.get(url,headers=headers) Json=response.json() results=Json['data']['result']#获得当前页面所有视频的具体信息 #print(results) for result in results: #with open("视频bv号.txt",mode='a',encoding='utf-8') as f: #f.write(result['bvid']+'\n') #print(result['bvid']) url = f"https://api.bilibili.com/x/player/pagelist?bvid={result['bvid']}" #获得当前视频的bv号并且访问api接口获取他的cid response=requests.get(url,headers=headers) response.encoding='utf-8' Json=response.json() cid=Json['data'][0]['cid'] #print(cid) url = f"https://comment.bilibili.com/{cid}.xml" #print(url) #通过cid访问视频弹幕数据所在界面 response = requests.get(url=url,headers=headers) response.encoding = 'utf-8' soup = BeautifulSoup(response.text, 'xml') cnt += 1 #已爬取视频数量+1 for i in soup.find_all('d'): #print(i.text) #danmuku.append(i.text) with open(f"第{cnt}个视频弹幕.txt",mode='a',encoding='utf-8') as f: f.write(i.text+'\n') #写入本地文档,方便后续读取调用筛选 #print(danmuku) if(cnt==300):#如果爬了300个视频的弹幕就结束循环 break if(cnt==300): break