Paris_2024/爬前300条视频弹幕.py

import re
import requests
import json
import time
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter


cnt=0#已爬取视频数
#danmuku=[]#弹幕库

for Page in range(1,22):
    url=f'https://api.bilibili.com/x/web-interface/search/type?search_type=video&keyword=巴黎奥运会&page={Page}'
    #获得搜索“巴黎奥运会”第一页的json数据的url
    #print(url)
    headers={
        "cookie":"buvid4=7D777810-9AD0-8C21-1D6A-C607F528C7B427206-022123000-Sfw%2Bq8N2F39WtfvTG9WlSA%3D%3D; DedeUserID=352875468; DedeUserID__ckMd5=0a90e72ce13d5f80; buvid_fp_plain=undefined; is-2022-channel=1; FEED_LIVE_VERSION=V8; hit-new-style-dyn=1; enable_web_push=DISABLE; header_theme_version=CLOSE; buvid3=D0F7F1A0-043F-F252-3B6E-407C2F74F3E233731infoc; b_nut=1703867934; _uuid=10782D5106-6BD5-2F25-DB56-93210AD5B1077B97901infoc; hit-dyn-v2=1; rpdid=|(u))kkYuu|u0J'u~|)~)k)Ju; LIVE_BUVID=AUTO3717091338559729; CURRENT_QUALITY=80; fingerprint=77f845d4623f8049224f6d42350abef3; PVID=1; home_feed_column=5; browser_resolution=1897-998; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYxNTAzOTIsImlhdCI6MTcyNTg5MTEzMiwicGx0IjotMX0.-PIHJf-y8eiCUpgjbSr80_6MMNByawusvATxL1TlWEg; bili_ticket_expires=1726150332; SESSDATA=e4358344%2C1741449782%2Cf61a4%2A92CjCRkwSuqlWD2A7212ZB1TNWsYRH10J2J7J5MA_OLtgkpVB-NiWTShPTTV1Uvij4R28SVjRCcl9yUW9sREZnQWNxMXpsQ3IxdC03QVJsMURLUTBuMjJkRVVZYjlNVUZfVUhNYjZlUEgtdDI0aVlfZnZRUjBqb01EZjFMWDAyaW9LQW9XWDJPdnBnIIEC; bili_jct=a84563cf63b9e39aae094c54f58ac264; sid=690q9qio; buvid_fp=77f845d4623f8049224f6d42350abef3; CURRENT_FNVAL=4048; b_lsid=883A9109D_191E0390D0E; bp_t_offset_352875468=975839826899107840",
        "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
    }#请求头伪装浏览器
    response=requests.get(url,headers=headers)
    Json=response.json()
    results=Json['data']['result']#获得当前页面所有视频的具体信息
    #print(results)
    for result in results:
        #with open("视频bv号.txt",mode='a',encoding='utf-8') as f:
            #f.write(result['bvid']+'\n')
        #print(result['bvid'])
        url = f"https://api.bilibili.com/x/player/pagelist?bvid={result['bvid']}"
        #获得当前视频的bv号并且访问api接口获取他的cid
        response=requests.get(url,headers=headers)
        response.encoding='utf-8'
        Json=response.json()
        cid=Json['data'][0]['cid']
        #print(cid)
        url = f"https://comment.bilibili.com/{cid}.xml"
        #print(url)
        #通过cid访问视频弹幕数据所在界面
        response = requests.get(url=url,headers=headers)
        response.encoding = 'utf-8'
        soup = BeautifulSoup(response.text, 'xml')
        cnt += 1
        #已爬取视频数量+1
        for i in soup.find_all('d'):
            #print(i.text)
            #danmuku.append(i.text)
            with open(f"第{cnt}个视频弹幕.txt",mode='a',encoding='utf-8') as f:
                f.write(i.text+'\n')
                #写入本地文档，方便后续读取调用筛选
        #print(danmuku)
        if(cnt==300):#如果爬了300个视频的弹幕就结束循环
            break
    if(cnt==300):
        break