102201518/提取300个视频的弹幕并以文本形式存入.py

import requests
from bs4 import BeautifulSoup
import re
import jieba
import wordcloud
from collections import Counter
import imageio
from openpyxl import Workbook,load_workbook
from PIL import Image
import numpy as np
from pyinstrument import Profiler
t=Profiler()
t.start()
#定义请求头
headers = {
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 SLBrowser/9.0.3.5211 SLBChan/105",
              "Referer":"https: // www.bilibili.com /",
              "Origin":"https://search.bilibili.com"
}
#定义页数page和o来翻页
page=0
o=0
#存放视频bv号
list_ = []
#翻页提取视频bv号
while page<10:
    url = 'https://search.bilibili.com/all?vt=94833807&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3'.format(page,o)
    response=requests.get(url=url,headers=headers)
    text1=response.text
    html=re.findall(r'(BV.{10})',text1)
    list_.extend(html)
    page=page+1
    o=o+36
#排除重复的bv号
ll = []
for k in list_:
    if ll.count(k)==0:
        ll.append(k)


#获取前300个视频链接
all_danmu=[]
for j in ll[:300]:
    link = "https://www.bilibili.com/video/{}/".format(j)
    response = requests.get(url=link, headers=headers)
    response.encoding = 'utf-8'
    html1 = response.text
#获取cid以此获得弹幕地址
    cid = re.search(r'"cid":(\d*),', html1).group(1)
    link = "https://comment.bilibili.com/{}.xml".format(cid)
    res = requests.get(link)
    res.encoding = 'utf-8'
    soup2 = BeautifulSoup(res.text, 'xml')
    all_barrage = soup2.findAll("d")
    #print(all_danmu)
#将弹幕存入文件中
    for danmu in all_danmu:
        with open('弹幕.txt', 'a', newline='', encoding='utf-8-sig') as file:
            file.write(danmu.string)
            file.write("\n")
t.stop()
t.print()