import requests from bs4 import BeautifulSoup import re import jieba import wordcloud from collections import Counter import imageio from openpyxl import Workbook,load_workbook from PIL import Image import numpy as np from pyinstrument import Profiler t=Profiler() t.start() #定义请求头 headers = { "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 SLBrowser/9.0.3.5211 SLBChan/105", "Referer":"https: // www.bilibili.com /", "Origin":"https://search.bilibili.com" } #定义页数page和o来翻页 page=0 o=0 #存放视频bv号 list_ = [] #翻页提取视频bv号 while page<10: url = 'https://search.bilibili.com/all?vt=94833807&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3'.format(page,o) response=requests.get(url=url,headers=headers) text1=response.text html=re.findall(r'(BV.{10})',text1) list_.extend(html) page=page+1 o=o+36 #排除重复的bv号 ll = [] for k in list_: if ll.count(k)==0: ll.append(k) #获取前300个视频链接 all_danmu=[] for j in ll[:300]: link = "https://www.bilibili.com/video/{}/".format(j) response = requests.get(url=link, headers=headers) response.encoding = 'utf-8' html1 = response.text #获取cid以此获得弹幕地址 cid = re.search(r'"cid":(\d*),', html1).group(1) link = "https://comment.bilibili.com/{}.xml".format(cid) res = requests.get(link) res.encoding = 'utf-8' soup2 = BeautifulSoup(res.text, 'xml') all_barrage = soup2.findAll("d") #print(all_danmu) #将弹幕存入文件中 for danmu in all_danmu: with open('弹幕.txt', 'a', newline='', encoding='utf-8-sig') as file: file.write(danmu.string) file.write("\n") t.stop() t.print()