You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
102201518/提取300个视频的弹幕并以文本形式存入.py

62 lines
1.9 KiB

import requests
from bs4 import BeautifulSoup
import re
import jieba
import wordcloud
from collections import Counter
import imageio
from openpyxl import Workbook,load_workbook
from PIL import Image
import numpy as np
from pyinstrument import Profiler
t=Profiler()
t.start()
#定义请求头
headers = {
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 SLBrowser/9.0.3.5211 SLBChan/105",
"Referer":"https: // www.bilibili.com /",
"Origin":"https://search.bilibili.com"
}
#定义页数page和o来翻页
page=0
o=0
#存放视频bv号
list_ = []
#翻页提取视频bv号
while page<10:
url = 'https://search.bilibili.com/all?vt=94833807&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3'.format(page,o)
response=requests.get(url=url,headers=headers)
text1=response.text
html=re.findall(r'(BV.{10})',text1)
list_.extend(html)
page=page+1
o=o+36
#排除重复的bv号
ll = []
for k in list_:
if ll.count(k)==0:
ll.append(k)
#获取前300个视频链接
all_danmu=[]
for j in ll[:300]:
link = "https://www.bilibili.com/video/{}/".format(j)
response = requests.get(url=link, headers=headers)
response.encoding = 'utf-8'
html1 = response.text
#获取cid以此获得弹幕地址
cid = re.search(r'"cid":(\d*),', html1).group(1)
link = "https://comment.bilibili.com/{}.xml".format(cid)
res = requests.get(link)
res.encoding = 'utf-8'
soup2 = BeautifulSoup(res.text, 'xml')
all_barrage = soup2.findAll("d")
#print(all_danmu)
#将弹幕存入文件中
for danmu in all_danmu:
with open('弹幕.txt', 'a', newline='', encoding='utf-8-sig') as file:
file.write(danmu.string)
file.write("\n")
t.stop()
t.print()