You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
62 lines
1.9 KiB
62 lines
1.9 KiB
import requests
|
|
from bs4 import BeautifulSoup
|
|
import re
|
|
import jieba
|
|
import wordcloud
|
|
from collections import Counter
|
|
import imageio
|
|
from openpyxl import Workbook,load_workbook
|
|
from PIL import Image
|
|
import numpy as np
|
|
from pyinstrument import Profiler
|
|
t=Profiler()
|
|
t.start()
|
|
#定义请求头
|
|
headers = {
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 SLBrowser/9.0.3.5211 SLBChan/105",
|
|
"Referer":"https: // www.bilibili.com /",
|
|
"Origin":"https://search.bilibili.com"
|
|
}
|
|
#定义页数page和o来翻页
|
|
page=0
|
|
o=0
|
|
#存放视频bv号
|
|
list_ = []
|
|
#翻页提取视频bv号
|
|
while page<10:
|
|
url = 'https://search.bilibili.com/all?vt=94833807&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3'.format(page,o)
|
|
response=requests.get(url=url,headers=headers)
|
|
text1=response.text
|
|
html=re.findall(r'(BV.{10})',text1)
|
|
list_.extend(html)
|
|
page=page+1
|
|
o=o+36
|
|
#排除重复的bv号
|
|
ll = []
|
|
for k in list_:
|
|
if ll.count(k)==0:
|
|
ll.append(k)
|
|
|
|
|
|
#获取前300个视频链接
|
|
all_danmu=[]
|
|
for j in ll[:300]:
|
|
link = "https://www.bilibili.com/video/{}/".format(j)
|
|
response = requests.get(url=link, headers=headers)
|
|
response.encoding = 'utf-8'
|
|
html1 = response.text
|
|
#获取cid以此获得弹幕地址
|
|
cid = re.search(r'"cid":(\d*),', html1).group(1)
|
|
link = "https://comment.bilibili.com/{}.xml".format(cid)
|
|
res = requests.get(link)
|
|
res.encoding = 'utf-8'
|
|
soup2 = BeautifulSoup(res.text, 'xml')
|
|
all_barrage = soup2.findAll("d")
|
|
#print(all_danmu)
|
|
#将弹幕存入文件中
|
|
for danmu in all_danmu:
|
|
with open('弹幕.txt', 'a', newline='', encoding='utf-8-sig') as file:
|
|
file.write(danmu.string)
|
|
file.write("\n")
|
|
t.stop()
|
|
t.print() |