You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

138 lines
5.2 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

from email.mime import image
import re #使用正则表达式分割获取弹幕
from tkinter import Image
import requests
from bs4 import BeautifulSoup
import jieba #结巴分词pip install jieba
import wordcloud #制作词云图
import imageio
from wordcloud.wordcloud import np #读取本地图片,修改词云图形
def get_outer_urls(n):
'''
【分页网址url采集】
n爬取页数
结果得到分页网址的list
'''
urllst = []
#第一个分页网址
urllst.append('https://search.bilibili.com/video?vt=77234042&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3')
#其余分页网址
for i in range(2,n+1):
url = f'https://search.bilibili.com/video?vt=77234042&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3&page={i}&o={i*30-30}'
urllst.append(url)
return urllst
def get_inter_urls(ui,d_h,d_c):
'''
【视频页面url采集】
ui视频信息网页url
d_huser-agent信息
d_ccookies信息
结果得到一个视频页面的list
'''
ri = requests.get(ui, headers = d_h, cookies = d_c)
soupi = BeautifulSoup(ri.text, 'lxml')
lis = soupi.find('div',class_="video-list row").find_all('div')
lst = []
i=0
while i<len(lis):
div_0=lis[i]
#print(div_0)
#print(type(div_0))
url_inner=div_0.a
if url_inner==None: ####避免异常
i=i+1
continue
lst.append('https:'+url_inner['href']) ###得到完整网址
i=i+1
return lst
def get_data(ui,d_h,d_c):
'''
ui视频页面网址
d_huser-agent信息
d_ccookies信息
tablemongo集合对象
'''
###获取每个视频的弹幕网址
ri = requests.get(url = ui, headers = d_h, cookies = d_c)
soupi = BeautifulSoup(ri.text, 'lxml')
cid = re.search(r'"cid":(\d*),', ri.text).group(1)
cid_url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
###利用正则表达式切割出弹幕并保存
r2 = requests.get(cid_url,headers = dic_headers,cookies =dic_cookies)
r2.encoding = r2.apparent_encoding
dmlst = re.findall('<d p=".*?">(.*?)</d>',r2.text)
###将每个弹幕写入文件,并记录数量
n = 0
for index in dmlst:
with open('D:/弹幕.txt','a',encoding='UTF-8') as f:
f.write(index)
f.write('\n')
n+=1
return n
if __name__ == '__main__':
###获取headers和cookies伪装成浏览器访问网页
dic_headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"}
cookies = "buvid4=DCB17204-EBFC-9664-4E97-3795181B43A418148-022080117-DNbKVr13tEGcPKkOr4lL%2Fg%3D%3D; buvid3=CDE73727-EF10-ED0B-4A16-BA5D759B014208137infoc; b_nut=1698373908; _uuid=16CA10BCD-10538-D8310-8ABA-85D99510C966509403infoc; rpdid=|(u))kkYu|~Y0J'u~uJ|mYlYu; fingerprint=160ca50e14386abcf76b36d0fdd4b02d; buvid_fp_plain=undefined; buvid_fp=160ca50e14386abcf76b36d0fdd4b02d; header_theme_version=CLOSE; enable_web_push=DISABLE; home_feed_column=5; browser_resolution=1455-755; CURRENT_FNVAL=4048; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY3Mzc4NjQsImlhdCI6MTcyNjQ3ODYwNCwicGx0IjotMX0.VqGzlkNCgqurTbc0ruEj0IDl8eqJBRRhBq86ARF3Kfk; bili_ticket_expires=1726737804; bp_t_offset_1323959786=977791090671222784; b_lsid=6D45B513_191FE02FA1B; bsource=search_bing; SESSDATA=009ba743%2C1742100653%2C4a273%2A92CjAuo1OGrcH-PW3p0fHH1AKVrtbV3FsE-4oxnyl2hYcGWvzUpi1fSUrIGfYRLBOiSS8SVlhVZmxWSy1QS2ZuWGRpbDRWNjdHZmNKVVpNYjcwOEJ1UjI1R0JJNnZVTUZTczNaUmlPbk1jMlZKQnBFMEJzVmotbV9RcXdfVEVhVFloMzNBd01nYS1RIIEC; bili_jct=4314c0fd203c95101efcce860ce01817; DedeUserID=1323959786; DedeUserID__ckMd5=7e17b1885b370fe5; sid=omn05a3k"
dic_cookies = {}
for i in cookies.split("; "):
dic_cookies[i.split("=")[0]] = i.split("=")[1]
urllst = get_outer_urls(10) #获取前10页的网址
u1 = urllst[0] #获取第一页的全部30个视频url
url_inter = get_inter_urls(u1,dic_headers,dic_cookies) #获取第一个视频的url
###记录弹幕总数
count = 0
###避免异常
m=0
for u in url_inter:
if m!=0:
m+=1
if m==5:
m=0
continue
m+=1
count += get_data(u,dic_headers,dic_cookies)
print(f'数据采集并存入成功,总共采集{count}条数据')
###修改词云图形
img=imageio.imread('D:/2.png')
f1=open('D:/弹幕.txt','r',encoding='UTF-8')
text=f1.read()
text_list=jieba.lcut(text)
#print(text_list)
text_str=' '.join(text_list)
wc=wordcloud.WordCloud(
###设置词云图参数
width=1000,
height=800,
background_color='white',
mask=img,
font_path='C:\Windows\Fonts\FZSTK.TTF',###字体
stopwords={'','','','','','','','','','','',''}###屏蔽词
)
wc.generate(text_str)
wc.to_file('D:/词云3.png')
f1.close()