From 4006830f05eebe67aa4fd07e48ca82ebd3a2f7fa Mon Sep 17 00:00:00 2001 From: posql3f6g <2974352416@qq.com> Date: Tue, 25 Apr 2023 16:48:47 +0800 Subject: [PATCH] =?UTF-8?q?=E7=94=A8=E8=AF=8D=E4=BA=91=E7=9A=84=E6=96=B9?= =?UTF-8?q?=E6=B3=95=E5=81=9A=E5=8F=AF=E8=A7=86=E5=8C=96=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 爬取弹幕并做词云可视化处理.py | 41 ++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 爬取弹幕并做词云可视化处理.py diff --git a/爬取弹幕并做词云可视化处理.py b/爬取弹幕并做词云可视化处理.py new file mode 100644 index 0000000..c096cdf --- /dev/null +++ b/爬取弹幕并做词云可视化处理.py @@ -0,0 +1,41 @@ +import urllib.request +from lxml import etree +import requests +import re + +url='https://www.ibilibili.com/video/BV1o24y1F7wV/?spm_id_from=333.999.0.0&vd_source=59fecc30e7f4791084968599ca1f8b82' + +headers={ + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36' +} + +request=urllib.request.Request(url=url,headers=headers) +response=urllib.request.urlopen(request) +content=response.read().decode('utf-8') +tree=etree.HTML(content) +danmuurl=tree.xpath('//div[@class="btn-group"]/a[3]/@href') + +response2 = requests.get(url=danmuurl[0], headers=headers) +response2.encoding = 'utf-8' +content_list = re.findall('(.*?)', response2.text) + +for content in content_list: + with open('弹幕.txt',mode='a',encoding='utf-8') as f: + f.write(content) + f.write('\n') + +import jieba +from wordcloud import WordCloud +fp=open('弹幕.txt',mode='r',encoding='UTF-8') +txt=fp.read() +words=jieba.lcut(txt) +new_txt=" ".join(words) +wordcloud=WordCloud(font_path='simkai.ttf',\ + background_color='white',\ + height=400,\ + width=600,\ + max_font_size=100,\ + max_words=200 + ).generate(new_txt) +wordcloud.to_file('弹幕词云.jpg') +