diff --git a/爬取弹幕并做词云可视化处理.py b/爬取弹幕并做词云可视化处理.py new file mode 100644 index 0000000..c096cdf --- /dev/null +++ b/爬取弹幕并做词云可视化处理.py @@ -0,0 +1,41 @@ +import urllib.request +from lxml import etree +import requests +import re + +url='https://www.ibilibili.com/video/BV1o24y1F7wV/?spm_id_from=333.999.0.0&vd_source=59fecc30e7f4791084968599ca1f8b82' + +headers={ + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36' +} + +request=urllib.request.Request(url=url,headers=headers) +response=urllib.request.urlopen(request) +content=response.read().decode('utf-8') +tree=etree.HTML(content) +danmuurl=tree.xpath('//div[@class="btn-group"]/a[3]/@href') + +response2 = requests.get(url=danmuurl[0], headers=headers) +response2.encoding = 'utf-8' +content_list = re.findall('(.*?)', response2.text) + +for content in content_list: + with open('弹幕.txt',mode='a',encoding='utf-8') as f: + f.write(content) + f.write('\n') + +import jieba +from wordcloud import WordCloud +fp=open('弹幕.txt',mode='r',encoding='UTF-8') +txt=fp.read() +words=jieba.lcut(txt) +new_txt=" ".join(words) +wordcloud=WordCloud(font_path='simkai.ttf',\ + background_color='white',\ + height=400,\ + width=600,\ + max_font_size=100,\ + max_words=200 + ).generate(new_txt) +wordcloud.to_file('弹幕词云.jpg') +