|
|
|
|
import requests
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
import re
|
|
|
|
|
import time
|
|
|
|
|
import jieba
|
|
|
|
|
import wordcloud
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
import pandas as pd
|
|
|
|
|
from pandas import ExcelWriter
|
|
|
|
|
from collections import Counter
|
|
|
|
|
headers = {
|
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0',
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
keywords = [
|
|
|
|
|
'AI', '人工智能', '机器学习', '深度学习', '神经网络', '自动化',
|
|
|
|
|
'算法', '数据科学' '自然语言处理', '计算机视觉', '人工智能技术', 'AI技术', 'AI应用', 'AI模型',
|
|
|
|
|
'大数据', '预测分析', '机器视觉', '自动驾驶',
|
|
|
|
|
'智能推荐', '计算机科学', '人工智能应用',
|
|
|
|
|
'数据分析','智能化', '情感计算','ai'
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
def chuli(etxt):
|
|
|
|
|
#danmustr=''.join(i for i in etxt) #将所有弹幕拼接在一起
|
|
|
|
|
#words=list(jieba.cut(danmustr)) ###利用jieba库将弹幕按词进行切分
|
|
|
|
|
words=[i for i in etxt if len(i)>1] ###挑出长度大于1的词语(为去除诸如?,哈,啊等字符)
|
|
|
|
|
wc=wordcloud.WordCloud(background_color='white',height=1000,width=1000,font_path='simsun.ttc')#利用wordcloud库定义词云图片的信息
|
|
|
|
|
wc.generate(' '.join(words)) ##生成图片
|
|
|
|
|
print(wc)
|
|
|
|
|
plt.imshow(wc)
|
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
def sort(txt, keywords):
|
|
|
|
|
|
|
|
|
|
comment_counter = Counter()
|
|
|
|
|
for line in txt:
|
|
|
|
|
line.strip()
|
|
|
|
|
if any(word in keywords for word in jieba.cut(line)):
|
|
|
|
|
comment_counter[line] += 1
|
|
|
|
|
|
|
|
|
|
return comment_counter
|
|
|
|
|
|
|
|
|
|
AIdanmu = sort(altxt, keywords)
|
|
|
|
|
chuli(AIdanmu)
|