You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

61 lines
1.9 KiB

import numpy as np
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
import plotly.express as px
# 读取AI技术相关弹幕的CSV文件
def read_data(file_path):
df = pd.read_csv(file_path)
return df['ai_related_danmakus'].dropna().astype(str)
# 筛选与统计关键词
def filter_keywords(text_list, stopwords):
filtered_words = []
for text in text_list:
words = text.split()
filtered_words.extend([word for word in words if word not in stopwords])
return ' '.join(filtered_words)
# 生成词云并统计频率
def generate_wordcloud(text, stopwords,colormap='viridis'):
wordcloud = WordCloud(width=800, height=400, background_color='white', stopwords=stopwords,
colormap=colormap, font_path='/usr/share/fonts/truetype/simhei.ttf').generate(text)
return wordcloud
# 显示词云和频率统计
def show_wordcloud(wordcloud, freq_dist):
# 绘制词云
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
# 使用 Plotly 绘制关键词频率条形图
freq_df = pd.DataFrame(freq_dist.most_common(8), columns=['Word', 'Frequency'])
fig = px.bar(freq_df, x='Word', y='Frequency', title='Top 8 ai_related_danmakus')
fig.show()
def cloud(file_path):
# 读取弹幕数据
danmakus = read_data(file_path)
# 自定义停用词
stopwords = set(['', '', '', '', '','5', '']) # 可以根据需求自定义更多停用词
# 过滤无意义词语
filtered_text = filter_keywords(danmakus, stopwords)
# 统计关键词频率
word_list = filtered_text.split()
freq_dist = Counter(word_list)
# 生成词云
wordcloud = generate_wordcloud(filtered_text, stopwords)
# 显示词云和统计信息
show_wordcloud(wordcloud, freq_dist)
return 0