You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
61 lines
1.9 KiB
61 lines
1.9 KiB
import numpy as np
|
|
import pandas as pd
|
|
from wordcloud import WordCloud
|
|
import matplotlib.pyplot as plt
|
|
from collections import Counter
|
|
import plotly.express as px
|
|
|
|
# 读取AI技术相关弹幕的CSV文件
|
|
def read_data(file_path):
|
|
df = pd.read_csv(file_path)
|
|
return df['ai_related_danmakus'].dropna().astype(str)
|
|
|
|
# 筛选与统计关键词
|
|
def filter_keywords(text_list, stopwords):
|
|
filtered_words = []
|
|
for text in text_list:
|
|
words = text.split()
|
|
filtered_words.extend([word for word in words if word not in stopwords])
|
|
return ' '.join(filtered_words)
|
|
|
|
# 生成词云并统计频率
|
|
def generate_wordcloud(text, stopwords,colormap='viridis'):
|
|
|
|
wordcloud = WordCloud(width=800, height=400, background_color='white', stopwords=stopwords,
|
|
colormap=colormap, font_path='/usr/share/fonts/truetype/simhei.ttf').generate(text)
|
|
return wordcloud
|
|
|
|
# 显示词云和频率统计
|
|
def show_wordcloud(wordcloud, freq_dist):
|
|
# 绘制词云
|
|
plt.figure(figsize=(10, 5))
|
|
plt.imshow(wordcloud, interpolation='bilinear')
|
|
plt.axis('off')
|
|
plt.show()
|
|
|
|
# 使用 Plotly 绘制关键词频率条形图
|
|
freq_df = pd.DataFrame(freq_dist.most_common(8), columns=['Word', 'Frequency'])
|
|
fig = px.bar(freq_df, x='Word', y='Frequency', title='Top 8 ai_related_danmakus')
|
|
fig.show()
|
|
|
|
def cloud(file_path):
|
|
# 读取弹幕数据
|
|
danmakus = read_data(file_path)
|
|
|
|
# 自定义停用词
|
|
stopwords = set(['的', '是', '在', '我', '你','5', '哈']) # 可以根据需求自定义更多停用词
|
|
|
|
# 过滤无意义词语
|
|
filtered_text = filter_keywords(danmakus, stopwords)
|
|
|
|
# 统计关键词频率
|
|
word_list = filtered_text.split()
|
|
freq_dist = Counter(word_list)
|
|
|
|
# 生成词云
|
|
wordcloud = generate_wordcloud(filtered_text, stopwords)
|
|
|
|
# 显示词云和统计信息
|
|
show_wordcloud(wordcloud, freq_dist)
|
|
|
|
return 0 |