|
|
import pandas as pd
|
|
|
import jieba # 结巴分词
|
|
|
import wordcloud # 词云图模块
|
|
|
import matplotlib.pyplot as plt
|
|
|
import imageio # 读取本地图片,修改词云图形
|
|
|
import time
|
|
|
|
|
|
def break_down_words(file_path, stopwords= '', output_txt='stop.txt', output_turly=False):
|
|
|
"""
|
|
|
从Excel文件中生成有效词并保存为TXT。
|
|
|
返回有效词集合。
|
|
|
"""
|
|
|
# 1. 读取Excel文件中的弹幕数据
|
|
|
df = pd.read_excel(file_path)
|
|
|
|
|
|
# 2. 合并所有句子为一个字符串
|
|
|
text = ' '.join(df['包含关键词的弹幕'].astype(str))
|
|
|
|
|
|
# 3. 分词,将句子分割为词汇
|
|
|
text_list = jieba.lcut(text)
|
|
|
|
|
|
# 4. 去除停用词,并过滤出有效词(长度要大于等于2个字的词)
|
|
|
filtered_words = set(word for word in text_list if word not in stopwords and len(word) >= 2)
|
|
|
|
|
|
# 5. 将有效词保存到TXT文件
|
|
|
if output_turly:
|
|
|
with open(output_txt, 'w', encoding='utf-8') as f:
|
|
|
for word in filtered_words:
|
|
|
f.write(f"{word}\n")
|
|
|
|
|
|
print(f"有效词已保存到 {output_txt}")
|
|
|
return filtered_words # 返回有效词集合供后续使用
|
|
|
|
|
|
# 从TXT文件中加载停用词
|
|
|
def load_stopwords_from_txt(file_path):
|
|
|
"""
|
|
|
从TXT文件中加载停用词
|
|
|
"""
|
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
|
# 读取所有行并去掉换行符和空白字符
|
|
|
stopwords = {line.strip() for line in f if line.strip()} # 只保留非空行
|
|
|
return stopwords
|
|
|
|
|
|
# 根据有效词的出现次数生成字符串
|
|
|
def generate_weighted_text_from_counts(file_path, stopwords):
|
|
|
"""
|
|
|
参数1: file_path: 包含词频的Excel文件路径
|
|
|
参数2: stopwords: 停用词列表
|
|
|
返回值: 生成的文本
|
|
|
"""
|
|
|
df = pd.read_excel(file_path)
|
|
|
total_count = df.shape[0] # 获取总行数
|
|
|
weighted_words = []
|
|
|
|
|
|
for index, row in df.iterrows():
|
|
|
word = row['关键词']
|
|
|
count = total_count - index
|
|
|
if(count/2 > 1):
|
|
|
count = int (count/2)
|
|
|
if word not in stopwords and len(word) >= 2: # 确保不包含停用词
|
|
|
weighted_words.extend([word] * count) # 将词按其出现次数重复添加
|
|
|
|
|
|
return ' '.join(weighted_words)
|
|
|
|
|
|
# 根据有效词集合和词频生成并显示词云图
|
|
|
def generate_wordcloud(filtered_words, weighted_text, output_path='词云.png'):
|
|
|
"""
|
|
|
参数1: filtered_words 有效词集合
|
|
|
参数2: weighted_text 加权后的文本
|
|
|
"""
|
|
|
# 将有效词集合转为字符串,空格分隔
|
|
|
filtered_text_str = ' '.join(filtered_words)
|
|
|
|
|
|
# 组合加权文本和有效词字符串
|
|
|
combined_text = weighted_text + ' ' + filtered_text_str
|
|
|
|
|
|
# 配置词云图
|
|
|
wc = wordcloud.WordCloud(
|
|
|
width=600,
|
|
|
height=300,
|
|
|
background_color='white',
|
|
|
font_path='msyh.ttc', # 字体文件路径
|
|
|
max_font_size=200,
|
|
|
min_font_size=10,
|
|
|
colormap='cool',
|
|
|
scale=4
|
|
|
)
|
|
|
|
|
|
# 生成词云
|
|
|
wc.generate(combined_text)
|
|
|
|
|
|
# 保存词云图为文件
|
|
|
wc.to_file(output_path)
|
|
|
|
|
|
# 显示词云图
|
|
|
plt.figure(figsize=(10, 10))
|
|
|
plt.imshow(wc, interpolation='bilinear')
|
|
|
plt.axis('off') # 关闭坐标轴
|
|
|
plt.show()
|
|
|
|
|
|
def main_word_could():
|
|
|
stopwords = load_stopwords_from_txt('stop.txt')
|
|
|
|
|
|
# 处理AI相关句子以生成有效词
|
|
|
filtered_words = break_down_words('ai_sentences.xlsx', stopwords) # 处理AI相关句子
|
|
|
print('词语已生成')
|
|
|
|
|
|
# 读取keyword_counts.xlsx文件中的词频并生成加权文本
|
|
|
weighted_text = generate_weighted_text_from_counts('keyword_counts.xlsx', stopwords)
|
|
|
print('加权文本已生成')
|
|
|
|
|
|
# 生成词云图
|
|
|
generate_wordcloud(filtered_words, weighted_text)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
main_word_could()
|
|
|
|
|
|
|
|
|
# 获取相关函数运行时间
|
|
|
def main_word_could2():
|
|
|
start_time = time.time() # 开始计时
|
|
|
stopwords = load_stopwords_from_txt('stop.txt')
|
|
|
end_time = time.time() # 结束计时
|
|
|
print(f"load_stopwords_from_txt函数耗时: {end_time - start_time:.4f} 秒")
|
|
|
|
|
|
# 处理AI相关句子以生成有效词
|
|
|
start_time = time.time() # 开始计时
|
|
|
filtered_words = break_down_words('ai_sentences.xlsx', stopwords) # 处理AI相关句子
|
|
|
# print('词语已生成')
|
|
|
end_time = time.time() # 结束计时
|
|
|
print(f"break_down_words函数耗时: {end_time - start_time:.4f} 秒")
|
|
|
|
|
|
# 读取keyword_counts.xlsx文件中的词频并生成加权文本
|
|
|
start_time = time.time() # 开始计时
|
|
|
weighted_text = generate_weighted_text_from_counts('keyword_counts.xlsx', stopwords)
|
|
|
# print('加权文本已生成')
|
|
|
end_time = time.time() # 结束计时
|
|
|
print(f"generate_weighted_text_from_counts函数耗时: {end_time - start_time:.4f} 秒")
|
|
|
|
|
|
# 生成词云图
|
|
|
start_time = time.time() # 开始计时
|
|
|
generate_wordcloud(filtered_words, weighted_text)
|
|
|
end_time = time.time() # 结束计时
|
|
|
print(f"generate_wordcloud函数耗时: {end_time - start_time:.4f} 秒")
|