You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

145 lines
5.1 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import pandas as pd
import jieba # 结巴分词
import wordcloud # 词云图模块
import matplotlib.pyplot as plt
import imageio # 读取本地图片,修改词云图形
import time
def break_down_words(file_path, stopwords= '', output_txt='stop.txt', output_turly=False):
"""
从Excel文件中生成有效词并保存为TXT。
返回有效词集合。
"""
# 1. 读取Excel文件中的弹幕数据
df = pd.read_excel(file_path)
# 2. 合并所有句子为一个字符串
text = ' '.join(df['包含关键词的弹幕'].astype(str))
# 3. 分词,将句子分割为词汇
text_list = jieba.lcut(text)
# 4. 去除停用词并过滤出有效词长度要大于等于2个字的词
filtered_words = set(word for word in text_list if word not in stopwords and len(word) >= 2)
# 5. 将有效词保存到TXT文件
if output_turly:
with open(output_txt, 'w', encoding='utf-8') as f:
for word in filtered_words:
f.write(f"{word}\n")
print(f"有效词已保存到 {output_txt}")
return filtered_words # 返回有效词集合供后续使用
# 从TXT文件中加载停用词
def load_stopwords_from_txt(file_path):
"""
从TXT文件中加载停用词
"""
with open(file_path, 'r', encoding='utf-8') as f:
# 读取所有行并去掉换行符和空白字符
stopwords = {line.strip() for line in f if line.strip()} # 只保留非空行
return stopwords
# 根据有效词的出现次数生成字符串
def generate_weighted_text_from_counts(file_path, stopwords):
"""
参数1: file_path: 包含词频的Excel文件路径
参数2: stopwords: 停用词列表
返回值: 生成的文本
"""
df = pd.read_excel(file_path)
total_count = df.shape[0] # 获取总行数
weighted_words = []
for index, row in df.iterrows():
word = row['关键词']
count = total_count - index
if(count/2 > 1):
count = int (count/2)
if word not in stopwords and len(word) >= 2: # 确保不包含停用词
weighted_words.extend([word] * count) # 将词按其出现次数重复添加
return ' '.join(weighted_words)
# 根据有效词集合和词频生成并显示词云图
def generate_wordcloud(filtered_words, weighted_text, output_path='词云.png'):
"""
参数1: filtered_words 有效词集合
参数2: weighted_text 加权后的文本
"""
# 将有效词集合转为字符串,空格分隔
filtered_text_str = ' '.join(filtered_words)
# 组合加权文本和有效词字符串
combined_text = weighted_text + ' ' + filtered_text_str
# 配置词云图
wc = wordcloud.WordCloud(
width=600,
height=300,
background_color='white',
font_path='msyh.ttc', # 字体文件路径
max_font_size=200,
min_font_size=10,
colormap='cool',
scale=4
)
# 生成词云
wc.generate(combined_text)
# 保存词云图为文件
wc.to_file(output_path)
# 显示词云图
plt.figure(figsize=(10, 10))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off') # 关闭坐标轴
plt.show()
def main_word_could():
stopwords = load_stopwords_from_txt('stop.txt')
# 处理AI相关句子以生成有效词
filtered_words = break_down_words('ai_sentences.xlsx', stopwords) # 处理AI相关句子
print('词语已生成')
# 读取keyword_counts.xlsx文件中的词频并生成加权文本
weighted_text = generate_weighted_text_from_counts('keyword_counts.xlsx', stopwords)
print('加权文本已生成')
# 生成词云图
generate_wordcloud(filtered_words, weighted_text)
if __name__ == '__main__':
main_word_could()
# 获取相关函数运行时间
def main_word_could2():
start_time = time.time() # 开始计时
stopwords = load_stopwords_from_txt('stop.txt')
end_time = time.time() # 结束计时
print(f"load_stopwords_from_txt函数耗时: {end_time - start_time:.4f}")
# 处理AI相关句子以生成有效词
start_time = time.time() # 开始计时
filtered_words = break_down_words('ai_sentences.xlsx', stopwords) # 处理AI相关句子
# print('词语已生成')
end_time = time.time() # 结束计时
print(f"break_down_words函数耗时: {end_time - start_time:.4f}")
# 读取keyword_counts.xlsx文件中的词频并生成加权文本
start_time = time.time() # 开始计时
weighted_text = generate_weighted_text_from_counts('keyword_counts.xlsx', stopwords)
# print('加权文本已生成')
end_time = time.time() # 结束计时
print(f"generate_weighted_text_from_counts函数耗时: {end_time - start_time:.4f}")
# 生成词云图
start_time = time.time() # 开始计时
generate_wordcloud(filtered_words, weighted_text)
end_time = time.time() # 结束计时
print(f"generate_wordcloud函数耗时: {end_time - start_time:.4f}")