|
|
|
@ -0,0 +1,124 @@
|
|
|
|
|
# 程序标题:小组作业1
|
|
|
|
|
# 开发时间:2022/3/23 22:37
|
|
|
|
|
# 程序说明:统计词语数,并生成词云
|
|
|
|
|
"""
|
|
|
|
|
程序要点说明:
|
|
|
|
|
"""
|
|
|
|
|
import jieba
|
|
|
|
|
import os
|
|
|
|
|
import wordcloud
|
|
|
|
|
import sys
|
|
|
|
|
|
|
|
|
|
'''
|
|
|
|
|
* @name : Count_Word
|
|
|
|
|
* @brief : 函数实现对文本内容词语个数的统计,并且对词语个数进行降序排序
|
|
|
|
|
* @param[in] : 函数的输入参数为 文件名 file_name
|
|
|
|
|
* @param[out] : 函数的输出名为 words_out 是一个列表对象,格式为[('人生', 3), ('没有', 2), ('白走', 1)]
|
|
|
|
|
* @return : 函数返回词语的排序列表对象,列表中的元素是元组类型的
|
|
|
|
|
* @others : 无
|
|
|
|
|
'''
|
|
|
|
|
def Count_Word(file_name):
|
|
|
|
|
#打开文件
|
|
|
|
|
with open(file_name,"r",encoding="utf-8") as fp:
|
|
|
|
|
# 用来替换字符文本中的特殊字符
|
|
|
|
|
symbol = '`~!@#$%^&*()_+-={[}]:;"\'<,>.?/《,》。?、:;“’{【}】·@¥……()\n\r\t'
|
|
|
|
|
txt_str = fp.read()
|
|
|
|
|
#替换文本中的特殊字符
|
|
|
|
|
for ch in symbol:
|
|
|
|
|
txt_str = txt_str.replace(ch," ")
|
|
|
|
|
#进行分词操作
|
|
|
|
|
words_list = jieba.lcut(txt_str)
|
|
|
|
|
#统计词语出现的次数
|
|
|
|
|
words_dict = {}
|
|
|
|
|
for word in words_list:
|
|
|
|
|
if word != " ":
|
|
|
|
|
if word in words_dict:
|
|
|
|
|
words_dict[word] += 1
|
|
|
|
|
else:
|
|
|
|
|
words_dict[word] = 1
|
|
|
|
|
# 对字典进行“降序”排序,生成列表对象
|
|
|
|
|
words_out = sorted(words_dict.items(), key=lambda e: e[1], reverse=True)
|
|
|
|
|
#返回列表数据
|
|
|
|
|
return words_out
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'''
|
|
|
|
|
* @name : File_Search
|
|
|
|
|
* @brief : 函数实现对目录下的所有txt文件进行搜索
|
|
|
|
|
* @param[in] : 函数的输入参数为 目录路径 path_name
|
|
|
|
|
* @param[out] : 函数的输出名为 file_path 是一个列表对象,格式为['./test\\D1\\1.txt', './test\\D1\\F1\\3.txt']
|
|
|
|
|
* @return : 函数返回txt文件的路径,列表元素的类型为 string
|
|
|
|
|
* @others : 无
|
|
|
|
|
'''
|
|
|
|
|
def File_Search(path_name):
|
|
|
|
|
fs_index = 1
|
|
|
|
|
file_path = []
|
|
|
|
|
# 文件搜索算法,广度优先遍历
|
|
|
|
|
for root, dirs, files in os.walk(path_name):
|
|
|
|
|
for file in files:
|
|
|
|
|
if file.endswith(".txt"):
|
|
|
|
|
file_path.append(os.path.join(root, file))
|
|
|
|
|
print("{0}:{1}".format(fs_index,file))
|
|
|
|
|
fs_index += 1
|
|
|
|
|
return file_path
|
|
|
|
|
|
|
|
|
|
'''
|
|
|
|
|
* @name : Word_Cloud
|
|
|
|
|
* @brief : 函数实现对统计的词语进行保存和生成词云图
|
|
|
|
|
* @param[in] : 函数的输入参数为 单词统计生成的列表数据
|
|
|
|
|
* @param[out] : 无
|
|
|
|
|
* @return : 无返回值
|
|
|
|
|
* @others : 无
|
|
|
|
|
'''
|
|
|
|
|
def Word_Cloud(word_list):
|
|
|
|
|
while True:
|
|
|
|
|
wordcloud_path = input('请输入数据保存的路径,结尾加上 .txt :')
|
|
|
|
|
if not(wordcloud_path.endswith(".txt")):
|
|
|
|
|
print("请输入正确的txt文件文件格式")
|
|
|
|
|
else:
|
|
|
|
|
#保存数据文件
|
|
|
|
|
with open(wordcloud_path,"w",encoding="utf-8") as save_fp:
|
|
|
|
|
for k in word_list:
|
|
|
|
|
for h in k:
|
|
|
|
|
save_fp.write(str(h)+'\t')
|
|
|
|
|
save_fp.write("\n")
|
|
|
|
|
# 生成词云图
|
|
|
|
|
with open(wordcloud_path,"r", encoding="utf-8")as read_fp:
|
|
|
|
|
read_text = read_fp.read()
|
|
|
|
|
# 设置词云的背景颜色、宽高、字数
|
|
|
|
|
wordcloud_fp = wordcloud.WordCloud(font_path="/Windows/Fonts/simfang.ttf", background_color="black", width=600,\
|
|
|
|
|
height=300, max_words=50)
|
|
|
|
|
#生成词云
|
|
|
|
|
wordcloud_fp.generate(read_text)
|
|
|
|
|
# 生成图片
|
|
|
|
|
wordcloud_fp.to_file("wordcloud.png")
|
|
|
|
|
wordcloud_image = wordcloud_fp.to_image()
|
|
|
|
|
# 显示图片
|
|
|
|
|
wordcloud_image.show()
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
while True:
|
|
|
|
|
system_status = input("是否运行词云系统 y/n:")
|
|
|
|
|
#对用户的指令进行判断
|
|
|
|
|
if system_status == 'n':
|
|
|
|
|
sys.exit()
|
|
|
|
|
elif system_status != 'y' and system_status != 'n':
|
|
|
|
|
print("输入指令错误,请输入正确的指令!!!")
|
|
|
|
|
else:
|
|
|
|
|
#对程序整个目录下的txt文件进行搜索
|
|
|
|
|
file_path = File_Search("./")
|
|
|
|
|
while True:
|
|
|
|
|
file_index = input("请输入要操作的文件的序号:")
|
|
|
|
|
#对输入文件序号进行判断
|
|
|
|
|
if (not file_index.isdecimal()) or int(file_index) > len(file_path) or int(file_index) <= 0:
|
|
|
|
|
print("输入文件序号错,误请重新输入!!!")
|
|
|
|
|
else:
|
|
|
|
|
file_index = int(file_index)
|
|
|
|
|
# 对文本的词语进行统计
|
|
|
|
|
file_word = Count_Word(file_path[file_index-1])
|
|
|
|
|
# 生成词云
|
|
|
|
|
Word_Cloud(file_word)
|
|
|
|
|
break
|