From e9c6965d401d58db90ddc074f58b383c26c48551 Mon Sep 17 00:00:00 2001 From: pe79kpq4w <310771302@qq.com> Date: Wed, 30 Mar 2022 21:05:45 +0800 Subject: [PATCH] Word frequency statistics --- FileProcess.py | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 FileProcess.py diff --git a/FileProcess.py b/FileProcess.py new file mode 100644 index 0000000..bcaf434 --- /dev/null +++ b/FileProcess.py @@ -0,0 +1,92 @@ +#文件的处理 +import os +import re +from collections import Counter +from os import listdir +from os.path import join, isfile, isdir +import jieba + +filter=[".txt"] #设置过滤后的文件类型 当然可以设置多个类型 + +#查找当前文件路径下的所有文本文件 +def all_path(dirname): + print(dirname) + xf = 0 + adict = {} + for subpath in listdir(dirname): + path = join(dirname, subpath) + if isfile(path): + ext = os.path.splitext(path)[1] + if ext in filter: #筛选文件中以.txt结尾的文件 + xf = xf+1 + path = path.replace(dirname, "") + path = path.lstrip(path[0:1]) + adict[xf] = path + elif isdir(path): #跳过文件夹,只查找目录下的文件 + continue + for key, value in adict.items(): + print(str(key)+" : "+value) + return adict + +#处理中文文件 +def Copenfile(name): + strcloud = "" + cut_words = "" + for line in open(name, encoding='utf-8'): + line.strip('\n') + line = re.sub("[A-Za-z0-9\:\·\?\!\?\;\、\—\,\。\“ \”]", "", line) # 去除杂乱字符 + seg_list = jieba.cut(line, cut_all=False) + cut_words += (" ".join(seg_list)) + all_words = cut_words.split() + #将分词结果,以空格为间隔形成字符串,方便进行词云的生成 + for s in range(0, len(all_words)): + strcloud = strcloud + all_words[s] + ' ' + c = Counter() + for x in all_words: + if len(x) > 1 and x != '\r\n': # 长度大于一,并且不为换行等字符 + c[x] += 1 + #新建文件存放词频统计结果 + resfile = "res" + name + f = open(resfile, 'w', encoding='utf-8') + print("-----------------------统计词频中--------------------------") + print('词频统计结果:') + for (k, v) in c.most_common(len(all_words)): + print("%s:%d" % (k, v)) + f.write(k + ':' + str(v) + '\n')#将结果存入文件 + f.close() #关闭文件 + return strcloud + + +#处理英文文件 +def Eopenfile(name): + strcloud = "" + dic = {}#设置字典,存放结果 + allword = [] + for line in open(name, encoding='utf-8'): + line.lower()#转换为小写字母 + newline = re.split('[ ,.?!;:"]', line.strip('\n')) + #将分词结果,以空格为间隔形成字符串,方便进行词云的生成 + for s in range(0, len(newline)): + strcloud = strcloud + newline[s] + ' ' + #统计单词出现次数 + for i in newline: + if i in dic: + dic[i] += 1 + else: + dic[i] = 1 + allword +=newline + # 新建文件存放词频统计结果 + resfile = "res" + name + f = open(resfile, 'w', encoding='utf-8') + # 为便于排序,将词典转化为列表 + items = list(dic.items()) + # 根据单词的频数从高到低排序 + items.sort(key=lambda x: x[1], reverse=True) + print("-----------------------统计词频中--------------------------") + print('词频统计结果:') + for i in range(len(items)): + print(items[i]) + f.write(str(items[i]) + "\n")#将结果存入文件 + f.close() + return strcloud +