#文件的处理 import os import re from collections import Counter from os import listdir from os.path import join, isfile, isdir import jieba filter=[".txt"] #设置过滤后的文件类型 当然可以设置多个类型 #查找当前文件路径下的所有文本文件 def all_path(dirname): print(dirname) xf = 0 adict = {} for subpath in listdir(dirname): path = join(dirname, subpath) if isfile(path): ext = os.path.splitext(path)[1] if ext in filter: #筛选文件中以.txt结尾的文件 xf = xf+1 path = path.replace(dirname, "") path = path.lstrip(path[0:1]) adict[xf] = path elif isdir(path): #跳过文件夹,只查找目录下的文件 continue for key, value in adict.items(): print(str(key)+" : "+value) return adict #处理中文文件 def Copenfile(name): strcloud = "" cut_words = "" for line in open(name, encoding='utf-8'): line.strip('\n') line = re.sub("[A-Za-z0-9\:\·\?\!\?\;\、\—\,\。\“ \”]", "", line) # 去除杂乱字符 seg_list = jieba.cut(line, cut_all=False) cut_words += (" ".join(seg_list)) all_words = cut_words.split() #将分词结果,以空格为间隔形成字符串,方便进行词云的生成 for s in range(0, len(all_words)): strcloud = strcloud + all_words[s] + ' ' c = Counter() for x in all_words: if len(x) > 1 and x != '\r\n': # 长度大于一,并且不为换行等字符 c[x] += 1 #新建文件存放词频统计结果 resfile = "res" + name f = open(resfile, 'w', encoding='utf-8') print("-----------------------统计词频中--------------------------") print('词频统计结果:') for (k, v) in c.most_common(len(all_words)): print("%s:%d" % (k, v)) f.write(k + ':' + str(v) + '\n')#将结果存入文件 f.close() #关闭文件 return strcloud #处理英文文件 def Eopenfile(name): strcloud = "" dic = {}#设置字典,存放结果 allword = [] for line in open(name, encoding='utf-8'): line.lower()#转换为小写字母 newline = re.split('[ ,.?!;:"]', line.strip('\n')) #将分词结果,以空格为间隔形成字符串,方便进行词云的生成 for s in range(0, len(newline)): strcloud = strcloud + newline[s] + ' ' #统计单词出现次数 for i in newline: if i in dic: dic[i] += 1 else: dic[i] = 1 allword +=newline # 新建文件存放词频统计结果 resfile = "res" + name f = open(resfile, 'w', encoding='utf-8') # 为便于排序,将词典转化为列表 items = list(dic.items()) # 根据单词的频数从高到低排序 items.sort(key=lambda x: x[1], reverse=True) print("-----------------------统计词频中--------------------------") print('词频统计结果:') for i in range(len(items)): print(items[i]) f.write(str(items[i]) + "\n")#将结果存入文件 f.close() return strcloud