You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

93 lines
3.2 KiB

#文件的处理
import os
import re
from collections import Counter
from os import listdir
from os.path import join, isfile, isdir
import jieba
filter=[".txt"] #设置过滤后的文件类型 当然可以设置多个类型
#查找当前文件路径下的所有文本文件
def all_path(dirname):
print(dirname)
xf = 0
adict = {}
for subpath in listdir(dirname):
path = join(dirname, subpath)
if isfile(path):
ext = os.path.splitext(path)[1]
if ext in filter: #筛选文件中以.txt结尾的文件
xf = xf+1
path = path.replace(dirname, "")
path = path.lstrip(path[0:1])
adict[xf] = path
elif isdir(path): #跳过文件夹,只查找目录下的文件
continue
for key, value in adict.items():
print(str(key)+" : "+value)
return adict
#处理中文文件
def Copenfile(name):
strcloud = ""
cut_words = ""
for line in open(name, encoding='utf-8'):
line.strip('\n')
line = re.sub("[A-Za-z0-9\\·\?\!\\\\\\\\”]", "", line) # 去除杂乱字符
seg_list = jieba.cut(line, cut_all=False)
cut_words += (" ".join(seg_list))
all_words = cut_words.split()
#将分词结果,以空格为间隔形成字符串,方便进行词云的生成
for s in range(0, len(all_words)):
strcloud = strcloud + all_words[s] + ' '
c = Counter()
for x in all_words:
if len(x) > 1 and x != '\r\n': # 长度大于一,并且不为换行等字符
c[x] += 1
#新建文件存放词频统计结果
resfile = "res" + name
f = open(resfile, 'w', encoding='utf-8')
print("-----------------------统计词频中--------------------------")
print('词频统计结果:')
for (k, v) in c.most_common(len(all_words)):
print("%s:%d" % (k, v))
f.write(k + ':' + str(v) + '\n')#将结果存入文件
f.close() #关闭文件
return strcloud
#处理英文文件
def Eopenfile(name):
strcloud = ""
dic = {}#设置字典,存放结果
allword = []
for line in open(name, encoding='utf-8'):
line.lower()#转换为小写字母
newline = re.split('[ ,.?!;:"]', line.strip('\n'))
#将分词结果,以空格为间隔形成字符串,方便进行词云的生成
for s in range(0, len(newline)):
strcloud = strcloud + newline[s] + ' '
#统计单词出现次数
for i in newline:
if i in dic:
dic[i] += 1
else:
dic[i] = 1
allword +=newline
# 新建文件存放词频统计结果
resfile = "res" + name
f = open(resfile, 'w', encoding='utf-8')
# 为便于排序,将词典转化为列表
items = list(dic.items())
# 根据单词的频数从高到低排序
items.sort(key=lambda x: x[1], reverse=True)
print("-----------------------统计词频中--------------------------")
print('词频统计结果:')
for i in range(len(items)):
print(items[i])
f.write(str(items[i]) + "\n")#将结果存入文件
f.close()
return strcloud