You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

93 lines
3.2 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#文件的处理
import os
import re
from collections import Counter
from os import listdir
from os.path import join, isfile, isdir
import jieba
filter=[".txt"] #设置过滤后的文件类型 当然可以设置多个类型
#查找当前文件路径下的所有文本文件
def all_path(dirname):
print(dirname)
xf = 0
adict = {}
for subpath in listdir(dirname):
path = join(dirname, subpath)
if isfile(path):
ext = os.path.splitext(path)[1]
if ext in filter: #筛选文件中以.txt结尾的文件
xf = xf+1
path = path.replace(dirname, "")
path = path.lstrip(path[0:1])
adict[xf] = path
elif isdir(path): #跳过文件夹,只查找目录下的文件
continue
for key, value in adict.items():
print(str(key)+" : "+value)
return adict
#处理中文文件
def Copenfile(name):
strcloud = ""
cut_words = ""
for line in open(name, encoding='utf-8'):
line.strip('\n')
line = re.sub("[A-Za-z0-9\\·\?\!\\\\\\\\”]", "", line) # 去除杂乱字符
seg_list = jieba.cut(line, cut_all=False)
cut_words += (" ".join(seg_list))
all_words = cut_words.split()
#将分词结果,以空格为间隔形成字符串,方便进行词云的生成
for s in range(0, len(all_words)):
strcloud = strcloud + all_words[s] + ' '
c = Counter()
for x in all_words:
if len(x) > 1 and x != '\r\n': # 长度大于一,并且不为换行等字符
c[x] += 1
#新建文件存放词频统计结果
resfile = "res" + name
f = open(resfile, 'w', encoding='utf-8')
print("-----------------------统计词频中--------------------------")
print('词频统计结果:')
for (k, v) in c.most_common(len(all_words)):
print("%s:%d" % (k, v))
f.write(k + ':' + str(v) + '\n')#将结果存入文件
f.close() #关闭文件
return strcloud
#处理英文文件
def Eopenfile(name):
strcloud = ""
dic = {}#设置字典,存放结果
allword = []
for line in open(name, encoding='utf-8'):
line.lower()#转换为小写字母
newline = re.split('[ ,.?!;:"]', line.strip('\n'))
#将分词结果,以空格为间隔形成字符串,方便进行词云的生成
for s in range(0, len(newline)):
strcloud = strcloud + newline[s] + ' '
#统计单词出现次数
for i in newline:
if i in dic:
dic[i] += 1
else:
dic[i] = 1
allword +=newline
# 新建文件存放词频统计结果
resfile = "res" + name
f = open(resfile, 'w', encoding='utf-8')
# 为便于排序,将词典转化为列表
items = list(dic.items())
# 根据单词的频数从高到低排序
items.sort(key=lambda x: x[1], reverse=True)
print("-----------------------统计词频中--------------------------")
print('词频统计结果:')
for i in range(len(items)):
print(items[i])
f.write(str(items[i]) + "\n")#将结果存入文件
f.close()
return strcloud