|
|
|
@ -0,0 +1,92 @@
|
|
|
|
|
#文件的处理
|
|
|
|
|
import os
|
|
|
|
|
import re
|
|
|
|
|
from collections import Counter
|
|
|
|
|
from os import listdir
|
|
|
|
|
from os.path import join, isfile, isdir
|
|
|
|
|
import jieba
|
|
|
|
|
|
|
|
|
|
filter=[".txt"] #设置过滤后的文件类型 当然可以设置多个类型
|
|
|
|
|
|
|
|
|
|
#查找当前文件路径下的所有文本文件
|
|
|
|
|
def all_path(dirname):
|
|
|
|
|
print(dirname)
|
|
|
|
|
xf = 0
|
|
|
|
|
adict = {}
|
|
|
|
|
for subpath in listdir(dirname):
|
|
|
|
|
path = join(dirname, subpath)
|
|
|
|
|
if isfile(path):
|
|
|
|
|
ext = os.path.splitext(path)[1]
|
|
|
|
|
if ext in filter: #筛选文件中以.txt结尾的文件
|
|
|
|
|
xf = xf+1
|
|
|
|
|
path = path.replace(dirname, "")
|
|
|
|
|
path = path.lstrip(path[0:1])
|
|
|
|
|
adict[xf] = path
|
|
|
|
|
elif isdir(path): #跳过文件夹,只查找目录下的文件
|
|
|
|
|
continue
|
|
|
|
|
for key, value in adict.items():
|
|
|
|
|
print(str(key)+" : "+value)
|
|
|
|
|
return adict
|
|
|
|
|
|
|
|
|
|
#处理中文文件
|
|
|
|
|
def Copenfile(name):
|
|
|
|
|
strcloud = ""
|
|
|
|
|
cut_words = ""
|
|
|
|
|
for line in open(name, encoding='utf-8'):
|
|
|
|
|
line.strip('\n')
|
|
|
|
|
line = re.sub("[A-Za-z0-9\:\·\?\!\?\;\、\—\,\。\“ \”]", "", line) # 去除杂乱字符
|
|
|
|
|
seg_list = jieba.cut(line, cut_all=False)
|
|
|
|
|
cut_words += (" ".join(seg_list))
|
|
|
|
|
all_words = cut_words.split()
|
|
|
|
|
#将分词结果,以空格为间隔形成字符串,方便进行词云的生成
|
|
|
|
|
for s in range(0, len(all_words)):
|
|
|
|
|
strcloud = strcloud + all_words[s] + ' '
|
|
|
|
|
c = Counter()
|
|
|
|
|
for x in all_words:
|
|
|
|
|
if len(x) > 1 and x != '\r\n': # 长度大于一,并且不为换行等字符
|
|
|
|
|
c[x] += 1
|
|
|
|
|
#新建文件存放词频统计结果
|
|
|
|
|
resfile = "res" + name
|
|
|
|
|
f = open(resfile, 'w', encoding='utf-8')
|
|
|
|
|
print("-----------------------统计词频中--------------------------")
|
|
|
|
|
print('词频统计结果:')
|
|
|
|
|
for (k, v) in c.most_common(len(all_words)):
|
|
|
|
|
print("%s:%d" % (k, v))
|
|
|
|
|
f.write(k + ':' + str(v) + '\n')#将结果存入文件
|
|
|
|
|
f.close() #关闭文件
|
|
|
|
|
return strcloud
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#处理英文文件
|
|
|
|
|
def Eopenfile(name):
|
|
|
|
|
strcloud = ""
|
|
|
|
|
dic = {}#设置字典,存放结果
|
|
|
|
|
allword = []
|
|
|
|
|
for line in open(name, encoding='utf-8'):
|
|
|
|
|
line.lower()#转换为小写字母
|
|
|
|
|
newline = re.split('[ ,.?!;:"]', line.strip('\n'))
|
|
|
|
|
#将分词结果,以空格为间隔形成字符串,方便进行词云的生成
|
|
|
|
|
for s in range(0, len(newline)):
|
|
|
|
|
strcloud = strcloud + newline[s] + ' '
|
|
|
|
|
#统计单词出现次数
|
|
|
|
|
for i in newline:
|
|
|
|
|
if i in dic:
|
|
|
|
|
dic[i] += 1
|
|
|
|
|
else:
|
|
|
|
|
dic[i] = 1
|
|
|
|
|
allword +=newline
|
|
|
|
|
# 新建文件存放词频统计结果
|
|
|
|
|
resfile = "res" + name
|
|
|
|
|
f = open(resfile, 'w', encoding='utf-8')
|
|
|
|
|
# 为便于排序,将词典转化为列表
|
|
|
|
|
items = list(dic.items())
|
|
|
|
|
# 根据单词的频数从高到低排序
|
|
|
|
|
items.sort(key=lambda x: x[1], reverse=True)
|
|
|
|
|
print("-----------------------统计词频中--------------------------")
|
|
|
|
|
print('词频统计结果:')
|
|
|
|
|
for i in range(len(items)):
|
|
|
|
|
print(items[i])
|
|
|
|
|
f.write(str(items[i]) + "\n")#将结果存入文件
|
|
|
|
|
f.close()
|
|
|
|
|
return strcloud
|
|
|
|
|
|