our_wordsystem/FileProcess.py

#文件的处理
import os
import re
from collections import Counter
from os import listdir
from os.path import join, isfile, isdir
import jieba

filter=[".txt"] #设置过滤后的文件类型 当然可以设置多个类型

#查找当前文件路径下的所有文本文件
def all_path(dirname):
    print(dirname)
    xf = 0
    adict = {}
    for subpath in listdir(dirname):
        path = join(dirname, subpath)
        if isfile(path):
            ext = os.path.splitext(path)[1]
            if ext in filter: #筛选文件中以.txt结尾的文件
                xf = xf+1
                path = path.replace(dirname, "")
                path = path.lstrip(path[0:1])
                adict[xf] = path
        elif isdir(path):   #跳过文件夹，只查找目录下的文件
            continue
    for key, value in adict.items():
        print(str(key)+"  :  "+value)
    return adict

#处理中文文件
def Copenfile(name):
    strcloud = ""
    cut_words = ""
    for line in open(name, encoding='utf-8'):
        line.strip('\n')
        line = re.sub("[A-Za-z0-9\：\·\?\!\？\；\、\—\，\。\“ \”]", "", line)  # 去除杂乱字符
        seg_list = jieba.cut(line, cut_all=False)
        cut_words += (" ".join(seg_list))
    all_words = cut_words.split()
    #将分词结果，以空格为间隔形成字符串，方便进行词云的生成
    for s in range(0, len(all_words)):
        strcloud = strcloud + all_words[s] + ' '
    c = Counter()
    for x in all_words:
        if len(x) > 1 and x != '\r\n':  # 长度大于一，并且不为换行等字符
            c[x] += 1
    #新建文件存放词频统计结果
    resfile = "res" + name
    f = open(resfile, 'w', encoding='utf-8')
    print("-----------------------统计词频中--------------------------")
    print('词频统计结果：')
    for (k, v) in c.most_common(len(all_words)):
        print("%s:%d" % (k, v))
        f.write(k + ':' + str(v) + '\n')#将结果存入文件
    f.close() #关闭文件
    return strcloud


#处理英文文件
def Eopenfile(name):
    strcloud = ""
    dic = {}#设置字典，存放结果
    allword = []
    for line in open(name, encoding='utf-8'):
        line.lower()#转换为小写字母
        newline = re.split('[ ,.?!;:"]', line.strip('\n'))
        #将分词结果，以空格为间隔形成字符串，方便进行词云的生成
        for s in range(0, len(newline)):
            strcloud = strcloud + newline[s] + ' '
        #统计单词出现次数
        for i in newline:
            if i in dic:
                dic[i] += 1
            else:
                dic[i] = 1
        allword +=newline
    # 新建文件存放词频统计结果
    resfile = "res" + name
    f = open(resfile, 'w', encoding='utf-8')
    # 为便于排序，将词典转化为列表
    items = list(dic.items())
    # 根据单词的频数从高到低排序
    items.sort(key=lambda x: x[1], reverse=True)
    print("-----------------------统计词频中--------------------------")
    print('词频统计结果：')
    for i in range(len(items)):
        print(items[i])
        f.write(str(items[i]) + "\n")#将结果存入文件
    f.close()
    return strcloud