From e9c6965d401d58db90ddc074f58b383c26c48551 Mon Sep 17 00:00:00 2001
From: pe79kpq4w <310771302@qq.com>
Date: Wed, 30 Mar 2022 21:05:45 +0800
Subject: [PATCH] Word frequency statistics

---
 FileProcess.py | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100644 FileProcess.py

diff --git a/FileProcess.py b/FileProcess.py
new file mode 100644
index 0000000..bcaf434
--- /dev/null
+++ b/FileProcess.py
@@ -0,0 +1,92 @@
+#文件的处理
+import os
+import re
+from collections import Counter
+from os import listdir
+from os.path import join, isfile, isdir
+import jieba
+
+filter=[".txt"] #设置过滤后的文件类型 当然可以设置多个类型
+
+#查找当前文件路径下的所有文本文件
+def all_path(dirname):
+    print(dirname)
+    xf = 0
+    adict = {}
+    for subpath in listdir(dirname):
+        path = join(dirname, subpath)
+        if isfile(path):
+            ext = os.path.splitext(path)[1]
+            if ext in filter: #筛选文件中以.txt结尾的文件
+                xf = xf+1
+                path = path.replace(dirname, "")
+                path = path.lstrip(path[0:1])
+                adict[xf] = path
+        elif isdir(path):   #跳过文件夹，只查找目录下的文件
+            continue
+    for key, value in adict.items():
+        print(str(key)+"  :  "+value)
+    return adict
+
+#处理中文文件
+def Copenfile(name):
+    strcloud = ""
+    cut_words = ""
+    for line in open(name, encoding='utf-8'):
+        line.strip('\n')
+        line = re.sub("[A-Za-z0-9\：\·\?\!\？\；\、\—\，\。\“ \”]", "", line)  # 去除杂乱字符
+        seg_list = jieba.cut(line, cut_all=False)
+        cut_words += (" ".join(seg_list))
+    all_words = cut_words.split()
+    #将分词结果，以空格为间隔形成字符串，方便进行词云的生成
+    for s in range(0, len(all_words)):
+        strcloud = strcloud + all_words[s] + ' '
+    c = Counter()
+    for x in all_words:
+        if len(x) > 1 and x != '\r\n':  # 长度大于一，并且不为换行等字符
+            c[x] += 1
+    #新建文件存放词频统计结果
+    resfile = "res" + name
+    f = open(resfile, 'w', encoding='utf-8')
+    print("-----------------------统计词频中--------------------------")
+    print('词频统计结果：')
+    for (k, v) in c.most_common(len(all_words)):
+        print("%s:%d" % (k, v))
+        f.write(k + ':' + str(v) + '\n')#将结果存入文件
+    f.close() #关闭文件
+    return strcloud
+
+
+#处理英文文件
+def Eopenfile(name):
+    strcloud = ""
+    dic = {}#设置字典，存放结果
+    allword = []
+    for line in open(name, encoding='utf-8'):
+        line.lower()#转换为小写字母
+        newline = re.split('[ ,.?!;:"]', line.strip('\n'))
+        #将分词结果，以空格为间隔形成字符串，方便进行词云的生成
+        for s in range(0, len(newline)):
+            strcloud = strcloud + newline[s] + ' '
+        #统计单词出现次数
+        for i in newline:
+            if i in dic:
+                dic[i] += 1
+            else:
+                dic[i] = 1
+        allword +=newline
+    # 新建文件存放词频统计结果
+    resfile = "res" + name
+    f = open(resfile, 'w', encoding='utf-8')
+    # 为便于排序，将词典转化为列表
+    items = list(dic.items())
+    # 根据单词的频数从高到低排序
+    items.sort(key=lambda x: x[1], reverse=True)
+    print("-----------------------统计词频中--------------------------")
+    print('词频统计结果：')
+    for i in range(len(items)):
+        print(items[i])
+        f.write(str(items[i]) + "\n")#将结果存入文件
+    f.close()
+    return strcloud
+