|
|
# -*- coding: utf-8 -*-
|
|
|
import os
|
|
|
import jieba
|
|
|
|
|
|
def Create_path():
|
|
|
'''
|
|
|
当前目录下创建一个新目录 Result 存放 Jieba.txt , Sort.txt , 词云图
|
|
|
:return: 创建的目录
|
|
|
'''
|
|
|
try:
|
|
|
newPath = os.getcwd() + "\\" + 'Result' + "\\"
|
|
|
if not os.path.exists(newPath):
|
|
|
os.makedirs(newPath)
|
|
|
return newPath
|
|
|
except BaseException as msg:
|
|
|
print("新建目录失败:" + msg)
|
|
|
|
|
|
|
|
|
def Jieba(fileName,filePath):
|
|
|
'''
|
|
|
调用jieba库,对输入文件进行分词,结果保存至Jieba.txt文件
|
|
|
:param fileName: 传入文件的名字
|
|
|
:param filePath: 结果要保存到的目录
|
|
|
:return: Jieba.txt文件
|
|
|
'''
|
|
|
with open(fileName, 'r', encoding='utf-8') as f:
|
|
|
cutWord = jieba.cut(f.read())
|
|
|
result = " ".join(cutWord)
|
|
|
|
|
|
result = result.replace(' : ', ' ')
|
|
|
result = result.replace(' ; ', ' ')
|
|
|
result = result.replace(' ! ', ' ')
|
|
|
result = result.replace(' 、 ', ' ')
|
|
|
result = result.replace(' ” ', ' ')
|
|
|
result = result.replace(' “ ', ' ')
|
|
|
result = result.replace(' — ', ' ')
|
|
|
result = result.replace(' — ', ' ')
|
|
|
result = result.replace(' - ', ' ')
|
|
|
result = result.replace(' , ', ' ')
|
|
|
result = result.replace(' . ', ' ')
|
|
|
result = result.replace(' , ', ' ')
|
|
|
result = result.replace(' 。 ', ' ')
|
|
|
result = result.replace(' 》 ', ' ')
|
|
|
result = result.replace(' 《 ', ' ')
|
|
|
result = result.replace(' ) ', ' ')
|
|
|
result = result.replace(' ( ', ' ')
|
|
|
result = result.replace(' ', ' ')
|
|
|
|
|
|
os.chdir(filePath) # 修改当前目录
|
|
|
fp = open('Jieba.txt', 'w', encoding='utf-8')
|
|
|
fp.write(result)
|
|
|
fp.close()
|
|
|
|
|
|
|
|
|
def Turn():
|
|
|
'''
|
|
|
将 Jieba_txt文件 分组转化为列表
|
|
|
:return: 结果列表 wordList
|
|
|
'''
|
|
|
with open('Jieba.txt', 'r', encoding='utf-8') as f:
|
|
|
wordList = []
|
|
|
for wordStr in f.readlines():
|
|
|
wordStr = wordStr.strip()
|
|
|
wordLine = wordStr.split(' ')
|
|
|
wordList.extend(wordLine)
|
|
|
return wordList
|
|
|
|
|
|
|
|
|
def Account(wordList):
|
|
|
'''
|
|
|
统计 wordList 的词频
|
|
|
:param wordList: 待统计的列表
|
|
|
:return: 统计的结果
|
|
|
'''
|
|
|
accountDict = {}
|
|
|
for i in wordList:
|
|
|
accountDict[i] = wordList.count(i)
|
|
|
return accountDict
|
|
|
|
|
|
|
|
|
def Sort(accountDict):
|
|
|
'''
|
|
|
对字典进行排序,并保存到Sort.txt中
|
|
|
:param accountDict: 排序前的字典
|
|
|
:return: 排序后的字典
|
|
|
'''
|
|
|
sortDict = sorted(accountDict.items(), key=lambda d: d[1], reverse=True)
|
|
|
sortDict = dict(sortDict)
|
|
|
Print_sortDict(sortDict)
|
|
|
|
|
|
clearStr = str(sortDict)
|
|
|
clearStr = clearStr.replace('\'','')
|
|
|
fp = open('Sort.txt', 'w', encoding='utf-8')
|
|
|
fp.write(clearStr)
|
|
|
fp.close()
|
|
|
|
|
|
|
|
|
def Print_sortDict(sortDict):
|
|
|
'''
|
|
|
打印统计排序的结果
|
|
|
:param sortDict: 排序后的字典
|
|
|
'''
|
|
|
i = 0
|
|
|
print('\n======统计结果:======')
|
|
|
for x, y in sortDict.items():
|
|
|
if i < len(sortDict):
|
|
|
print('(\'%s\',%s)'%(x,y))
|
|
|
i += 1
|
|
|
continue
|
|
|
else:
|
|
|
break
|