diff --git a/DataProcess.py b/DataProcess.py new file mode 100644 index 0000000..e5f1075 --- /dev/null +++ b/DataProcess.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- +import os +import jieba + +def Create_path(): + ''' + 当前目录下创建一个新目录 Result 存放 Jieba.txt , Sort.txt , 词云图 + :return: 创建的目录 + ''' + try: + newPath = os.getcwd() + "\\" + 'Result' + "\\" + if not os.path.exists(newPath): + os.makedirs(newPath) + return newPath + except BaseException as msg: + print("新建目录失败:" + msg) + + +def Jieba(fileName,filePath): + ''' + 调用jieba库,对输入文件进行分词,结果保存至Jieba.txt文件 + :param fileName: 传入文件的名字 + :param filePath: 结果要保存到的目录 + :return: Jieba.txt文件 + ''' + with open(fileName, 'r', encoding='utf-8') as f: + cutWord = jieba.cut(f.read()) + result = " ".join(cutWord) + + result = result.replace(' : ', ' ') + result = result.replace(' ; ', ' ') + result = result.replace(' ! ', ' ') + result = result.replace(' 、 ', ' ') + result = result.replace(' ” ', ' ') + result = result.replace(' “ ', ' ') + result = result.replace(' — ', ' ') + result = result.replace(' — ', ' ') + result = result.replace(' - ', ' ') + result = result.replace(' , ', ' ') + result = result.replace(' . ', ' ') + result = result.replace(' , ', ' ') + result = result.replace(' 。 ', ' ') + result = result.replace(' 》 ', ' ') + result = result.replace(' 《 ', ' ') + result = result.replace(' ) ', ' ') + result = result.replace(' ( ', ' ') + result = result.replace(' ', ' ') + + os.chdir(filePath) # 修改当前目录 + fp = open('Jieba.txt', 'w', encoding='utf-8') + fp.write(result) + fp.close() + + +def Turn(): + ''' + 将 Jieba_txt文件 分组转化为列表 + :return: 结果列表 wordList + ''' + with open('Jieba.txt', 'r', encoding='utf-8') as f: + wordList = [] + for wordStr in f.readlines(): + wordStr = wordStr.strip() + wordLine = wordStr.split(' ') + wordList.extend(wordLine) + return wordList + + +def Account(wordList): + ''' + 统计 wordList 的词频 + :param wordList: 待统计的列表 + :return: 统计的结果 + ''' + accountDict = {} + for i in wordList: + accountDict[i] = wordList.count(i) + return accountDict + + +def Sort(accountDict): + ''' + 对字典进行排序,并保存到Sort.txt中 + :param accountDict: 排序前的字典 + :return: 排序后的字典 + ''' + sortDict = sorted(accountDict.items(), key=lambda d: d[1], reverse=True) + sortDict = dict(sortDict) + Print_sortDict(sortDict) + + clearStr = str(sortDict) + clearStr = clearStr.replace('\'','') + fp = open('Sort.txt', 'w', encoding='utf-8') + fp.write(clearStr) + fp.close() + + +def Print_sortDict(sortDict): + ''' + 打印统计排序的结果 + :param sortDict: 排序后的字典 + ''' + i = 0 + print('\n======统计结果:======') + for x, y in sortDict.items(): + if i < len(sortDict): + print('(\'%s\',%s)'%(x,y)) + i += 1 + continue + else: + break