ciyun/main.py

# -*- coding=utf-8 -*-

import os
import glob
import os
import jieba
import wordcloud
from wordcloud import STOPWORDS
from matplotlib import pyplot as plt

####################################################################################
#检验是否全是中文字符
def is_all_chinese(strs):
    for _char in strs:
        if not '\u4e00' <= _char <= '\u9fa5':
            return False
    return True
#检验是否包含中文字符
def is_chinese(strs):
    for ch in strs:
        if u'\u4e00' <= ch <= u'\u9fff':
            return True
    return False
####################################################################################


'''
纯中文词云
'''
def word_cloud_Chinese(file):
    fb = open(file, 'r', encoding="utf-8")
    t = fb.read()
    fb.close()

    stopwords = set()
    content = [line.strip() for line in open('cn_stopwords.txt', 'r', encoding="utf-8").readlines()]
    stopwords.update(content)

    ls = jieba.lcut(t)
    txt = " ".join(ls)
    w = wordcloud.WordCloud(font_path="STSONG.TTF",
                            width=700,
                            height=700,
                            background_color="white",
                            stopwords=stopwords)
    w.generate(txt)
    w.to_file("123.png")

    plt.imshow(w, interpolation='bilinear')
    plt.axis('off')
    plt.tight_layout()
    plt.show()


'''
纯英文词云
'''


def word_cloud_English(file):
    fb = open(file, 'r', encoding="utf-8")
    t = fb.read()
    fb.close()
    w = wordcloud.WordCloud(font_path="arial.ttf",
                            width=1000,
                            height=700,
                            background_color="white",
                            stopwords=STOPWORDS)
    w.generate(t)
    w.to_file("123.png")

    plt.imshow(w, interpolation='bilinear')
    plt.axis('off')
    plt.tight_layout()
    plt.show()


'''
中英混合词云
'''


def word_cloud_English_and_Chinese(file):
    fb = open(file, 'r', encoding="utf-8")
    t = fb.read()
    fb.close()
    stopwords = set()
    content = [line.strip() for line in open('cn_stopwords.txt', 'r', encoding="utf-8").readlines()]
    stopwords.update(content)
    w = wordcloud.WordCloud(font_path="STSONG.TTF",
                            width=1000,
                            height=700,
                            background_color="white",
                            stopwords=stopwords,
                            collocations=False
                            )
    ls = jieba.lcut(t)
    t = " ".join(ls)
    w.generate(t)
    w.to_file("123.png")

    plt.imshow(w, interpolation='bilinear')
#################################################################################################

'''
纯中文词频计数
'''


def Chineseword(file):
    txt = open(file, "r", encoding='utf-8').read()
    counts = {}  # 通过键值对的形式存储词语及其出现的次数
    for ch in " ，。：；,《》！？“\”' ''\n'":
        txt = txt.replace(ch, "")  # 将文本中特殊字符替换为空格
    words = jieba.lcut(txt)  # 使用精确模式对文本进行分词

    for word in words:
        if (len(word) == 1):
            continue
        else:
            counts[word] = counts.get(word, 0) + 1  # 遍历所有词语，每出现一次其对应的值加 1

    items = list(counts.items())
    items.sort(key=lambda x: x[1], reverse=True)  # 根据词语出现的次数进行从大到小排序
    for i in range(len(items)):
        print(items[i])


'''
纯英文词频计数
'''


def Englishword(file):
    fb = open(file, 'r', encoding="utf-8")
    wordfile = {}
    for line in fb:
        line = line.lower()
        sword = line.strip().split()
        for word in sword:
            if word in wordfile:
                wordfile[word] += 1
            else:
                wordfile[word] = 1
    wordfrehigh = []
    for wd, fy in wordfile.items():
        wordfrehigh.append((fy, wd))
    wordfrehigh.sort(reverse=True)
    for wd in wordfrehigh:
        print(wd)
    fb.close()


'''
中英混合词频计数
'''


def English_and_Chinese(file):
    fb = open(file, 'r', encoding="utf-8")
    t = fb.read()
    ls = jieba.lcut(t)
    t = " ".join(ls)
    t = t.lower()
    for ch in "，。？：；’“！——、~,《》.--?;:'\"!~' ''\n'":
        t = t.replace(ch, " ")
    t = t.split(" ")

    wordfile = {}
    for line in t:
        sword = line.split()
        for word in sword:
            if word in wordfile:
                wordfile[word] += 1
            else:
                wordfile[word] = 1
    wordfrehigh = []
    for wd, fy in wordfile.items():
        wordfrehigh.append((fy, wd))
    wordfrehigh.sort(reverse=True)
    for wd in wordfrehigh:
        print(wd)
    fb.close()
###########################################################################################################

if __name__ =='__main__':
    print("欢迎使用小浣熊词云转换器")
    print('''使用介绍：
    1.将你想要转换成词云图的文本放入一个文件夹
    2.告诉我们这个文件夹的地址
    3.确认是否将文本导入（是/否）
    4.我们会用序号标好您的所有文本，由您用序号选择转换哪一个文本

    开发团队：
    李世健，卢婉梅，李子祥，
    鲁朕家，兰晶晶，闭玉婷''')

    print("请输入目标文件夹：")
    targetfile=input()
    print('是否已将文本导入')
    fa=input()
    while True:

        if fa == '是':
            path = targetfile
            files = os.listdir(path)  # 得到文件夹下的所有文件名称
            txts = []
            i = 1
            judg = []

            for file in files:  # 遍历文件夹
                position = path + '\\' + file  # 构造绝对路径，"\\"，其中一个'\'为转义符
                print(i, '--- ', end='')
                print(file, end='')
                i = i + 1

                with open(position, "r", encoding='utf-8') as f:  # 打开文件
                    data = f.read()  # 读取文件
                    if (is_all_chinese(data)):
                        print("  (纯中文)")
                        judg.append('z')
                    else:
                        if (is_chinese(data)):
                            print("  (有英文有中文)")
                            judg.append('m')
                        else:
                            print("  (纯英文)")
                            judg.append('y')
            ##################################################################################
            print("输入你要选择的文本")
            atwo = int(input())
            tine = 0
            k = 1
            for file in files:  # 遍历文件夹
                position = path + '\\' + file  # 构造绝对路径，"\\"，其中一个'\'为转义符
                tine = tine + 1
                if tine == atwo:
                    resultlj = position
            print(resultlj)
            #################################################################################
            print(judg[atwo - 1])
            if judg[atwo - 1] == 'z':
                word_cloud_Chinese(resultlj)
                Chineseword(resultlj)

            elif judg[atwo - 1] == 'y':
                word_cloud_English(resultlj)
                Englishword(resultlj)
            else:
                word_cloud_English_and_Chinese(resultlj)
                English_and_Chinese(resultlj)
        print("是否退出程序")
        ans=input()
        if ans=='是':
            break