lin/python豆瓣电影数据可视化分析/项目/word_cloud.py

import jieba
from PIL import Image
import numpy as np
from wordcloud import WordCloud
import random
from utils.query import querys
from utils.utils import typeList
import matplotlib
import re

matplotlib.use('Agg')  # 使用非交互式后端
import matplotlib.pyplot as plt


# 评论
def getImageByComments(comments):
    # 调用函数，并传入要排除的关键字列表
    exclude_words_pl = []
    text = ''
    for i in comments:
        text = text + i['content']

    # 分词
    cut = jieba.cut(text)
    string = ' '.join(cut)

    # 排除指定关键字
    if exclude_words_pl:
        for word in exclude_words_pl:
            string = re.sub(word, '', string)

    # 过滤掉长度为1的单词
    filtered_string = ' '.join(word for word in string.split() if len(word) > 1)

    img = Image.open('./static/img/2.png')
    img_arr = np.array(img)
    wc = WordCloud(
        background_color='white',
        # mask=img_arr,
        # font_path=r'C:\Windows\Fonts\simsun.ttc',
        font_path='.\飞波正点体.otf',
    )
    wc.generate_from_text(filtered_string)

    # 绘制图片
    flg = plt.figure(1)
    plt.imshow(wc)
    plt.axis('off')

    randomInt = random.randint(1, 100000000)
    plt.savefig(f'./static/img/{randomInt}.png')
    # 关闭图形
    plt.close()
    return f'./static/img/{randomInt}.png'


# 标题
def getImageByAuthor(field, targetImage, resImage, exclude_words=None):
    sql = 'select {} from movie'.format(field)
    data = querys(sql, [], 'select')
    text = ''
    for i in data:
        if i[0] is not None:
            text = text + i[0]

    # 分词
    cut = jieba.cut(text)
    string = ' '.join(cut)

    # 排除指定关键字
    if exclude_words:
        for word in exclude_words:
            string = re.sub(word, '', string)

    # 过滤掉长度为1的单词
    filtered_string = ' '.join(word for word in string.split() if len(word) > 1)

    img = Image.open(targetImage)
    img_arr = np.array(img)
    wc = WordCloud(
        background_color='white',
        # mask=img_arr,
        font_path='.\飞波正点体.otf',
    )
    wc.generate_from_text(filtered_string)

    # 绘制图片
    plt.figure(1)
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')

    randomInt = random.randint(1, 100000000)
    plt.savefig(resImage)
    # 关闭图形
    plt.close()


# 调用函数，并传入要排除的关键字列表
exclude_words = []
getImageByAuthor('title', './static/img/2.png', './static/img/title_cloud.png', exclude_words)


def getCastsDataTop():
    castsList = typeList('casts')
    castsObj = {}
    for i in castsList:
        if castsObj.get(i, -1) == -1:
            castsObj[i] = 1
        else:
            castsObj[i] = castsObj[i] + 1
    castsObj = sorted(castsObj.items(), key=lambda x: x[1], reverse=True)[:100]
    row = []
    columns = []
    for i in castsObj:
        row.append(i[0])
        columns.append(i[1])
    # print(row,columns)
    return row, columns


# 演员
# def getImageByCasts(targetImage, resImage):
#     # 假设getCastsDataTop20()返回演员名单和频率列表
#     castsList, castsFrequency = getCastsDataTop()
#     # print(castsList, castsFrequency)
#     # 将演员名单和频率结合起来，生成一个用于生成词云的文本字符串
#     text = ' '.join([name + ' ' * freq for name, freq in zip(castsList, castsFrequency)])
#
#     # 打开目标图片并获取其形状
#     img = Image.open(targetImage)
#     img_arr = np.array(img)
#
#     # 创建词云对象
#     wc = WordCloud(
#         background_color='white',
#         # mask=img_arr,
#         font_path='STHUPO.TTF'
#     )
#
#     # 生成词云
#     wc.generate_from_text(text)
#
#     # 绘制词云图
#     flg = plt.figure(1)
#     plt.imshow(wc)
#     plt.axis('off')
#
#     # 保存词云图
#     randomInt = random.randint(1, 100000000)
#     plt.savefig(resImage)
#
#     # 关闭绘图
#     plt.close()

# 演员
def getImageByCasts(targetImage, resImage):
    # 假设getCastsDataTop20()返回演员名单和频率列表
    castsList, castsFrequency = getCastsDataTop()

    # 将演员名单和频率结合起来，生成一个字典，用于生成词云
    frequency_dict = {name: freq for name, freq in zip(castsList, castsFrequency)}

    # 打开目标图片并获取其形状
    img = Image.open(targetImage)
    img_arr = np.array(img)

    # 创建词云对象
    wc = WordCloud(
        background_color='white',
        # mask=img_arr,  # 使用目标图片的形状作为词云的形状
        font_path='.\飞波正点体.otf',
        max_font_size=100,  # 设置最大字体大小
        font_step=1,  # 设置字体大小变化的步长
        random_state=30,  # 为字体大小和颜色的随机性设置一个种子
        max_words=200  # 设置词云显示的最大单词数
    )

    # 生成词云
    wc.generate_from_frequencies(frequency_dict)  # 使用频率字典生成词云

    # 绘制词云图
    # plt.figure(figsize=(10, 8))  # 设置图形的大小
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')  # 不显示坐标轴

    # 保存词云图
    randomInt = random.randint(1, 100000000)
    plt.savefig(resImage, bbox_inches=0)  # 使用bbox_inches=0来确保没有额外的空白边界

    # 关闭绘图
    plt.close()


# 调用函数生成词云图片
getImageByCasts('./static/img/2.png', './static/img/cloud_cloud.png')

# getImageByAuthor('title', './static/img/1.jpg', './static/img/title_cloud.png')
getImageByAuthor('summary', './static/img/2.png', './static/img/summary_cloud.png')

print('生成词云图成功！')

# def getImageByComments(comments):
#     text = ''
#     for i in comments:
#         text = text + i['content']
#
#     # 分词
#     cut = jieba.cut(text)
#     string = ' '.join(cut)
#
#     img = Image.open('./static/img/2.png')
#     img_arr = np.array(img)
#     wc = WordCloud(
#         background_color='white',
#         mask=img_arr,
#         font_path='STHUPO.TTF'
#     )
#     wc.generate_from_text(string)
#
#     # 绘制图片
#     flg = plt.figure(1)
#     plt.imshow(wc)
#     plt.axis('off')
#
#     randomInt = random.randint(1, 100000000)
#     plt.savefig(f'./static/img/{randomInt}.png')
#     return f'./static/img/{randomInt}.png'
#
#
# def getImageByAuthor(field, targetImage, resImage):
#     sql = 'select {} from movie'.format(field)
#     data = querys(sql, [], 'select')
#     text = ''
#     for i in data:
#         text = text + i[0]
#
#     # 分词
#     cut = jieba.cut(text)
#     string = ' '.join(cut)
#
#     img = Image.open(targetImage)
#     img_arr = np.array(img)
#     wc = WordCloud(
#         background_color='white',
#         mask=img_arr,
#         font_path='STHUPO.TTF'
#     )
#     wc.generate_from_text(string)
#
#     # 绘制图片
#     flg = plt.figure(1)
#     plt.imshow(wc)
#     plt.axis('off')
#
#     randomInt = random.randint(1, 100000000)
#     plt.savefig(resImage)
#
#
# getImageByAuthor('title', './static/img/2.png', './static/img/title_cloud.png')
# getImageByAuthor('summary', './static/img/2.png', './static/img/summary_cloud.png')
#
#
# def getImageByCasts(targetImage, resImage):
#     castsList = typeList('casts')
#     text = ''
#     for i in castsList:
#         text = text + i
#
#     # 分词
#     cut = jieba.cut(text)
#     string = ' '.join(cut)
#
#     img = Image.open(targetImage)
#     img_arr = np.array(img)
#     wc = WordCloud(
#         background_color='white',
#         mask=img_arr,
#         font_path='STHUPO.TTF'
#     )
#     wc.generate_from_text(string)
#
#     # 绘制图片
#     flg = plt.figure(1)
#     plt.imshow(wc)
#     plt.axis('off')
#
#     randomInt = random.randint(1, 100000000)
#     plt.savefig(resImage)
#
#
# getImageByCasts('./static/img/2.png', './static/img/cloud_cloud.png')