import pandas as pd
import jieba  # 结巴分词
import wordcloud  # 词云图模块
import matplotlib.pyplot as plt
import imageio  # 读取本地图片，修改词云图形
import time

def break_down_words(file_path, stopwords= '', output_txt='stop.txt', output_turly=False):
    """ 
    从Excel文件中生成有效词并保存为TXT。
    返回有效词集合。
    """
    # 1. 读取Excel文件中的弹幕数据
    df = pd.read_excel(file_path)

    # 2. 合并所有句子为一个字符串
    text = ' '.join(df['包含关键词的弹幕'].astype(str))

    # 3. 分词，将句子分割为词汇
    text_list = jieba.lcut(text)

    # 4. 去除停用词，并过滤出有效词（长度要大于等于2个字的词）
    filtered_words = set(word for word in text_list if word not in stopwords and len(word) >= 2)

    # 5. 将有效词保存到TXT文件
    if output_turly:
        with open(output_txt, 'w', encoding='utf-8') as f:
            for word in filtered_words:
                f.write(f"{word}\n")
    
        print(f"有效词已保存到 {output_txt}")
    return filtered_words  # 返回有效词集合供后续使用

# 从TXT文件中加载停用词
def load_stopwords_from_txt(file_path):
    """ 
    从TXT文件中加载停用词
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        # 读取所有行并去掉换行符和空白字符
        stopwords = {line.strip() for line in f if line.strip()}  # 只保留非空行
    return stopwords

# 根据有效词的出现次数生成字符串
def generate_weighted_text_from_counts(file_path, stopwords):
    """
    参数1: file_path: 包含词频的Excel文件路径
    参数2: stopwords: 停用词列表
    返回值: 生成的文本
    """
    df = pd.read_excel(file_path)
    total_count = df.shape[0] # 获取总行数
    weighted_words = []

    for index, row in df.iterrows():
        word = row['关键词']
        count = total_count - index
        if(count/2 > 1):
            count = int (count/2)
        if word not in stopwords and len(word) >= 2:  # 确保不包含停用词
            weighted_words.extend([word] * count)  # 将词按其出现次数重复添加

    return ' '.join(weighted_words)

# 根据有效词集合和词频生成并显示词云图
def generate_wordcloud(filtered_words, weighted_text, output_path='词云.png'):
    """ 
    参数1: filtered_words 有效词集合
    参数2: weighted_text 加权后的文本
    """
    # 将有效词集合转为字符串，空格分隔
    filtered_text_str = ' '.join(filtered_words)

    # 组合加权文本和有效词字符串
    combined_text = weighted_text + ' ' + filtered_text_str

    # 配置词云图
    wc = wordcloud.WordCloud(
        width=600,
        height=300,
        background_color='white',
        font_path='msyh.ttc',  # 字体文件路径
        max_font_size=200,
        min_font_size=10,
        colormap='cool',
        scale=4
    )

    # 生成词云
    wc.generate(combined_text)

    # 保存词云图为文件
    wc.to_file(output_path)

    # 显示词云图
    plt.figure(figsize=(10, 10))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')  # 关闭坐标轴
    plt.show()

def main_word_could():
    stopwords = load_stopwords_from_txt('stop.txt')

    # 处理AI相关句子以生成有效词
    filtered_words = break_down_words('ai_sentences.xlsx', stopwords)  # 处理AI相关句子
    print('词语已生成')

    # 读取keyword_counts.xlsx文件中的词频并生成加权文本
    weighted_text = generate_weighted_text_from_counts('keyword_counts.xlsx', stopwords)
    print('加权文本已生成')

    # 生成词云图
    generate_wordcloud(filtered_words, weighted_text)

if __name__ == '__main__':
    main_word_could()


# 获取相关函数运行时间
def main_word_could2():
    start_time = time.time()  # 开始计时
    stopwords = load_stopwords_from_txt('stop.txt')
    end_time = time.time()  # 结束计时
    print(f"load_stopwords_from_txt函数耗时: {end_time - start_time:.4f} 秒")

    # 处理AI相关句子以生成有效词
    start_time = time.time()  # 开始计时
    filtered_words = break_down_words('ai_sentences.xlsx', stopwords)  # 处理AI相关句子
    # print('词语已生成')
    end_time = time.time()  # 结束计时
    print(f"break_down_words函数耗时: {end_time - start_time:.4f} 秒")

    # 读取keyword_counts.xlsx文件中的词频并生成加权文本
    start_time = time.time()  # 开始计时
    weighted_text = generate_weighted_text_from_counts('keyword_counts.xlsx', stopwords)
    # print('加权文本已生成')
    end_time = time.time()  # 结束计时
    print(f"generate_weighted_text_from_counts函数耗时: {end_time - start_time:.4f} 秒")

    # 生成词云图
    start_time = time.time()  # 开始计时
    generate_wordcloud(filtered_words, weighted_text)
    end_time = time.time()  # 结束计时
    print(f"generate_wordcloud函数耗时: {end_time - start_time:.4f} 秒")