import threading
from collections import Counter
from cppy.cp_util import *

"""
 把切分数据片段分给多线程改为分配多个文件给多个线程（有IO操作），就能看到效果了
"""

stop_words = get_stopwords()  

# 定义一个函数来计算每个线程的词频
def count_words(start, end, text, result_index, results):    
    words = re_split( text[start:end] )
    words = [w for w in words if not w in stop_words]    
    result = Counter(words)
    results[result_index] = result

if __name__ == '__main__':
    # 读取文件内容
    text = read_file(testfilepath)    

    # 确定线程数量
    num_threads = 4
    text_length = len(text)
    chunk_size = text_length // num_threads

    # 存储每个线程的结果
    results = [None] * num_threads
    threads = []

    # 创建并启动线程
    for i in range(num_threads):
        start = i * chunk_size
        # 确保最后一个线程能够读取文件的末尾
        end = text_length if i == num_threads - 1 else (i + 1) * chunk_size
        t = threading.Thread(target=count_words, args=(start, end, text, i, results))
        threads.append(t)
        t.start()

    # 等待所有线程完成
    for t in threads: t.join()

    # 合并结果
    total_count = Counter()
    for result in results:  total_count += result

    # 打印词频最高的10个单词
    for w,c in total_count.most_common(10):
        print(w, '--',c)