import threading from collections import Counter from cppy.cp_util import * """ 把切分数据片段分给多线程改为分配多个文件给多个线程(有IO操作),就能看到效果了 """ stop_words = get_stopwords() # 定义一个函数来计算每个线程的词频 def count_words(start, end, text, result_index, results): words = re_split( text[start:end] ) words = [w for w in words if not w in stop_words] result = Counter(words) results[result_index] = result if __name__ == '__main__': # 读取文件内容 text = read_file(testfilepath) # 确定线程数量 num_threads = 4 text_length = len(text) chunk_size = text_length // num_threads # 存储每个线程的结果 results = [None] * num_threads threads = [] # 创建并启动线程 for i in range(num_threads): start = i * chunk_size # 确保最后一个线程能够读取文件的末尾 end = text_length if i == num_threads - 1 else (i + 1) * chunk_size t = threading.Thread(target=count_words, args=(start, end, text, i, results)) threads.append(t) t.start() # 等待所有线程完成 for t in threads: t.join() # 合并结果 total_count = Counter() for result in results: total_count += result # 打印词频最高的10个单词 for w,c in total_count.most_common(10): print(w, '--',c)