From 2288c18e8af1167598bff44d8e6bda56326feb15 Mon Sep 17 00:00:00 2001 From: pbr4nzfkh <18879212807@163.com> Date: Sun, 17 Mar 2024 10:20:58 +0800 Subject: [PATCH] =?UTF-8?q?Delete=20'=E8=AE=A1=E7=AE=97=E8=AE=BE=E5=A4=87/?= =?UTF-8?q?map-reduce/tf=5F91.py'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 计算设备/map-reduce/tf_91.py | 54 -------------------------------- 1 file changed, 54 deletions(-) delete mode 100644 计算设备/map-reduce/tf_91.py diff --git a/计算设备/map-reduce/tf_91.py b/计算设备/map-reduce/tf_91.py deleted file mode 100644 index df5add8..0000000 --- a/计算设备/map-reduce/tf_91.py +++ /dev/null @@ -1,54 +0,0 @@ -import threading -from collections import Counter -from cppy.cp_util import * - -# -# 多线程 -# -def process_chunk(start, end, text, result_index, results): - # 切词并过滤停用词 - words = extract_str_words( text[start:end] ) - results[result_index] = Counter(words) - -def merge_counts(counts_list): - # 合并多个Counter对象 - total_counts = Counter() - for counts in counts_list: - total_counts += counts - return total_counts - -@timing_decorator -def main(): - # 读取文件内容 - text = read_file(testfilepath) - - # 确定线程数量 - num_threads = 4 - text_length = len(text) - chunk_size = text_length // num_threads - - # 存储每个线程的结果 - results = [None] * num_threads - threads = [] - - # 创建并启动线程 - for i in range(num_threads): - start = i * chunk_size - # 确保最后一个线程能够读取文件的末尾 - end = text_length if i == num_threads - 1 else (i + 1) * chunk_size - t = threading.Thread(target=process_chunk, args=(start, end, text, i, results)) - threads.append(t) - t.start() - - # 等待所有线程完成 - for t in threads: t.join() - - # 合并计数 - total_counts = merge_counts(results) - - # 输出最高频的n个词 - print_word_freqs( total_counts.most_common(10) ) - - -if __name__ == '__main__': - main()