From 740f5aabff3ab4b2f8990cadab7c3af7f17c0f34 Mon Sep 17 00:00:00 2001 From: pbr4nzfkh <18879212807@163.com> Date: Sun, 17 Mar 2024 10:21:08 +0800 Subject: [PATCH] =?UTF-8?q?Delete=20'=E8=AE=A1=E7=AE=97=E8=AE=BE=E5=A4=87/?= =?UTF-8?q?map-reduce/tf=5F92.py'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 计算设备/map-reduce/tf_92.py | 44 -------------------------------- 1 file changed, 44 deletions(-) delete mode 100644 计算设备/map-reduce/tf_92.py diff --git a/计算设备/map-reduce/tf_92.py b/计算设备/map-reduce/tf_92.py deleted file mode 100644 index 525181e..0000000 --- a/计算设备/map-reduce/tf_92.py +++ /dev/null @@ -1,44 +0,0 @@ -import multiprocessing -from collections import Counter -from cppy.cp_util import * - -# -# 多进程 -# -def process_chunk(chunk): - # 切词并过滤停用词 - words = extract_str_words( chunk.lower() ) - return Counter(words) - -def merge_counts(counts_list): - # 合并多个Counter对象 - total_counts = Counter() - for counts in counts_list: - total_counts += counts - return total_counts - -@timing_decorator -def main(): - # 读取文件内容 - content = read_file(testfilepath) - - # 分割文件内容为多个块,每个块由一个进程处理 - chunk_size = 1000 # 可以根据实际情况调整块大小 - chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)] - - # 使用多进程处理每个块 - pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) - counts_list = pool.map(process_chunk, chunks) - pool.close() - pool.join() - - # 合并计数 - total_counts = merge_counts(counts_list) - - # 输出最高频的n个词 - print_word_freqs( total_counts.most_common(10) ) - - -if __name__ == '__main__': - main() - \ No newline at end of file