diff --git a/计算设备/map-reduce/tf_92.py b/计算设备/map-reduce/tf_92.py deleted file mode 100644 index 525181e..0000000 --- a/计算设备/map-reduce/tf_92.py +++ /dev/null @@ -1,44 +0,0 @@ -import multiprocessing -from collections import Counter -from cppy.cp_util import * - -# -# 多进程 -# -def process_chunk(chunk): - # 切词并过滤停用词 - words = extract_str_words( chunk.lower() ) - return Counter(words) - -def merge_counts(counts_list): - # 合并多个Counter对象 - total_counts = Counter() - for counts in counts_list: - total_counts += counts - return total_counts - -@timing_decorator -def main(): - # 读取文件内容 - content = read_file(testfilepath) - - # 分割文件内容为多个块,每个块由一个进程处理 - chunk_size = 1000 # 可以根据实际情况调整块大小 - chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)] - - # 使用多进程处理每个块 - pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) - counts_list = pool.map(process_chunk, chunks) - pool.close() - pool.join() - - # 合并计数 - total_counts = merge_counts(counts_list) - - # 输出最高频的n个词 - print_word_freqs( total_counts.most_common(10) ) - - -if __name__ == '__main__': - main() - \ No newline at end of file