forked from p46318075/CodePattern
				
			
							parent
							
								
									2288c18e8a
								
							
						
					
					
						commit
						740f5aabff
					
				| @ -1,44 +0,0 @@ | |||||||
| import multiprocessing   |  | ||||||
| from collections import Counter |  | ||||||
| from cppy.cp_util import *   |  | ||||||
| 
 |  | ||||||
| # |  | ||||||
| # 多进程 |  | ||||||
| # |  | ||||||
| def process_chunk(chunk):   |  | ||||||
|     # 切词并过滤停用词        |  | ||||||
|     words = extract_str_words( chunk.lower() ) |  | ||||||
|     return Counter(words)   |  | ||||||
|    |  | ||||||
| def merge_counts(counts_list):   |  | ||||||
|     # 合并多个Counter对象   |  | ||||||
|     total_counts = Counter()   |  | ||||||
|     for counts in counts_list:   |  | ||||||
|         total_counts += counts   |  | ||||||
|     return total_counts     |  | ||||||
| 
 |  | ||||||
| @timing_decorator |  | ||||||
| def main():   |  | ||||||
|     # 读取文件内容   |  | ||||||
|     content = read_file(testfilepath)     |  | ||||||
| 
 |  | ||||||
|     # 分割文件内容为多个块,每个块由一个进程处理   |  | ||||||
|     chunk_size = 1000  # 可以根据实际情况调整块大小   |  | ||||||
|     chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)]   |  | ||||||
|    |  | ||||||
|     # 使用多进程处理每个块   |  | ||||||
|     pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())   |  | ||||||
|     counts_list = pool.map(process_chunk, chunks)   |  | ||||||
|     pool.close()   |  | ||||||
|     pool.join()   |  | ||||||
|    |  | ||||||
|     # 合并计数   |  | ||||||
|     total_counts = merge_counts(counts_list)   |  | ||||||
|    |  | ||||||
|     # 输出最高频的n个词 |  | ||||||
|     print_word_freqs( total_counts.most_common(10) )         |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| if __name__ == '__main__':   |  | ||||||
|     main() |  | ||||||
|      |  | ||||||
					Loading…
					
					
				
		Reference in new issue