You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

61 lines
2.3 KiB

8 months ago
'''
使用 multiprocessing.Manager:
Manager 提供了一个可以在不同进程之间共享和修改的数据类型 list, dict, Namespace
它实际上是在背后启动了一个单独的服务器进程其他进程通过代理来访问这些共享对象
8 months ago
使用 multiprocessing.Manager 来完成统计词频
需要注意
- Manager() 必须用函数包起来,不能按脚本随便放外面否则会提示freeze_support
- 工作函数需要放到外面不能做内部函数否则会提示参数错误
- 无法在 Jupyter 类似环境运行
8 months ago
'''
from cppy.cp_util import *
from collections import Counter
from multiprocessing import Manager, Process
stop_words = get_stopwords()
8 months ago
def process_chunk(shared_chunks,word_count):
while True:
try:
chunk = shared_chunks.pop(0) # 从共享列表中取出一个数据块
if chunk is None: break # 如果取出的是None表示所有数据块已处理完毕
words = extract_str_words(chunk)
for word in words:
if word not in stop_words:
word_count[word] = word_count.get(word, 0) + 1
except Exception as e:
print(e)
break
8 months ago
@timing_decorator
8 months ago
def main():
# 创建一个Manager实例
manager = Manager()
shared_chunks = manager.list()
8 months ago
word_count = manager.dict()
8 months ago
# 读取文件并按块大小分割,将块添加到共享列表中
chunk_size = 1024 * 10 # 假设每个块是10KB可以根据需要调整
with open(testfilepath, 'r', encoding='utf-8') as f:
while True:
chunk = f.read(chunk_size)
if not chunk: break
shared_chunks.append(chunk)
shared_chunks.append(None)
print('-------------------',len(shared_chunks))
processes = [ Process( target=process_chunk,
args=(shared_chunks,word_count))
for _ in range( 4 ) ] # 假设启动4个工作进程
for p in processes: p.start()
8 months ago
for p in processes: p.join()
8 months ago
# 将Manager类型的字典转换为普通的字典以便使用Counter
word_count = dict(word_count)
word_freqs = Counter(word_count).most_common(10)
print_word_freqs(word_freqs)
8 months ago
if __name__ == '__main__':
main()