|
|
@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
'''
|
|
|
|
|
|
|
|
使用 multiprocessing.Manager:
|
|
|
|
|
|
|
|
Manager 提供了一个可以在不同进程之间共享和修改的数据类型,如 list, dict, Namespace 等。
|
|
|
|
|
|
|
|
它实际上是在背后启动了一个单独的服务器进程,其他进程通过代理来访问这些共享对象。
|
|
|
|
|
|
|
|
'''
|
|
|
|
|
|
|
|
# 使用 multiprocessing.Manager 来完成统计词频
|
|
|
|
|
|
|
|
# 用消费者模式更好
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from cppy.cp_util import *
|
|
|
|
|
|
|
|
from collections import Counter
|
|
|
|
|
|
|
|
from multiprocessing import Manager, Process
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
stop_words = get_stopwords()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def count_words(chunk,word_count):
|
|
|
|
|
|
|
|
words = [ w for w in chunk if ( not w in stop_words ) and len(w) >= 3 ]
|
|
|
|
|
|
|
|
for word in words:
|
|
|
|
|
|
|
|
word_count[word] = word_count.get(word, 0) + 1
|
|
|
|
|
|
|
|
# word_count.update( Counter(words) ) # 类型不起作用
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@timing_decorator
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
|
|
|
manager = Manager()
|
|
|
|
|
|
|
|
word_count = manager.dict()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
chunks = get_chunks(testfilepath)
|
|
|
|
|
|
|
|
processes = []
|
|
|
|
|
|
|
|
for chunk in chunks:
|
|
|
|
|
|
|
|
p = Process(target=count_words, args=(chunk,word_count))
|
|
|
|
|
|
|
|
processes.append(p)
|
|
|
|
|
|
|
|
p.start()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for p in processes: p.join()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sorted_word_count = sorted(word_count.items(), key=lambda x: x[1], reverse=True)
|
|
|
|
|
|
|
|
top_10_words = sorted_word_count[:10]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("频率最高的10个词:")
|
|
|
|
|
|
|
|
for word, count in top_10_words:
|
|
|
|
|
|
|
|
print(f"{word}: {count}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
|
|
|
main()
|