You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

49 lines
1.4 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import threading
from collections import Counter
from cppy.cp_util import *
"""
把切分数据片段分给多线程改为分配多个文件给多个线程有IO操作就能看到效果了
"""
stop_words = get_stopwords()
# 定义一个函数来计算每个线程的词频
def count_words(start, end, text, result_index, results):
words = re_split( text[start:end] )
words = [w for w in words if not w in stop_words]
result = Counter(words)
results[result_index] = result
if __name__ == '__main__':
# 读取文件内容
text = read_file(testfilepath)
# 确定线程数量
num_threads = 4
text_length = len(text)
chunk_size = text_length // num_threads
# 存储每个线程的结果
results = [None] * num_threads
threads = []
# 创建并启动线程
for i in range(num_threads):
start = i * chunk_size
# 确保最后一个线程能够读取文件的末尾
end = text_length if i == num_threads - 1 else (i + 1) * chunk_size
t = threading.Thread(target=count_words, args=(start, end, text, i, results))
threads.append(t)
t.start()
# 等待所有线程完成
for t in threads: t.join()
# 合并结果
total_count = Counter()
for result in results: total_count += result
# 打印词频最高的10个单词
for w,c in total_count.most_common(10):
print(w, '--',c)