dev
zj3D 8 months ago
parent f2ff5c8d4e
commit a3bc46dae3

@ -3,14 +3,14 @@ from collections import Counter
from cppy.cp_util import *
from functools import reduce
stop_words = get_stopwords()
# map - reduce
def process_chunk(chunk):
# 过滤停用词
stop_words = get_stopwords()
words = [ w for w in chunk if ( not w in stop_words ) and len(w) >= 3 ]
return Counter(words)
def merge_counts(count1,count2):
sum_counts = count1 + count2
return sum_counts
@ -18,16 +18,11 @@ def merge_counts(count1,count2):
@timing_decorator
def main():
# 读取文件内容
content = re_split(read_file(testfilepath))
# 分割文件内容为多个块,每个块由一个进程处理
chunk_size = 1000 # 可以根据实际情况调整块大小
chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
# 读数据按1000个词一组分片
chunks = get_chunks(testfilepath,1000)
# 使用 map 方法和 process_chunk 函数处理每个分区
counts_list = list(map(process_chunk, chunks))
# 使用 reduce 和 merge_counts 函数统计所有分区的词频
total_counts = (reduce(merge_counts,counts_list))
@ -38,5 +33,3 @@ def main():
if __name__ == '__main__':
main()

@ -7,9 +7,10 @@ from multiprocessing.pool import ThreadPool
#
# 多线程
#
stop_words = get_stopwords()
def process_chunk(chunk):
# 过滤停用词
stop_words = get_stopwords()
words = [ w for w in chunk if ( not w in stop_words ) and len(w) >= 3 ]
return Counter(words)
@ -29,13 +30,12 @@ def thread_function(chunk, counts_list):
@timing_decorator
def main():
# 读取文件内容
content = re_split(read_file(testfilepath))
chunk_size = 1000 # 可以根据实际情况调整块大小
chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
# 读数据按1000个词一组分片
chunks = get_chunks(testfilepath,1000)
# 线程池
pool = ThreadPool(len(chunks)) # 随意指定的线程数
# 使用多线程池,每个线程处理一个块
pool = ThreadPool(len(content)//chunk_size+1)
counts_list = pool.map(process_chunk, chunks)
pool.close()
pool.join()

@ -7,13 +7,13 @@ from cppy.cp_util import *
#
# 多进程
#
stop_words = get_stopwords()
def process_chunk(chunk):
# 过滤停用词
stop_words = get_stopwords()
words = [ w for w in chunk if ( not w in stop_words ) and len(w) >= 3 ]
return Counter(words)
def merge_counts(counts_list):
# 合并多个Counter对象
total_counts = Counter()
@ -24,12 +24,8 @@ def merge_counts(counts_list):
@timing_decorator
def main():
# 读取文件内容
content = re_split(read_file(testfilepath))
# 分割文件内容为多个块,每个块由一个进程处理
chunk_size = 1000 # 可以根据实际情况调整块大小
chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
# 读取文件内容,分割文件内容为多个块,每个块由一个进程处理
chunks = get_chunks(testfilepath,1000)
# 使用多进程处理每个块
pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())

@ -7,24 +7,23 @@ concurrent.futures模块为Python中的并发编程提供了一个统一接口,
这个模块隐藏了低层次的线程和进程创建同步和清理的细节,提供了一个更高层次的API来处理并发任务
当前版本推荐它与asyncio模块结合使用完成Python中的各种异步编程任务
'''
stop_words = util.get_stopwords()
class WordFrequencyAgent:
def __init__(self, words):
self.words = words
def compute_word_frequency(self):
self.word_freq = Counter(self.words)
words = [ w for w in self.words if ( not w in stop_words ) and len(w) >= 3 ]
self.word_freq = Counter( words)
def get_word_frequency(self):
return self.word_freq
# 将文本分割成多个部分并为每个部分创建一个Agent
def create_agents(words, num_agents = 4 ):
text_chunks = [ words[i::num_agents] for i in range(num_agents) ]
agents = [ WordFrequencyAgent(chunk) for chunk in text_chunks ]
return agents
def create_agents( words ):
return [ WordFrequencyAgent(chunk) for chunk in words ]
def compute_all_word_frequencies(agents):
with concurrent.futures.ThreadPoolExecutor() as executor:
@ -34,7 +33,6 @@ def compute_all_word_frequencies(agents):
agent = future_to_agent[future]
data = future.result() # 词频被保存在agent中
# 所有Agent计算完成后合并它们的词频结果
def merge_word_frequencies(agents):
merged_freq = Counter()
@ -42,10 +40,13 @@ def merge_word_frequencies(agents):
merged_freq.update(agent.get_word_frequency())
return merged_freq
if __name__ == '__main__':
words = util.extract_file_words(util.testfilepath) # 从文本抽词
@util.timing_decorator
def main():
words = util.get_chunks(util.testfilepath)
agents = create_agents(words) # 创建代理
compute_all_word_frequencies(agents) # 计算
merged_word_freq = merge_word_frequencies(agents) # 合并结果
util.print_word_freqs(merged_word_freq.most_common(10)) # 排序输出
if __name__ == '__main__':
main()

@ -45,7 +45,7 @@ def main(testfilepath, top_n = 10 ):
wordlist = re_split( read_file(testfilepath) )
for word in wordlist:
if word not in stopwords:
subject.notify(word)
subject.notify(word) # 触发
# 打印最高的N个词频
top_words = observer.get_top_n(top_n)

@ -84,9 +84,11 @@ if __name__ == "__main__":
'''
在这个示例中IBook 是一个接口定义了书籍应有的行为比如获取标题和作者NovelBook 是一个具体书籍类实现了 IBook 接口BookCategory 是一个书籍分类类它可以包含多个书籍实例
DisplayPlatform 是一个抽象展示平台类定义了如何展示书籍WebDisplayPlatform MobileDisplayPlatform 是具体展示平台类分别实现了 DisplayPlatform 接口以提供不同的展示方式
在这个示例中
IBook 是一个接口定义了书籍应有的行为比如获取标题和作者
NovelBook 是一个具体书籍类实现了 IBook 接口
BookCategory 是一个书籍分类类它可以包含多个书籍实例
DisplayPlatform 是一个抽象展示平台类定义了如何展示书籍
WebDisplayPlatform MobileDisplayPlatform 是具体展示平台类分别实现了 DisplayPlatform 接口以提供不同的展示方式
BookShop 是一个桥接类它将书籍分类与展示平台连接起来通过 show_books 方法可以展示分类中的所有书籍
'''

@ -39,6 +39,13 @@ def get_stopwords( path_to_file = stopwordfilepath ):
data.extend(list(string.ascii_lowercase))
return data
def get_chunks( file_path = testfilepath, chunk_size = 1000):
# 读取文件内容,分割文件内容为多个块,每个块由一个进程处理
# 可以根据实际情况调整块大小
content = re_split(read_file(file_path))
chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
return chunks
def extract_file_words(path_to_file):
word_list = re_split( read_file(path_to_file) )
stop_words = get_stopwords()

Loading…
Cancel
Save