大修6

2 years ago · a3bc46dae3
parent f2ff5c8d4e
commit a3bc46dae3
14 changed files with 48 additions and 49 deletions
--- a/概念认知/尾调用/tf-10.py
+++ b/概念认知/尾调用/tf-10.py
--- a/概念认知/尾调用/tf-25.py
+++ b/概念认知/尾调用/tf-25.py
--- a/概念认知/尾调用/tf-26B.py
+++ b/概念认知/尾调用/tf-26B.py
--- a/概念认知/尾调用/模拟管道.py
+++ b/概念认知/尾调用/模拟管道.py
--- a/计算设备/数据分包/mapreduce.py
+++ b/计算设备/数据分包/mapreduce.py
@ -3,14 +3,14 @@ from collections import Counter
 from cppy.cp_util import *
 from functools import reduce

+stop_words = get_stopwords()
+
 # map - reduce
 def process_chunk(chunk):
    # 过滤停用词
-    stop_words = get_stopwords()
    words = [ w for w in chunk if ( not w in stop_words ) and len(w) >= 3 ]
    return Counter(words)

-
 def merge_counts(count1,count2):
    sum_counts = count1 + count2
    return sum_counts
@ -18,16 +18,11 @@ def merge_counts(count1,count2):

@timing_decorator
 def main():   
-    # 读取文件内容
-    content = re_split(read_file(testfilepath))
-
-    # 分割文件内容为多个块，每个块由一个进程处理
-    chunk_size = 1000  # 可以根据实际情况调整块大小
-    chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
+    # 读数据，按1000个词一组分片
+    chunks = get_chunks(testfilepath,1000)

    # 使用 map 方法和 process_chunk 函数处理每个分区
    counts_list = list(map(process_chunk, chunks))
-
    #  使用 reduce 和 merge_counts 函数统计所有分区的词频
    total_counts = (reduce(merge_counts,counts_list))

@ -38,5 +33,3 @@ def main():
 if __name__ == '__main__':
    main()

-
-
--- a/计算设备/数据分包/多线程.py
+++ b/计算设备/数据分包/多线程.py
@ -7,9 +7,10 @@ from multiprocessing.pool import ThreadPool
 #
 # 多线程
 #
+stop_words = get_stopwords()
+
 def process_chunk(chunk):
    # 过滤停用词    
-    stop_words = get_stopwords()
    words = [ w for w in chunk if ( not w in stop_words ) and len(w) >= 3 ]
    return Counter(words)

@ -29,13 +30,12 @@ def thread_function(chunk, counts_list):

@timing_decorator
 def main():    
-    # 读取文件内容
-    content = re_split(read_file(testfilepath))
-    chunk_size = 1000 # 可以根据实际情况调整块大小
-    chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
+    # 读数据，按1000个词一组分片
+    chunks = get_chunks(testfilepath,1000)
+
+    # 线程池    
+    pool = ThreadPool(len(chunks))  # 随意指定的线程数       

-    # 使用多线程池，每个线程处理一个块
-    pool = ThreadPool(len(content)//chunk_size+1)
    counts_list = pool.map(process_chunk, chunks)
    pool.close()
    pool.join()
--- a/计算设备/数据分包/多进程.py
+++ b/计算设备/数据分包/多进程.py
@ -7,13 +7,13 @@ from cppy.cp_util import *
 #
 # 多进程
 #
+stop_words = get_stopwords()
+
 def process_chunk(chunk):
    # 过滤停用词    
-    stop_words = get_stopwords()
    words = [ w for w in chunk if ( not w in stop_words ) and len(w) >= 3 ]
    return Counter(words)

-
 def merge_counts(counts_list):
    # 合并多个Counter对象
    total_counts = Counter()
@ -24,12 +24,8 @@ def merge_counts(counts_list):

@timing_decorator
 def main():
-    # 读取文件内容
-    content = re_split(read_file(testfilepath))
-
-    # 分割文件内容为多个块，每个块由一个进程处理
-    chunk_size = 1000  # 可以根据实际情况调整块大小
-    chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
+    # 读取文件内容，分割文件内容为多个块，每个块由一个进程处理    
+    chunks = get_chunks(testfilepath,1000)

    # 使用多进程处理每个块
    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
--- a/计算设备/数据分包/抽象并发.py
+++ b/计算设备/数据分包/抽象并发.py
@ -7,24 +7,23 @@ concurrent.futures模块为Python中的并发编程提供了一个统一接口,
 这个模块隐藏了低层次的线程和进程创建、同步和清理的细节,提供了一个更高层次的API来处理并发任务。
 当前版本推荐它与asyncio模块结合使用完成Python中的各种异步编程任务。
 '''
+stop_words = util.get_stopwords()

 class WordFrequencyAgent:  
    def __init__(self, words):          
        self.words = words  
  
    def compute_word_frequency(self):
-        self.word_freq = Counter(self.words)  
+        words = [ w for w in self.words if ( not w in stop_words ) and len(w) >= 3 ]          
+        self.word_freq = Counter( words)  
  
    def get_word_frequency(self):  
        return self.word_freq

    
 # 将文本分割成多个部分，并为每个部分创建一个Agent
-def create_agents(words, num_agents = 4 ):  
-    text_chunks = [ words[i::num_agents] for i in range(num_agents) ]  
-    agents = [ WordFrequencyAgent(chunk) for chunk in text_chunks ]
-    return agents    
-
+def create_agents( words ):          
+    return [ WordFrequencyAgent(chunk) for chunk in words ]    

 def compute_all_word_frequencies(agents):  
    with concurrent.futures.ThreadPoolExecutor() as executor:  
@ -34,7 +33,6 @@ def compute_all_word_frequencies(agents):
            agent = future_to_agent[future]  
            data = future.result()   # 词频被保存在agent中

-
 # 所有Agent计算完成后，合并它们的词频结果
 def merge_word_frequencies(agents):  
    merged_freq = Counter()  
@ -42,10 +40,13 @@ def merge_word_frequencies(agents):
        merged_freq.update(agent.get_word_frequency())  
    return merged_freq  

-
-if __name__ == '__main__':       
-    words = util.extract_file_words(util.testfilepath) # 从文本抽词
+@util.timing_decorator
+def main():   
+    words = util.get_chunks(util.testfilepath)
    agents = create_agents(words)  # 创建代理
    compute_all_word_frequencies(agents)  # 计算
    merged_word_freq = merge_word_frequencies(agents)   # 合并结果    
    util.print_word_freqs(merged_word_freq.most_common(10))  # 排序输出  
+
+if __name__ == '__main__':       
+  main()
--- a/工程化/松耦合/restful/tf-35-app.py
+++ b/工程化/松耦合/restful/tf-35-app.py
--- a/工程化/松耦合/restful/tf-35-request.py
+++ b/工程化/松耦合/restful/tf-35-request.py
--- a/其它/对象设计模式/观察者.py
+++ b/其它/对象设计模式/观察者.py
@ -45,7 +45,7 @@ def main(testfilepath, top_n = 10 ):
    wordlist = re_split( read_file(testfilepath) )
    for word in wordlist:  
        if word not in stopwords:  
-            subject.notify(word)  
+            subject.notify(word)  # 触发
  
    # 打印最高的N个词频  
    top_words = observer.get_top_n(top_n)  
--- a/对象设计模式/结构型/170
+++ b/对象设计模式/结构型/170
@ -84,9 +84,11 @@ if __name__ == "__main__":


 '''
-在这个示例中，IBook 是一个接口，定义了书籍应有的行为（比如获取标题和作者）。NovelBook 是一个具体书籍类，实现了 IBook 接口。BookCategory 是一个书籍分类类，它可以包含多个书籍实例。
-
-DisplayPlatform 是一个抽象展示平台类，定义了如何展示书籍。WebDisplayPlatform 和 MobileDisplayPlatform 是具体展示平台类，分别实现了 DisplayPlatform 接口，以提供不同的展示方式。
-
+在这个示例中，
+IBook 是一个接口，定义了书籍应有的行为（比如获取标题和作者）。
+NovelBook 是一个具体书籍类，实现了 IBook 接口。
+BookCategory 是一个书籍分类类，它可以包含多个书籍实例。
+DisplayPlatform 是一个抽象展示平台类，定义了如何展示书籍。
+WebDisplayPlatform 和 MobileDisplayPlatform 是具体展示平台类，分别实现了 DisplayPlatform 接口，以提供不同的展示方式。
 BookShop 是一个桥接类，它将书籍分类与展示平台连接起来，通过 show_books 方法可以展示分类中的所有书籍。
 '''    
--- a/cppy_/cp_util.py
+++ b/cppy_/cp_util.py
@ -39,6 +39,13 @@ def get_stopwords( path_to_file = stopwordfilepath ):
    data.extend(list(string.ascii_lowercase))
    return data

+def get_chunks( file_path = testfilepath, chunk_size = 1000):
+    # 读取文件内容，分割文件内容为多个块，每个块由一个进程处理
+    # 可以根据实际情况调整块大小
+    content = re_split(read_file(file_path))         
+    chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
+    return chunks
+
 def extract_file_words(path_to_file):
    word_list = re_split( read_file(path_to_file) )
    stop_words = get_stopwords()