0524_1

2 months ago · f2729b414f
parent 2398743ab9
commit f2729b414f
35 changed files with 129 additions and 312 deletions
--- a/观察者/readme.MD
+++ b/观察者/readme.MD
@ -1,9 +0,0 @@
-
-注册
- 解耦合：通过回调函数，可以将不同部分的代码逻辑分离，降低模块之间的耦合度。
- 主动通信：注册回调模式实现了下层模块与上层模块之间的主动通信。当下层模块发生特定事件或满足特定条件时，可以主动调用上层模块注册的回调函数，而不需要上层模块不停地轮询下层模块的状态。
-
- 异步处理：回调函数常用于异步操作的响应处理，可以在主线程之外执行耗时操作，提升程序的效率和响应速度。
- 简化设计：在某些情况下，使用回调函数可以避免复杂的控制流设计，使代码更加简洁明了。
-
- 适应变化：随着项目的发展，需求可能会发生变化。注册回调模式使得在不影响现有代码的基础上，容易添加新功能或修改现有逻辑。
--- a/解耦模式/注册回调
+++ b/解耦模式/注册回调
@ -1,6 +1,6 @@
-################ 待整理 
 '''
 注册者 = 观察者
+你也可以把它看作订阅模式

 每个组件提供注册消息接口和注册消息动作

--- a/解耦模式/注册回调
+++ b/解耦模式/注册回调
@ -1,5 +1,3 @@
-################ 待整理 
-
 from cppy.cp_util import *
 '''
 订阅者 = 注册者 = 观察者
--- a/解耦模式/观察者
+++ b/解耦模式/观察者
@ -2,10 +2,7 @@
 入门级示例，是用来帮助理解其他例子

 把观察者挂到自己的处理队列上
-适当时机调用所有队列上的约定的观察者的 update 方法
-如果观察者有多个职能参与不同的任务链，不一定要统一命名update方法
-
-这是一个示例性质的原型，具体环境下需要调整
+WordSubject 调用所有观察者的 update 方法
 '''

 import collections  
--- a/解耦模式/观察者
+++ b/解耦模式/观察者
@ -1,8 +1,7 @@
-
 '''
-本例的基本模式还是观察者
 基类 Subject 提供注册和提醒注册上的对象提醒机制
-
+每个派生组件做完自己的事情，调用基类的 notify 方法
+这样就可以实现一个简单的观察者模式，也可以说是订阅模式
 因为函数和参数混杂在一起传递，使得各个模块的处理结构其实是 case by case
 '''

--- a/动机与模式/13
+++ b/动机与模式/13
--- a/动机与模式/13
+++ b/动机与模式/13
--- a/插件/plugins-src/buildingPyc.py
+++ b/插件/plugins-src/buildingPyc.py
--- a/插件/plugins-src/f1.py
+++ b/插件/plugins-src/f1.py
--- a/插件/plugins-src/f2.py
+++ b/插件/plugins-src/f2.py
--- a/插件/plugins/f1.pyc
+++ b/插件/plugins/f1.pyc
--- a/插件/plugins/f2.pyc
+++ b/插件/plugins/f2.pyc
--- a/动机与模式/13
+++ b/动机与模式/13
@ -1,102 +0,0 @@
-'''
-后续组件挂载到前序组件后续链上
-仅提供 self.next_observer 的抽象关系
-后续组件接到指令和数据，自己决定动作
-
-理论上每个组件可以参与到多个生产队列
-
-本例使用了类来封装消息，相对于字符串理论上提供了更丰富的扩展可能
-这是一个示例性质的原型，具体环境下需要调整
-'''
-
-from collections import Counter  
-from typing import List,  Dict  
-from cppy.cp_util import *
-
-# 定义消息类型  
-class Message:  
-    def __init__(self, data):  
-        self.data = data  
-  
-class TokenizedText(Message):  
-    pass  
-  
-class FilteredText(Message):  
-    pass  
-  
-class WordFrequency(Message):  
-    pass  
-  
-# 定义观察者接口  
-class Observer:  
-    def notify(self, message: Message):  
-        pass  
-  
-# 切词订阅者  
-class TokenizerSubscriber(Observer):  
-    def __init__(self, next_observer: Observer):  
-        self.next_observer = next_observer  
-      
-    def notify(self, message: Message):  
-        if not isinstance(message.data, str):  
-            return  
-        tokenized_text = re_split(message.data)  
-        self.next_observer.notify(TokenizedText(tokenized_text))  
-  
-# 停用词订阅者  
-class StopWordsRemoverSubscriber(Observer):  
-    def __init__(self, next_observer: Observer, stop_words: List[str]):  
-        self.next_observer = next_observer  
-        self.stop_words = set(stop_words)  
-      
-    def notify(self, message: Message):  
-        if not isinstance(message, TokenizedText):  
-            return  
-        filtered_text = [word for word in message.data if word not in self.stop_words and len(word)>2 ]  
-        self.next_observer.notify(FilteredText(filtered_text))  
-  
-# 词频统计订阅者  
-class WordFrequencyCalculatorSubscriber(Observer):  
-    def __init__(self, next_observer: Observer):  
-        self.next_observer = next_observer          
-
-    def notify(self, message: Message):          
-        if not isinstance(message, FilteredText):  
-            return  
-        word_freq = Counter(message.data)          
-        self.next_observer.notify( WordFrequency(word_freq) )
-        
-  
-# 输出前N个词订阅者  
-class TopNWordsDisplaySubscriber(Observer):  
-    def __init__(self, n: int):  
-        self.n = n  
-      
-    def notify(self, message: Message):  
-        if not isinstance(message, WordFrequency):  
-            return          
-        print_word_freqs( message.data.most_common(self.n) )
-        
-  
-# 模拟发布者  
-def publish_text(text: str, observers: List[Observer]):  
-    for observer in observers:  
-        observer.notify(Message(text))  
-  
-# 主函数  
-def main():  
-    text = read_file()
-    
-    stop_words = get_stopwords()
-  
-    # 创建订阅者链  
-    display_subscriber = TopNWordsDisplaySubscriber( n=10 )  
-    freq_subscriber = WordFrequencyCalculatorSubscriber(display_subscriber)  
-    stop_words_subscriber = StopWordsRemoverSubscriber(freq_subscriber, stop_words)  
-    tokenizer_subscriber = TokenizerSubscriber(stop_words_subscriber)  
-  
-    # 发布文本  
-    publish_text(text, [tokenizer_subscriber])  
-  
-if __name__ == "__main__":  
-    main()
--- a/进程独立/消息驱动/1
+++ b/进程独立/消息驱动/1
--- a/进程独立/消息驱动/2
+++ b/进程独立/消息驱动/2
--- a/进程独立/消息驱动/readme.md
+++ b/进程独立/消息驱动/readme.md
@ -0,0 +1,2 @@
+
+改造下适合跨进程系统的后台响应对象设计
--- a/动机与模式/13
+++ b/动机与模式/13
--- a/动机与模式/13
+++ b/动机与模式/13
--- a/动机与模式/13
+++ b/动机与模式/13
--- a/动机与模式/13
+++ b/动机与模式/13
--- a/动机与模式/13
+++ b/动机与模式/13
--- a/动机与模式/13
+++ b/动机与模式/13
--- a/多计算单元/并发/2
+++ b/多计算单元/并发/2
@ -1,48 +0,0 @@
-# -*- coding: utf-8 -*-
-from collections import Counter
-from cppy.cp_util import *
-from multiprocessing.pool import ThreadPool
-
-
-#
-# 多线程
-#
-stop_words = get_stopwords()
-
-def process_chunk(chunk):
-    # 过滤停用词    
-    words = [ w for w in chunk if ( not w in stop_words ) and len(w) >= 3 ]
-    return Counter(words)
-
-
-def merge_counts(counts_list):
-    """合并多个Counter对象的总和"""
-    return sum(counts_list, Counter())
-
-
-def thread_function(chunk, counts_list):
-    word_count = process_chunk(chunk)
-    counts_list.append(word_count)
-
-
-@timing_decorator
-def main():    
-    # 读数据，按1000个词一组分片
-    chunks = get_chunks(testfilepath,1000)
-
-    # 线程池    
-    pool = ThreadPool(len(chunks))  # 随意指定的线程数       
-
-    counts_list = pool.map(process_chunk, chunks)
-    pool.close()
-    pool.join()
-
-    # 合并计数
-    total_counts = merge_counts(counts_list)
-
-    # 输出最高频的n个词
-    print_word_freqs(total_counts.most_common(10))
-
-
-if __name__ == '__main__':
-    main()
--- a/多计算单元/并发/3
+++ b/多计算单元/并发/3
@ -1,42 +0,0 @@
-# -*- coding: utf-8 -*-
-import multiprocessing
-from collections import Counter
-from cppy.cp_util import *
-
-
-#
-# 多进程: 因为创建进程相比计算过程开销太大，结果最慢
-#
-stop_words = get_stopwords()
-
-def process_chunk(chunk):
-    # 过滤停用词    
-    words = [ w for w in chunk if ( not w in stop_words ) and len(w) >= 3 ]
-    return Counter(words)
-
-def merge_counts(counts_list):
-    """合并多个Counter对象的总和"""
-    return sum(counts_list, Counter())
-
-
-@timing_decorator
-def main():
-    # 读取文件内容，分割文件内容为多个块，每个块由一个进程处理    
-    chunks = get_chunks(testfilepath,1000)
-
-    # 使用多进程处理每个块
-    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
-    counts_list = pool.map(process_chunk, chunks)
-    pool.close()
-    pool.join()
-
-    # 合并计数
-    total_counts = merge_counts(counts_list)
-
-    # 输出最高频的n个词
-    print_word_freqs(total_counts.most_common(10))
-
-
-if __name__ == '__main__':
-    main()   
-
--- a/多计算单元/数据共享/1
+++ b/多计算单元/数据共享/1
@ -1,52 +0,0 @@
-import  threading, queue
-from cppy.cp_util import *
-from collections import Counter
-
-stop_words = get_stopwords()
-
-# 待处理数据放一个队列，多个线程轮流计数，最后合并统一计数
-
-class WordFrequencyCounter:
-    def __init__(self, input_file):
-        self.word_space = queue.Queue()
-        self.freq_space = queue.Queue()                
-        for chunk in get_chunks(input_file,3000):                        
-            self.word_space.put(chunk)        
-
-    def process_words(self):
-        while not self.word_space.empty():
-            try:
-                chunk = self.word_space.get_nowait()  # 不使用超时，持续获取数据
-            except queue.Empty:
-                break  # 队列为空，退出循环
-            # print(f"Worker thread ID: {threading.get_ident()}",len(chunk))  
-            words = [ w for w in chunk if w not in stop_words and len(w) >= 3 ]
-            word_freqs = Counter(words)
-            self.freq_space.put(dict(word_freqs))  # 将Counter对象转换为字典
-
-    def run(self):
-        workers = [ threading.Thread(target=self.process_words) for _ in range(5)]
-        for worker in workers:  worker.start()
-        for worker in workers:  worker.join()
-
-        word_freqs = Counter()  # 初始化一个空的Counter对象
-        while not self.freq_space.empty():
-            freqs = self.freq_space.get()            
-            if freqs:  # 确保freqs非空
-                word_freqs.update(freqs)  
-
-        print_word_freqs ( sort_dict (word_freqs) )
-        
-
-@timing_decorator
-def main(): 
-    counter = WordFrequencyCounter( testfilepath )
-    counter.run()
-
-if __name__ == '__main__':
-    main()
-
-'''
-在多线程之间传递数据，建议使用线程安全的队列，如queue.Queue或multiprocessing.Queue（后者也适用于多进程环境）。
-这些队列提供了线程安全的数据传输机制，可以避免竞态条件和数据损坏。
-'''    
--- a/多计算单元/数据共享/2
+++ b/多计算单元/数据共享/2
@ -1,49 +0,0 @@
-'''
-使用 multiprocessing.Manager:
-Manager 提供了一个可以在不同进程之间共享和修改的数据类型，如 list, dict, Namespace 等。
-它实际上是在背后启动了一个单独的服务器进程，其他进程通过代理来访问这些共享对象。
-
-怎么得到最快的一个结果，是一个试错过程：X程创建数目多少、分片的大小 ...
-
-使用 multiprocessing.Manager 来完成统计词频
-需要注意：
- - Manager() 必须用函数包起来,不能按脚本随便放外面，否则会提示freeze_support
- - 工作函数需要放到外面，不能做内部函数。否则会提示参数错误
- - 无法在 Jupyter 类似环境运行 
-'''
-
-from cppy.cp_util import *
-from collections import Counter
-from multiprocessing import Manager, Process
-
-stop_words = get_stopwords()
-
-def process_chunk(chunk,word_count):    
-    words = [ w for w in chunk if ( not w in stop_words ) and len(w) >= 3 ]    
-    for word in words: # 非常化时间
-        word_count[word] = word_count.get(word, 0) + 1
-    # word_count.update( Counter(words) ) # 类型不起作用
-
-@timing_decorator
-def main():   
-    manager = Manager()
-    word_count = manager.dict()
-
-    chunks = get_chunks(testfilepath,2800)    
-    print('-------------------',len(chunks))
-    processes = []
-    for chunk in chunks:
-        p = Process(target=process_chunk, 
-                    args=(chunk,word_count) )
-        processes.append(p)
-        p.start()
-
-    for p in processes:  p.join()
-
-    word_count = dict(word_count)  
-    word_freqs = Counter(word_count).most_common(10)      
-    print_word_freqs(word_freqs)    
-
-
-if __name__ == '__main__':
-    main()
--- a/其它/数据库做中介/ORM/DataQuery.py
+++ b/其它/数据库做中介/ORM/DataQuery.py
--- a/其它/数据库做中介/ORM/createDb.py
+++ b/其它/数据库做中介/ORM/createDb.py
--- a/其它/数据库做中介/ORM/processData.py
+++ b/其它/数据库做中介/ORM/processData.py
--- a/其它/数据库做中介/tf.db
+++ b/其它/数据库做中介/tf.db
--- a/其它/数据库做中介/数据库.py
+++ b/其它/数据库做中介/数据库.py
--- a/高性能编程/11
+++ b/高性能编程/11
@ -0,0 +1,18 @@
+
+队列应用,涉及简单算法到多线程、多进程，以及分布式。
+
+应用场景匹配的相应队列：
+- 简单算法或小型任务：list（列表）作为队列,append() 入队，pop(0) 出队,简单易用。
+- 单线程高性能：用 collections.deque，效率最高。
+- 异步编程: 用 asyncio.Queue 
+- 多线程安全：用 queue.Queue 
+- 多线程优先级任务安全：用 queue.PriorityQueue ，确保高优先级的元素优先被取出。
+- 多进程通信：用 multiprocessing.Queue，如果进程内部存在线程竞争，就需要加锁。
+
+注意事项：
+- 非线程安全的实现（如 deque,list）在多线程场景中需加锁 。
+- multiprocessing.Queue 利用了操作系统提供的进程间通信（IPC, Inter-Process Communication）机制，具体实现取决于不同操作系统的支持。
+
+其它可能用到的相关工具：
+Redis，作为中间件提供共享存储和跨进程协调机制，处理缓存、消息队列、分布式锁、分布式状态共享。
+Celery，用于管理异步任务或后台任务，一般用来做周期性任务或者长时间运行的任务。可用 Redis 做存储。
--- a/高性能编程/词频统计/1
+++ b/高性能编程/词频统计/1
--- a/高性能编程/词频统计/2
+++ b/高性能编程/词频统计/2
@ -0,0 +1,60 @@
+import os
+import threading
+from queue import Queue
+from collections import Counter
+import re
+
+# 共享队列和词频统计器
+file_queue = Queue()
+word_counter = Counter()
+lock = threading.Lock()  # 确保线程安全更新 Counter
+
+# 读取文件并分词的函数
+def process_file():
+    while True:
+        try:
+            # 从队列获取文件名，非阻塞
+            file_path = file_queue.get_nowait()
+        except:
+            break  # 队列为空，退出
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                text = f.read().lower()
+                # 简单分词，移除标点
+                words = re.findall(r'\b\w+\b', text)
+                # 线程安全更新词频
+                with lock:
+                    word_counter.update(words)
+        except Exception as e:
+            print(f"Error processing {file_path}: {e}")
+        finally:
+            file_queue.task_done()
+
+def main():
+    # 获取 data 目录下所有 .txt 文件
+    data_dir = 'data'
+    files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.txt')]
+    
+    # 将文件路径放入队列
+    for file_path in files:
+        file_queue.put(file_path)
+    
+    # 创建并启动多个线程
+    num_threads = 4  # 可根据需要调整线程数
+    threads = []
+    for _ in range(num_threads):
+        t = threading.Thread(target=process_file)
+        t.start()
+        threads.append(t)
+    
+    # 等待所有线程完成
+    for t in threads:
+        t.join()
+    
+    # 输出前 10 高频词
+    print("Top 10 高频词:")
+    for word, count in word_counter.most_common(10):
+        print(f"{word}: {count}")
+
+if __name__ == '__main__':
+    main()
--- a/高性能编程/词频统计/3
+++ b/高性能编程/词频统计/3
@ -0,0 +1,45 @@
+import os
+import re
+from collections import Counter
+from multiprocessing import Pool, Manager
+
+def process_file(file_path, shared_counter):
+    """处理单个文件，统计词频"""
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            text = f.read().lower()
+            # 简单分词，移除标点
+            words = re.findall(r'\b\w+\b', text)
+            # 更新共享 Counter
+            shared_counter.update(words)
+    except Exception as e:
+        print(f"Error processing {file_path}: {e}")
+
+def main():
+    # 获取 data 目录下所有 .txt 文件
+    data_dir = 'data'
+    files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.txt')]
+    
+    # 使用 Manager 创建共享 Counter
+    with Manager() as manager:
+        shared_counter = manager.dict(Counter())
+        
+        # 创建进程池
+        with Pool(processes=4) as pool:  # 可调整进程数
+            # 分发任务给进程池
+            for file_path in files:
+                pool.apply_async(process_file, args=(file_path, shared_counter))
+            # 关闭池并等待所有进程完成
+            pool.close()
+            pool.join()
+        
+        # 转换为普通 Counter 以获取结果
+        final_counter = Counter(dict(shared_counter))
+    
+    # 输出前 10 高频词
+    print("Top 10 高频词:")
+    for word, count in final_counter.most_common(10):
+        print(f"{word}: {count}")
+
+if __name__ == '__main__':
+    main()
				`@ -0,0 +1,2 @@`

				`改造下适合跨进程系统的后台响应对象设计`