From 44b0c005671d10ccc0795286a1a811e14db439dc Mon Sep 17 00:00:00 2001
From: zj3D <flysmart.ww@qq.com>
Date: Wed, 20 Mar 2024 17:30:10 +0800
Subject: [PATCH] =?UTF-8?q?=E5=A4=A7=E4=BF=AE=209?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 12 语言特性/反射/反射_类.py          | 16 +++++-----
 12 语言特性/异步.py                     |  2 +-
 12 语言特性/递归.py                     | 30 +++++++++++++++++++
 12 语言特性/递归/test.txt               |  2 --
 12 语言特性/递归/tf-08.py               | 29 ------------------
 .../数据库/{tf-26.py => 数据库.py}      | 28 ++++-------------
 6 files changed, 44 insertions(+), 63 deletions(-)
 create mode 100644 12 语言特性/递归.py
 delete mode 100644 12 语言特性/递归/test.txt
 delete mode 100644 12 语言特性/递归/tf-08.py
 rename 13 计算设备/存储/数据库/{tf-26.py => 数据库.py} (57%)

diff --git a/12 语言特性/反射/反射_类.py b/12 语言特性/反射/反射_类.py
index 81c93f4..c189dac 100644
--- a/12 语言特性/反射/反射_类.py	
+++ b/12 语言特性/反射/反射_类.py	
@@ -19,19 +19,17 @@ class sortTaskHandler:
 #  应用框架
 ##########################################
 def handle_task(task_type,*args):    
-    handler_class_name = f"{task_type}TaskHandler" # 构建处理器类名
-    
-    # 使用globals()获取当前全局符号表
+    handler_class_name = f"{task_type}TaskHandler" # 构建处理器类名   
+
     handler_class = globals().get(handler_class_name)    
     if handler_class:        
         handler = handler_class() # 实例化处理器类        
         return handler.handle(*args) # 调用处理方法
     else:
-        print(f"No handler found for task type: {task_type}")       
+        print(f"No found for task type: {task_type}")       
 
 
-if __name__ == '__main__':   
-    word_list = handle_task("words",util.testfilepath)
-    word_freq = handle_task("frequencies",word_list)
-    word_sort = handle_task("sort",word_freq)        
-    util.print_word_freqs(word_sort)
\ No newline at end of file
+word_list = handle_task("words",util.testfilepath)
+word_freq = handle_task("frequencies",word_list)
+word_sort = handle_task("sort",word_freq)        
+util.print_word_freqs(word_sort)
\ No newline at end of file
diff --git a/12 语言特性/异步.py b/12 语言特性/异步.py
index f23060f..0d04460 100644
--- a/12 语言特性/异步.py	
+++ b/12 语言特性/异步.py	
@@ -5,7 +5,7 @@ from cppy.cp_util import *
 
 
 #
-# 协程: 有点复杂
+# 协程: 有点复杂; 读文件的Io还是太快，的爬虫
 #
 async def read_file(file_path):
     async with aiofiles.open(file_path, 'r', encoding='utf-8') as file:
diff --git a/12 语言特性/递归.py b/12 语言特性/递归.py
new file mode 100644
index 0000000..d38efa5
--- /dev/null
+++ b/12 语言特性/递归.py	
@@ -0,0 +1,30 @@
+from cppy.cp_util import *
+from collections import Counter
+
+stop_words = get_stopwords()
+
+def process_chunk(chunk):
+    # 过滤停用词
+    words = [ w for w in chunk if ( not w in stop_words ) and len(w) >= 3 ]
+    return Counter(words)
+
+def process_chunks( chunks,word_freqs,x,max ):
+    next  = x + 1
+    if next < max:
+        process_chunks(chunks,word_freqs,next,max)   
+    word_list = process_chunk(chunks[x])    
+    word_freqs += Counter(word_list)
+
+# def process_chunks( chunks,word_freqs,x,max ):
+#     word_list = process_chunk(chunks[x])    
+#     word_freqs += Counter(word_list)
+#     next  = x + 1
+#     if next < max:
+#         process_chunks(chunks,word_freqs,next,max)   
+  
+  
+# 读数据，按1000个词一组分片
+chunks = get_chunks(testfilepath,2000)
+word_freqs = Counter()
+process_chunks( chunks,word_freqs,0,len(chunks) )
+print_word_freqs( word_freqs.most_common(10) )
\ No newline at end of file
diff --git a/12 语言特性/递归/test.txt b/12 语言特性/递归/test.txt
deleted file mode 100644
index 088a3d7..0000000
--- a/12 语言特性/递归/test.txt	
+++ /dev/null
@@ -1,2 +0,0 @@
-" my Some sure acquaintance or other, my dear, sure,other  I suppose; I am sure I do not
-know. sure "
diff --git a/12 语言特性/递归/tf-08.py b/12 语言特性/递归/tf-08.py
deleted file mode 100644
index 552b341..0000000
--- a/12 语言特性/递归/tf-08.py	
+++ /dev/null
@@ -1,29 +0,0 @@
-import sys
-from cppy.cp_util import *
-
-## 切分任务这个工作，可以统一为一个通用函数。做成一个生成器！！
-
-script_dir = os.path.dirname(os.path.abspath(__file__))  
-testfile = os.path.join(script_dir, 'test.txt')
-stop_words = get_stopwords()
-
-# 如果崩溃，把 5000 改下
-RECURSION_LIMIT = 5000
-sys.setrecursionlimit( RECURSION_LIMIT )
-
-def count( i,chunks, stopwords, wordfreqs):        
-    if i < 0 : return       
-    for word in  chunks[i]:                     
-        if word not in stopwords:  
-            wordfreqs[word] = wordfreqs.get(word, 0) + 1    
-    count( i-1, chunks,stopwords, wordfreqs )
-
-word_list = re_split( open(testfile,encoding='utf-8').read() )
-filesize = len( word_list )    
-chunk_size = ( filesize // RECURSION_LIMIT ) + 1
-chunks = [  word_list[ x*chunk_size:(x+1)*RECURSION_LIMIT ] 
-                for x in range(chunk_size)  ]
-word_freqs = {}
-count( chunk_size -1 ,chunks, stop_words, word_freqs )
-
-print_word_freqs(sort_dict(word_freqs))
\ No newline at end of file
diff --git a/13 计算设备/存储/数据库/tf-26.py b/13 计算设备/存储/数据库/数据库.py
similarity index 57%
rename from 13 计算设备/存储/数据库/tf-26.py
rename to 13 计算设备/存储/数据库/数据库.py
index 6ef18e4..34b2411 100644
--- a/13 计算设备/存储/数据库/tf-26.py	
+++ b/13 计算设备/存储/数据库/数据库.py	
@@ -4,20 +4,10 @@ from cppy.cp_util import testfilepath,db_filename,extract_file_words
 
 # 数据库表结构
 TABLES = {
-    'documents': '''CREATE TABLE IF NOT EXISTS documents (
-                        id INTEGER PRIMARY KEY AUTOINCREMENT,
-                        name TEXT NOT NULL
-                    )''',
     'words': '''CREATE TABLE IF NOT EXISTS words (
-                    doc_id INTEGER NOT NULL,
-                    value TEXT NOT NULL,
-                    FOREIGN KEY (doc_id) REFERENCES documents (id)
-                )''',
-    'characters': '''CREATE TABLE IF NOT EXISTS characters (
-                        word_id INTEGER NOT NULL,
-                        value TEXT NOT NULL,
-                        FOREIGN KEY (word_id) REFERENCES words (id)
-                )'''
+                    doc_name INTEGER NOT NULL,
+                    value TEXT NOT NULL
+                )''',    
 }
 
 
@@ -33,15 +23,10 @@ def create_db_schema(connection):
 def load_file_into_database(path_to_file, connection):        
     words = extract_file_words( path_to_file )
 
-    c = connection.cursor()
-    c.execute("INSERT INTO documents (name) VALUES (?)", (path_to_file,))
-    doc_id = c.lastrowid
-
+    doc_name = os.path.basename(testfilepath).split('.')[0]     
+    c = connection.cursor()        
     for w in words:
-        c.execute("INSERT INTO words (doc_id, value) VALUES (?, ?)", (doc_id, w))
-        word_id = c.lastrowid
-        for char in w:
-            c.execute("INSERT INTO characters (word_id, value) VALUES (?, ?)", (word_id, char))
+        c.execute("INSERT INTO words (doc_name, value) VALUES (?, ?)", (doc_name, w))        
     connection.commit()
     c.close()
 
@@ -52,7 +37,6 @@ def load_file_into_database(path_to_file, connection):
 # 构造数据库文件的完整路径  
 current_dir = os.path.dirname(os.path.abspath(__file__))   
 db_file_path = os.path.join(current_dir, db_filename)    
-
 if os.path.exists(db_file_path):          
     os.remove(db_file_path)