大修 9

1 year ago · 44b0c00567
parent 83c156a3d5
commit 44b0c00567
6 changed files with 44 additions and 63 deletions
--- a/语言特性/反射/反射_类.py
+++ b/语言特性/反射/反射_类.py
@ -21,17 +21,15 @@ class sortTaskHandler:
 def handle_task(task_type,*args):    
    handler_class_name = f"{task_type}TaskHandler" # 构建处理器类名   

-    # 使用globals()获取当前全局符号表
    handler_class = globals().get(handler_class_name)    
    if handler_class:        
        handler = handler_class() # 实例化处理器类        
        return handler.handle(*args) # 调用处理方法
    else:
-        print(f"No handler found for task type: {task_type}")       
+        print(f"No found for task type: {task_type}")       


-if __name__ == '__main__':   
-    word_list = handle_task("words",util.testfilepath)
-    word_freq = handle_task("frequencies",word_list)
-    word_sort = handle_task("sort",word_freq)        
-    util.print_word_freqs(word_sort)
+word_list = handle_task("words",util.testfilepath)
+word_freq = handle_task("frequencies",word_list)
+word_sort = handle_task("sort",word_freq)        
+util.print_word_freqs(word_sort)
--- a/语言特性/异步.py
+++ b/语言特性/异步.py
@ -5,7 +5,7 @@ from cppy.cp_util import *


 #
-# 协程: 有点复杂
+# 协程: 有点复杂; 读文件的Io还是太快，的爬虫
 #
 async def read_file(file_path):
    async with aiofiles.open(file_path, 'r', encoding='utf-8') as file:
--- a/语言特性/递归.py
+++ b/语言特性/递归.py
@ -0,0 +1,30 @@
+from cppy.cp_util import *
+from collections import Counter
+
+stop_words = get_stopwords()
+
+def process_chunk(chunk):
+    # 过滤停用词
+    words = [ w for w in chunk if ( not w in stop_words ) and len(w) >= 3 ]
+    return Counter(words)
+
+def process_chunks( chunks,word_freqs,x,max ):
+    next  = x + 1
+    if next < max:
+        process_chunks(chunks,word_freqs,next,max)   
+    word_list = process_chunk(chunks[x])    
+    word_freqs += Counter(word_list)
+
+# def process_chunks( chunks,word_freqs,x,max ):
+#     word_list = process_chunk(chunks[x])    
+#     word_freqs += Counter(word_list)
+#     next  = x + 1
+#     if next < max:
+#         process_chunks(chunks,word_freqs,next,max)   
+  
+  
+# 读数据，按1000个词一组分片
+chunks = get_chunks(testfilepath,2000)
+word_freqs = Counter()
+process_chunks( chunks,word_freqs,0,len(chunks) )
+print_word_freqs( word_freqs.most_common(10) )
--- a/语言特性/递归/test.txt
+++ b/语言特性/递归/test.txt
@ -1,2 +0,0 @@
-" my Some sure acquaintance or other, my dear, sure,other  I suppose; I am sure I do not
-know. sure "
--- a/语言特性/递归/tf-08.py
+++ b/语言特性/递归/tf-08.py
@ -1,29 +0,0 @@
-import sys
-from cppy.cp_util import *
-
-## 切分任务这个工作，可以统一为一个通用函数。做成一个生成器！！
-
-script_dir = os.path.dirname(os.path.abspath(__file__))  
-testfile = os.path.join(script_dir, 'test.txt')
-stop_words = get_stopwords()
-
-# 如果崩溃，把 5000 改下
-RECURSION_LIMIT = 5000
-sys.setrecursionlimit( RECURSION_LIMIT )
-
-def count( i,chunks, stopwords, wordfreqs):        
-    if i < 0 : return       
-    for word in  chunks[i]:                     
-        if word not in stopwords:  
-            wordfreqs[word] = wordfreqs.get(word, 0) + 1    
-    count( i-1, chunks,stopwords, wordfreqs )
-
-word_list = re_split( open(testfile,encoding='utf-8').read() )
-filesize = len( word_list )    
-chunk_size = ( filesize // RECURSION_LIMIT ) + 1
-chunks = [  word_list[ x*chunk_size:(x+1)*RECURSION_LIMIT ] 
-                for x in range(chunk_size)  ]
-word_freqs = {}
-count( chunk_size -1 ,chunks, stop_words, word_freqs )
-
-print_word_freqs(sort_dict(word_freqs))
--- a/计算设备/存储/数据库/数据库.py
+++ b/计算设备/存储/数据库/数据库.py
@ -4,20 +4,10 @@ from cppy.cp_util import testfilepath,db_filename,extract_file_words

 # 数据库表结构
 TABLES = {
-    'documents': '''CREATE TABLE IF NOT EXISTS documents (
-                        id INTEGER PRIMARY KEY AUTOINCREMENT,
-                        name TEXT NOT NULL
-                    )''',
    'words': '''CREATE TABLE IF NOT EXISTS words (
-                    doc_id INTEGER NOT NULL,
-                    value TEXT NOT NULL,
-                    FOREIGN KEY (doc_id) REFERENCES documents (id)
+                    doc_name INTEGER NOT NULL,
+                    value TEXT NOT NULL
                )''',    
-    'characters': '''CREATE TABLE IF NOT EXISTS characters (
-                        word_id INTEGER NOT NULL,
-                        value TEXT NOT NULL,
-                        FOREIGN KEY (word_id) REFERENCES words (id)
-                )'''
 }


@ -33,15 +23,10 @@ def create_db_schema(connection):
 def load_file_into_database(path_to_file, connection):        
    words = extract_file_words( path_to_file )

+    doc_name = os.path.basename(testfilepath).split('.')[0]     
    c = connection.cursor()        
-    c.execute("INSERT INTO documents (name) VALUES (?)", (path_to_file,))
-    doc_id = c.lastrowid
-
    for w in words:
-        c.execute("INSERT INTO words (doc_id, value) VALUES (?, ?)", (doc_id, w))
-        word_id = c.lastrowid
-        for char in w:
-            c.execute("INSERT INTO characters (word_id, value) VALUES (?, ?)", (word_id, char))
+        c.execute("INSERT INTO words (doc_name, value) VALUES (?, ?)", (doc_name, w))        
    connection.commit()
    c.close()

@ -52,7 +37,6 @@ def load_file_into_database(path_to_file, connection):
 # 构造数据库文件的完整路径  
 current_dir = os.path.dirname(os.path.abspath(__file__))   
 db_file_path = os.path.join(current_dir, db_filename)    
-
 if os.path.exists(db_file_path):          
    os.remove(db_file_path)