diff --git a/12 语言特性/反射/反射_类.py b/12 语言特性/反射/反射_类.py index 81c93f4..c189dac 100644 --- a/12 语言特性/反射/反射_类.py +++ b/12 语言特性/反射/反射_类.py @@ -19,19 +19,17 @@ class sortTaskHandler: # 应用框架 ########################################## def handle_task(task_type,*args): - handler_class_name = f"{task_type}TaskHandler" # 构建处理器类名 - - # 使用globals()获取当前全局符号表 + handler_class_name = f"{task_type}TaskHandler" # 构建处理器类名 + handler_class = globals().get(handler_class_name) if handler_class: handler = handler_class() # 实例化处理器类 return handler.handle(*args) # 调用处理方法 else: - print(f"No handler found for task type: {task_type}") + print(f"No found for task type: {task_type}") -if __name__ == '__main__': - word_list = handle_task("words",util.testfilepath) - word_freq = handle_task("frequencies",word_list) - word_sort = handle_task("sort",word_freq) - util.print_word_freqs(word_sort) \ No newline at end of file +word_list = handle_task("words",util.testfilepath) +word_freq = handle_task("frequencies",word_list) +word_sort = handle_task("sort",word_freq) +util.print_word_freqs(word_sort) \ No newline at end of file diff --git a/12 语言特性/异步.py b/12 语言特性/异步.py index f23060f..0d04460 100644 --- a/12 语言特性/异步.py +++ b/12 语言特性/异步.py @@ -5,7 +5,7 @@ from cppy.cp_util import * # -# 协程: 有点复杂 +# 协程: 有点复杂; 读文件的Io还是太快,的爬虫 # async def read_file(file_path): async with aiofiles.open(file_path, 'r', encoding='utf-8') as file: diff --git a/12 语言特性/递归.py b/12 语言特性/递归.py new file mode 100644 index 0000000..d38efa5 --- /dev/null +++ b/12 语言特性/递归.py @@ -0,0 +1,30 @@ +from cppy.cp_util import * +from collections import Counter + +stop_words = get_stopwords() + +def process_chunk(chunk): + # 过滤停用词 + words = [ w for w in chunk if ( not w in stop_words ) and len(w) >= 3 ] + return Counter(words) + +def process_chunks( chunks,word_freqs,x,max ): + next = x + 1 + if next < max: + process_chunks(chunks,word_freqs,next,max) + word_list = process_chunk(chunks[x]) + word_freqs += Counter(word_list) + +# def process_chunks( chunks,word_freqs,x,max ): +# word_list = process_chunk(chunks[x]) +# word_freqs += Counter(word_list) +# next = x + 1 +# if next < max: +# process_chunks(chunks,word_freqs,next,max) + + +# 读数据,按1000个词一组分片 +chunks = get_chunks(testfilepath,2000) +word_freqs = Counter() +process_chunks( chunks,word_freqs,0,len(chunks) ) +print_word_freqs( word_freqs.most_common(10) ) \ No newline at end of file diff --git a/12 语言特性/递归/test.txt b/12 语言特性/递归/test.txt deleted file mode 100644 index 088a3d7..0000000 --- a/12 语言特性/递归/test.txt +++ /dev/null @@ -1,2 +0,0 @@ -" my Some sure acquaintance or other, my dear, sure,other I suppose; I am sure I do not -know. sure " diff --git a/12 语言特性/递归/tf-08.py b/12 语言特性/递归/tf-08.py deleted file mode 100644 index 552b341..0000000 --- a/12 语言特性/递归/tf-08.py +++ /dev/null @@ -1,29 +0,0 @@ -import sys -from cppy.cp_util import * - -## 切分任务这个工作,可以统一为一个通用函数。做成一个生成器!! - -script_dir = os.path.dirname(os.path.abspath(__file__)) -testfile = os.path.join(script_dir, 'test.txt') -stop_words = get_stopwords() - -# 如果崩溃,把 5000 改下 -RECURSION_LIMIT = 5000 -sys.setrecursionlimit( RECURSION_LIMIT ) - -def count( i,chunks, stopwords, wordfreqs): - if i < 0 : return - for word in chunks[i]: - if word not in stopwords: - wordfreqs[word] = wordfreqs.get(word, 0) + 1 - count( i-1, chunks,stopwords, wordfreqs ) - -word_list = re_split( open(testfile,encoding='utf-8').read() ) -filesize = len( word_list ) -chunk_size = ( filesize // RECURSION_LIMIT ) + 1 -chunks = [ word_list[ x*chunk_size:(x+1)*RECURSION_LIMIT ] - for x in range(chunk_size) ] -word_freqs = {} -count( chunk_size -1 ,chunks, stop_words, word_freqs ) - -print_word_freqs(sort_dict(word_freqs)) \ No newline at end of file diff --git a/13 计算设备/存储/数据库/tf-26.py b/13 计算设备/存储/数据库/数据库.py similarity index 57% rename from 13 计算设备/存储/数据库/tf-26.py rename to 13 计算设备/存储/数据库/数据库.py index 6ef18e4..34b2411 100644 --- a/13 计算设备/存储/数据库/tf-26.py +++ b/13 计算设备/存储/数据库/数据库.py @@ -4,20 +4,10 @@ from cppy.cp_util import testfilepath,db_filename,extract_file_words # 数据库表结构 TABLES = { - 'documents': '''CREATE TABLE IF NOT EXISTS documents ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - name TEXT NOT NULL - )''', 'words': '''CREATE TABLE IF NOT EXISTS words ( - doc_id INTEGER NOT NULL, - value TEXT NOT NULL, - FOREIGN KEY (doc_id) REFERENCES documents (id) - )''', - 'characters': '''CREATE TABLE IF NOT EXISTS characters ( - word_id INTEGER NOT NULL, - value TEXT NOT NULL, - FOREIGN KEY (word_id) REFERENCES words (id) - )''' + doc_name INTEGER NOT NULL, + value TEXT NOT NULL + )''', } @@ -33,15 +23,10 @@ def create_db_schema(connection): def load_file_into_database(path_to_file, connection): words = extract_file_words( path_to_file ) - c = connection.cursor() - c.execute("INSERT INTO documents (name) VALUES (?)", (path_to_file,)) - doc_id = c.lastrowid - + doc_name = os.path.basename(testfilepath).split('.')[0] + c = connection.cursor() for w in words: - c.execute("INSERT INTO words (doc_id, value) VALUES (?, ?)", (doc_id, w)) - word_id = c.lastrowid - for char in w: - c.execute("INSERT INTO characters (word_id, value) VALUES (?, ?)", (word_id, char)) + c.execute("INSERT INTO words (doc_name, value) VALUES (?, ?)", (doc_name, w)) connection.commit() c.close() @@ -52,7 +37,6 @@ def load_file_into_database(path_to_file, connection): # 构造数据库文件的完整路径 current_dir = os.path.dirname(os.path.abspath(__file__)) db_file_path = os.path.join(current_dir, db_filename) - if os.path.exists(db_file_path): os.remove(db_file_path)