dev
zj3D 8 months ago
parent 83c156a3d5
commit 44b0c00567

@ -21,17 +21,15 @@ class sortTaskHandler:
def handle_task(task_type,*args): def handle_task(task_type,*args):
handler_class_name = f"{task_type}TaskHandler" # 构建处理器类名 handler_class_name = f"{task_type}TaskHandler" # 构建处理器类名
# 使用globals()获取当前全局符号表
handler_class = globals().get(handler_class_name) handler_class = globals().get(handler_class_name)
if handler_class: if handler_class:
handler = handler_class() # 实例化处理器类 handler = handler_class() # 实例化处理器类
return handler.handle(*args) # 调用处理方法 return handler.handle(*args) # 调用处理方法
else: else:
print(f"No handler found for task type: {task_type}") print(f"No found for task type: {task_type}")
if __name__ == '__main__': word_list = handle_task("words",util.testfilepath)
word_list = handle_task("words",util.testfilepath) word_freq = handle_task("frequencies",word_list)
word_freq = handle_task("frequencies",word_list) word_sort = handle_task("sort",word_freq)
word_sort = handle_task("sort",word_freq) util.print_word_freqs(word_sort)
util.print_word_freqs(word_sort)

@ -5,7 +5,7 @@ from cppy.cp_util import *
# #
# 协程: 有点复杂 # 协程: 有点复杂; 读文件的Io还是太快的爬虫
# #
async def read_file(file_path): async def read_file(file_path):
async with aiofiles.open(file_path, 'r', encoding='utf-8') as file: async with aiofiles.open(file_path, 'r', encoding='utf-8') as file:

@ -0,0 +1,30 @@
from cppy.cp_util import *
from collections import Counter
stop_words = get_stopwords()
def process_chunk(chunk):
# 过滤停用词
words = [ w for w in chunk if ( not w in stop_words ) and len(w) >= 3 ]
return Counter(words)
def process_chunks( chunks,word_freqs,x,max ):
next = x + 1
if next < max:
process_chunks(chunks,word_freqs,next,max)
word_list = process_chunk(chunks[x])
word_freqs += Counter(word_list)
# def process_chunks( chunks,word_freqs,x,max ):
# word_list = process_chunk(chunks[x])
# word_freqs += Counter(word_list)
# next = x + 1
# if next < max:
# process_chunks(chunks,word_freqs,next,max)
# 读数据按1000个词一组分片
chunks = get_chunks(testfilepath,2000)
word_freqs = Counter()
process_chunks( chunks,word_freqs,0,len(chunks) )
print_word_freqs( word_freqs.most_common(10) )

@ -1,2 +0,0 @@
" my Some sure acquaintance or other, my dear, sure,other I suppose; I am sure I do not
know. sure "

@ -1,29 +0,0 @@
import sys
from cppy.cp_util import *
## 切分任务这个工作,可以统一为一个通用函数。做成一个生成器!!
script_dir = os.path.dirname(os.path.abspath(__file__))
testfile = os.path.join(script_dir, 'test.txt')
stop_words = get_stopwords()
# 如果崩溃,把 5000 改下
RECURSION_LIMIT = 5000
sys.setrecursionlimit( RECURSION_LIMIT )
def count( i,chunks, stopwords, wordfreqs):
if i < 0 : return
for word in chunks[i]:
if word not in stopwords:
wordfreqs[word] = wordfreqs.get(word, 0) + 1
count( i-1, chunks,stopwords, wordfreqs )
word_list = re_split( open(testfile,encoding='utf-8').read() )
filesize = len( word_list )
chunk_size = ( filesize // RECURSION_LIMIT ) + 1
chunks = [ word_list[ x*chunk_size:(x+1)*RECURSION_LIMIT ]
for x in range(chunk_size) ]
word_freqs = {}
count( chunk_size -1 ,chunks, stop_words, word_freqs )
print_word_freqs(sort_dict(word_freqs))

@ -4,20 +4,10 @@ from cppy.cp_util import testfilepath,db_filename,extract_file_words
# 数据库表结构 # 数据库表结构
TABLES = { TABLES = {
'documents': '''CREATE TABLE IF NOT EXISTS documents (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL
)''',
'words': '''CREATE TABLE IF NOT EXISTS words ( 'words': '''CREATE TABLE IF NOT EXISTS words (
doc_id INTEGER NOT NULL, doc_name INTEGER NOT NULL,
value TEXT NOT NULL, value TEXT NOT NULL
FOREIGN KEY (doc_id) REFERENCES documents (id)
)''', )''',
'characters': '''CREATE TABLE IF NOT EXISTS characters (
word_id INTEGER NOT NULL,
value TEXT NOT NULL,
FOREIGN KEY (word_id) REFERENCES words (id)
)'''
} }
@ -33,15 +23,10 @@ def create_db_schema(connection):
def load_file_into_database(path_to_file, connection): def load_file_into_database(path_to_file, connection):
words = extract_file_words( path_to_file ) words = extract_file_words( path_to_file )
doc_name = os.path.basename(testfilepath).split('.')[0]
c = connection.cursor() c = connection.cursor()
c.execute("INSERT INTO documents (name) VALUES (?)", (path_to_file,))
doc_id = c.lastrowid
for w in words: for w in words:
c.execute("INSERT INTO words (doc_id, value) VALUES (?, ?)", (doc_id, w)) c.execute("INSERT INTO words (doc_name, value) VALUES (?, ?)", (doc_name, w))
word_id = c.lastrowid
for char in w:
c.execute("INSERT INTO characters (word_id, value) VALUES (?, ?)", (word_id, char))
connection.commit() connection.commit()
c.close() c.close()
@ -52,7 +37,6 @@ def load_file_into_database(path_to_file, connection):
# 构造数据库文件的完整路径 # 构造数据库文件的完整路径
current_dir = os.path.dirname(os.path.abspath(__file__)) current_dir = os.path.dirname(os.path.abspath(__file__))
db_file_path = os.path.join(current_dir, db_filename) db_file_path = os.path.join(current_dir, db_filename)
if os.path.exists(db_file_path): if os.path.exists(db_file_path):
os.remove(db_file_path) os.remove(db_file_path)
Loading…
Cancel
Save