|
|
|
import sqlite3, os.path
|
|
|
|
from cppy.cp_util import testfilepath,db_filename,extract_file_words
|
|
|
|
|
|
|
|
|
|
|
|
# 数据库表结构
|
|
|
|
TABLES = {
|
|
|
|
'documents': '''CREATE TABLE IF NOT EXISTS documents (
|
|
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
|
|
name TEXT NOT NULL
|
|
|
|
)''',
|
|
|
|
'words': '''CREATE TABLE IF NOT EXISTS words (
|
|
|
|
doc_id INTEGER NOT NULL,
|
|
|
|
value TEXT NOT NULL,
|
|
|
|
FOREIGN KEY (doc_id) REFERENCES documents (id)
|
|
|
|
)''',
|
|
|
|
'characters': '''CREATE TABLE IF NOT EXISTS characters (
|
|
|
|
word_id INTEGER NOT NULL,
|
|
|
|
value TEXT NOT NULL,
|
|
|
|
FOREIGN KEY (word_id) REFERENCES words (id)
|
|
|
|
)'''
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# 创建数据库表
|
|
|
|
def create_db_schema(connection):
|
|
|
|
for table, sql in TABLES.items():
|
|
|
|
c = connection.cursor()
|
|
|
|
c.execute(sql)
|
|
|
|
connection.commit()
|
|
|
|
c.close()
|
|
|
|
|
|
|
|
|
|
|
|
def load_file_into_database(path_to_file, connection):
|
|
|
|
words = extract_file_words( path_to_file )
|
|
|
|
|
|
|
|
c = connection.cursor()
|
|
|
|
c.execute("INSERT INTO documents (name) VALUES (?)", (path_to_file,))
|
|
|
|
doc_id = c.lastrowid
|
|
|
|
|
|
|
|
for w in words:
|
|
|
|
c.execute("INSERT INTO words (doc_id, value) VALUES (?, ?)", (doc_id, w))
|
|
|
|
word_id = c.lastrowid
|
|
|
|
for char in w:
|
|
|
|
c.execute("INSERT INTO characters (word_id, value) VALUES (?, ?)", (word_id, char))
|
|
|
|
connection.commit()
|
|
|
|
c.close()
|
|
|
|
|
|
|
|
#######################################################
|
|
|
|
# 建数据库,处理数据入库
|
|
|
|
#######################################################
|
|
|
|
|
|
|
|
# 获取当前文件所在的目录
|
|
|
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
|
|
|
# 构造数据库文件的完整路径
|
|
|
|
db_file_path = os.path.join(current_dir, db_filename)
|
|
|
|
|
|
|
|
if os.path.exists(db_file_path):
|
|
|
|
os.remove(db_file_path)
|
|
|
|
|
|
|
|
if not os.path.isfile(db_file_path):
|
|
|
|
with sqlite3.connect(db_file_path) as connection:
|
|
|
|
create_db_schema(connection)
|
|
|
|
load_file_into_database(testfilepath, connection)
|
|
|
|
|
|
|
|
|
|
|
|
# 查询输出
|
|
|
|
with sqlite3.connect(db_file_path) as connection:
|
|
|
|
c = connection.cursor()
|
|
|
|
c.execute("SELECT value, COUNT(*) as C FROM words GROUP BY value ORDER BY C DESC LIMIT 10")
|
|
|
|
for row in c.fetchall():
|
|
|
|
print(row[0], '-', row[1])
|