You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

62 lines
1.9 KiB

9 months ago
import sqlite3, os.path
from cppy.cp_util import *
# 数据库表结构
TABLES = {
'documents': '''CREATE TABLE IF NOT EXISTS documents (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL
)''',
'words': '''CREATE TABLE IF NOT EXISTS words (
doc_id INTEGER NOT NULL,
value TEXT NOT NULL,
FOREIGN KEY (doc_id) REFERENCES documents (id)
)''',
'characters': '''CREATE TABLE IF NOT EXISTS characters (
word_id INTEGER NOT NULL,
value TEXT NOT NULL,
FOREIGN KEY (word_id) REFERENCES words (id)
)'''
}
# 创建数据库表
def create_db_schema(connection):
for table, sql in TABLES.items():
c = connection.cursor()
c.execute(sql)
connection.commit()
c.close()
def load_file_into_database(path_to_file, connection):
words = extract_file_words( path_to_file )
c = connection.cursor()
c.execute("INSERT INTO documents (name) VALUES (?)", (path_to_file,))
doc_id = c.lastrowid
for w in words:
c.execute("INSERT INTO words (doc_id, value) VALUES (?, ?)", (doc_id, w))
word_id = c.lastrowid
for char in w:
c.execute("INSERT INTO characters (word_id, value) VALUES (?, ?)", (word_id, char))
connection.commit()
c.close()
# 建数据库,处理数据入库
db_path = 'tfdb'
if not os.path.isfile(db_path):
with sqlite3.connect(db_path) as connection:
create_db_schema(connection)
load_file_into_database(testfilepath, connection)
# 查询输出
with sqlite3.connect(db_path) as connection:
c = connection.cursor()
c.execute("SELECT value, COUNT(*) as C FROM words GROUP BY value ORDER BY C DESC LIMIT 10")
for row in c.fetchall():
print(row[0], '-', row[1])