parent
a647f06f32
commit
66ecf28d15
Binary file not shown.
@ -0,0 +1,43 @@
|
||||
from sqlalchemy import create_engine, Column, Integer, String, ForeignKey
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
from sqlalchemy.orm import relationship
|
||||
import os
|
||||
from cppy.cp_util import db_filename
|
||||
|
||||
|
||||
# 定义数据模型和数据库连接
|
||||
Base = declarative_base()
|
||||
|
||||
# 获取当前文件所在的目录
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
# 构造数据库文件的完整路径
|
||||
db_file_path = os.path.join(current_dir, db_filename)
|
||||
DATABASE_URI = f"sqlite:///{db_file_path}"
|
||||
|
||||
# 创建数据库引擎
|
||||
engine = create_engine(DATABASE_URI, echo=True)
|
||||
|
||||
|
||||
class TextFile(Base):
|
||||
__tablename__ = 'text_files'
|
||||
id = Column(Integer, primary_key=True)
|
||||
filepath = Column(String, unique=True)
|
||||
content = Column(String)
|
||||
words = relationship("WordFrequency", back_populates="textfile")
|
||||
|
||||
class WordFrequency(Base):
|
||||
__tablename__ = 'word_frequencies'
|
||||
id = Column(Integer, primary_key=True)
|
||||
word = Column(String)
|
||||
frequency = Column(Integer)
|
||||
textfile_id = Column(Integer, ForeignKey('text_files.id'))
|
||||
textfile = relationship("TextFile", back_populates="words")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 检查数据库文件是否存在
|
||||
if os.path.exists(db_file_path):
|
||||
os.remove(db_file_path)
|
||||
|
||||
Base.metadata.create_all(engine)
|
@ -0,0 +1,49 @@
|
||||
from collections import Counter
|
||||
from cppy.cp_util import re_split,read_file,testfilepath,get_stopwords
|
||||
from createDb import TextFile, WordFrequency,engine
|
||||
|
||||
from sqlalchemy.exc import IntegrityError
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
|
||||
Session = sessionmaker(bind=engine)
|
||||
|
||||
|
||||
def store_textfile(session, filepath):
|
||||
content = read_file(filepath)
|
||||
textfile = TextFile(filepath=filepath, content=content)
|
||||
session.add(textfile)
|
||||
try:
|
||||
session.commit()
|
||||
except IntegrityError:
|
||||
session.rollback()
|
||||
print(f"File {filepath} already exists in the database.")
|
||||
return None
|
||||
return textfile
|
||||
|
||||
def update_word_frequencies(session, textfile):
|
||||
words = re_split( textfile.content.lower())
|
||||
stopwords = get_stopwords()
|
||||
words = [ word for word in words if word not in stopwords ]
|
||||
word_counts = Counter(words)
|
||||
for word, count in word_counts.items():
|
||||
word_freq = WordFrequency(word=word, frequency=count, textfile=textfile)
|
||||
session.add(word_freq)
|
||||
try:
|
||||
session.commit()
|
||||
except Exception as e:
|
||||
session.rollback()
|
||||
print(f"Error updating word frequencies: {e}")
|
||||
|
||||
def process_textfile(filepath):
|
||||
session = Session()
|
||||
textfile = store_textfile(session, filepath)
|
||||
if textfile:
|
||||
update_word_frequencies(session, textfile)
|
||||
session.close()
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
process_textfile(testfilepath)
|
||||
|
Loading…
Reference in new issue