parent
a647f06f32
commit
66ecf28d15
Binary file not shown.
@ -0,0 +1,43 @@
|
|||||||
|
from sqlalchemy import create_engine, Column, Integer, String, ForeignKey
|
||||||
|
from sqlalchemy.ext.declarative import declarative_base
|
||||||
|
from sqlalchemy.orm import relationship
|
||||||
|
import os
|
||||||
|
from cppy.cp_util import db_filename
|
||||||
|
|
||||||
|
|
||||||
|
# 定义数据模型和数据库连接
|
||||||
|
Base = declarative_base()
|
||||||
|
|
||||||
|
# 获取当前文件所在的目录
|
||||||
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
|
# 构造数据库文件的完整路径
|
||||||
|
db_file_path = os.path.join(current_dir, db_filename)
|
||||||
|
DATABASE_URI = f"sqlite:///{db_file_path}"
|
||||||
|
|
||||||
|
# 创建数据库引擎
|
||||||
|
engine = create_engine(DATABASE_URI, echo=True)
|
||||||
|
|
||||||
|
|
||||||
|
class TextFile(Base):
|
||||||
|
__tablename__ = 'text_files'
|
||||||
|
id = Column(Integer, primary_key=True)
|
||||||
|
filepath = Column(String, unique=True)
|
||||||
|
content = Column(String)
|
||||||
|
words = relationship("WordFrequency", back_populates="textfile")
|
||||||
|
|
||||||
|
class WordFrequency(Base):
|
||||||
|
__tablename__ = 'word_frequencies'
|
||||||
|
id = Column(Integer, primary_key=True)
|
||||||
|
word = Column(String)
|
||||||
|
frequency = Column(Integer)
|
||||||
|
textfile_id = Column(Integer, ForeignKey('text_files.id'))
|
||||||
|
textfile = relationship("TextFile", back_populates="words")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# 检查数据库文件是否存在
|
||||||
|
if os.path.exists(db_file_path):
|
||||||
|
os.remove(db_file_path)
|
||||||
|
|
||||||
|
Base.metadata.create_all(engine)
|
@ -0,0 +1,49 @@
|
|||||||
|
from collections import Counter
|
||||||
|
from cppy.cp_util import re_split,read_file,testfilepath,get_stopwords
|
||||||
|
from createDb import TextFile, WordFrequency,engine
|
||||||
|
|
||||||
|
from sqlalchemy.exc import IntegrityError
|
||||||
|
from sqlalchemy.orm import sessionmaker
|
||||||
|
|
||||||
|
|
||||||
|
Session = sessionmaker(bind=engine)
|
||||||
|
|
||||||
|
|
||||||
|
def store_textfile(session, filepath):
|
||||||
|
content = read_file(filepath)
|
||||||
|
textfile = TextFile(filepath=filepath, content=content)
|
||||||
|
session.add(textfile)
|
||||||
|
try:
|
||||||
|
session.commit()
|
||||||
|
except IntegrityError:
|
||||||
|
session.rollback()
|
||||||
|
print(f"File {filepath} already exists in the database.")
|
||||||
|
return None
|
||||||
|
return textfile
|
||||||
|
|
||||||
|
def update_word_frequencies(session, textfile):
|
||||||
|
words = re_split( textfile.content.lower())
|
||||||
|
stopwords = get_stopwords()
|
||||||
|
words = [ word for word in words if word not in stopwords ]
|
||||||
|
word_counts = Counter(words)
|
||||||
|
for word, count in word_counts.items():
|
||||||
|
word_freq = WordFrequency(word=word, frequency=count, textfile=textfile)
|
||||||
|
session.add(word_freq)
|
||||||
|
try:
|
||||||
|
session.commit()
|
||||||
|
except Exception as e:
|
||||||
|
session.rollback()
|
||||||
|
print(f"Error updating word frequencies: {e}")
|
||||||
|
|
||||||
|
def process_textfile(filepath):
|
||||||
|
session = Session()
|
||||||
|
textfile = store_textfile(session, filepath)
|
||||||
|
if textfile:
|
||||||
|
update_word_frequencies(session, textfile)
|
||||||
|
session.close()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
process_textfile(testfilepath)
|
||||||
|
|
Loading…
Reference in new issue