diff --git a/计算设备/数据库/ORM/DataQuery.py b/计算设备/数据库/ORM/DataQuery.py new file mode 100644 index 0000000..73830b8 --- /dev/null +++ b/计算设备/数据库/ORM/DataQuery.py @@ -0,0 +1,28 @@ +from cppy.cp_util import testfilepath + +from sqlalchemy.orm import sessionmaker +from createDb import TextFile, WordFrequency,engine + + +def get_top_n_word_frequencies(filepath, n=10): + Session = sessionmaker(bind=engine) + session = Session() + textfile = session.query(TextFile).filter_by(filepath=filepath).first() + if textfile: + # 查询词频并按频率降序排序,然后取前N个 + word_freqs = (session.query(WordFrequency) + .filter_by(textfile=textfile) + .order_by(WordFrequency.frequency.desc()) + .limit(n) + .all()) + # 输出词频最高的N个词 + for wf in word_freqs: + print(f"{wf.word}: {wf.frequency}") + else: + print(f"File {filepath} not found in the database.") + session.close() + + +if __name__ == '__main__': + get_top_n_word_frequencies( testfilepath ) + \ No newline at end of file diff --git a/计算设备/数据库/ORM/__pycache__/createDb.cpython-38.pyc b/计算设备/数据库/ORM/__pycache__/createDb.cpython-38.pyc new file mode 100644 index 0000000..9de75d7 Binary files /dev/null and b/计算设备/数据库/ORM/__pycache__/createDb.cpython-38.pyc differ diff --git a/计算设备/数据库/ORM/createDb.py b/计算设备/数据库/ORM/createDb.py new file mode 100644 index 0000000..d5e7451 --- /dev/null +++ b/计算设备/数据库/ORM/createDb.py @@ -0,0 +1,43 @@ +from sqlalchemy import create_engine, Column, Integer, String, ForeignKey +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import relationship +import os +from cppy.cp_util import db_filename + + +# 定义数据模型和数据库连接 +Base = declarative_base() + +# 获取当前文件所在的目录 +current_dir = os.path.dirname(os.path.abspath(__file__)) + +# 构造数据库文件的完整路径 +db_file_path = os.path.join(current_dir, db_filename) +DATABASE_URI = f"sqlite:///{db_file_path}" + +# 创建数据库引擎 +engine = create_engine(DATABASE_URI, echo=True) + + +class TextFile(Base): + __tablename__ = 'text_files' + id = Column(Integer, primary_key=True) + filepath = Column(String, unique=True) + content = Column(String) + words = relationship("WordFrequency", back_populates="textfile") + +class WordFrequency(Base): + __tablename__ = 'word_frequencies' + id = Column(Integer, primary_key=True) + word = Column(String) + frequency = Column(Integer) + textfile_id = Column(Integer, ForeignKey('text_files.id')) + textfile = relationship("TextFile", back_populates="words") + + +if __name__ == '__main__': + # 检查数据库文件是否存在 + if os.path.exists(db_file_path): + os.remove(db_file_path) + + Base.metadata.create_all(engine) \ No newline at end of file diff --git a/计算设备/数据库/ORM/processData.py b/计算设备/数据库/ORM/processData.py new file mode 100644 index 0000000..dc5fae6 --- /dev/null +++ b/计算设备/数据库/ORM/processData.py @@ -0,0 +1,49 @@ +from collections import Counter +from cppy.cp_util import re_split,read_file,testfilepath,get_stopwords +from createDb import TextFile, WordFrequency,engine + +from sqlalchemy.exc import IntegrityError +from sqlalchemy.orm import sessionmaker + + +Session = sessionmaker(bind=engine) + + +def store_textfile(session, filepath): + content = read_file(filepath) + textfile = TextFile(filepath=filepath, content=content) + session.add(textfile) + try: + session.commit() + except IntegrityError: + session.rollback() + print(f"File {filepath} already exists in the database.") + return None + return textfile + +def update_word_frequencies(session, textfile): + words = re_split( textfile.content.lower()) + stopwords = get_stopwords() + words = [ word for word in words if word not in stopwords ] + word_counts = Counter(words) + for word, count in word_counts.items(): + word_freq = WordFrequency(word=word, frequency=count, textfile=textfile) + session.add(word_freq) + try: + session.commit() + except Exception as e: + session.rollback() + print(f"Error updating word frequencies: {e}") + +def process_textfile(filepath): + session = Session() + textfile = store_textfile(session, filepath) + if textfile: + update_word_frequencies(session, textfile) + session.close() + + + +if __name__ == '__main__': + process_textfile(testfilepath) + \ No newline at end of file