李兵提议

pull/1/head
zj3D 9 months ago
parent a647f06f32
commit 66ecf28d15

@ -0,0 +1,28 @@
from cppy.cp_util import testfilepath
from sqlalchemy.orm import sessionmaker
from createDb import TextFile, WordFrequency,engine
def get_top_n_word_frequencies(filepath, n=10):
Session = sessionmaker(bind=engine)
session = Session()
textfile = session.query(TextFile).filter_by(filepath=filepath).first()
if textfile:
# 查询词频并按频率降序排序然后取前N个
word_freqs = (session.query(WordFrequency)
.filter_by(textfile=textfile)
.order_by(WordFrequency.frequency.desc())
.limit(n)
.all())
# 输出词频最高的N个词
for wf in word_freqs:
print(f"{wf.word}: {wf.frequency}")
else:
print(f"File {filepath} not found in the database.")
session.close()
if __name__ == '__main__':
get_top_n_word_frequencies( testfilepath )

@ -0,0 +1,43 @@
from sqlalchemy import create_engine, Column, Integer, String, ForeignKey
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship
import os
from cppy.cp_util import db_filename
# 定义数据模型和数据库连接
Base = declarative_base()
# 获取当前文件所在的目录
current_dir = os.path.dirname(os.path.abspath(__file__))
# 构造数据库文件的完整路径
db_file_path = os.path.join(current_dir, db_filename)
DATABASE_URI = f"sqlite:///{db_file_path}"
# 创建数据库引擎
engine = create_engine(DATABASE_URI, echo=True)
class TextFile(Base):
__tablename__ = 'text_files'
id = Column(Integer, primary_key=True)
filepath = Column(String, unique=True)
content = Column(String)
words = relationship("WordFrequency", back_populates="textfile")
class WordFrequency(Base):
__tablename__ = 'word_frequencies'
id = Column(Integer, primary_key=True)
word = Column(String)
frequency = Column(Integer)
textfile_id = Column(Integer, ForeignKey('text_files.id'))
textfile = relationship("TextFile", back_populates="words")
if __name__ == '__main__':
# 检查数据库文件是否存在
if os.path.exists(db_file_path):
os.remove(db_file_path)
Base.metadata.create_all(engine)

@ -0,0 +1,49 @@
from collections import Counter
from cppy.cp_util import re_split,read_file,testfilepath,get_stopwords
from createDb import TextFile, WordFrequency,engine
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import sessionmaker
Session = sessionmaker(bind=engine)
def store_textfile(session, filepath):
content = read_file(filepath)
textfile = TextFile(filepath=filepath, content=content)
session.add(textfile)
try:
session.commit()
except IntegrityError:
session.rollback()
print(f"File {filepath} already exists in the database.")
return None
return textfile
def update_word_frequencies(session, textfile):
words = re_split( textfile.content.lower())
stopwords = get_stopwords()
words = [ word for word in words if word not in stopwords ]
word_counts = Counter(words)
for word, count in word_counts.items():
word_freq = WordFrequency(word=word, frequency=count, textfile=textfile)
session.add(word_freq)
try:
session.commit()
except Exception as e:
session.rollback()
print(f"Error updating word frequencies: {e}")
def process_textfile(filepath):
session = Session()
textfile = store_textfile(session, filepath)
if textfile:
update_word_frequencies(session, textfile)
session.close()
if __name__ == '__main__':
process_textfile(testfilepath)
Loading…
Cancel
Save