You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
49 lines
1.5 KiB
49 lines
1.5 KiB
9 months ago
|
from collections import Counter
|
||
|
from cppy.cp_util import re_split,read_file,testfilepath,get_stopwords
|
||
|
from createDb import TextFile, WordFrequency,engine
|
||
|
|
||
|
from sqlalchemy.exc import IntegrityError
|
||
|
from sqlalchemy.orm import sessionmaker
|
||
|
|
||
|
|
||
|
Session = sessionmaker(bind=engine)
|
||
|
|
||
|
|
||
|
def store_textfile(session, filepath):
|
||
|
content = read_file(filepath)
|
||
|
textfile = TextFile(filepath=filepath, content=content)
|
||
|
session.add(textfile)
|
||
|
try:
|
||
|
session.commit()
|
||
|
except IntegrityError:
|
||
|
session.rollback()
|
||
|
print(f"File {filepath} already exists in the database.")
|
||
|
return None
|
||
|
return textfile
|
||
|
|
||
|
def update_word_frequencies(session, textfile):
|
||
|
words = re_split( textfile.content.lower())
|
||
|
stopwords = get_stopwords()
|
||
|
words = [ word for word in words if word not in stopwords ]
|
||
|
word_counts = Counter(words)
|
||
|
for word, count in word_counts.items():
|
||
|
word_freq = WordFrequency(word=word, frequency=count, textfile=textfile)
|
||
|
session.add(word_freq)
|
||
|
try:
|
||
|
session.commit()
|
||
|
except Exception as e:
|
||
|
session.rollback()
|
||
|
print(f"Error updating word frequencies: {e}")
|
||
|
|
||
|
def process_textfile(filepath):
|
||
|
session = Session()
|
||
|
textfile = store_textfile(session, filepath)
|
||
|
if textfile:
|
||
|
update_word_frequencies(session, textfile)
|
||
|
session.close()
|
||
|
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
process_textfile(testfilepath)
|
||
|
|