You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

65 lines
2.2 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# -*- coding: utf-8 -*-
import cppy.cp_util as util
from collections import Counter
'''
状态机是计算机程序运行的基础理论。
使用状态机的风格来处理文件并计算词频,我们可以将整个过程分解为一系列状态转移。
每个状态代表处理过程中的一个阶段,比如“读取文件”、“分割单词”和“计算词频”等。
这种方法在Python中并不常见但它展示了如何使用状态机来管理程序的状态和流程
'''
class WordFrequencyStateMachine:
def __init__(self, file_path):
self.file_path = file_path
self.content = None
self.words = None
self.word_freq = None
self.state = 'IDLE'
def transition_to_read_file(self):
try:
with open(self.file_path, 'r', encoding='utf-8') as file:
self.content = file.read()
self.state = 'WORDS_SPLIT'
except FileNotFoundError:
print(f"文件 {self.file_path} 未找到。")
except Exception as e:
print(f"读取文件时发生错误: {e}")
def transition_to_split_words(self):
if self.content is not None:
self.words = util.extract_str_words(self.content)
self.state = 'CALCULATE_FREQ'
else:
print("文件内容为空,无法分割单词。")
def transition_to_calculate_freq(self):
if self.words is not None:
self.word_freq = Counter(self.words)
self.state = 'DONE'
else:
print("单词列表为空,无法计算词频。")
def run(self):
while self.state != 'DONE':
if self.state == 'IDLE':
self.transition_to_read_file()
elif self.state == 'WORDS_SPLIT':
self.transition_to_split_words()
elif self.state == 'CALCULATE_FREQ':
self.transition_to_calculate_freq()
else:
print(f"未知状态: {self.state}")
break
return self.word_freq
# 使用状态机计算词频
state_machine = WordFrequencyStateMachine(util.testfilepath)
word_frequencies = state_machine.run()
# 打印结果
util.print_word_freqs(word_frequencies.most_common(10))