from cppy.cp_util import * from dataclasses import dataclass from collections import Counter import re @dataclass class WordFrequency: text: str stop_words: set = None def __post_init__(self): # 如果未提供停用词表 if self.stop_words is None: self.stop_words = get_stopwords() def tokenize(self): # 分词并去除停用词 words = re.findall(r'\b\w+\b', self.text.lower()) filtered_words = [word for word in words if word not in self.stop_words and len(word)>2] return filtered_words def get_top_n(self, n=10): # 计算词频 word_freqs = Counter(self.tokenize()) return word_freqs.most_common(n) # 使用示例 if __name__ == '__main__': # 创建WordFrequency实例 text = read_file() word_freq = WordFrequency( text ) # 获取并打印词频 top_words = word_freq.get_top_n() print_word_freqs(top_words)