from cppy.cp_util import *
from dataclasses import dataclass
from collections import Counter
import re

@dataclass
class WordFrequency:
    text: str
    stop_words: set = None

    def __post_init__(self):
        # 如果未提供停用词表
        if self.stop_words is None:
            self.stop_words = get_stopwords()

    def tokenize(self):
        # 分词并去除停用词
        words = re.findall(r'\b\w+\b', self.text.lower())
        filtered_words = [word for word in words if word not in self.stop_words and len(word)>2]
        return filtered_words
    
    def get_top_n(self, n=10):
        # 计算词频
        word_freqs = Counter(self.tokenize())
        return word_freqs.most_common(n)


# 使用示例
if __name__ == '__main__':
    # 创建WordFrequency实例
    text = read_file()    
    word_freq = WordFrequency( text )

    # 获取并打印词频
    top_words = word_freq.get_top_n()
    print_word_freqs(top_words)