CodePattern/15 工程化/4 类属性 .py

from cppy.cp_util import *
from dataclasses import dataclass
from collections import Counter
import re

@dataclass
class WordFrequency:
    text: str
    stop_words: set = None

    def __post_init__(self):
        # 如果未提供停用词表
        if self.stop_words is None:
            self.stop_words = get_stopwords()

    def tokenize(self):
        # 分词并去除停用词
        words = re.findall(r'\b\w+\b', self.text.lower())
        filtered_words = [word for word in words if word not in self.stop_words and len(word)>2]
        return filtered_words
    
    def get_top_n(self, n=10):
        # 计算词频
        word_freqs = Counter(self.tokenize())
        return word_freqs.most_common(n)


# 使用示例
if __name__ == '__main__':
    # 创建WordFrequency实例
    text = read_file()    
    word_freq = WordFrequency( text )

    # 获取并打印词频
    top_words = word_freq.get_top_n()
    print_word_freqs(top_words)
修 12 8 months ago			`from cppy.cp_util import *`
			`from dataclasses import dataclass`
			`from collections import Counter`
			`import re`

			`@dataclass`
			`class WordFrequency:`
			`text: str`
			`stop_words: set = None`

			`def __post_init__(self):`
			`# 如果未提供停用词表`
			`if self.stop_words is None:`
			`self.stop_words = get_stopwords()`

			`def tokenize(self):`
			`# 分词并去除停用词`
			`words = re.findall(r'\b\w+\b', self.text.lower())`
			`filtered_words = [word for word in words if word not in self.stop_words and len(word)>2]`
			`return filtered_words`

			`def get_top_n(self, n=10):`
			`# 计算词频`
			`word_freqs = Counter(self.tokenize())`
			`return word_freqs.most_common(n)`


			`# 使用示例`
			`if __name__ == '__main__':`
			`# 创建WordFrequency实例`
			`text = read_file()`
			`word_freq = WordFrequency( text )`

			`# 获取并打印词频`
			`top_words = word_freq.get_top_n()`
			`print_word_freqs(top_words)`