forked from p46318075/CodePattern
parent
c8946209bf
commit
ebe28f7670
@ -0,0 +1,36 @@
|
||||
from cppy.cp_util import *
|
||||
from dataclasses import dataclass
|
||||
from collections import Counter
|
||||
import re
|
||||
|
||||
@dataclass
|
||||
class WordFrequency:
|
||||
text: str
|
||||
stop_words: set = None
|
||||
|
||||
def __post_init__(self):
|
||||
# 如果未提供停用词表
|
||||
if self.stop_words is None:
|
||||
self.stop_words = get_stopwords()
|
||||
|
||||
def tokenize(self):
|
||||
# 分词并去除停用词
|
||||
words = re.findall(r'\b\w+\b', self.text.lower())
|
||||
filtered_words = [word for word in words if word not in self.stop_words and len(word)>2]
|
||||
return filtered_words
|
||||
|
||||
def get_top_n(self, n=10):
|
||||
# 计算词频
|
||||
word_freqs = Counter(self.tokenize())
|
||||
return word_freqs.most_common(n)
|
||||
|
||||
|
||||
# 使用示例
|
||||
if __name__ == '__main__':
|
||||
# 创建WordFrequency实例
|
||||
text = read_file()
|
||||
word_freq = WordFrequency( text )
|
||||
|
||||
# 获取并打印词频
|
||||
top_words = word_freq.get_top_n()
|
||||
print_word_freqs(top_words)
|
Loading…
Reference in new issue