You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
36 lines
958 B
36 lines
958 B
8 months ago
|
from cppy.cp_util import *
|
||
|
from dataclasses import dataclass
|
||
|
from collections import Counter
|
||
|
import re
|
||
|
|
||
|
@dataclass
|
||
|
class WordFrequency:
|
||
|
text: str
|
||
|
stop_words: set = None
|
||
|
|
||
|
def __post_init__(self):
|
||
|
# 如果未提供停用词表
|
||
|
if self.stop_words is None:
|
||
|
self.stop_words = get_stopwords()
|
||
|
|
||
|
def tokenize(self):
|
||
|
# 分词并去除停用词
|
||
|
words = re.findall(r'\b\w+\b', self.text.lower())
|
||
|
filtered_words = [word for word in words if word not in self.stop_words and len(word)>2]
|
||
|
return filtered_words
|
||
|
|
||
|
def get_top_n(self, n=10):
|
||
|
# 计算词频
|
||
|
word_freqs = Counter(self.tokenize())
|
||
|
return word_freqs.most_common(n)
|
||
|
|
||
|
|
||
|
# 使用示例
|
||
|
if __name__ == '__main__':
|
||
|
# 创建WordFrequency实例
|
||
|
text = read_file()
|
||
|
word_freq = WordFrequency( text )
|
||
|
|
||
|
# 获取并打印词频
|
||
|
top_words = word_freq.get_top_n()
|
||
|
print_word_freqs(top_words)
|