from cppy.cp_util import stopwordfilepath,testfilepath
import string
from collections import Counter

# 准备词和停用词表
stop_words = set(open(stopwordfilepath).read().split(','))
stop_words.update(list(string.ascii_lowercase))

# 读取文件并计算单词频率
word_freqs = Counter()
with open(testfilepath,encoding = 'utf8') as f:
    for line_num, line in enumerate(f, 1):
        start_char = None
        for i, c in enumerate(line):
            if start_char is None and c.isalnum():
                start_char = i
            elif start_char is not None and not c.isalnum():
                word = line[start_char:i].lower()
                if word not in stop_words:
                    word_freqs[word] += 1
                start_char = None

# 打印前10个最常见的单词
for word, freq in word_freqs.most_common(10):
    print(f"{word}-{freq}")

'''
相比 A01
使用collections.Counter来计数单词频率，从而简化了代码并提高了效率。
使用enumerate来获取行号和行内容，使用set来存储停用词，都有助于提高代码的性能和可读性。
使用most_common方法来获取最常见的单词，使输出更为简洁。
'''