CodePattern/A 代码模式/10 一盘大棋/1 最基础的写法.py

# 引入停用词表和测试文件的路径
from cppy.cp_util import stopwordfilepath, testfilepath

# 准备停用词表
with open(stopwordfilepath, encoding='utf-8') as f:
    stop_words = f.read().split(',')
for letter in 'abcdefghijklmnopqrstuvwxyz':
    stop_words.append(letter)

# 读文件，逐行扫描文本，发现词，确定不是停用词，计数
word_freqs = []
for line in open(testfilepath, encoding='utf-8'):
    start_char = None
    i = 0
    for c in line:
        if start_char is None:
            if c.isalnum():
                # 一个单词开始
                start_char = i
        else:
            if not c.isalnum():
                # 一个单词结束
                found = False
                word = line[start_char:i].lower()
                # 跳过停用词
                if word not in stop_words:
                    pair_index = 0
                    # 单词是否第一次出现
                    for pair in word_freqs:
                        if word == pair[0]:
                            pair[1] += 1
                            found = True
                            break
                        pair_index += 1
                    if not found:
                        word_freqs.append([word, 1])
                # 重置开始标记
                start_char = None
        i += 1

# 使用冒泡排序对词频进行排序
n = len(word_freqs)
for i in range(n):
    for j in range(0, n - i - 1):
        if word_freqs[j][1] < word_freqs[j + 1][1]:
            word_freqs[j], word_freqs[j + 1] = word_freqs[j + 1], word_freqs[j]

# 打印频率最高的前10个词
for tf in word_freqs[:10]:
    print(tf[0], '-', tf[1])