import string
from cppy.cp_util import stopwordfilepath,testfilepath

# 准备词和停用词表
word_freqs = []
with open( stopwordfilepath,encoding='utf-8' ) as f:
    stop_words = f.read().split(',')
stop_words.extend(list(string.ascii_lowercase))

for line in open( testfilepath ,encoding='utf-8' ):
    start_char = None
    i = 0
    for c in line:
        if start_char == None:
            if c.isalnum():
                # 一个单词开始
                start_char = i
        else:
            if not c.isalnum():
                # 一个单词结束
                found = False
                word = line[start_char:i].lower()
                # 跳过停用词
                if word not in stop_words:
                    pair_index = 0
                    # 单词是否第一次出现
                    for pair in word_freqs:
                        if word == pair[0]:
                            pair[1] += 1
                            found = True
                            break
                        pair_index += 1
                    if not found:
                        word_freqs.append([word, 1])
                    elif len(word_freqs) > 1:                        
                        for n in reversed(range(pair_index)):
                            if word_freqs[pair_index][1] > word_freqs[n][1]:
                                # 交换
                                word_freqs[n], word_freqs[pair_index] = word_freqs[pair_index], word_freqs[n]
                                pair_index = n
                # 重置开始标记
                start_char = None
        i += 1

for tf in word_freqs[0:10]:
    print(tf[0], '-', tf[1])