You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.
import string
from collections import Counter
from cppy . cp_util import *
################################
# data
################################
data = ' '
words = [ ]
word_freqs = [ ]
################################
# procedures
################################
def read_file ( path_to_file ) :
""" 读取文件内容, 并赋值给全局变量data """
global data
with open ( path_to_file , encoding = ' utf-8 ' ) as f :
data = f . read ( )
def extractwords ( ) :
""" 提取data中的单词, 并赋值给全局变量words """
global data
global words
words = data . lower ( ) . split ( )
with open ( stopwordfilepath ) as f :
stop_words = set ( f . read ( ) . split ( ' , ' ) )
stop_words . update ( string . ascii_lowercase )
words = [ word for word in words if word not in stop_words ]
def frequencies ( ) :
""" 统计words中单词的频率, 并赋值给全局变量word_freqs """
global words
global word_freqs
word_freqs . extend ( [ ( word , 1 ) for word in words ] )
def sort ( ) :
""" 对word_freqs按照频率进行排序 """
global word_freqs
word_freqs = Counter ( words ) . most_common ( )
if __name__ == " __main__ " :
read_file ( testfilepath )
extractwords ( )
frequencies ( )
sort ( )
for tf in word_freqs [ : 10 ] :
print ( tf [ 0 ] , ' - ' , tf [ 1 ] )