forked from p46318075/CodePattern
parent
b86f626e94
commit
44c1f9eb1e
@ -0,0 +1,30 @@
|
||||
import re
|
||||
from cppy.cp_util import *
|
||||
|
||||
|
||||
def extractwords(str_data):
|
||||
pattern = re.compile('[\W_]+')
|
||||
word_list = pattern.sub(' ', str_data).lower().split()
|
||||
stop_words = get_stopwords()
|
||||
return [w for w in word_list if not w in stop_words]
|
||||
|
||||
def frequencies(word_list):
|
||||
word_freqs = {}
|
||||
for word in word_list:
|
||||
word_freqs[word] = word_freqs.get(word, 0) + 1
|
||||
return word_freqs
|
||||
|
||||
def sort(word_freq):
|
||||
return sorted( word_freq.items(), key=lambda x: x[1], reverse=True )
|
||||
|
||||
def printall(word_freqs, n = 10 ):
|
||||
for word, freq in word_freqs[ :n ]:
|
||||
print(word, '-', freq)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
txtcontent = read_file( testfilepath )
|
||||
word_list = extractwords( txtcontent )
|
||||
word_freqs = frequencies( word_list )
|
||||
word_sort = sort ( word_freqs )
|
||||
printall(word_sort)
|
Loading…
Reference in new issue