forked from p46318075/CodePattern
				
			
							parent
							
								
									b86f626e94
								
							
						
					
					
						commit
						44c1f9eb1e
					
				@ -0,0 +1,30 @@
 | 
				
			|||||||
 | 
					import re
 | 
				
			||||||
 | 
					from cppy.cp_util import *
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def extractwords(str_data):
 | 
				
			||||||
 | 
					    pattern = re.compile('[\W_]+')
 | 
				
			||||||
 | 
					    word_list = pattern.sub(' ', str_data).lower().split()
 | 
				
			||||||
 | 
					    stop_words = get_stopwords()    
 | 
				
			||||||
 | 
					    return [w for w in word_list if not w in stop_words]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def frequencies(word_list):    
 | 
				
			||||||
 | 
					    word_freqs = {}  
 | 
				
			||||||
 | 
					    for word in word_list:  
 | 
				
			||||||
 | 
					        word_freqs[word] = word_freqs.get(word, 0) + 1    
 | 
				
			||||||
 | 
					    return word_freqs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def sort(word_freq):    
 | 
				
			||||||
 | 
					    return sorted( word_freq.items(), key=lambda x: x[1], reverse=True )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def printall(word_freqs, n = 10 ):    
 | 
				
			||||||
 | 
					    for word, freq in word_freqs[ :n ]:
 | 
				
			||||||
 | 
					        print(word, '-', freq)        
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if __name__ == "__main__":
 | 
				
			||||||
 | 
					    txtcontent = read_file( testfilepath )
 | 
				
			||||||
 | 
					    word_list = extractwords( txtcontent )
 | 
				
			||||||
 | 
					    word_freqs = frequencies( word_list )     
 | 
				
			||||||
 | 
					    word_sort = sort ( word_freqs )
 | 
				
			||||||
 | 
					    printall(word_sort)    
 | 
				
			||||||
					Loading…
					
					
				
		Reference in new issue