pull/1/head
			
			
		
		
							parent
							
								
									2bfeabe429
								
							
						
					
					
						commit
						6edef230ac
					
				| @ -0,0 +1,70 @@ | ||||
| 
 | ||||
| import site | ||||
| import os,re | ||||
| import string,operator | ||||
| 
 | ||||
| ################################################################################ | ||||
| #  变量 | ||||
| ################################################################################ | ||||
| testfilename = 'test.txt' | ||||
| testfilename = 'pride-and-prejudice.txt' | ||||
| testfilename = 'Prey.txt' | ||||
| 
 | ||||
| site_packages = site.getsitepackages() | ||||
| for package in site_packages: | ||||
|     if 'package' in  package: | ||||
|         basePath = package | ||||
| stopwordfilepath = os.path.join(basePath, 'cppy','data','stop_words.txt') | ||||
| testfilepath = os.path.join(basePath, 'cppy','data',testfilename ) | ||||
| 
 | ||||
| 
 | ||||
| ################################################################################ | ||||
| #  函数 | ||||
| ################################################################################ | ||||
| def read_file(path_to_file):     | ||||
|     with open(path_to_file,encoding='utf-8') as f: | ||||
|         data = f.read() | ||||
|     return data | ||||
| 
 | ||||
| def re_split( data ): | ||||
|     pattern = re.compile('[\W_]+') | ||||
|     data = pattern.sub(' ', data).lower() | ||||
|     return data.split() | ||||
| 
 | ||||
| def get_stopwords( path_to_file = stopwordfilepath ): | ||||
|     with open(path_to_file,encoding='utf-8') as f: | ||||
|         data = f.read().split(',')         | ||||
|     data.extend(list(string.ascii_lowercase)) | ||||
|     return data | ||||
| 
 | ||||
| def extract_file_words(path_to_file): | ||||
|     word_list = re_split( read_file(path_to_file) ) | ||||
|     stop_words = get_stopwords() | ||||
|     return [ w for w in word_list if ( not w in stop_words ) and len(w) >= 3 ] | ||||
| 
 | ||||
| def extract_str_words(data_str): | ||||
|     word_list = re_split( data_str ) | ||||
|     stop_words = get_stopwords() | ||||
|     return [ w for w in word_list if ( not w in stop_words ) and len(w) >= 3 ] | ||||
| 
 | ||||
| def count_word(word, word_freqs, stopwords): | ||||
|     if word not in stopwords: | ||||
|         word_freqs[word] = word_freqs.get(word, 0) + 1 | ||||
| 
 | ||||
| def get_frequencies(word_list):     | ||||
|     word_freqs = {}   | ||||
|     for word in word_list:   | ||||
|         word_freqs[word] = word_freqs.get(word, 0) + 1     | ||||
|     return word_freqs | ||||
| 
 | ||||
| def sort_dict (word_freq): | ||||
|     return sorted(word_freq.items(), key=operator.itemgetter(1), reverse=True) | ||||
|     # return sorted( word_freq, key=lambda x: x[1], reverse=True ) | ||||
| 
 | ||||
| def print_word_freqs( word_freqs, n = 10): | ||||
|     for (w, c) in word_freqs[ :n ]: | ||||
|         print( w, '-', c ) | ||||
| 
 | ||||
| 
 | ||||
| def  test(): | ||||
|     print( 'cppy welcome' ) | ||||
											
												
													File diff suppressed because it is too large
													Load Diff
												
											
										
									
								
											
												
													File diff suppressed because it is too large
													Load Diff
												
											
										
									
								| @ -0,0 +1 @@ | ||||
| a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your | ||||
| @ -0,0 +1,2 @@ | ||||
| "Some acquaintance or other, my dear, I suppose; I am sure I do not | ||||
| know." | ||||
					Loading…
					
					
				
		Reference in new issue