forked from p46318075/CodePattern
				
			Merge pull request 'dev' (#17) from pcz4qfnkl/CodePattern:dev into dev
	
		
	
				
					
				
			
						commit
						3d0220d49b
					
				| @ -0,0 +1,4 @@ | |||||||
|  | log.txt | ||||||
|  | /test | ||||||
|  | /.venv | ||||||
|  | __pycache__ | ||||||
											
												Binary file not shown.
											
										
									
								| @ -1,93 +0,0 @@ | |||||||
| 
 |  | ||||||
| import site |  | ||||||
| import os,re,time |  | ||||||
| import string,operator |  | ||||||
| 
 |  | ||||||
| ################################################################################ |  | ||||||
| #  变量 |  | ||||||
| ################################################################################ |  | ||||||
| testfilename = 'test.txt' |  | ||||||
| testfilename = 'pride-and-prejudice.txt' |  | ||||||
| testfilename = 'Prey.txt' |  | ||||||
| 
 |  | ||||||
| db_filename = "tf.db"   |  | ||||||
| 
 |  | ||||||
| site_packages = site.getsitepackages() |  | ||||||
| for package in site_packages: |  | ||||||
|     if 'package' in  package: |  | ||||||
|         basePath = package |  | ||||||
| stopwordfilepath = os.path.join(basePath, 'cppy','data','stop_words.txt') |  | ||||||
| testfilepath = os.path.join(basePath, 'cppy','data',testfilename ) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| ################################################################################ |  | ||||||
| #  项目函数 |  | ||||||
| ################################################################################ |  | ||||||
| def read_file(path_to_file):     |  | ||||||
|     with open(path_to_file,encoding='utf-8') as f: |  | ||||||
|         data = f.read() |  | ||||||
|     return data |  | ||||||
| 
 |  | ||||||
| def re_split( data ): |  | ||||||
|     pattern = re.compile('[\W_]+') |  | ||||||
|     data = pattern.sub(' ', data).lower() |  | ||||||
|     return data.split() |  | ||||||
| 
 |  | ||||||
| def get_stopwords( path_to_file = stopwordfilepath ): |  | ||||||
|     with open(path_to_file,encoding='utf-8') as f: |  | ||||||
|         data = f.read().split(',')         |  | ||||||
|     data.extend(list(string.ascii_lowercase)) |  | ||||||
|     return data |  | ||||||
| 
 |  | ||||||
| def get_chunks( file_path = testfilepath, chunk_size = 1000): |  | ||||||
|     # 读取文件内容,分割文件内容为多个块,每个块由一个进程处理 |  | ||||||
|     # 可以根据实际情况调整块大小 |  | ||||||
|     content = re_split(read_file(file_path))          |  | ||||||
|     chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)] |  | ||||||
|     return chunks |  | ||||||
| 
 |  | ||||||
| def extract_file_words(path_to_file): |  | ||||||
|     word_list = re_split( read_file(path_to_file) ) |  | ||||||
|     stop_words = get_stopwords() |  | ||||||
|     return [ w for w in word_list if ( not w in stop_words ) and len(w) >= 3 ] |  | ||||||
| 
 |  | ||||||
| def extract_str_words(data_str): |  | ||||||
|     word_list = re_split( data_str ) |  | ||||||
|     stop_words = get_stopwords() |  | ||||||
|     return [ w for w in word_list if ( not w in stop_words ) and len(w) >= 3 ] |  | ||||||
| 
 |  | ||||||
| def count_word(word, word_freqs, stopwords): |  | ||||||
|     if word not in stopwords: |  | ||||||
|         word_freqs[word] = word_freqs.get(word, 0) + 1 |  | ||||||
| 
 |  | ||||||
| def get_frequencies(word_list):     |  | ||||||
|     word_freqs = {}   |  | ||||||
|     for word in word_list:   |  | ||||||
|         word_freqs[word] = word_freqs.get(word, 0) + 1     |  | ||||||
|     return word_freqs |  | ||||||
| 
 |  | ||||||
| def sort_dict (word_freq): |  | ||||||
|     return sorted(word_freq.items(), key=operator.itemgetter(1), reverse=True) |  | ||||||
|     # return sorted( word_freq, key=lambda x: x[1], reverse=True ) |  | ||||||
| 
 |  | ||||||
| def print_word_freqs( word_freqs, n = 10): |  | ||||||
|     for (w, c) in word_freqs[ :n ]: |  | ||||||
|         print( w, '-', c ) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| ################################################################################ |  | ||||||
| #  通用工具 |  | ||||||
| ################################################################################ |  | ||||||
| 
 |  | ||||||
| def timing_decorator(func): |  | ||||||
|     def wrapper(*args, **kwargs): |  | ||||||
|         start_time = time.time()  # 记录开始时间 |  | ||||||
|         result = func(*args, **kwargs)  # 调用原始函数 |  | ||||||
|         end_time = time.time()  # 记录结束时间 |  | ||||||
|         run_time = end_time - start_time  # 计算运行时间 |  | ||||||
|         print(f"{func.__name__} 运行时间: {run_time*1000:.2f} 秒") |  | ||||||
|         return result |  | ||||||
|     return wrapper |  | ||||||
| 
 |  | ||||||
| def  test(): |  | ||||||
|     print( 'cppy welcome' ) |  | ||||||
					Loading…
					
					
				
		Reference in new issue