|  |  |  | @ -23,18 +23,45 @@ testfilepath = os.path.join(basePath, 'cppy', 'data', testfilename) | 
			
		
	
		
			
				
					|  |  |  |  | #  项目函数 | 
			
		
	
		
			
				
					|  |  |  |  | ################################################################################ | 
			
		
	
		
			
				
					|  |  |  |  | def read_file(path_to_file): | 
			
		
	
		
			
				
					|  |  |  |  |     """ | 
			
		
	
		
			
				
					|  |  |  |  |     读取指定文件的内容。 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     Args: | 
			
		
	
		
			
				
					|  |  |  |  |         path_to_file (str): 文件路径。 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     Returns: | 
			
		
	
		
			
				
					|  |  |  |  |         str: 文件内容。 | 
			
		
	
		
			
				
					|  |  |  |  |     """ | 
			
		
	
		
			
				
					|  |  |  |  |     with open(path_to_file, encoding='utf-8') as f: | 
			
		
	
		
			
				
					|  |  |  |  |         data = f.read() | 
			
		
	
		
			
				
					|  |  |  |  |     return data | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | def re_split(data): | 
			
		
	
		
			
				
					|  |  |  |  |     """ | 
			
		
	
		
			
				
					|  |  |  |  |     使用正则表达式分割字符串,将非字母字符替换为空格,并将所有字符转换为小写。 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     Args: | 
			
		
	
		
			
				
					|  |  |  |  |         data (str): 输入字符串。 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     Returns: | 
			
		
	
		
			
				
					|  |  |  |  |         list: 分割后的单词列表。 | 
			
		
	
		
			
				
					|  |  |  |  |     """ | 
			
		
	
		
			
				
					|  |  |  |  |     pattern = re.compile('[\W_]+') | 
			
		
	
		
			
				
					|  |  |  |  |     data = pattern.sub(' ', data).lower() | 
			
		
	
		
			
				
					|  |  |  |  |     return data.split() | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | def get_stopwords(path_to_file=stopwordfilepath): | 
			
		
	
		
			
				
					|  |  |  |  |     """ | 
			
		
	
		
			
				
					|  |  |  |  |     获取停用词列表。 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     Args: | 
			
		
	
		
			
				
					|  |  |  |  |         path_to_file (str): 停用词文件路径,默认为 stopwordfilepath。 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     Returns: | 
			
		
	
		
			
				
					|  |  |  |  |         list: 停用词列表。 | 
			
		
	
		
			
				
					|  |  |  |  |     """ | 
			
		
	
		
			
				
					|  |  |  |  |     with open(path_to_file, encoding='utf-8') as f: | 
			
		
	
		
			
				
					|  |  |  |  |         data = f.read().split(',') | 
			
		
	
		
			
				
					|  |  |  |  |     data.extend(list(string.ascii_lowercase)) | 
			
		
	
	
		
			
				
					|  |  |  | @ -42,8 +69,16 @@ def get_stopwords(path_to_file=stopwordfilepath): | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | def get_chunks(file_path=testfilepath, chunk_size=1000): | 
			
		
	
		
			
				
					|  |  |  |  |     # 读取文件内容,分割文件内容为多个块,每个块由一个进程处理 | 
			
		
	
		
			
				
					|  |  |  |  |     # 可以根据实际情况调整块大小 | 
			
		
	
		
			
				
					|  |  |  |  |     """ | 
			
		
	
		
			
				
					|  |  |  |  |     将文件内容分割成多个块。 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     Args: | 
			
		
	
		
			
				
					|  |  |  |  |         file_path (str): 文件路径,默认为 testfilepath。 | 
			
		
	
		
			
				
					|  |  |  |  |         chunk_size (int): 每个块的大小,默认为 1000。 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     Returns: | 
			
		
	
		
			
				
					|  |  |  |  |         list: 分割后的块列表。 | 
			
		
	
		
			
				
					|  |  |  |  |     """ | 
			
		
	
		
			
				
					|  |  |  |  |     content = re_split(read_file(file_path)) | 
			
		
	
		
			
				
					|  |  |  |  |     chunks = [ | 
			
		
	
		
			
				
					|  |  |  |  |         content[i:i + chunk_size] for i in range(0, len(content), chunk_size) | 
			
		
	
	
		
			
				
					|  |  |  | @ -52,23 +87,58 @@ def get_chunks(file_path=testfilepath, chunk_size=1000): | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | def extract_file_words(path_to_file): | 
			
		
	
		
			
				
					|  |  |  |  |     """ | 
			
		
	
		
			
				
					|  |  |  |  |     提取文件中的单词,去除停用词和长度小于3的单词。 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     Args: | 
			
		
	
		
			
				
					|  |  |  |  |         path_to_file (str): 文件路径。 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     Returns: | 
			
		
	
		
			
				
					|  |  |  |  |         list: 提取后的单词列表。 | 
			
		
	
		
			
				
					|  |  |  |  |     """ | 
			
		
	
		
			
				
					|  |  |  |  |     word_list = re_split(read_file(path_to_file)) | 
			
		
	
		
			
				
					|  |  |  |  |     stop_words = get_stopwords() | 
			
		
	
		
			
				
					|  |  |  |  |     return [w for w in word_list if (not w in stop_words) and len(w) >= 3] | 
			
		
	
		
			
				
					|  |  |  |  |     return [w for w in word_list if (w not in stop_words) and len(w) >= 3] | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | def extract_str_words(data_str): | 
			
		
	
		
			
				
					|  |  |  |  |     """ | 
			
		
	
		
			
				
					|  |  |  |  |     提取字符串中的单词,去除停用词和长度小于3的单词。 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     Args: | 
			
		
	
		
			
				
					|  |  |  |  |         data_str (str): 输入字符串。 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     Returns: | 
			
		
	
		
			
				
					|  |  |  |  |         list: 提取后的单词列表。 | 
			
		
	
		
			
				
					|  |  |  |  |     """ | 
			
		
	
		
			
				
					|  |  |  |  |     word_list = re_split(data_str) | 
			
		
	
		
			
				
					|  |  |  |  |     stop_words = get_stopwords() | 
			
		
	
		
			
				
					|  |  |  |  |     return [w for w in word_list if (not w in stop_words) and len(w) >= 3] | 
			
		
	
		
			
				
					|  |  |  |  |     return [w for w in word_list if (w not in stop_words) and len(w) >= 3] | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | def count_word(word, word_freqs, stopwords): | 
			
		
	
		
			
				
					|  |  |  |  |     """ | 
			
		
	
		
			
				
					|  |  |  |  |     统计单词频率。 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     Args: | 
			
		
	
		
			
				
					|  |  |  |  |         word (str): 单词。 | 
			
		
	
		
			
				
					|  |  |  |  |         word_freqs (dict): 单词频率字典。 | 
			
		
	
		
			
				
					|  |  |  |  |         stopwords (list): 停用词列表。 | 
			
		
	
		
			
				
					|  |  |  |  |     """ | 
			
		
	
		
			
				
					|  |  |  |  |     if word not in stopwords: | 
			
		
	
		
			
				
					|  |  |  |  |         word_freqs[word] = word_freqs.get(word, 0) + 1 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | def get_frequencies(word_list): | 
			
		
	
		
			
				
					|  |  |  |  |     """ | 
			
		
	
		
			
				
					|  |  |  |  |     获取单词频率。 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     Args: | 
			
		
	
		
			
				
					|  |  |  |  |         word_list (list): 单词列表。 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     Returns: | 
			
		
	
		
			
				
					|  |  |  |  |         dict: 单词频率字典。 | 
			
		
	
		
			
				
					|  |  |  |  |     """ | 
			
		
	
		
			
				
					|  |  |  |  |     word_freqs = {} | 
			
		
	
		
			
				
					|  |  |  |  |     for word in word_list: | 
			
		
	
		
			
				
					|  |  |  |  |         word_freqs[word] = word_freqs.get(word, 0) + 1 | 
			
		
	
	
		
			
				
					|  |  |  | @ -76,11 +146,26 @@ def get_frequencies(word_list): | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | def sort_dict(word_freq): | 
			
		
	
		
			
				
					|  |  |  |  |     """ | 
			
		
	
		
			
				
					|  |  |  |  |     对字典进行排序。 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     Args: | 
			
		
	
		
			
				
					|  |  |  |  |         word_freq (dict): 单词频率字典。 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     Returns: | 
			
		
	
		
			
				
					|  |  |  |  |         list: 排序后的单词频率列表。 | 
			
		
	
		
			
				
					|  |  |  |  |     """ | 
			
		
	
		
			
				
					|  |  |  |  |     return sorted(word_freq.items(), key=operator.itemgetter(1), reverse=True) | 
			
		
	
		
			
				
					|  |  |  |  |     # return sorted( word_freq, key=lambda x: x[1], reverse=True ) | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | def print_word_freqs(word_freqs, n=10): | 
			
		
	
		
			
				
					|  |  |  |  |     """ | 
			
		
	
		
			
				
					|  |  |  |  |     打印单词频率。 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     Args: | 
			
		
	
		
			
				
					|  |  |  |  |         word_freqs (list): 单词频率列表。 | 
			
		
	
		
			
				
					|  |  |  |  |         n (int): 打印的单词数量,默认为 10。 | 
			
		
	
		
			
				
					|  |  |  |  |     """ | 
			
		
	
		
			
				
					|  |  |  |  |     for (w, c) in word_freqs[:n]: | 
			
		
	
		
			
				
					|  |  |  |  |         print(w, '-', c) | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
	
		
			
				
					|  |  |  | 
 |