|
|
|
@ -23,18 +23,45 @@ testfilepath = os.path.join(basePath, 'cppy', 'data', testfilename)
|
|
|
|
|
# 项目函数
|
|
|
|
|
################################################################################
|
|
|
|
|
def read_file(path_to_file):
|
|
|
|
|
"""
|
|
|
|
|
读取指定文件的内容。
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
path_to_file (str): 文件路径。
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
str: 文件内容。
|
|
|
|
|
"""
|
|
|
|
|
with open(path_to_file, encoding='utf-8') as f:
|
|
|
|
|
data = f.read()
|
|
|
|
|
return data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def re_split(data):
|
|
|
|
|
"""
|
|
|
|
|
使用正则表达式分割字符串,将非字母字符替换为空格,并将所有字符转换为小写。
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
data (str): 输入字符串。
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
list: 分割后的单词列表。
|
|
|
|
|
"""
|
|
|
|
|
pattern = re.compile('[\W_]+')
|
|
|
|
|
data = pattern.sub(' ', data).lower()
|
|
|
|
|
return data.split()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_stopwords(path_to_file=stopwordfilepath):
|
|
|
|
|
"""
|
|
|
|
|
获取停用词列表。
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
path_to_file (str): 停用词文件路径,默认为 stopwordfilepath。
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
list: 停用词列表。
|
|
|
|
|
"""
|
|
|
|
|
with open(path_to_file, encoding='utf-8') as f:
|
|
|
|
|
data = f.read().split(',')
|
|
|
|
|
data.extend(list(string.ascii_lowercase))
|
|
|
|
@ -42,8 +69,16 @@ def get_stopwords(path_to_file=stopwordfilepath):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_chunks(file_path=testfilepath, chunk_size=1000):
|
|
|
|
|
# 读取文件内容,分割文件内容为多个块,每个块由一个进程处理
|
|
|
|
|
# 可以根据实际情况调整块大小
|
|
|
|
|
"""
|
|
|
|
|
将文件内容分割成多个块。
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
file_path (str): 文件路径,默认为 testfilepath。
|
|
|
|
|
chunk_size (int): 每个块的大小,默认为 1000。
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
list: 分割后的块列表。
|
|
|
|
|
"""
|
|
|
|
|
content = re_split(read_file(file_path))
|
|
|
|
|
chunks = [
|
|
|
|
|
content[i:i + chunk_size] for i in range(0, len(content), chunk_size)
|
|
|
|
@ -52,23 +87,58 @@ def get_chunks(file_path=testfilepath, chunk_size=1000):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_file_words(path_to_file):
|
|
|
|
|
"""
|
|
|
|
|
提取文件中的单词,去除停用词和长度小于3的单词。
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
path_to_file (str): 文件路径。
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
list: 提取后的单词列表。
|
|
|
|
|
"""
|
|
|
|
|
word_list = re_split(read_file(path_to_file))
|
|
|
|
|
stop_words = get_stopwords()
|
|
|
|
|
return [w for w in word_list if (not w in stop_words) and len(w) >= 3]
|
|
|
|
|
return [w for w in word_list if (w not in stop_words) and len(w) >= 3]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_str_words(data_str):
|
|
|
|
|
"""
|
|
|
|
|
提取字符串中的单词,去除停用词和长度小于3的单词。
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
data_str (str): 输入字符串。
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
list: 提取后的单词列表。
|
|
|
|
|
"""
|
|
|
|
|
word_list = re_split(data_str)
|
|
|
|
|
stop_words = get_stopwords()
|
|
|
|
|
return [w for w in word_list if (not w in stop_words) and len(w) >= 3]
|
|
|
|
|
return [w for w in word_list if (w not in stop_words) and len(w) >= 3]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def count_word(word, word_freqs, stopwords):
|
|
|
|
|
"""
|
|
|
|
|
统计单词频率。
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
word (str): 单词。
|
|
|
|
|
word_freqs (dict): 单词频率字典。
|
|
|
|
|
stopwords (list): 停用词列表。
|
|
|
|
|
"""
|
|
|
|
|
if word not in stopwords:
|
|
|
|
|
word_freqs[word] = word_freqs.get(word, 0) + 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_frequencies(word_list):
|
|
|
|
|
"""
|
|
|
|
|
获取单词频率。
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
word_list (list): 单词列表。
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
dict: 单词频率字典。
|
|
|
|
|
"""
|
|
|
|
|
word_freqs = {}
|
|
|
|
|
for word in word_list:
|
|
|
|
|
word_freqs[word] = word_freqs.get(word, 0) + 1
|
|
|
|
@ -76,11 +146,26 @@ def get_frequencies(word_list):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def sort_dict(word_freq):
|
|
|
|
|
"""
|
|
|
|
|
对字典进行排序。
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
word_freq (dict): 单词频率字典。
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
list: 排序后的单词频率列表。
|
|
|
|
|
"""
|
|
|
|
|
return sorted(word_freq.items(), key=operator.itemgetter(1), reverse=True)
|
|
|
|
|
# return sorted( word_freq, key=lambda x: x[1], reverse=True )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def print_word_freqs(word_freqs, n=10):
|
|
|
|
|
"""
|
|
|
|
|
打印单词频率。
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
word_freqs (list): 单词频率列表。
|
|
|
|
|
n (int): 打印的单词数量,默认为 10。
|
|
|
|
|
"""
|
|
|
|
|
for (w, c) in word_freqs[:n]:
|
|
|
|
|
print(w, '-', c)
|
|
|
|
|
|
|
|
|
|