You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

108 lines
3.1 KiB

9 months ago
import site
import os, re, time
import string, operator
9 months ago
################################################################################
# 变量
################################################################################
testfilename = 'test.txt'
testfilename = 'pride-and-prejudice.txt'
testfilename = 'Prey.txt'
db_filename = "tf.db"
9 months ago
9 months ago
site_packages = site.getsitepackages()
for package in site_packages:
if 'package' in package:
9 months ago
basePath = package
stopwordfilepath = os.path.join(basePath, 'cppy', 'data', 'stop_words.txt')
testfilepath = os.path.join(basePath, 'cppy', 'data', testfilename)
9 months ago
################################################################################
# 项目函数
9 months ago
################################################################################
def read_file(path_to_file):
with open(path_to_file, encoding='utf-8') as f:
9 months ago
data = f.read()
return data
def re_split(data):
9 months ago
pattern = re.compile('[\W_]+')
data = pattern.sub(' ', data).lower()
return data.split()
def get_stopwords(path_to_file=stopwordfilepath):
with open(path_to_file, encoding='utf-8') as f:
data = f.read().split(',')
9 months ago
data.extend(list(string.ascii_lowercase))
return data
def get_chunks(file_path=testfilepath, chunk_size=1000):
9 months ago
# 读取文件内容,分割文件内容为多个块,每个块由一个进程处理
# 可以根据实际情况调整块大小
content = re_split(read_file(file_path))
chunks = [
content[i:i + chunk_size] for i in range(0, len(content), chunk_size)
]
9 months ago
return chunks
9 months ago
def extract_file_words(path_to_file):
word_list = re_split(read_file(path_to_file))
9 months ago
stop_words = get_stopwords()
return [w for w in word_list if (not w in stop_words) and len(w) >= 3]
9 months ago
def extract_str_words(data_str):
word_list = re_split(data_str)
9 months ago
stop_words = get_stopwords()
return [w for w in word_list if (not w in stop_words) and len(w) >= 3]
9 months ago
def count_word(word, word_freqs, stopwords):
if word not in stopwords:
word_freqs[word] = word_freqs.get(word, 0) + 1
def get_frequencies(word_list):
word_freqs = {}
for word in word_list:
word_freqs[word] = word_freqs.get(word, 0) + 1
9 months ago
return word_freqs
def sort_dict(word_freq):
9 months ago
return sorted(word_freq.items(), key=operator.itemgetter(1), reverse=True)
# return sorted( word_freq, key=lambda x: x[1], reverse=True )
def print_word_freqs(word_freqs, n=10):
for (w, c) in word_freqs[:n]:
print(w, '-', c)
9 months ago
################################################################################
# 通用工具
################################################################################
def timing_decorator(func):
def wrapper(*args, **kwargs):
start_time = time.time() # 记录开始时间
result = func(*args, **kwargs) # 调用原始函数
end_time = time.time() # 记录结束时间
run_time = end_time - start_time # 计算运行时间
print(f"{func.__name__} 运行时间: {run_time*1000:.2f}")
return result
return wrapper
def test():
print('cppy welcome')