Compare commits
No commits in common. '3d0220d49ba7b6c6b14c77eecdbe26efc69809be' and 'ceb9955051ab69453197709b767b4d986cc6c410' have entirely different histories.
3d0220d49b
...
ceb9955051
@ -1,4 +0,0 @@
|
|||||||
log.txt
|
|
||||||
/test
|
|
||||||
/.venv
|
|
||||||
__pycache__
|
|
Binary file not shown.
@ -0,0 +1,93 @@
|
|||||||
|
|
||||||
|
import site
|
||||||
|
import os,re,time
|
||||||
|
import string,operator
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# 变量
|
||||||
|
################################################################################
|
||||||
|
testfilename = 'test.txt'
|
||||||
|
testfilename = 'pride-and-prejudice.txt'
|
||||||
|
testfilename = 'Prey.txt'
|
||||||
|
|
||||||
|
db_filename = "tf.db"
|
||||||
|
|
||||||
|
site_packages = site.getsitepackages()
|
||||||
|
for package in site_packages:
|
||||||
|
if 'package' in package:
|
||||||
|
basePath = package
|
||||||
|
stopwordfilepath = os.path.join(basePath, 'cppy','data','stop_words.txt')
|
||||||
|
testfilepath = os.path.join(basePath, 'cppy','data',testfilename )
|
||||||
|
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# 项目函数
|
||||||
|
################################################################################
|
||||||
|
def read_file(path_to_file):
|
||||||
|
with open(path_to_file,encoding='utf-8') as f:
|
||||||
|
data = f.read()
|
||||||
|
return data
|
||||||
|
|
||||||
|
def re_split( data ):
|
||||||
|
pattern = re.compile('[\W_]+')
|
||||||
|
data = pattern.sub(' ', data).lower()
|
||||||
|
return data.split()
|
||||||
|
|
||||||
|
def get_stopwords( path_to_file = stopwordfilepath ):
|
||||||
|
with open(path_to_file,encoding='utf-8') as f:
|
||||||
|
data = f.read().split(',')
|
||||||
|
data.extend(list(string.ascii_lowercase))
|
||||||
|
return data
|
||||||
|
|
||||||
|
def get_chunks( file_path = testfilepath, chunk_size = 1000):
|
||||||
|
# 读取文件内容,分割文件内容为多个块,每个块由一个进程处理
|
||||||
|
# 可以根据实际情况调整块大小
|
||||||
|
content = re_split(read_file(file_path))
|
||||||
|
chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
def extract_file_words(path_to_file):
|
||||||
|
word_list = re_split( read_file(path_to_file) )
|
||||||
|
stop_words = get_stopwords()
|
||||||
|
return [ w for w in word_list if ( not w in stop_words ) and len(w) >= 3 ]
|
||||||
|
|
||||||
|
def extract_str_words(data_str):
|
||||||
|
word_list = re_split( data_str )
|
||||||
|
stop_words = get_stopwords()
|
||||||
|
return [ w for w in word_list if ( not w in stop_words ) and len(w) >= 3 ]
|
||||||
|
|
||||||
|
def count_word(word, word_freqs, stopwords):
|
||||||
|
if word not in stopwords:
|
||||||
|
word_freqs[word] = word_freqs.get(word, 0) + 1
|
||||||
|
|
||||||
|
def get_frequencies(word_list):
|
||||||
|
word_freqs = {}
|
||||||
|
for word in word_list:
|
||||||
|
word_freqs[word] = word_freqs.get(word, 0) + 1
|
||||||
|
return word_freqs
|
||||||
|
|
||||||
|
def sort_dict (word_freq):
|
||||||
|
return sorted(word_freq.items(), key=operator.itemgetter(1), reverse=True)
|
||||||
|
# return sorted( word_freq, key=lambda x: x[1], reverse=True )
|
||||||
|
|
||||||
|
def print_word_freqs( word_freqs, n = 10):
|
||||||
|
for (w, c) in word_freqs[ :n ]:
|
||||||
|
print( w, '-', c )
|
||||||
|
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# 通用工具
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
def timing_decorator(func):
|
||||||
|
def wrapper(*args, **kwargs):
|
||||||
|
start_time = time.time() # 记录开始时间
|
||||||
|
result = func(*args, **kwargs) # 调用原始函数
|
||||||
|
end_time = time.time() # 记录结束时间
|
||||||
|
run_time = end_time - start_time # 计算运行时间
|
||||||
|
print(f"{func.__name__} 运行时间: {run_time*1000:.2f} 秒")
|
||||||
|
return result
|
||||||
|
return wrapper
|
||||||
|
|
||||||
|
def test():
|
||||||
|
print( 'cppy welcome' )
|
Loading…
Reference in new issue