CodePattern/A 代码模式/cppy/cp_util.py

import site
import os, re, time
import string, operator

################################################################################
#  变量
################################################################################
testfilename = 'test.txt'
testfilename = 'pride-and-prejudice.txt'
testfilename = 'Prey.txt'

db_filename = "tf.db"

site_packages = site.getsitepackages()
for package in site_packages:
    if 'package' in package:
        basePath = package
stopwordfilepath = os.path.join(basePath, 'cppy', 'data', 'stop_words.txt')
testfilepath = os.path.join(basePath, 'cppy', 'data', testfilename)


################################################################################
#  项目函数
################################################################################
def read_file(path_to_file):
    with open(path_to_file, encoding='utf-8') as f:
        data = f.read()
    return data


def re_split(data):
    pattern = re.compile('[\W_]+')
    data = pattern.sub(' ', data).lower()
    return data.split()


def get_stopwords(path_to_file=stopwordfilepath):
    with open(path_to_file, encoding='utf-8') as f:
        data = f.read().split(',')
    data.extend(list(string.ascii_lowercase))
    return data


def get_chunks(file_path=testfilepath, chunk_size=1000):
    # 读取文件内容，分割文件内容为多个块，每个块由一个进程处理
    # 可以根据实际情况调整块大小
    content = re_split(read_file(file_path))
    chunks = [
        content[i:i + chunk_size] for i in range(0, len(content), chunk_size)
    ]
    return chunks


def extract_file_words(path_to_file):
    word_list = re_split(read_file(path_to_file))
    stop_words = get_stopwords()
    return [w for w in word_list if (not w in stop_words) and len(w) >= 3]


def extract_str_words(data_str):
    word_list = re_split(data_str)
    stop_words = get_stopwords()
    return [w for w in word_list if (not w in stop_words) and len(w) >= 3]


def count_word(word, word_freqs, stopwords):
    if word not in stopwords:
        word_freqs[word] = word_freqs.get(word, 0) + 1


def get_frequencies(word_list):
    word_freqs = {}
    for word in word_list:
        word_freqs[word] = word_freqs.get(word, 0) + 1
    return word_freqs


def sort_dict(word_freq):
    return sorted(word_freq.items(), key=operator.itemgetter(1), reverse=True)
    # return sorted( word_freq, key=lambda x: x[1], reverse=True )


def print_word_freqs(word_freqs, n=10):
    for (w, c) in word_freqs[:n]:
        print(w, '-', c)


################################################################################
#  通用工具
################################################################################


def timing_decorator(func):

    def wrapper(*args, **kwargs):
        start_time = time.time()  # 记录开始时间
        result = func(*args, **kwargs)  # 调用原始函数
        end_time = time.time()  # 记录结束时间
        run_time = end_time - start_time  # 计算运行时间
        print(f"{func.__name__} 运行时间: {run_time*1000:.2f} 秒")
        return result

    return wrapper


def test():
    print('cppy welcome')
3 9 months ago			`import site`
refactor(code): 优化代码，提高可读性和效率 4 months ago			`import os, re, time`
			`import string, operator`
3 9 months ago
			`################################################################################`
			`# 变量`
			`################################################################################`
			`testfilename = 'test.txt'`
			`testfilename = 'pride-and-prejudice.txt'`
			`testfilename = 'Prey.txt'`

refactor(code): 优化代码，提高可读性和效率 4 months ago			`db_filename = "tf.db"`
db 路径修正 9 months ago
3 9 months ago			`site_packages = site.getsitepackages()`
			`for package in site_packages:`
refactor(code): 优化代码，提高可读性和效率 4 months ago			`if 'package' in package:`
3 9 months ago			`basePath = package`
refactor(code): 优化代码，提高可读性和效率 4 months ago			`stopwordfilepath = os.path.join(basePath, 'cppy', 'data', 'stop_words.txt')`
			`testfilepath = os.path.join(basePath, 'cppy', 'data', testfilename)`
3 9 months ago

			`################################################################################`
运行时间装饰器 9 months ago			`# 项目函数`
3 9 months ago			`################################################################################`
refactor(code): 优化代码，提高可读性和效率 4 months ago			`def read_file(path_to_file):`
			`with open(path_to_file, encoding='utf-8') as f:`
3 9 months ago			`data = f.read()`
			`return data`

refactor(code): 优化代码，提高可读性和效率 4 months ago
			`def re_split(data):`
3 9 months ago			`pattern = re.compile('[\W_]+')`
			`data = pattern.sub(' ', data).lower()`
			`return data.split()`

refactor(code): 优化代码，提高可读性和效率 4 months ago
			`def get_stopwords(path_to_file=stopwordfilepath):`
			`with open(path_to_file, encoding='utf-8') as f:`
			`data = f.read().split(',')`
3 9 months ago			`data.extend(list(string.ascii_lowercase))`
			`return data`

refactor(code): 优化代码，提高可读性和效率 4 months ago
			`def get_chunks(file_path=testfilepath, chunk_size=1000):`
大修6 9 months ago			`# 读取文件内容，分割文件内容为多个块，每个块由一个进程处理`
			`# 可以根据实际情况调整块大小`
refactor(code): 优化代码，提高可读性和效率 4 months ago			`content = re_split(read_file(file_path))`
			`chunks = [`
			`content[i:i + chunk_size] for i in range(0, len(content), chunk_size)`
			`]`
大修6 9 months ago			`return chunks`

refactor(code): 优化代码，提高可读性和效率 4 months ago
3 9 months ago			`def extract_file_words(path_to_file):`
refactor(code): 优化代码，提高可读性和效率 4 months ago			`word_list = re_split(read_file(path_to_file))`
3 9 months ago			`stop_words = get_stopwords()`
refactor(code): 优化代码，提高可读性和效率 4 months ago			`return [w for w in word_list if (not w in stop_words) and len(w) >= 3]`

3 9 months ago
			`def extract_str_words(data_str):`
refactor(code): 优化代码，提高可读性和效率 4 months ago			`word_list = re_split(data_str)`
3 9 months ago			`stop_words = get_stopwords()`
refactor(code): 优化代码，提高可读性和效率 4 months ago			`return [w for w in word_list if (not w in stop_words) and len(w) >= 3]`

3 9 months ago
			`def count_word(word, word_freqs, stopwords):`
			`if word not in stopwords:`
			`word_freqs[word] = word_freqs.get(word, 0) + 1`

refactor(code): 优化代码，提高可读性和效率 4 months ago
			`def get_frequencies(word_list):`
			`word_freqs = {}`
			`for word in word_list:`
			`word_freqs[word] = word_freqs.get(word, 0) + 1`
3 9 months ago			`return word_freqs`

refactor(code): 优化代码，提高可读性和效率 4 months ago
			`def sort_dict(word_freq):`
3 9 months ago			`return sorted(word_freq.items(), key=operator.itemgetter(1), reverse=True)`
			`# return sorted( word_freq, key=lambda x: x[1], reverse=True )`

refactor(code): 优化代码，提高可读性和效率 4 months ago
			`def print_word_freqs(word_freqs, n=10):`
			`for (w, c) in word_freqs[:n]:`
			`print(w, '-', c)`
3 9 months ago

运行时间装饰器 9 months ago			`################################################################################`
			`# 通用工具`
			`################################################################################`

refactor(code): 优化代码，提高可读性和效率 4 months ago
运行时间装饰器 9 months ago			`def timing_decorator(func):`
refactor(code): 优化代码，提高可读性和效率 4 months ago
运行时间装饰器 9 months ago			`def wrapper(args, *kwargs):`
			`start_time = time.time() # 记录开始时间`
			`result = func(args, *kwargs) # 调用原始函数`
			`end_time = time.time() # 记录结束时间`
			`run_time = end_time - start_time # 计算运行时间`
			`print(f"{func.__name__} 运行时间: {run_time*1000:.2f} 秒")`
			`return result`
refactor(code): 优化代码，提高可读性和效率 4 months ago
运行时间装饰器 9 months ago			`return wrapper`

refactor(code): 优化代码，提高可读性和效率 4 months ago
			`def test():`
			`print('cppy welcome')`