diff --git a/.gitignore b/.gitignore index 638bc3f..61f4419 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ log.txt /test /.venv -*/__pycache__ +__pycache__ diff --git a/A 代码模式/10 一盘大棋/1 最基础的写法.py b/A 代码模式/10 一盘大棋/1 最基础的写法.py index a0c4ab6..e70362d 100644 --- a/A 代码模式/10 一盘大棋/1 最基础的写法.py +++ b/A 代码模式/10 一盘大棋/1 最基础的写法.py @@ -7,10 +7,9 @@ with open(stopwordfilepath, encoding='utf-8') as f: for letter in 'abcdefghijklmnopqrstuvwxyz': stop_words.append(letter) - # 读文件,逐行扫描文本,发现词,确定不是停用词,计数 word_freqs = [] -for line in open( testfilepath, encoding='utf-8' ): +for line in open(testfilepath, encoding='utf-8'): start_char = None i = 0 for c in line: @@ -42,10 +41,9 @@ for line in open( testfilepath, encoding='utf-8' ): # 使用冒泡排序对词频进行排序 n = len(word_freqs) for i in range(n): - for j in range(0, n-i-1): - if word_freqs[j][1] < word_freqs[j+1][1]: - word_freqs[j], word_freqs[j+1] = word_freqs[j+1], word_freqs[j] - + for j in range(0, n - i - 1): + if word_freqs[j][1] < word_freqs[j + 1][1]: + word_freqs[j], word_freqs[j + 1] = word_freqs[j + 1], word_freqs[j] # 打印频率最高的前10个词 for tf in word_freqs[:10]: diff --git a/A 代码模式/10 一盘大棋/2 加入语言特性.py b/A 代码模式/10 一盘大棋/2 加入语言特性.py index df26455..f48b2c4 100644 --- a/A 代码模式/10 一盘大棋/2 加入语言特性.py +++ b/A 代码模式/10 一盘大棋/2 加入语言特性.py @@ -1,4 +1,4 @@ -from cppy.cp_util import stopwordfilepath,testfilepath +from cppy.cp_util import stopwordfilepath, testfilepath import string from collections import Counter @@ -8,7 +8,7 @@ stop_words.update(list(string.ascii_lowercase)) # 读取文件并计算单词频率 word_freqs = Counter() -with open(testfilepath,encoding = 'utf8') as f: +with open(testfilepath, encoding='utf8') as f: for line_num, line in enumerate(f, 1): start_char = None for i, c in enumerate(line): @@ -23,10 +23,9 @@ with open(testfilepath,encoding = 'utf8') as f: # 打印前10个最常见的单词 for word, freq in word_freqs.most_common(10): print(f"{word}-{freq}") - ''' 相比 A01 使用collections.Counter来计数单词频率,从而简化了代码并提高了效率。 使用enumerate来获取行号和行内容,使用set来存储停用词,都有助于提高代码的性能和可读性。 使用most_common方法来获取最常见的单词,使输出更为简洁。 -''' \ No newline at end of file +''' diff --git a/A 代码模式/10 一盘大棋/3 Hacker.py b/A 代码模式/10 一盘大棋/3 Hacker.py index c70b4b5..2576218 100644 --- a/A 代码模式/10 一盘大棋/3 Hacker.py +++ b/A 代码模式/10 一盘大棋/3 Hacker.py @@ -1,12 +1,14 @@ -import re, collections -from cppy.cp_util import stopwordfilepath,testfilepath - -stopwords = set(open( stopwordfilepath,encoding = 'utf8' ).read().split(',')) -words = re.findall('[a-z]{2,}', open( testfilepath,encoding = 'utf8').read().lower()) -counts = collections.Counter( w for w in words if w not in stopwords ) -for (w, c) in counts.most_common(10) : print(w, '-', c) +import re +import collections +from cppy.cp_util import stopwordfilepath, testfilepath +stopwords = set(open(stopwordfilepath, encoding='utf8').read().split(',')) +words = re.findall('[a-z]{2,}', + open(testfilepath, encoding='utf8').read().lower()) +counts = collections.Counter(w for w in words if w not in stopwords) +for (w, c) in counts.most_common(10): + print(w, '-', c) ''' 熟练的软件工程师,会如此简单完成任务 后面的例子,我们必须变的啰嗦一些,不能用这种太 hacker 的写法 -''' \ No newline at end of file +''' diff --git a/A 代码模式/cppy_/cp_util.py b/A 代码模式/cppy/cp_util.py similarity index 60% rename from A 代码模式/cppy_/cp_util.py rename to A 代码模式/cppy/cp_util.py index 6554c2c..b85ec2a 100644 --- a/A 代码模式/cppy_/cp_util.py +++ b/A 代码模式/cppy/cp_util.py @@ -1,7 +1,6 @@ - import site -import os,re,time -import string,operator +import os, re, time +import string, operator ################################################################################ # 变量 @@ -10,76 +9,89 @@ testfilename = 'test.txt' testfilename = 'pride-and-prejudice.txt' testfilename = 'Prey.txt' -db_filename = "tf.db" +db_filename = "tf.db" site_packages = site.getsitepackages() for package in site_packages: - if 'package' in package: + if 'package' in package: basePath = package -stopwordfilepath = os.path.join(basePath, 'cppy','data','stop_words.txt') -testfilepath = os.path.join(basePath, 'cppy','data',testfilename ) +stopwordfilepath = os.path.join(basePath, 'cppy', 'data', 'stop_words.txt') +testfilepath = os.path.join(basePath, 'cppy', 'data', testfilename) ################################################################################ # 项目函数 ################################################################################ -def read_file(path_to_file): - with open(path_to_file,encoding='utf-8') as f: +def read_file(path_to_file): + with open(path_to_file, encoding='utf-8') as f: data = f.read() return data -def re_split( data ): + +def re_split(data): pattern = re.compile('[\W_]+') data = pattern.sub(' ', data).lower() return data.split() -def get_stopwords( path_to_file = stopwordfilepath ): - with open(path_to_file,encoding='utf-8') as f: - data = f.read().split(',') + +def get_stopwords(path_to_file=stopwordfilepath): + with open(path_to_file, encoding='utf-8') as f: + data = f.read().split(',') data.extend(list(string.ascii_lowercase)) return data -def get_chunks( file_path = testfilepath, chunk_size = 1000): + +def get_chunks(file_path=testfilepath, chunk_size=1000): # 读取文件内容,分割文件内容为多个块,每个块由一个进程处理 # 可以根据实际情况调整块大小 - content = re_split(read_file(file_path)) - chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)] + content = re_split(read_file(file_path)) + chunks = [ + content[i:i + chunk_size] for i in range(0, len(content), chunk_size) + ] return chunks + def extract_file_words(path_to_file): - word_list = re_split( read_file(path_to_file) ) + word_list = re_split(read_file(path_to_file)) stop_words = get_stopwords() - return [ w for w in word_list if ( not w in stop_words ) and len(w) >= 3 ] + return [w for w in word_list if (not w in stop_words) and len(w) >= 3] + def extract_str_words(data_str): - word_list = re_split( data_str ) + word_list = re_split(data_str) stop_words = get_stopwords() - return [ w for w in word_list if ( not w in stop_words ) and len(w) >= 3 ] + return [w for w in word_list if (not w in stop_words) and len(w) >= 3] + def count_word(word, word_freqs, stopwords): if word not in stopwords: word_freqs[word] = word_freqs.get(word, 0) + 1 -def get_frequencies(word_list): - word_freqs = {} - for word in word_list: - word_freqs[word] = word_freqs.get(word, 0) + 1 + +def get_frequencies(word_list): + word_freqs = {} + for word in word_list: + word_freqs[word] = word_freqs.get(word, 0) + 1 return word_freqs -def sort_dict (word_freq): + +def sort_dict(word_freq): return sorted(word_freq.items(), key=operator.itemgetter(1), reverse=True) # return sorted( word_freq, key=lambda x: x[1], reverse=True ) -def print_word_freqs( word_freqs, n = 10): - for (w, c) in word_freqs[ :n ]: - print( w, '-', c ) + +def print_word_freqs(word_freqs, n=10): + for (w, c) in word_freqs[:n]: + print(w, '-', c) ################################################################################ # 通用工具 ################################################################################ + def timing_decorator(func): + def wrapper(*args, **kwargs): start_time = time.time() # 记录开始时间 result = func(*args, **kwargs) # 调用原始函数 @@ -87,7 +99,9 @@ def timing_decorator(func): run_time = end_time - start_time # 计算运行时间 print(f"{func.__name__} 运行时间: {run_time*1000:.2f} 秒") return result + return wrapper -def test(): - print( 'cppy welcome' ) \ No newline at end of file + +def test(): + print('cppy welcome') diff --git a/A 代码模式/cppy_/data/Prey.txt b/A 代码模式/cppy/data/Prey.txt similarity index 100% rename from A 代码模式/cppy_/data/Prey.txt rename to A 代码模式/cppy/data/Prey.txt diff --git a/A 代码模式/cppy_/data/pride-and-prejudice.txt b/A 代码模式/cppy/data/pride-and-prejudice.txt similarity index 100% rename from A 代码模式/cppy_/data/pride-and-prejudice.txt rename to A 代码模式/cppy/data/pride-and-prejudice.txt diff --git a/A 代码模式/cppy_/data/stop_words.txt b/A 代码模式/cppy/data/stop_words.txt similarity index 100% rename from A 代码模式/cppy_/data/stop_words.txt rename to A 代码模式/cppy/data/stop_words.txt diff --git a/A 代码模式/cppy_/data/test.txt b/A 代码模式/cppy/data/test.txt similarity index 100% rename from A 代码模式/cppy_/data/test.txt rename to A 代码模式/cppy/data/test.txt diff --git a/A 代码模式/cppy_/__pycache__/cp_util.cpython-38.pyc b/A 代码模式/cppy_/__pycache__/cp_util.cpython-38.pyc deleted file mode 100644 index 6e5a579..0000000 Binary files a/A 代码模式/cppy_/__pycache__/cp_util.cpython-38.pyc and /dev/null differ