Compare commits
No commits in common. 'dev' and 'dev' have entirely different histories.
@ -0,0 +1,4 @@
|
||||
log.txt
|
||||
/test
|
||||
/.venv
|
||||
__pycache__
|
@ -1,5 +1,3 @@
|
||||
''' 递归是基础编程思维产生的代码结构 。注意递归深度限制只能解决小规模问题 '''
|
||||
|
||||
from cppy.cp_util import *
|
||||
from collections import Counter
|
||||
|
@ -1,7 +1,7 @@
|
||||
from cppy.cp_util import *
|
||||
|
||||
#
|
||||
# 生成器 是一种简单异步实现
|
||||
# 生成器
|
||||
#
|
||||
def non_stop_words(testfilepath):
|
||||
stopwords = get_stopwords()
|
@ -1,9 +1,6 @@
|
||||
import time
|
||||
import cppy.cp_util as util
|
||||
|
||||
'''
|
||||
用反射实现装饰器的效果
|
||||
'''
|
||||
|
||||
# 工具函数
|
||||
def extract_words(path_to_file):
|
@ -0,0 +1,34 @@
|
||||
from collections import Counter
|
||||
import cppy.cp_util as util
|
||||
|
||||
def word_frequency( top_n=10 ):
|
||||
def decorator(func):
|
||||
def wrapper(*args, **kwargs):
|
||||
# 执行被装饰的函数
|
||||
result = func(*args, **kwargs)
|
||||
|
||||
# 初始化词频计数器
|
||||
word_counts = Counter()
|
||||
|
||||
# 分词并计数
|
||||
for word in util.extract_str_words(result):
|
||||
word_counts[word] += 1
|
||||
|
||||
# 输出所有词的频率最高的n个词
|
||||
most_common = word_counts.most_common(top_n)
|
||||
util.print_word_freqs( most_common )
|
||||
|
||||
return result
|
||||
|
||||
return wrapper
|
||||
return decorator
|
||||
|
||||
|
||||
# 使用装饰器
|
||||
@word_frequency( top_n=10 )
|
||||
def read_file(file_path):
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
return file.read()
|
||||
|
||||
|
||||
read_file( util.testfilepath )
|
@ -1,6 +1,3 @@
|
||||
|
||||
# 自己设计类装饰器,类方法做进行类型申明和检查
|
||||
|
||||
from collections import Counter
|
||||
from cppy.cp_util import *
|
||||
|
@ -0,0 +1,42 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import multiprocessing
|
||||
from collections import Counter
|
||||
from cppy.cp_util import *
|
||||
|
||||
|
||||
#
|
||||
# 多进程: 因为创建进程相比计算过程开销太大,结果最慢
|
||||
#
|
||||
stop_words = get_stopwords()
|
||||
|
||||
def process_chunk(chunk):
|
||||
# 过滤停用词
|
||||
words = [ w for w in chunk if ( not w in stop_words ) and len(w) >= 3 ]
|
||||
return Counter(words)
|
||||
|
||||
def merge_counts(counts_list):
|
||||
"""合并多个Counter对象的总和"""
|
||||
return sum(counts_list, Counter())
|
||||
|
||||
|
||||
@timing_decorator
|
||||
def main():
|
||||
# 读取文件内容,分割文件内容为多个块,每个块由一个进程处理
|
||||
chunks = get_chunks(testfilepath,1000)
|
||||
|
||||
# 使用多进程处理每个块
|
||||
pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
|
||||
counts_list = pool.map(process_chunk, chunks)
|
||||
pool.close()
|
||||
pool.join()
|
||||
|
||||
# 合并计数
|
||||
total_counts = merge_counts(counts_list)
|
||||
|
||||
# 输出最高频的n个词
|
||||
print_word_freqs(total_counts.most_common(10))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
@ -1,9 +1,6 @@
|
||||
import re
|
||||
from collections import Counter
|
||||
|
||||
# 提供一个命令行交互方式来驱动程序运行
|
||||
|
||||
|
||||
# 清洗文本,移除标点符号并转换为小写
|
||||
def clean_text(text):
|
||||
return re.sub(r'[^\w\s]', '', text).lower()
|
@ -1,7 +1,3 @@
|
||||
|
||||
# 命令行菜单驱动程序
|
||||
# while 循环体中出现一个 case 菜单,每个菜单调用对应函数
|
||||
|
||||
import os
|
||||
import cppy.cp_util as util
|
||||
|
@ -1,6 +1,3 @@
|
||||
|
||||
# 应用基于一个窗口,窗口基于操作系统的消息机制
|
||||
|
||||
import sys
|
||||
from PyQt5.QtWidgets import QApplication, QWidget, QPushButton, QVBoxLayout, QTextEdit, QFileDialog
|
||||
import cppy.cp_util as util
|
@ -0,0 +1,2 @@
|
||||
" my Some sure acquaintance or other, my dear, sure,other I suppose; I am sure I do not
|
||||
know. sure "
|
@ -1,6 +1,4 @@
|
||||
# 创建对象是消耗资源的,如果发现对象已经存在,可以返回引用,不创造新对象 。设计模式中这个做法叫享元
|
||||
# 可以降低资源需求和提升响应速度。更常见的该模式使用场景是各种资源池。
|
||||
|
||||
from cppy.cp_util import *
|
||||
|
||||
#享元类
|
@ -0,0 +1,9 @@
|
||||
|
||||
注册
|
||||
- 解耦合:通过回调函数,可以将不同部分的代码逻辑分离,降低模块之间的耦合度。
|
||||
- 主动通信:注册回调模式实现了下层模块与上层模块之间的主动通信。当下层模块发生特定事件或满足特定条件时,可以主动调用上层模块注册的回调函数,而不需要上层模块不停地轮询下层模块的状态。
|
||||
|
||||
- 异步处理:回调函数常用于异步操作的响应处理,可以在主线程之外执行耗时操作,提升程序的效率和响应速度。
|
||||
- 简化设计:在某些情况下,使用回调函数可以避免复杂的控制流设计,使代码更加简洁明了。
|
||||
|
||||
- 适应变化:随着项目的发展,需求可能会发生变化。注册回调模式使得在不影响现有代码的基础上,容易添加新功能或修改现有逻辑。
|
@ -1,7 +1,3 @@
|
||||
|
||||
# 插件模式提供一种个别扩展性开发和系统核心开发无关的松耦合结构。
|
||||
# 简单说,第三方开发者在没有核心框架源码下也能扩展或者改造系统功能
|
||||
|
||||
import configparser, importlib.machinery
|
||||
from cppy.cp_util import *
|
||||
|
@ -1,7 +1,3 @@
|
||||
|
||||
# Python 作为弱类型语言希望拥有强类型语言类似的规范工整工程性的优点,牺牲一些代码的自由度。
|
||||
# 可以理解为更好的代码注释和更多的工程约束 。
|
||||
|
||||
import cppy.cp_util as util
|
||||
|
||||
|
@ -1,9 +1,7 @@
|
||||
from collections import Counter
|
||||
from cppy.cp_util import *
|
||||
|
||||
#
|
||||
# 遇到异常,退出程序
|
||||
#
|
||||
|
||||
def extract_words(path_to_file):
|
||||
assert(type(path_to_file) is str), "Must be a string!"
|
||||
assert(path_to_file), "Must be a non-empty string!"
|
@ -1,8 +1,6 @@
|
||||
from cppy.cp_util import *
|
||||
|
||||
#
|
||||
# 用断言从事发点给出出错的准确信息
|
||||
#
|
||||
|
||||
def extractWords(path_to_file):
|
||||
assert(type(path_to_file) is str), "Must be a string"
|
||||
assert(path_to_file), "Must be a non-empty string"
|
@ -0,0 +1,4 @@
|
||||
|
||||
|
||||
## 任务
|
||||
本项目的主要功能任务:做文本文件的分词,过滤常见词,求词频,并排序输出。
|
@ -1 +0,0 @@
|
||||
异常主要发生在参数传递和代码块执行过程。一种原则是:软件不能挂掉。检查参数合理性、检查代码块执行可能的错误,并进行合理结果补齐,保持程序继续运行【 1 软件不能挂掉 】,另外一种情况是发生异常就抛出然后终止程序【 2 时间停止在那一刻 】,或者由上层函数接住,集中统一处理。【 3 预判可能的错误 】。
|
@ -1,105 +0,0 @@
|
||||
'''
|
||||
把观察者挂到自己的处理队列上
|
||||
'''
|
||||
|
||||
import os
|
||||
import re
|
||||
import threading
|
||||
from queue import Queue
|
||||
from collections import Counter
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
# 观察者接口
|
||||
class Observer(ABC):
|
||||
@abstractmethod
|
||||
def update(self, word_counts: Counter):
|
||||
pass
|
||||
|
||||
# 具体观察者:打印前 10 高频词
|
||||
class PrintTopWordsObserver(Observer):
|
||||
def update(self, word_counts: Counter):
|
||||
print("Top 10 高频词:")
|
||||
for word, count in word_counts.most_common(10):
|
||||
print(f"{word}: {count}")
|
||||
|
||||
# 具体观察者:保存词频到文件
|
||||
class SaveToFileObserver(Observer):
|
||||
def __init__(self, output_file):
|
||||
self.output_file = output_file
|
||||
|
||||
def update(self, word_counts: Counter):
|
||||
try:
|
||||
with open(self.output_file, 'w', encoding='utf-8') as f:
|
||||
for word, count in word_counts.most_common(10):
|
||||
f.write(f"{word}: {count}\n")
|
||||
print(f"词频已保存到 {self.output_file}")
|
||||
except Exception as e:
|
||||
print(f"保存失败: {e}")
|
||||
|
||||
# 词频统计器(主题)
|
||||
class WordFrequencyCounter:
|
||||
def __init__(self):
|
||||
self.observers = []
|
||||
self.counter = Counter()
|
||||
self.queue = Queue()
|
||||
self.lock = threading.Lock()
|
||||
|
||||
def add_observer(self, observer: Observer):
|
||||
self.observers.append(observer)
|
||||
|
||||
def remove_observer(self, observer: Observer):
|
||||
self.observers.remove(observer)
|
||||
|
||||
def notify_observers(self):
|
||||
for observer in self.observers:
|
||||
observer.update(self.counter)
|
||||
|
||||
def process_file(self):
|
||||
while True:
|
||||
try:
|
||||
file_path = self.queue.get_nowait()
|
||||
except:
|
||||
break
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
text = f.read().lower()
|
||||
words = re.findall(r'\b\w+\b', text)
|
||||
with self.lock:
|
||||
self.counter.update(words)
|
||||
except Exception as e:
|
||||
print(f"Error processing {file_path}: {e}")
|
||||
finally:
|
||||
self.queue.task_done()
|
||||
|
||||
def count_words(self, files, num_threads=4):
|
||||
# 将文件路径放入队列
|
||||
for file_path in files:
|
||||
self.queue.put(file_path)
|
||||
|
||||
# 创建并启动线程
|
||||
threads = [threading.Thread(target=self.process_file) for _ in range(num_threads)]
|
||||
for t in threads:
|
||||
t.start()
|
||||
for t in threads:
|
||||
t.join()
|
||||
|
||||
# 通知所有观察者
|
||||
self.notify_observers()
|
||||
|
||||
def main():
|
||||
# 获取文件列表
|
||||
data_dir = 'data'
|
||||
files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.txt')]
|
||||
|
||||
# 创建词频统计器
|
||||
counter = WordFrequencyCounter()
|
||||
|
||||
# 添加观察者
|
||||
counter.add_observer(PrintTopWordsObserver())
|
||||
counter.add_observer(SaveToFileObserver("word_frequency.txt"))
|
||||
|
||||
# 统计词频并通知观察者
|
||||
counter.count_words(files)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue