zj3D 8 months ago
parent bfcaab3439
commit 239c0188d0

@ -1,45 +1,45 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import requests import requests
from cppy.cp_util import * from cppy.cp_util import *
# 查询资源,得到空列表 # 查询资源,得到空列表
url = 'http://127.0.0.1:5000//books' url = 'http://127.0.0.1:5000//books'
response = requests.get(url) response = requests.get(url)
print(response.json()) print(response.json())
time.sleep(2) time.sleep(2)
# - 创建一个1号资源 # - 创建一个1号资源
print('创建一个1号资源') print('创建一个1号资源')
book_1 = {"id": 1, "title": "Python编程:从入门到实践", "content": testfilepath} book_1 = {"id": 1, "title": "Python编程:从入门到实践", "content": testfilepath}
url = 'http://127.0.0.1:5000/books/1' url = 'http://127.0.0.1:5000/books/1'
response = requests.put(url,json=book_1) response = requests.put(url,json=book_1)
time.sleep(2) time.sleep(2)
# - 创建一个2号资源修改testfilepaht变量 # - 创建一个2号资源修改testfilepaht变量
print('创建一个2号资源') print('创建一个2号资源')
testfilepath = testfilepath.replace('Prey.txt','Pride-and-Prejudice.txt') testfilepath = testfilepath.replace('Prey.txt','Pride-and-Prejudice.txt')
book_2 = {"id": 2, "title": "深入浅出计算机组成原理", "content": testfilepath} book_2 = {"id": 2, "title": "深入浅出计算机组成原理", "content": testfilepath}
url = 'http://127.0.0.1:5000/books/2' url = 'http://127.0.0.1:5000/books/2'
response = requests.put(url,json=book_2) response = requests.put(url,json=book_2)
time.sleep(2) time.sleep(2)
# - 创建一个3号资源修改testfilepaht变量正好有3个文件 # - 创建一个3号资源修改testfilepaht变量正好有3个文件
print('创建一个3号资源') print('创建一个3号资源')
testfilepath = testfilepath.replace('Pride-and-Prejudice.txt','test.txt') testfilepath = testfilepath.replace('Pride-and-Prejudice.txt','test.txt')
book_3 = {"id": 3, "title": "算法导论", "content": testfilepath} book_3 = {"id": 3, "title": "算法导论", "content": testfilepath}
url = 'http://127.0.0.1:5000/books/3' url = 'http://127.0.0.1:5000/books/3'
response = requests.put(url,json=book_3) response = requests.put(url,json=book_3)
time.sleep(2) time.sleep(2)
# - 查询资源,看到结果 # - 查询资源,看到结果
print('查询资源,看到结果') print('查询资源,看到结果')
url = 'http://127.0.0.1:5000//books' url = 'http://127.0.0.1:5000//books'
response = requests.get(url) response = requests.get(url)
print(response.json()) print(response.json())
time.sleep(2) time.sleep(2)
# - 操作1号资源得到词频 # - 操作1号资源得到词频
print('操作1号资源得到词频') print('操作1号资源得到词频')
url = 'http://127.0.0.1:5000/books/1/word_frequency' url = 'http://127.0.0.1:5000/books/1/word_frequency'
response = requests.get(url) response = requests.get(url)
print_word_freqs(response.json()) print_word_freqs(response.json())

@ -1,65 +0,0 @@
from cppy.cp_util import *
from collections import Counter
from heapq import nlargest
import re
class Pipeline:
def __init__(self):
pass
def __or__(self, other):
class PipelineComposition(Pipeline):
def __init__(self, first, second):
self.first = first
self.second = second
def process(self, data):
return self.second.process(self.first.process(data))
return PipelineComposition(self, other)
def process(self, data):
raise NotImplementedError
class FileReader(Pipeline):
def __init__(self, filename):
super().__init__()
self.filename = filename
def process(self):
with open(self.filename, 'r', encoding='utf-8') as file:
content = file.read()
return content
class WordFrequencyCounter(Pipeline):
def process(self, text):
words = re.findall(r'\w+', text.lower())
word_freq = Counter(words)
return word_freq
class TopNFilter(Pipeline):
def __init__(self, n):
super().__init__()
self.n = n
def process(self, word_freq):
return nlargest(self.n, word_freq.items(), key=lambda item: item[1])
# 假设有一个文本文件"text.txt",其内容是需要分析的文本
filename = testfilepath
n = 5 # 求取最高5个词频
# 创建管道
pipeline = FileReader(filename) | WordFrequencyCounter() | TopNFilter(n)
# 执行管道
top_n_words = pipeline.process()
# 打印结果
for word, freq in top_n_words:
print(f"{word}: {freq}")

@ -1,42 +1,21 @@
from collections import Counter
from cppy.cp_util import * from cppy.cp_util import *
import re
# 如果有连续的对数据加工操作,而且总是把共同加工数据对象当第一个参数,可以用一个管道框架来封装
# 注意最后还要调用一次对象call方法才能执行最后一个函数
class Pipe: class Pipe:
def __init__(self, func, *args, **kwargs): def __init__(self, func, *args, **kwargs):
# print( self, func, *args, **kwargs )
self.func = func self.func = func
self.args = args self.args = args
self.kwargs = kwargs self.kwargs = kwargs
def __or__(self, other): def __or__(self, other):
return Pipe(lambda x: self.func(x) or other.func(x)) data = self.func(*self.args, **self.kwargs)
# print(self.func.__name__, other.func.__name__ ) return Pipe( other.func,data,*other.args,**other.kwargs)
def composed_func():
print( other.func(self.func(self.args, self.kwargs), other.args, other.kwargs) )
return other.func(self.func(self.args, self.kwargs), other.args, other.kwargs)
return Pipe(composed_func)
# def __call__(self, *args, **kwargs): def __call__(self):
# print( *args, **kwargs ) return self.func(*self.args, **self.kwargs)
# return self.func(*args, **kwargs)
def __call__(self, data):
return self.func(data)
def read_file(filename):
with open(filename, 'r',encoding='utf-8') as f:
return f.read()
def split_words(text): # 模仿管道
return re.findall(r'\b\w+\b', text.lower()) pipe = Pipe(extract_file_words,testfilepath) | Pipe(get_frequencies) | Pipe(sort_dict) | Pipe(print_word_freqs, 10)
pipe()
def count_words(words):
return Counter(words)
def top_n_words(word_counts, n):
return word_counts.most_common(n)
# 使用管道
pipe = Pipe(read_file) | Pipe(split_words) | Pipe(count_words) | Pipe(top_n_words, 10)
result = pipe(testfilepath)
print(result)

@ -1,49 +1,25 @@
import re
from collections import Counter
from functools import reduce
from cppy.cp_util import * from cppy.cp_util import *
class Pipeline: # 这种连续方法调用,看起来也比较舒服
def __init__(self, function): # 每一个类方法返回 self ,另外最后一个方法注意是否有返回值
self.function = function
class Flow:
def __or__(self, other): def extract_file_words(self, filepath):
if isinstance(other, Pipeline): self.data = extract_file_words(filepath)
return Pipeline(lambda x: self.function(x) or other.function(x)) return self
else:
raise TypeError("The argument must be an instance of Pipeline") def get_frequencies(self):
self.data = get_frequencies(self.data)
def process(self, data): return self
return self.function(data)
def sort_dict(self):
# 定义处理函数 self.data = sort_dict(self.data)
def read_file(path): return self
with open(path, 'r', encoding='utf-8') as file:
return file.read() def print_word_freqs(self, n):
print_word_freqs(self.data, n)
def clean_text(text): return self
return re.sub(r'[^\w\s]', '', text).lower()
def tokenize(text): # 顺序调用
return re.findall(r'\b\w+\b', text) Flow().extract_file_words(testfilepath).get_frequencies().sort_dict().print_word_freqs(10)
def remove_stop_words(tokens, stop_words):
return [token for token in tokens if token not in stop_words]
def count_frequencies(tokens):
return Counter(tokens)
def get_top_n_frequencies(counter, n):
return counter.most_common(n)
# 定义停用词列表
stop_words = set(['the', 'and', 'a', 'to', 'of', 'in', 'for', 'on', 'is', 'it', 'with', 'that', 'as', 'by', 'this', 'at', 'be', 'which', 'from', 'or', 'are', 'an', 'but', 'not', 'you', 'have', 'your', 'can', 'will', 'all', 'any', 'if', 'their', 'would', 'what', 'there', 'when', 'which', 'who', 'whom', 'whose', 'where', 'why'])
# 创建管道
pipeline = (Pipeline(read_file) | clean_text | tokenize
| remove_stop_words | count_frequencies
| get_top_n_frequencies(n=10))
# 执行管道并打印结果
top_n_word_frequencies = pipeline.process( testfilepath )
print(top_n_word_frequencies)

Loading…
Cancel
Save