dev
zj3D 8 months ago
parent 2d46194636
commit c8946209bf

@ -1,17 +1,20 @@
import string
from cppy.cp_util import stopwordfilepath,testfilepath
# 引入停用词表和测试文件的路径
from cppy.cp_util import stopwordfilepath, testfilepath
# 准备词和停用词表
word_freqs = []
with open( stopwordfilepath,encoding='utf-8' ) as f:
# 准备停用词表
with open(stopwordfilepath, encoding='utf-8') as f:
stop_words = f.read().split(',')
stop_words.extend(list(string.ascii_lowercase))
for letter in 'abcdefghijklmnopqrstuvwxyz':
stop_words.append(letter)
for line in open( testfilepath ,encoding='utf-8' ):
# 读文件,逐行扫描文本,发现词,确定不是停用词,计数
word_freqs = []
for line in open( testfilepath, encoding='utf-8' ):
start_char = None
i = 0
for c in line:
if start_char == None:
if start_char is None:
if c.isalnum():
# 一个单词开始
start_char = i
@ -32,15 +35,18 @@ for line in open( testfilepath ,encoding='utf-8' ):
pair_index += 1
if not found:
word_freqs.append([word, 1])
elif len(word_freqs) > 1:
for n in reversed(range(pair_index)):
if word_freqs[pair_index][1] > word_freqs[n][1]:
# 交换
word_freqs[n], word_freqs[pair_index] = word_freqs[pair_index], word_freqs[n]
pair_index = n
# 重置开始标记
start_char = None
i += 1
for tf in word_freqs[0:10]:
print(tf[0], '-', tf[1])
# 使用冒泡排序对词频进行排序
n = len(word_freqs)
for i in range(n):
for j in range(0, n-i-1):
if word_freqs[j][1] < word_freqs[j+1][1]:
word_freqs[j], word_freqs[j+1] = word_freqs[j+1], word_freqs[j]
# 打印频率最高的前10个词
for tf in word_freqs[:10]:
print(tf[0], '-', tf[1])

@ -1,7 +1,7 @@
from cppy.cp_util import *
# 这个例子没有实际意义,是用来帮助理解下一个例子
# 程序只需要做第一件事情,后面的顺序逻辑写到各个函数里面了
# 程序只需要做第一件事情,后面的顺序逻辑写到各个函数里面了
def readfile(path_to_file, func):
data = read_file(path_to_file)

@ -27,7 +27,7 @@ class IWordFrequencyCounter(metaclass=abc.ABCMeta):
# 类实现
#
class DataStorageManager1:
def __init__(self, path_to_file):
def __init__(self, path_to_file):
self._data = read_file(path_to_file)
self._data = re_split(self._data)

@ -1,7 +1,7 @@
from collections import Counter
from cppy.cp_util import *
class AcceptTypes:
class TypesCheck:
def __init__(self, *args):
self._args = args
@ -9,19 +9,19 @@ class AcceptTypes:
def wrapped_f(*args, **kwargs):
for i, arg_type in enumerate(self._args):
if not isinstance(args[i], arg_type):
raise TypeError(f"Argument {i} expected {arg_type}, got {type(args[i])}")
raise TypeError(f" {i} expected {arg_type}, got {type(args[i])}")
return f(*args, **kwargs)
return wrapped_f
@AcceptTypes(str)
@TypesCheck(str)
def extract_words_(path_to_file):
return extract_file_words(path_to_file)
@AcceptTypes(list)
@TypesCheck(list)
def frequencies_(word_list):
return Counter(word_list)
@AcceptTypes(Counter)
@TypesCheck(Counter)
def sort_(word_freq):
return word_freq.most_common()

@ -3,25 +3,25 @@ from cppy.cp_util import *
def extract_words(path_to_file):
assert(type(path_to_file) is str), "I need a string!"
assert(path_to_file), "I need a non-empty string!"
assert(type(path_to_file) is str), "Must be a string!"
assert(path_to_file), "Must be a non-empty string!"
try:
with open(path_to_file,encoding='utf-8') as f:
str_data = f.read()
except IOError as e:
print("I/O error({0}) when opening {1}: {2}! I quit!".format(e.errno, path_to_file, e.strerror))
print("I/O error({0}) when opening {1}: {2}".format(e.errno, path_to_file, e.strerror))
raise e
return re_split(str_data)
def remove_stop_words(word_list):
assert(type(word_list) is list), "I need a list!"
assert(type(word_list) is list), "Must be a list!"
try:
stop_words = get_stopwords()
except IOError as e:
print("I/O error({0}) opening stops_words.txt: {1}! I quit!".format(e.errno, e.strerror))
print("I/O error({0}) opening stops_words.txt: {1}".format(e.errno, e.strerror))
raise e
return [w for w in word_list if not w in stop_words]

@ -2,18 +2,18 @@ from cppy.cp_util import *
def extractWords(path_to_file):
assert(type(path_to_file) is str), "I need a string! I quit!"
assert(path_to_file), "I need a non-empty string! I quit!"
assert(type(path_to_file) is str), "Must be a string"
assert(path_to_file), "Must be a non-empty string"
return extract_file_words(path_to_file)
def frequencies(word_list):
assert(type(word_list) is list), "I need a list! I quit!"
assert(word_list != []), "I need a non-empty list! I quit!"
assert(type(word_list) is list), "Must be a list"
assert(word_list != []), "Must be a non-empty list"
return get_frequencies(word_list)
def sort(word_freqs):
assert(type(word_freqs) is dict), "I need a dictionary! I quit!"
assert(word_freqs != {}), "I need a non-empty dictionary! I quit!"
assert(type(word_freqs) is dict), "Must be a dictionary"
assert(word_freqs != {}), "Must be a non-empty dictionary"
return sort_dict(word_freqs)

@ -1,33 +0,0 @@
# -*- coding: utf-8 -*-
import cppy.cp_util as util
# 每一列是一个数据元素和一个公式,第一列是输入数据,所以没有公式
all_words = [(), None]
non_stop_words = [(), util.extract_str_words]
frequencies = [(), util.get_frequencies]
sorted_data = [(), util.sort_dict]
# 整个电子表格
all_columns = [all_words, non_stop_words,\
frequencies, sorted_data]
# 每次输入数据后调用此方法
def update():
global all_columns
for c in all_columns[1::]:
if c[1] == util.extract_str_words:
c[0] = c[1](all_words[0])
elif c[1] == util.get_frequencies:
c[0] = c[1](non_stop_words[0])
elif c[1] == util.sort_dict:
c[0] = c[1](frequencies[0])
# 将固定数据加载到第一列中
all_words[0] = util.read_file(util.testfilepath)
# 调用update函数遍历列表
update()
#打印结果
util.print_word_freqs(sorted_data[0])
Loading…
Cancel
Save