dev
zj3D 8 months ago
parent 2d46194636
commit c8946209bf

@ -1,17 +1,20 @@
import string # 引入停用词表和测试文件的路径
from cppy.cp_util import stopwordfilepath,testfilepath from cppy.cp_util import stopwordfilepath, testfilepath
# 准备词和停用词表 # 准备停用词表
word_freqs = [] with open(stopwordfilepath, encoding='utf-8') as f:
with open( stopwordfilepath,encoding='utf-8' ) as f:
stop_words = f.read().split(',') stop_words = f.read().split(',')
stop_words.extend(list(string.ascii_lowercase)) for letter in 'abcdefghijklmnopqrstuvwxyz':
stop_words.append(letter)
for line in open( testfilepath ,encoding='utf-8' ): # 读文件,逐行扫描文本,发现词,确定不是停用词,计数
word_freqs = []
for line in open( testfilepath, encoding='utf-8' ):
start_char = None start_char = None
i = 0 i = 0
for c in line: for c in line:
if start_char == None: if start_char is None:
if c.isalnum(): if c.isalnum():
# 一个单词开始 # 一个单词开始
start_char = i start_char = i
@ -32,15 +35,18 @@ for line in open( testfilepath ,encoding='utf-8' ):
pair_index += 1 pair_index += 1
if not found: if not found:
word_freqs.append([word, 1]) word_freqs.append([word, 1])
elif len(word_freqs) > 1:
for n in reversed(range(pair_index)):
if word_freqs[pair_index][1] > word_freqs[n][1]:
# 交换
word_freqs[n], word_freqs[pair_index] = word_freqs[pair_index], word_freqs[n]
pair_index = n
# 重置开始标记 # 重置开始标记
start_char = None start_char = None
i += 1 i += 1
for tf in word_freqs[0:10]: # 使用冒泡排序对词频进行排序
n = len(word_freqs)
for i in range(n):
for j in range(0, n-i-1):
if word_freqs[j][1] < word_freqs[j+1][1]:
word_freqs[j], word_freqs[j+1] = word_freqs[j+1], word_freqs[j]
# 打印频率最高的前10个词
for tf in word_freqs[:10]:
print(tf[0], '-', tf[1]) print(tf[0], '-', tf[1])

@ -1,7 +1,7 @@
from cppy.cp_util import * from cppy.cp_util import *
# 这个例子没有实际意义,是用来帮助理解下一个例子 # 这个例子没有实际意义,是用来帮助理解下一个例子
# 程序只需要做第一件事情,后面的顺序逻辑写到各个函数里面了 # 程序只需要做第一件事情,后面的顺序逻辑写到各个函数里面了
def readfile(path_to_file, func): def readfile(path_to_file, func):
data = read_file(path_to_file) data = read_file(path_to_file)

@ -1,7 +1,7 @@
from collections import Counter from collections import Counter
from cppy.cp_util import * from cppy.cp_util import *
class AcceptTypes: class TypesCheck:
def __init__(self, *args): def __init__(self, *args):
self._args = args self._args = args
@ -9,19 +9,19 @@ class AcceptTypes:
def wrapped_f(*args, **kwargs): def wrapped_f(*args, **kwargs):
for i, arg_type in enumerate(self._args): for i, arg_type in enumerate(self._args):
if not isinstance(args[i], arg_type): if not isinstance(args[i], arg_type):
raise TypeError(f"Argument {i} expected {arg_type}, got {type(args[i])}") raise TypeError(f" {i} expected {arg_type}, got {type(args[i])}")
return f(*args, **kwargs) return f(*args, **kwargs)
return wrapped_f return wrapped_f
@AcceptTypes(str) @TypesCheck(str)
def extract_words_(path_to_file): def extract_words_(path_to_file):
return extract_file_words(path_to_file) return extract_file_words(path_to_file)
@AcceptTypes(list) @TypesCheck(list)
def frequencies_(word_list): def frequencies_(word_list):
return Counter(word_list) return Counter(word_list)
@AcceptTypes(Counter) @TypesCheck(Counter)
def sort_(word_freq): def sort_(word_freq):
return word_freq.most_common() return word_freq.most_common()

@ -3,25 +3,25 @@ from cppy.cp_util import *
def extract_words(path_to_file): def extract_words(path_to_file):
assert(type(path_to_file) is str), "I need a string!" assert(type(path_to_file) is str), "Must be a string!"
assert(path_to_file), "I need a non-empty string!" assert(path_to_file), "Must be a non-empty string!"
try: try:
with open(path_to_file,encoding='utf-8') as f: with open(path_to_file,encoding='utf-8') as f:
str_data = f.read() str_data = f.read()
except IOError as e: except IOError as e:
print("I/O error({0}) when opening {1}: {2}! I quit!".format(e.errno, path_to_file, e.strerror)) print("I/O error({0}) when opening {1}: {2}".format(e.errno, path_to_file, e.strerror))
raise e raise e
return re_split(str_data) return re_split(str_data)
def remove_stop_words(word_list): def remove_stop_words(word_list):
assert(type(word_list) is list), "I need a list!" assert(type(word_list) is list), "Must be a list!"
try: try:
stop_words = get_stopwords() stop_words = get_stopwords()
except IOError as e: except IOError as e:
print("I/O error({0}) opening stops_words.txt: {1}! I quit!".format(e.errno, e.strerror)) print("I/O error({0}) opening stops_words.txt: {1}".format(e.errno, e.strerror))
raise e raise e
return [w for w in word_list if not w in stop_words] return [w for w in word_list if not w in stop_words]

@ -2,18 +2,18 @@ from cppy.cp_util import *
def extractWords(path_to_file): def extractWords(path_to_file):
assert(type(path_to_file) is str), "I need a string! I quit!" assert(type(path_to_file) is str), "Must be a string"
assert(path_to_file), "I need a non-empty string! I quit!" assert(path_to_file), "Must be a non-empty string"
return extract_file_words(path_to_file) return extract_file_words(path_to_file)
def frequencies(word_list): def frequencies(word_list):
assert(type(word_list) is list), "I need a list! I quit!" assert(type(word_list) is list), "Must be a list"
assert(word_list != []), "I need a non-empty list! I quit!" assert(word_list != []), "Must be a non-empty list"
return get_frequencies(word_list) return get_frequencies(word_list)
def sort(word_freqs): def sort(word_freqs):
assert(type(word_freqs) is dict), "I need a dictionary! I quit!" assert(type(word_freqs) is dict), "Must be a dictionary"
assert(word_freqs != {}), "I need a non-empty dictionary! I quit!" assert(word_freqs != {}), "Must be a non-empty dictionary"
return sort_dict(word_freqs) return sort_dict(word_freqs)

@ -1,33 +0,0 @@
# -*- coding: utf-8 -*-
import cppy.cp_util as util
# 每一列是一个数据元素和一个公式,第一列是输入数据,所以没有公式
all_words = [(), None]
non_stop_words = [(), util.extract_str_words]
frequencies = [(), util.get_frequencies]
sorted_data = [(), util.sort_dict]
# 整个电子表格
all_columns = [all_words, non_stop_words,\
frequencies, sorted_data]
# 每次输入数据后调用此方法
def update():
global all_columns
for c in all_columns[1::]:
if c[1] == util.extract_str_words:
c[0] = c[1](all_words[0])
elif c[1] == util.get_frequencies:
c[0] = c[1](non_stop_words[0])
elif c[1] == util.sort_dict:
c[0] = c[1](frequencies[0])
# 将固定数据加载到第一列中
all_words[0] = util.read_file(util.testfilepath)
# 调用update函数遍历列表
update()
#打印结果
util.print_word_freqs(sorted_data[0])
Loading…
Cancel
Save