zj3D 9 months ago
parent 856fdcc1e1
commit 0e55cabe5c

@ -1,5 +1,5 @@
import string import string
from cppy.cp_util import * from cppy.cp_util import stopwordfilepath,testfilepath
# 准备词和停用词表 # 准备词和停用词表
word_freqs = [] word_freqs = []

@ -1,4 +1,5 @@
from cppy.cp_util import * from cppy.cp_util import stopwordfilepath,testfilepath
import string
from collections import Counter from collections import Counter
# 准备词和停用词表 # 准备词和停用词表

@ -1,11 +1,10 @@
import re, sys, collections import re, collections
from cppy.cp_util import * from cppy.cp_util import stopwordfilepath,testfilepath
stopwords = set(open( stopwordfilepath,encoding = 'utf8' ).read().split(',')) stopwords = set(open( stopwordfilepath,encoding = 'utf8' ).read().split(','))
words = re.findall('[a-z]{2,}', open( testfilepath,encoding = 'utf8').read().lower()) words = re.findall('[a-z]{2,}', open( testfilepath,encoding = 'utf8').read().lower())
counts = collections.Counter(w for w in words if w not in stopwords) counts = collections.Counter(w for w in words if w not in stopwords)
for (w, c) in counts.most_common(10): for (w, c) in counts.most_common(10) : print(w, '-', c)
print(w, '-', c)
''' '''
熟练的软件工程师会如此简单完成任务 熟练的软件工程师会如此简单完成任务

@ -0,0 +1,48 @@
import re
from collections import Counter
# 清洗文本,移除标点符号并转换为小写
def clean_text(text):
return re.sub(r'[^\w\s]', '', text).lower()
# 统计词频
def count_frequencies(text):
return Counter(word for word in clean_text(text).split())
# 交互式提示用户输入文件路径和前n个单词的数量
def interactive_mode():
file_path = input("请输入文件路径: ")
try:
n = int(input("请输入你想要输出的前n个最常见单词的数量: "))
if n <= 0:
raise ValueError("数量必须大于0。")
except ValueError as e:
print(f"输入错误:{e}")
return
try:
# 打开文件并读取内容
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()
# 统计词频
frequencies = count_frequencies(text)
# 获取前n个最常见的单词
most_common = frequencies.most_common(n)
# 输出结果
for word, freq in most_common:
print(f"{word}: {freq}")
except FileNotFoundError:
print(f"文件未找到: {file_path}")
except Exception as e:
print(f"发生错误: {e}")
# 主函数
def main():
print("欢迎使用词频统计工具。")
interactive_mode()
if __name__ == "__main__":
main()

@ -1,7 +1,7 @@
from cppy.cp_util import * from cppy.cp_util import *
def extract_words(obj, path_to_file): def extract_words(obj, path_to_file):
obj['data'] = re_split( read_file(path_to_file) ) obj['data'] = extract_file_words(path_to_file)
def load_stop_words(obj): def load_stop_words(obj):
obj['stop_words'] = get_stopwords() obj['stop_words'] = get_stopwords()

@ -23,9 +23,9 @@ top_10_words = calculate_word_frequency(testfilepath)
print_word_freqs(top_10_words) print_word_freqs(top_10_words)
''' '''
python 提供了一种缓存调用函数的机制 Python 提供了一个缓存调用函数的装饰器
import functools import functools
# 使用 functools.lru_cache 缓存结果 # 使用 functools.lru_cache 缓存结果
@functools.lru_cache(maxsize=None) @functools.lru_cache(maxsize=None)
def calculate_word_frequency(file_path): def calculate_word_frequency(file_path):

Loading…
Cancel
Save