You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
|
|
import sys
|
|
|
|
|
import re
|
|
|
|
|
from collections import Counter
|
|
|
|
|
|
|
|
|
|
# 使用 : python command_line_1.py testfilepath 10
|
|
|
|
|
|
|
|
|
|
# 清洗文本,移除标点符号并转换为小写
|
|
|
|
|
def clean_text(text):
|
|
|
|
|
return re.sub(r'[^\w\s]', '', text).lower()
|
|
|
|
|
|
|
|
|
|
# 统计词频
|
|
|
|
|
def count_frequencies(text):
|
|
|
|
|
return Counter(word for word in clean_text(text).split())
|
|
|
|
|
|
|
|
|
|
# 主函数
|
|
|
|
|
def main():
|
|
|
|
|
# 检查命令行参数数量
|
|
|
|
|
if len(sys.argv) != 3:
|
|
|
|
|
print("Usage: python command_line_1.py <file_path> <n>")
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
file_path = sys.argv[1]
|
|
|
|
|
n = int(sys.argv[2])
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
# 打开文件并读取内容
|
|
|
|
|
with open(file_path, 'r', encoding='utf-8') as file:
|
|
|
|
|
text = file.read()
|
|
|
|
|
|
|
|
|
|
# 统计词频
|
|
|
|
|
frequencies = count_frequencies(text)
|
|
|
|
|
|
|
|
|
|
# 获取前n个最常见的单词
|
|
|
|
|
most_common = frequencies.most_common(n)
|
|
|
|
|
|
|
|
|
|
# 输出结果
|
|
|
|
|
for word, freq in most_common:
|
|
|
|
|
print(f"{word}: {freq}")
|
|
|
|
|
|
|
|
|
|
except FileNotFoundError:
|
|
|
|
|
print(f"File not found: {file_path}")
|
|
|
|
|
except ValueError as e:
|
|
|
|
|
print(f"Error: {e}")
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|