From d542128f5d6a9d7fd5a172d2a04681a630365c77 Mon Sep 17 00:00:00 2001 From: pkjq8ohf2 <2100488276@qq.com> Date: Tue, 17 Sep 2024 14:05:37 +0800 Subject: [PATCH] ADD file via upload --- ciyuntu.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 ciyuntu.py diff --git a/ciyuntu.py b/ciyuntu.py new file mode 100644 index 0000000..0862e53 --- /dev/null +++ b/ciyuntu.py @@ -0,0 +1,39 @@ +import jieba + +#阅读文本文件 +txt = open(r'C:\Users\86158\爬虫作业\ai吧.txt','r',encoding='UTF-8').read() +rp_str = ': , ; 。 、 ? ———— ‘’ “” () ! # 《》\n\ufeff' +for i in rp_str: + txt = txt.replace(i,'') + +txt = ''.join(txt.split()) +jieba.load_userdict(r'C:\Users\86158\爬虫作业\userdict.txt') +words = jieba.lcut(txt) + +stopwords = open(r'C:\Users\86158\爬虫作业\stopwords.txt','r',encoding='UTF-8').read() +stopwords_list = list(stopwords) + +#将无意义的高频词写入remov_words筛掉 +remove_words = ['哈哈','可以','紫薯','整齐','开始','以为','这人','我们','好像'] +words_counts = {} +for i in words: + if len(i)==1: + continue + if i in remove_words: + continue + if i not in stopwords_list: + words_counts[i]=words_counts.get(i,0)+1 +words_list = list(words_counts.items()) +words_list.sort(key=lambda x:x[1],reverse=True) + +ranking8_list = words_list[:8] +ranking8_dict = dict(ranking8_list) +print(ranking8_dict) + +#将得到的关键词保存为文本文件 +f = open(r'C:\Users\86158\爬虫作业\ciyuntu.txt','w',encoding='UTF-8') +for i in range(len(words_list)): + k,v = words_list[i] + f.write('{:<8}{:>2}\n'.format(k,v)) +f.close() +