You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
|
|
import jieba
|
|
|
|
|
|
|
|
|
|
#阅读文本文件
|
|
|
|
|
txt = open(r'C:\Users\86158\爬虫作业\ai吧.txt','r',encoding='UTF-8').read()
|
|
|
|
|
rp_str = ': , ; 。 、 ? ———— ‘’ “” () ! # 《》\n\ufeff'
|
|
|
|
|
for i in rp_str:
|
|
|
|
|
txt = txt.replace(i,'')
|
|
|
|
|
|
|
|
|
|
txt = ''.join(txt.split())
|
|
|
|
|
jieba.load_userdict(r'C:\Users\86158\爬虫作业\userdict.txt')
|
|
|
|
|
words = jieba.lcut(txt)
|
|
|
|
|
|
|
|
|
|
stopwords = open(r'C:\Users\86158\爬虫作业\stopwords.txt','r',encoding='UTF-8').read()
|
|
|
|
|
stopwords_list = list(stopwords)
|
|
|
|
|
|
|
|
|
|
#将无意义的高频词写入remov_words筛掉
|
|
|
|
|
remove_words = ['哈哈','可以','紫薯','整齐','开始','以为','这人','我们','好像']
|
|
|
|
|
words_counts = {}
|
|
|
|
|
for i in words:
|
|
|
|
|
if len(i)==1:
|
|
|
|
|
continue
|
|
|
|
|
if i in remove_words:
|
|
|
|
|
continue
|
|
|
|
|
if i not in stopwords_list:
|
|
|
|
|
words_counts[i]=words_counts.get(i,0)+1
|
|
|
|
|
words_list = list(words_counts.items())
|
|
|
|
|
words_list.sort(key=lambda x:x[1],reverse=True)
|
|
|
|
|
|
|
|
|
|
ranking8_list = words_list[:8]
|
|
|
|
|
ranking8_dict = dict(ranking8_list)
|
|
|
|
|
print(ranking8_dict)
|
|
|
|
|
|
|
|
|
|
#将得到的关键词保存为文本文件
|
|
|
|
|
f = open(r'C:\Users\86158\爬虫作业\ciyuntu.txt','w',encoding='UTF-8')
|
|
|
|
|
for i in range(len(words_list)):
|
|
|
|
|
k,v = words_list[i]
|
|
|
|
|
f.write('{:<8}{:>2}\n'.format(k,v))
|
|
|
|
|
f.close()
|
|
|
|
|
|