From 744db0e54308dcb04889deecaa58c62eedc9e6a6 Mon Sep 17 00:00:00 2001 From: palc3e6gq <1622356900@qq.com> Date: Mon, 16 Sep 2024 22:29:11 +0800 Subject: [PATCH] ADD file via upload --- 词频统计.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 词频统计.py diff --git a/词频统计.py b/词频统计.py new file mode 100644 index 0000000..b77d186 --- /dev/null +++ b/词频统计.py @@ -0,0 +1,29 @@ +import jieba +import pandas +import openpyxl +#1.读取弹幕数据 +f =open("总弹幕.txt",encoding='utf-8') +text =f.read() + +#2.分词 +text_list=jieba.lcut(text) + +#3.去除标点符号(常见词) +f2=open('标点符号表.text',encoding='utf-8') +stopwords=f2.read() +text_clean=[word for word in text_list if word not in stopwords] + +#4.生成词频字典 +d_sict={} +for key in text_clean: + d_sict[key]=d_sict.get(key,0)+1 + +#5.获取字典的键和值作为excel的两个列表变量 +key=list(d_sict.keys()) +value=list(d_sict.values()) + +#6.写入excel表 +result_excel=pandas.DataFrame() +result_excel["词"]=key +result_excel["词频"]=value +result_excel.to_excel('词频统计(去标点符号).xlsx') \ No newline at end of file