From 744db0e54308dcb04889deecaa58c62eedc9e6a6 Mon Sep 17 00:00:00 2001
From: palc3e6gq <1622356900@qq.com>
Date: Mon, 16 Sep 2024 22:29:11 +0800
Subject: [PATCH] ADD file via upload

---
 词频统计.py | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 词频统计.py

diff --git a/词频统计.py b/词频统计.py
new file mode 100644
index 0000000..b77d186
--- /dev/null
+++ b/词频统计.py
@@ -0,0 +1,29 @@
+import jieba
+import pandas 
+import openpyxl
+#1.读取弹幕数据
+f =open("总弹幕.txt",encoding='utf-8')
+text =f.read()
+
+#2.分词
+text_list=jieba.lcut(text)
+
+#3.去除标点符号（常见词）
+f2=open('标点符号表.text',encoding='utf-8')
+stopwords=f2.read()
+text_clean=[word for word in text_list if word not in stopwords]
+
+#4.生成词频字典
+d_sict={}
+for key in text_clean:
+    d_sict[key]=d_sict.get(key,0)+1
+
+#5.获取字典的键和值作为excel的两个列表变量
+key=list(d_sict.keys())
+value=list(d_sict.values())
+
+#6.写入excel表
+result_excel=pandas.DataFrame()
+result_excel["词"]=key
+result_excel["词频"]=value
+result_excel.to_excel('词频统计（去标点符号）.xlsx')
\ No newline at end of file