parent
fc3cb55973
commit
744db0e543
@ -0,0 +1,29 @@
|
||||
import jieba
|
||||
import pandas
|
||||
import openpyxl
|
||||
#1.读取弹幕数据
|
||||
f =open("总弹幕.txt",encoding='utf-8')
|
||||
text =f.read()
|
||||
|
||||
#2.分词
|
||||
text_list=jieba.lcut(text)
|
||||
|
||||
#3.去除标点符号(常见词)
|
||||
f2=open('标点符号表.text',encoding='utf-8')
|
||||
stopwords=f2.read()
|
||||
text_clean=[word for word in text_list if word not in stopwords]
|
||||
|
||||
#4.生成词频字典
|
||||
d_sict={}
|
||||
for key in text_clean:
|
||||
d_sict[key]=d_sict.get(key,0)+1
|
||||
|
||||
#5.获取字典的键和值作为excel的两个列表变量
|
||||
key=list(d_sict.keys())
|
||||
value=list(d_sict.values())
|
||||
|
||||
#6.写入excel表
|
||||
result_excel=pandas.DataFrame()
|
||||
result_excel["词"]=key
|
||||
result_excel["词频"]=value
|
||||
result_excel.to_excel('词频统计(去标点符号).xlsx')
|
Loading…
Reference in new issue