You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
36 lines
1.3 KiB
36 lines
1.3 KiB
# -*- coding: utf-8 -*-
|
|
import jieba
|
|
import pandas as pd
|
|
from collections import Counter
|
|
|
|
class DanmakuStatistic:
|
|
def __init__(self, danmu_path, excel_path):
|
|
self.danmu_path = danmu_path
|
|
self.excel_path = excel_path
|
|
|
|
# 读取弹幕数据
|
|
with open(danmu_path, "r", encoding="utf-8") as f:
|
|
self.danmu_list = [line.strip() for line in f if line.strip()]
|
|
|
|
# 自定义停用词(可根据需求补充)
|
|
self.stop_words = {
|
|
"已三连", "求资料", "打卡", "不错", "很好", "牛逼", "卧槽",
|
|
"学习", "分享", "感谢", "点赞", "三连", "教程", "B站", "老师"
|
|
}
|
|
|
|
def count_top8(self):
|
|
"""统计Top8高频词"""
|
|
words = []
|
|
for danmu in self.danmu_list:
|
|
# 分词并过滤
|
|
seg_words = [w for w in jieba.cut(danmu) if len(w) > 1 and w not in self.stop_words]
|
|
words.extend(seg_words)
|
|
return Counter(words).most_common(8)
|
|
|
|
def export_to_excel(self):
|
|
"""导出统计结果到Excel"""
|
|
top8 = self.count_top8()
|
|
df = pd.DataFrame(top8, columns=["关键词", "词频"])
|
|
df.to_excel(self.excel_path, index=False)
|
|
print(f"统计数据已保存至{self.excel_path}")
|
|
return top8 |