diff --git a/statistic.py b/statistic.py new file mode 100644 index 0000000..e207855 --- /dev/null +++ b/statistic.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +import jieba +import pandas as pd +from collections import Counter + +class DanmakuStatistic: + def __init__(self, danmu_path, excel_path): + self.danmu_path = danmu_path + self.excel_path = excel_path + + # 读取弹幕数据 + with open(danmu_path, "r", encoding="utf-8") as f: + self.danmu_list = [line.strip() for line in f if line.strip()] + + # 自定义停用词(可根据需求补充) + self.stop_words = { + "已三连", "求资料", "打卡", "不错", "很好", "牛逼", "卧槽", + "学习", "分享", "感谢", "点赞", "三连", "教程", "B站", "老师" + } + + def count_top8(self): + """统计Top8高频词""" + words = [] + for danmu in self.danmu_list: + # 分词并过滤 + seg_words = [w for w in jieba.cut(danmu) if len(w) > 1 and w not in self.stop_words] + words.extend(seg_words) + return Counter(words).most_common(8) + + def export_to_excel(self): + """导出统计结果到Excel""" + top8 = self.count_top8() + df = pd.DataFrame(top8, columns=["关键词", "词频"]) + df.to_excel(self.excel_path, index=False) + print(f"统计数据已保存至{self.excel_path}") + return top8 \ No newline at end of file