From fc82f3774e764cafa01b800edfc72c66f810f736 Mon Sep 17 00:00:00 2001 From: ccicnce113424 Date: Sat, 20 Dec 2025 17:28:31 +0800 Subject: [PATCH] feat: implement analysis module --- src/analysis.py | 84 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 83 insertions(+), 1 deletion(-) diff --git a/src/analysis.py b/src/analysis.py index f357ca2..1fc775a 100644 --- a/src/analysis.py +++ b/src/analysis.py @@ -1 +1,83 @@ -# Data Analysis Module +import jieba +import pandas as pd +from collections import Counter +import re +import logging +from typing import List, Dict, Tuple +import os + +class DataAnalyzer: + def __init__(self, stop_words_path: str = None): + self.stop_words = set() + if stop_words_path and os.path.exists(stop_words_path): + with open(stop_words_path, 'r', encoding='utf-8') as f: + self.stop_words = set(line.strip() for line in f) + + # Add default stop words if none provided or to supplement + default_stop_words = {'的', '了', '是', '在', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有', '看', '好', '自己', '这', '那', '这个', '那个', '啊', '吧', '呢', '吗', '666', '哈哈', '哈哈哈', '打卡', '第一', '前排', '火钳刘明'} + self.stop_words.update(default_stop_words) + + def clean_text(self, text: str) -> str: + """ + Clean text by removing special characters and extra spaces. + """ + if not text: + return "" + # Remove special characters, keep only Chinese, English, numbers + text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9]', ' ', text) + return text.strip() + + def segment_and_count(self, danmaku_list: List[str], top_n: int = 100) -> Tuple[List[Tuple[str, int]], List[str]]: + """ + Segment danmaku list and count word frequencies. + Returns: + - top_words: List of (word, count) tuples + - all_words: List of all valid words (for wordcloud) + """ + all_words = [] + for text in danmaku_list: + cleaned = self.clean_text(text) + if not cleaned: + continue + + words = jieba.cut(cleaned) + for word in words: + word = word.strip() + if len(word) > 1 and word not in self.stop_words: + all_words.append(word) + + counter = Counter(all_words) + return counter.most_common(top_n), all_words + + def get_top_danmaku(self, danmaku_list: List[str], top_n: int = 8) -> List[Tuple[str, int]]: + """ + Get the most frequent full danmaku sentences (after simple cleaning). + """ + # For full danmaku, we might want to keep them as is or do minimal cleaning + # Here we count exact matches of the original content + counter = Counter(danmaku_list) + return counter.most_common(top_n) + + def export_to_excel(self, videos: List[Dict], top_danmaku: List[Tuple[str, int]], top_words: List[Tuple[str, int]], output_path: str): + """ + Export analysis results to Excel. + """ + os.makedirs(os.path.dirname(output_path), exist_ok=True) + + with pd.ExcelWriter(output_path, engine='openpyxl') as writer: + # Sheet 1: Video Info + if videos: + df_videos = pd.DataFrame(videos) + df_videos.to_excel(writer, sheet_name='Videos', index=False) + + # Sheet 2: Top Danmaku + if top_danmaku: + df_danmaku = pd.DataFrame(top_danmaku, columns=['Danmaku', 'Count']) + df_danmaku.to_excel(writer, sheet_name='Top Danmaku', index=False) + + # Sheet 3: Top Words + if top_words: + df_words = pd.DataFrame(top_words, columns=['Word', 'Count']) + df_words.to_excel(writer, sheet_name='Top Words', index=False) + + logging.info(f"Data exported to {output_path}")