feat: implement analysis module

1 week ago · fc82f3774e
parent 3cafaaeece
commit fc82f3774e
1 changed files with 83 additions and 1 deletions
--- a/src/analysis.py
+++ b/src/analysis.py
@ -1 +1,83 @@
-# Data Analysis Module
+import jieba
+import pandas as pd
+from collections import Counter
+import re
+import logging
+from typing import List, Dict, Tuple
+import os
+
+class DataAnalyzer:
+    def __init__(self, stop_words_path: str = None):
+        self.stop_words = set()
+        if stop_words_path and os.path.exists(stop_words_path):
+            with open(stop_words_path, 'r', encoding='utf-8') as f:
+                self.stop_words = set(line.strip() for line in f)
+        
+        # Add default stop words if none provided or to supplement
+        default_stop_words = {'的', '了', '是', '在', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有', '看', '好', '自己', '这', '那', '这个', '那个', '啊', '吧', '呢', '吗', '666', '哈哈', '哈哈哈', '打卡', '第一', '前排', '火钳刘明'}
+        self.stop_words.update(default_stop_words)
+
+    def clean_text(self, text: str) -> str:
+        """
+        Clean text by removing special characters and extra spaces.
+        """
+        if not text:
+            return ""
+        # Remove special characters, keep only Chinese, English, numbers
+        text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9]', ' ', text)
+        return text.strip()
+
+    def segment_and_count(self, danmaku_list: List[str], top_n: int = 100) -> Tuple[List[Tuple[str, int]], List[str]]:
+        """
+        Segment danmaku list and count word frequencies.
+        Returns:
+            - top_words: List of (word, count) tuples
+            - all_words: List of all valid words (for wordcloud)
+        """
+        all_words = []
+        for text in danmaku_list:
+            cleaned = self.clean_text(text)
+            if not cleaned:
+                continue
+            
+            words = jieba.cut(cleaned)
+            for word in words:
+                word = word.strip()
+                if len(word) > 1 and word not in self.stop_words:
+                    all_words.append(word)
+        
+        counter = Counter(all_words)
+        return counter.most_common(top_n), all_words
+
+    def get_top_danmaku(self, danmaku_list: List[str], top_n: int = 8) -> List[Tuple[str, int]]:
+        """
+        Get the most frequent full danmaku sentences (after simple cleaning).
+        """
+        # For full danmaku, we might want to keep them as is or do minimal cleaning
+        # Here we count exact matches of the original content
+        counter = Counter(danmaku_list)
+        return counter.most_common(top_n)
+
+    def export_to_excel(self, videos: List[Dict], top_danmaku: List[Tuple[str, int]], top_words: List[Tuple[str, int]], output_path: str):
+        """
+        Export analysis results to Excel.
+        """
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        
+        with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
+            # Sheet 1: Video Info
+            if videos:
+                df_videos = pd.DataFrame(videos)
+                df_videos.to_excel(writer, sheet_name='Videos', index=False)
+            
+            # Sheet 2: Top Danmaku
+            if top_danmaku:
+                df_danmaku = pd.DataFrame(top_danmaku, columns=['Danmaku', 'Count'])
+                df_danmaku.to_excel(writer, sheet_name='Top Danmaku', index=False)
+            
+            # Sheet 3: Top Words
+            if top_words:
+                df_words = pd.DataFrame(top_words, columns=['Word', 'Count'])
+                df_words.to_excel(writer, sheet_name='Top Words', index=False)
+        
+        logging.info(f"Data exported to {output_path}")