feat: implement analysis module

main
ccicnce113424 1 week ago
parent 3cafaaeece
commit fc82f3774e

@ -1 +1,83 @@
# Data Analysis Module
import jieba
import pandas as pd
from collections import Counter
import re
import logging
from typing import List, Dict, Tuple
import os
class DataAnalyzer:
def __init__(self, stop_words_path: str = None):
self.stop_words = set()
if stop_words_path and os.path.exists(stop_words_path):
with open(stop_words_path, 'r', encoding='utf-8') as f:
self.stop_words = set(line.strip() for line in f)
# Add default stop words if none provided or to supplement
default_stop_words = {'', '', '', '', '', '', '', '', '', '', '', '', '一个', '', '', '', '', '', '', '', '', '', '', '没有', '', '', '自己', '', '', '这个', '那个', '', '', '', '', '666', '哈哈', '哈哈哈', '打卡', '第一', '前排', '火钳刘明'}
self.stop_words.update(default_stop_words)
def clean_text(self, text: str) -> str:
"""
Clean text by removing special characters and extra spaces.
"""
if not text:
return ""
# Remove special characters, keep only Chinese, English, numbers
text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9]', ' ', text)
return text.strip()
def segment_and_count(self, danmaku_list: List[str], top_n: int = 100) -> Tuple[List[Tuple[str, int]], List[str]]:
"""
Segment danmaku list and count word frequencies.
Returns:
- top_words: List of (word, count) tuples
- all_words: List of all valid words (for wordcloud)
"""
all_words = []
for text in danmaku_list:
cleaned = self.clean_text(text)
if not cleaned:
continue
words = jieba.cut(cleaned)
for word in words:
word = word.strip()
if len(word) > 1 and word not in self.stop_words:
all_words.append(word)
counter = Counter(all_words)
return counter.most_common(top_n), all_words
def get_top_danmaku(self, danmaku_list: List[str], top_n: int = 8) -> List[Tuple[str, int]]:
"""
Get the most frequent full danmaku sentences (after simple cleaning).
"""
# For full danmaku, we might want to keep them as is or do minimal cleaning
# Here we count exact matches of the original content
counter = Counter(danmaku_list)
return counter.most_common(top_n)
def export_to_excel(self, videos: List[Dict], top_danmaku: List[Tuple[str, int]], top_words: List[Tuple[str, int]], output_path: str):
"""
Export analysis results to Excel.
"""
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
# Sheet 1: Video Info
if videos:
df_videos = pd.DataFrame(videos)
df_videos.to_excel(writer, sheet_name='Videos', index=False)
# Sheet 2: Top Danmaku
if top_danmaku:
df_danmaku = pd.DataFrame(top_danmaku, columns=['Danmaku', 'Count'])
df_danmaku.to_excel(writer, sheet_name='Top Danmaku', index=False)
# Sheet 3: Top Words
if top_words:
df_words = pd.DataFrame(top_words, columns=['Word', 'Count'])
df_words.to_excel(writer, sheet_name='Top Words', index=False)
logging.info(f"Data exported to {output_path}")

Loading…
Cancel
Save