parent
3cafaaeece
commit
fc82f3774e
@ -1 +1,83 @@
|
||||
# Data Analysis Module
|
||||
import jieba
|
||||
import pandas as pd
|
||||
from collections import Counter
|
||||
import re
|
||||
import logging
|
||||
from typing import List, Dict, Tuple
|
||||
import os
|
||||
|
||||
class DataAnalyzer:
|
||||
def __init__(self, stop_words_path: str = None):
|
||||
self.stop_words = set()
|
||||
if stop_words_path and os.path.exists(stop_words_path):
|
||||
with open(stop_words_path, 'r', encoding='utf-8') as f:
|
||||
self.stop_words = set(line.strip() for line in f)
|
||||
|
||||
# Add default stop words if none provided or to supplement
|
||||
default_stop_words = {'的', '了', '是', '在', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有', '看', '好', '自己', '这', '那', '这个', '那个', '啊', '吧', '呢', '吗', '666', '哈哈', '哈哈哈', '打卡', '第一', '前排', '火钳刘明'}
|
||||
self.stop_words.update(default_stop_words)
|
||||
|
||||
def clean_text(self, text: str) -> str:
|
||||
"""
|
||||
Clean text by removing special characters and extra spaces.
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
# Remove special characters, keep only Chinese, English, numbers
|
||||
text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9]', ' ', text)
|
||||
return text.strip()
|
||||
|
||||
def segment_and_count(self, danmaku_list: List[str], top_n: int = 100) -> Tuple[List[Tuple[str, int]], List[str]]:
|
||||
"""
|
||||
Segment danmaku list and count word frequencies.
|
||||
Returns:
|
||||
- top_words: List of (word, count) tuples
|
||||
- all_words: List of all valid words (for wordcloud)
|
||||
"""
|
||||
all_words = []
|
||||
for text in danmaku_list:
|
||||
cleaned = self.clean_text(text)
|
||||
if not cleaned:
|
||||
continue
|
||||
|
||||
words = jieba.cut(cleaned)
|
||||
for word in words:
|
||||
word = word.strip()
|
||||
if len(word) > 1 and word not in self.stop_words:
|
||||
all_words.append(word)
|
||||
|
||||
counter = Counter(all_words)
|
||||
return counter.most_common(top_n), all_words
|
||||
|
||||
def get_top_danmaku(self, danmaku_list: List[str], top_n: int = 8) -> List[Tuple[str, int]]:
|
||||
"""
|
||||
Get the most frequent full danmaku sentences (after simple cleaning).
|
||||
"""
|
||||
# For full danmaku, we might want to keep them as is or do minimal cleaning
|
||||
# Here we count exact matches of the original content
|
||||
counter = Counter(danmaku_list)
|
||||
return counter.most_common(top_n)
|
||||
|
||||
def export_to_excel(self, videos: List[Dict], top_danmaku: List[Tuple[str, int]], top_words: List[Tuple[str, int]], output_path: str):
|
||||
"""
|
||||
Export analysis results to Excel.
|
||||
"""
|
||||
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||||
|
||||
with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
|
||||
# Sheet 1: Video Info
|
||||
if videos:
|
||||
df_videos = pd.DataFrame(videos)
|
||||
df_videos.to_excel(writer, sheet_name='Videos', index=False)
|
||||
|
||||
# Sheet 2: Top Danmaku
|
||||
if top_danmaku:
|
||||
df_danmaku = pd.DataFrame(top_danmaku, columns=['Danmaku', 'Count'])
|
||||
df_danmaku.to_excel(writer, sheet_name='Top Danmaku', index=False)
|
||||
|
||||
# Sheet 3: Top Words
|
||||
if top_words:
|
||||
df_words = pd.DataFrame(top_words, columns=['Word', 'Count'])
|
||||
df_words.to_excel(writer, sheet_name='Top Words', index=False)
|
||||
|
||||
logging.info(f"Data exported to {output_path}")
|
||||
|
||||
Loading…
Reference in new issue