102301535/data_processor.py

# scripts/data_processor.py
import pandas as pd
import re
from collections import Counter
import os

class DataProcessor:
    def __init__(self):
        # 定义大语言模型相关词汇
        self.llm_terms = [
            '大语言模型', '大模型', 'LLM', 'ChatGPT', 'GPT', '文心一言', '通义千问',
            '代码生成', '文本摘要', '智能客服', '内容创作', '机器翻译', '提示工程',
            '多模态', 'AI绘画', '智能助手', '本地部署', '开源模型', '商业化',
            '深度学习', '自然语言处理', 'Transformer', '预训练模型'
        ]

    def simple_tokenize(self, text: str) -> list:
        """简单的分词函数（jieba的替代方案）"""
        # 先处理特殊词汇
        for term in self.llm_terms:
            if term in text:
                text = text.replace(term, f" {term} ")

        # 按标点符号和空格分词
        words = re.findall(r'[\w\u4e00-\u9fff]+', text)
        return words

    def load_data(self, filepath: str) -> pd.DataFrame:
        """加载数据"""
        return pd.read_csv(filepath, encoding='utf-8-sig')

    def extract_llm_applications(self, text: str) -> list:
        """提取大语言模型应用相关词汇"""
        applications = []

        # 应用领域关键词映射
        app_keywords = {
            '编程开发': ['代码', '编程', '开发', '程序员', 'debug', '自动补全', '代码生成', '编程助手'],
            '内容创作': ['写作', '创作', '文案', '文章', '内容', '自媒体', '营销', '创意写作'],
            '教育培训': ['教育', '学习', '教学', '培训', '老师', '辅导', '答疑', '个性化教学'],
            '医疗健康': ['医疗', '健康', '诊断', '医生', '医院', '病历', '药物', '医疗辅助'],
            '商业办公': ['办公', '商业', '企业', '工作', '效率', '自动化', '决策', '客户服务'],
            '智能客服': ['客服', '助手', '咨询', '问答', '帮助', '服务', '在线', '智能问答'],
            '翻译理解': ['翻译', '多语言', '理解', '语义', '跨语言', '交流', '机器翻译'],
            '创意设计': ['设计', '创意', '艺术', '绘画', '灵感', '创作', 'AI绘画', '艺术创作']
        }

        for category, keywords in app_keywords.items():
            if any(keyword in text for keyword in keywords):
                applications.append(category)

        return applications

    def process_danmu(self, df: pd.DataFrame) -> pd.DataFrame:
        """处理弹幕数据"""
        processed_data = []

        for _, row in df.iterrows():
            danmu = row['danmu']

            # 使用简单分词
            words = self.simple_tokenize(danmu)

            # 提取应用领域
            applications = self.extract_llm_applications(danmu)

            processed_data.append({
                'bvid': row['bvid'],
                'original_danmu': danmu,
                'words': words,
                'applications': applications,
                'word_count': len(words)
            })

        return pd.DataFrame(processed_data)

    def get_top_applications(self, df: pd.DataFrame, top_n: int = 8) -> pd.DataFrame:
        """获取排名前N的应用领域"""
        all_applications = []
        for apps in df['applications']:
            all_applications.extend(apps)

        app_counter = Counter(all_applications)
        top_apps = app_counter.most_common(top_n)

        result_df = pd.DataFrame(top_apps, columns=['应用领域', '出现次数'])
        return result_df

    def get_word_frequency(self, df: pd.DataFrame, top_n: int = 50) -> pd.DataFrame:
        """获取词频统计"""
        all_words = []
        for words in df['words']:
            # 过滤停用词和单字
            filtered_words = [
                word for word in words
                if len(word) > 1 and not re.match(r'^[0-9a-zA-Z]+$', word)
            ]
            all_words.extend(filtered_words)

        word_counter = Counter(all_words)
        top_words = word_counter.most_common(top_n)

        return pd.DataFrame(top_words, columns=['词语', '频次'])

    def save_to_excel(self, df: pd.DataFrame, top_apps: pd.DataFrame, word_freq: pd.DataFrame):
        """保存数据到Excel"""
        # 确保目录存在
        os.makedirs('data/processed', exist_ok=True)

        with pd.ExcelWriter('data/processed/llm_analysis.xlsx', engine='openpyxl') as writer:
            df.to_excel(writer, sheet_name='弹幕数据', index=False)
            top_apps.to_excel(writer, sheet_name='应用领域排名', index=False)
            word_freq.to_excel(writer, sheet_name='词频统计', index=False)

def main():
    processor = DataProcessor()

    # 加载数据
    df = processor.load_data('data/raw/danmu_raw.csv')
    print(f"加载了 {len(df)} 条弹幕数据")

    # 处理数据
    processed_df = processor.process_danmu(df)

    # 获取应用领域排名
    top_apps = processor.get_top_applications(processed_df, 8)
    print("\n应用领域排名前8:")
    print(top_apps)

    # 获取词频统计
    word_freq = processor.get_word_frequency(processed_df, 50)
    print(f"\n词频统计前10:")
    print(word_freq.head(10))

    # 保存到Excel
    processor.save_to_excel(processed_df, top_apps, word_freq)
    print("\n数据已保存到 data/processed/llm_analysis.xlsx")

    return processed_df, top_apps, word_freq

if __name__ == "__main__":
    main()