# scripts/data_processor.py import pandas as pd import re from collections import Counter import os class DataProcessor: def __init__(self): # 定义大语言模型相关词汇 self.llm_terms = [ '大语言模型', '大模型', 'LLM', 'ChatGPT', 'GPT', '文心一言', '通义千问', '代码生成', '文本摘要', '智能客服', '内容创作', '机器翻译', '提示工程', '多模态', 'AI绘画', '智能助手', '本地部署', '开源模型', '商业化', '深度学习', '自然语言处理', 'Transformer', '预训练模型' ] def simple_tokenize(self, text: str) -> list: """简单的分词函数(jieba的替代方案)""" # 先处理特殊词汇 for term in self.llm_terms: if term in text: text = text.replace(term, f" {term} ") # 按标点符号和空格分词 words = re.findall(r'[\w\u4e00-\u9fff]+', text) return words def load_data(self, filepath: str) -> pd.DataFrame: """加载数据""" return pd.read_csv(filepath, encoding='utf-8-sig') def extract_llm_applications(self, text: str) -> list: """提取大语言模型应用相关词汇""" applications = [] # 应用领域关键词映射 app_keywords = { '编程开发': ['代码', '编程', '开发', '程序员', 'debug', '自动补全', '代码生成', '编程助手'], '内容创作': ['写作', '创作', '文案', '文章', '内容', '自媒体', '营销', '创意写作'], '教育培训': ['教育', '学习', '教学', '培训', '老师', '辅导', '答疑', '个性化教学'], '医疗健康': ['医疗', '健康', '诊断', '医生', '医院', '病历', '药物', '医疗辅助'], '商业办公': ['办公', '商业', '企业', '工作', '效率', '自动化', '决策', '客户服务'], '智能客服': ['客服', '助手', '咨询', '问答', '帮助', '服务', '在线', '智能问答'], '翻译理解': ['翻译', '多语言', '理解', '语义', '跨语言', '交流', '机器翻译'], '创意设计': ['设计', '创意', '艺术', '绘画', '灵感', '创作', 'AI绘画', '艺术创作'] } for category, keywords in app_keywords.items(): if any(keyword in text for keyword in keywords): applications.append(category) return applications def process_danmu(self, df: pd.DataFrame) -> pd.DataFrame: """处理弹幕数据""" processed_data = [] for _, row in df.iterrows(): danmu = row['danmu'] # 使用简单分词 words = self.simple_tokenize(danmu) # 提取应用领域 applications = self.extract_llm_applications(danmu) processed_data.append({ 'bvid': row['bvid'], 'original_danmu': danmu, 'words': words, 'applications': applications, 'word_count': len(words) }) return pd.DataFrame(processed_data) def get_top_applications(self, df: pd.DataFrame, top_n: int = 8) -> pd.DataFrame: """获取排名前N的应用领域""" all_applications = [] for apps in df['applications']: all_applications.extend(apps) app_counter = Counter(all_applications) top_apps = app_counter.most_common(top_n) result_df = pd.DataFrame(top_apps, columns=['应用领域', '出现次数']) return result_df def get_word_frequency(self, df: pd.DataFrame, top_n: int = 50) -> pd.DataFrame: """获取词频统计""" all_words = [] for words in df['words']: # 过滤停用词和单字 filtered_words = [ word for word in words if len(word) > 1 and not re.match(r'^[0-9a-zA-Z]+$', word) ] all_words.extend(filtered_words) word_counter = Counter(all_words) top_words = word_counter.most_common(top_n) return pd.DataFrame(top_words, columns=['词语', '频次']) def save_to_excel(self, df: pd.DataFrame, top_apps: pd.DataFrame, word_freq: pd.DataFrame): """保存数据到Excel""" # 确保目录存在 os.makedirs('data/processed', exist_ok=True) with pd.ExcelWriter('data/processed/llm_analysis.xlsx', engine='openpyxl') as writer: df.to_excel(writer, sheet_name='弹幕数据', index=False) top_apps.to_excel(writer, sheet_name='应用领域排名', index=False) word_freq.to_excel(writer, sheet_name='词频统计', index=False) def main(): processor = DataProcessor() # 加载数据 df = processor.load_data('data/raw/danmu_raw.csv') print(f"加载了 {len(df)} 条弹幕数据") # 处理数据 processed_df = processor.process_danmu(df) # 获取应用领域排名 top_apps = processor.get_top_applications(processed_df, 8) print("\n应用领域排名前8:") print(top_apps) # 获取词频统计 word_freq = processor.get_word_frequency(processed_df, 50) print(f"\n词频统计前10:") print(word_freq.head(10)) # 保存到Excel processor.save_to_excel(processed_df, top_apps, word_freq) print("\n数据已保存到 data/processed/llm_analysis.xlsx") return processed_df, top_apps, word_freq if __name__ == "__main__": main()