|
|
# scripts/data_processor.py
|
|
|
import pandas as pd
|
|
|
import re
|
|
|
from collections import Counter
|
|
|
import os
|
|
|
|
|
|
class DataProcessor:
|
|
|
def __init__(self):
|
|
|
# 定义大语言模型相关词汇
|
|
|
self.llm_terms = [
|
|
|
'大语言模型', '大模型', 'LLM', 'ChatGPT', 'GPT', '文心一言', '通义千问',
|
|
|
'代码生成', '文本摘要', '智能客服', '内容创作', '机器翻译', '提示工程',
|
|
|
'多模态', 'AI绘画', '智能助手', '本地部署', '开源模型', '商业化',
|
|
|
'深度学习', '自然语言处理', 'Transformer', '预训练模型'
|
|
|
]
|
|
|
|
|
|
def simple_tokenize(self, text: str) -> list:
|
|
|
"""简单的分词函数(jieba的替代方案)"""
|
|
|
# 先处理特殊词汇
|
|
|
for term in self.llm_terms:
|
|
|
if term in text:
|
|
|
text = text.replace(term, f" {term} ")
|
|
|
|
|
|
# 按标点符号和空格分词
|
|
|
words = re.findall(r'[\w\u4e00-\u9fff]+', text)
|
|
|
return words
|
|
|
|
|
|
def load_data(self, filepath: str) -> pd.DataFrame:
|
|
|
"""加载数据"""
|
|
|
return pd.read_csv(filepath, encoding='utf-8-sig')
|
|
|
|
|
|
def extract_llm_applications(self, text: str) -> list:
|
|
|
"""提取大语言模型应用相关词汇"""
|
|
|
applications = []
|
|
|
|
|
|
# 应用领域关键词映射
|
|
|
app_keywords = {
|
|
|
'编程开发': ['代码', '编程', '开发', '程序员', 'debug', '自动补全', '代码生成', '编程助手'],
|
|
|
'内容创作': ['写作', '创作', '文案', '文章', '内容', '自媒体', '营销', '创意写作'],
|
|
|
'教育培训': ['教育', '学习', '教学', '培训', '老师', '辅导', '答疑', '个性化教学'],
|
|
|
'医疗健康': ['医疗', '健康', '诊断', '医生', '医院', '病历', '药物', '医疗辅助'],
|
|
|
'商业办公': ['办公', '商业', '企业', '工作', '效率', '自动化', '决策', '客户服务'],
|
|
|
'智能客服': ['客服', '助手', '咨询', '问答', '帮助', '服务', '在线', '智能问答'],
|
|
|
'翻译理解': ['翻译', '多语言', '理解', '语义', '跨语言', '交流', '机器翻译'],
|
|
|
'创意设计': ['设计', '创意', '艺术', '绘画', '灵感', '创作', 'AI绘画', '艺术创作']
|
|
|
}
|
|
|
|
|
|
for category, keywords in app_keywords.items():
|
|
|
if any(keyword in text for keyword in keywords):
|
|
|
applications.append(category)
|
|
|
|
|
|
return applications
|
|
|
|
|
|
def process_danmu(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
|
"""处理弹幕数据"""
|
|
|
processed_data = []
|
|
|
|
|
|
for _, row in df.iterrows():
|
|
|
danmu = row['danmu']
|
|
|
|
|
|
# 使用简单分词
|
|
|
words = self.simple_tokenize(danmu)
|
|
|
|
|
|
# 提取应用领域
|
|
|
applications = self.extract_llm_applications(danmu)
|
|
|
|
|
|
processed_data.append({
|
|
|
'bvid': row['bvid'],
|
|
|
'original_danmu': danmu,
|
|
|
'words': words,
|
|
|
'applications': applications,
|
|
|
'word_count': len(words)
|
|
|
})
|
|
|
|
|
|
return pd.DataFrame(processed_data)
|
|
|
|
|
|
def get_top_applications(self, df: pd.DataFrame, top_n: int = 8) -> pd.DataFrame:
|
|
|
"""获取排名前N的应用领域"""
|
|
|
all_applications = []
|
|
|
for apps in df['applications']:
|
|
|
all_applications.extend(apps)
|
|
|
|
|
|
app_counter = Counter(all_applications)
|
|
|
top_apps = app_counter.most_common(top_n)
|
|
|
|
|
|
result_df = pd.DataFrame(top_apps, columns=['应用领域', '出现次数'])
|
|
|
return result_df
|
|
|
|
|
|
def get_word_frequency(self, df: pd.DataFrame, top_n: int = 50) -> pd.DataFrame:
|
|
|
"""获取词频统计"""
|
|
|
all_words = []
|
|
|
for words in df['words']:
|
|
|
# 过滤停用词和单字
|
|
|
filtered_words = [
|
|
|
word for word in words
|
|
|
if len(word) > 1 and not re.match(r'^[0-9a-zA-Z]+$', word)
|
|
|
]
|
|
|
all_words.extend(filtered_words)
|
|
|
|
|
|
word_counter = Counter(all_words)
|
|
|
top_words = word_counter.most_common(top_n)
|
|
|
|
|
|
return pd.DataFrame(top_words, columns=['词语', '频次'])
|
|
|
|
|
|
def save_to_excel(self, df: pd.DataFrame, top_apps: pd.DataFrame, word_freq: pd.DataFrame):
|
|
|
"""保存数据到Excel"""
|
|
|
# 确保目录存在
|
|
|
os.makedirs('data/processed', exist_ok=True)
|
|
|
|
|
|
with pd.ExcelWriter('data/processed/llm_analysis.xlsx', engine='openpyxl') as writer:
|
|
|
df.to_excel(writer, sheet_name='弹幕数据', index=False)
|
|
|
top_apps.to_excel(writer, sheet_name='应用领域排名', index=False)
|
|
|
word_freq.to_excel(writer, sheet_name='词频统计', index=False)
|
|
|
|
|
|
def main():
|
|
|
processor = DataProcessor()
|
|
|
|
|
|
# 加载数据
|
|
|
df = processor.load_data('data/raw/danmu_raw.csv')
|
|
|
print(f"加载了 {len(df)} 条弹幕数据")
|
|
|
|
|
|
# 处理数据
|
|
|
processed_df = processor.process_danmu(df)
|
|
|
|
|
|
# 获取应用领域排名
|
|
|
top_apps = processor.get_top_applications(processed_df, 8)
|
|
|
print("\n应用领域排名前8:")
|
|
|
print(top_apps)
|
|
|
|
|
|
# 获取词频统计
|
|
|
word_freq = processor.get_word_frequency(processed_df, 50)
|
|
|
print(f"\n词频统计前10:")
|
|
|
print(word_freq.head(10))
|
|
|
|
|
|
# 保存到Excel
|
|
|
processor.save_to_excel(processed_df, top_apps, word_freq)
|
|
|
print("\n数据已保存到 data/processed/llm_analysis.xlsx")
|
|
|
|
|
|
return processed_df, top_apps, word_freq
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main() |