You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
102301535/data_processor.py

142 lines
5.7 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# scripts/data_processor.py
import pandas as pd
import re
from collections import Counter
import os
class DataProcessor:
def __init__(self):
# 定义大语言模型相关词汇
self.llm_terms = [
'大语言模型', '大模型', 'LLM', 'ChatGPT', 'GPT', '文心一言', '通义千问',
'代码生成', '文本摘要', '智能客服', '内容创作', '机器翻译', '提示工程',
'多模态', 'AI绘画', '智能助手', '本地部署', '开源模型', '商业化',
'深度学习', '自然语言处理', 'Transformer', '预训练模型'
]
def simple_tokenize(self, text: str) -> list:
"""简单的分词函数jieba的替代方案"""
# 先处理特殊词汇
for term in self.llm_terms:
if term in text:
text = text.replace(term, f" {term} ")
# 按标点符号和空格分词
words = re.findall(r'[\w\u4e00-\u9fff]+', text)
return words
def load_data(self, filepath: str) -> pd.DataFrame:
"""加载数据"""
return pd.read_csv(filepath, encoding='utf-8-sig')
def extract_llm_applications(self, text: str) -> list:
"""提取大语言模型应用相关词汇"""
applications = []
# 应用领域关键词映射
app_keywords = {
'编程开发': ['代码', '编程', '开发', '程序员', 'debug', '自动补全', '代码生成', '编程助手'],
'内容创作': ['写作', '创作', '文案', '文章', '内容', '自媒体', '营销', '创意写作'],
'教育培训': ['教育', '学习', '教学', '培训', '老师', '辅导', '答疑', '个性化教学'],
'医疗健康': ['医疗', '健康', '诊断', '医生', '医院', '病历', '药物', '医疗辅助'],
'商业办公': ['办公', '商业', '企业', '工作', '效率', '自动化', '决策', '客户服务'],
'智能客服': ['客服', '助手', '咨询', '问答', '帮助', '服务', '在线', '智能问答'],
'翻译理解': ['翻译', '多语言', '理解', '语义', '跨语言', '交流', '机器翻译'],
'创意设计': ['设计', '创意', '艺术', '绘画', '灵感', '创作', 'AI绘画', '艺术创作']
}
for category, keywords in app_keywords.items():
if any(keyword in text for keyword in keywords):
applications.append(category)
return applications
def process_danmu(self, df: pd.DataFrame) -> pd.DataFrame:
"""处理弹幕数据"""
processed_data = []
for _, row in df.iterrows():
danmu = row['danmu']
# 使用简单分词
words = self.simple_tokenize(danmu)
# 提取应用领域
applications = self.extract_llm_applications(danmu)
processed_data.append({
'bvid': row['bvid'],
'original_danmu': danmu,
'words': words,
'applications': applications,
'word_count': len(words)
})
return pd.DataFrame(processed_data)
def get_top_applications(self, df: pd.DataFrame, top_n: int = 8) -> pd.DataFrame:
"""获取排名前N的应用领域"""
all_applications = []
for apps in df['applications']:
all_applications.extend(apps)
app_counter = Counter(all_applications)
top_apps = app_counter.most_common(top_n)
result_df = pd.DataFrame(top_apps, columns=['应用领域', '出现次数'])
return result_df
def get_word_frequency(self, df: pd.DataFrame, top_n: int = 50) -> pd.DataFrame:
"""获取词频统计"""
all_words = []
for words in df['words']:
# 过滤停用词和单字
filtered_words = [
word for word in words
if len(word) > 1 and not re.match(r'^[0-9a-zA-Z]+$', word)
]
all_words.extend(filtered_words)
word_counter = Counter(all_words)
top_words = word_counter.most_common(top_n)
return pd.DataFrame(top_words, columns=['词语', '频次'])
def save_to_excel(self, df: pd.DataFrame, top_apps: pd.DataFrame, word_freq: pd.DataFrame):
"""保存数据到Excel"""
# 确保目录存在
os.makedirs('data/processed', exist_ok=True)
with pd.ExcelWriter('data/processed/llm_analysis.xlsx', engine='openpyxl') as writer:
df.to_excel(writer, sheet_name='弹幕数据', index=False)
top_apps.to_excel(writer, sheet_name='应用领域排名', index=False)
word_freq.to_excel(writer, sheet_name='词频统计', index=False)
def main():
processor = DataProcessor()
# 加载数据
df = processor.load_data('data/raw/danmu_raw.csv')
print(f"加载了 {len(df)} 条弹幕数据")
# 处理数据
processed_df = processor.process_danmu(df)
# 获取应用领域排名
top_apps = processor.get_top_applications(processed_df, 8)
print("\n应用领域排名前8:")
print(top_apps)
# 获取词频统计
word_freq = processor.get_word_frequency(processed_df, 50)
print(f"\n词频统计前10:")
print(word_freq.head(10))
# 保存到Excel
processor.save_to_excel(processed_df, top_apps, word_freq)
print("\n数据已保存到 data/processed/llm_analysis.xlsx")
return processed_df, top_apps, word_freq
if __name__ == "__main__":
main()