You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

74 lines
2.4 KiB

import pandas as pd
import jieba
import jieba.analyse
from typing import List, Dict
import re
class DataProcessor:
"""
数据处理类
功能:数据清洗、分词处理、统计排序
"""
def __init__(self):
# 添加大语言模型相关词汇到分词词典
jieba.add_word('大语言模型')
jieba.add_word('LLM')
jieba.add_word('大模型')
jieba.add_word('AI模型')
jieba.add_word('ChatGPT')
jieba.add_word('文心一言')
jieba.add_word('通义千问')
def clean_text(self, text: str) -> str:
"""
清洗文本数据
"""
if not isinstance(text, str):
return ""
# 移除特殊字符和表情符号
text = re.sub(r'[^\w\u4e00-\u9fa5]', ' ', text)
# 移除多余空格
text = re.sub(r'\s+', ' ', text)
return text.strip()
def extract_keywords(self, texts: List[str], top_k: int = 50) -> List[str]:
"""
提取关键词
"""
all_text = ' '.join([self.clean_text(text) for text in texts])
keywords = jieba.analyse.extract_tags(all_text, topK=top_k, withWeight=False)
return keywords
def count_danmaku_frequency(self, danmaku_list: List[str]) -> pd.DataFrame:
"""
统计弹幕词频
"""
word_count = {}
for danmaku in danmaku_list:
cleaned = self.clean_text(danmaku)
if cleaned:
words = jieba.lcut(cleaned)
for word in words:
if len(word) > 1: # 只统计长度大于1的词
word_count[word] = word_count.get(word, 0) + 1
# 转换为DataFrame并排序
df = pd.DataFrame(list(word_count.items()), columns=['词语', '频次'])
df = df.sort_values('频次', ascending=False)
return df
def save_to_excel(self, dataframes: Dict[str, pd.DataFrame], filename: str):
"""
保存数据到Excel文件
"""
with pd.ExcelWriter(filename, engine='openpyxl') as writer:
for sheet_name, df in dataframes.items():
df.to_excel(writer, sheet_name=sheet_name, index=False)
print(f"数据已保存到: {filename}")
if __name__ == "__main__":
processor = DataProcessor()
print("数据处理模块初始化成功")