You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
74 lines
2.4 KiB
74 lines
2.4 KiB
import pandas as pd
|
|
import jieba
|
|
import jieba.analyse
|
|
from typing import List, Dict
|
|
import re
|
|
|
|
class DataProcessor:
|
|
"""
|
|
数据处理类
|
|
功能:数据清洗、分词处理、统计排序
|
|
"""
|
|
|
|
def __init__(self):
|
|
# 添加大语言模型相关词汇到分词词典
|
|
jieba.add_word('大语言模型')
|
|
jieba.add_word('LLM')
|
|
jieba.add_word('大模型')
|
|
jieba.add_word('AI模型')
|
|
jieba.add_word('ChatGPT')
|
|
jieba.add_word('文心一言')
|
|
jieba.add_word('通义千问')
|
|
|
|
def clean_text(self, text: str) -> str:
|
|
"""
|
|
清洗文本数据
|
|
"""
|
|
if not isinstance(text, str):
|
|
return ""
|
|
|
|
# 移除特殊字符和表情符号
|
|
text = re.sub(r'[^\w\u4e00-\u9fa5]', ' ', text)
|
|
# 移除多余空格
|
|
text = re.sub(r'\s+', ' ', text)
|
|
return text.strip()
|
|
|
|
def extract_keywords(self, texts: List[str], top_k: int = 50) -> List[str]:
|
|
"""
|
|
提取关键词
|
|
"""
|
|
all_text = ' '.join([self.clean_text(text) for text in texts])
|
|
keywords = jieba.analyse.extract_tags(all_text, topK=top_k, withWeight=False)
|
|
return keywords
|
|
|
|
def count_danmaku_frequency(self, danmaku_list: List[str]) -> pd.DataFrame:
|
|
"""
|
|
统计弹幕词频
|
|
"""
|
|
word_count = {}
|
|
|
|
for danmaku in danmaku_list:
|
|
cleaned = self.clean_text(danmaku)
|
|
if cleaned:
|
|
words = jieba.lcut(cleaned)
|
|
for word in words:
|
|
if len(word) > 1: # 只统计长度大于1的词
|
|
word_count[word] = word_count.get(word, 0) + 1
|
|
|
|
# 转换为DataFrame并排序
|
|
df = pd.DataFrame(list(word_count.items()), columns=['词语', '频次'])
|
|
df = df.sort_values('频次', ascending=False)
|
|
return df
|
|
|
|
def save_to_excel(self, dataframes: Dict[str, pd.DataFrame], filename: str):
|
|
"""
|
|
保存数据到Excel文件
|
|
"""
|
|
with pd.ExcelWriter(filename, engine='openpyxl') as writer:
|
|
for sheet_name, df in dataframes.items():
|
|
df.to_excel(writer, sheet_name=sheet_name, index=False)
|
|
print(f"数据已保存到: {filename}")
|
|
|
|
if __name__ == "__main__":
|
|
processor = DataProcessor()
|
|
print("数据处理模块初始化成功") |