You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

32 lines
1023 B

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import jieba
from collections import Counter
class DataProcessor:
"""数据处理类"""
def __init__(self):
print("数据处理器初始化")
def clean_danmu(self, danmu_list):
"""清洗弹幕数据"""
print(f"清洗弹幕数据,共 {len(danmu_list)}")
# 去除空弹幕和简单过滤
cleaned = [dm.strip() for dm in danmu_list if len(dm.strip()) > 1]
return cleaned
def extract_keywords(self, danmu_list):
"""提取关键词"""
print("开始提取关键词")
all_text = " ".join(danmu_list)
words = jieba.cut(all_text)
# 过滤短词和停用词
keywords = [word for word in words if len(word) > 1]
return Counter(keywords)
if __name__ == "__main__":
processor = DataProcessor()
test_data = ["大语言模型", "AI应用", "很实用"]
cleaned = processor.clean_danmu(test_data)
print(f"清洗后数据: {cleaned}")