You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
32 lines
1023 B
32 lines
1023 B
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
import jieba
|
|
from collections import Counter
|
|
|
|
class DataProcessor:
|
|
"""数据处理类"""
|
|
|
|
def __init__(self):
|
|
print("数据处理器初始化")
|
|
|
|
def clean_danmu(self, danmu_list):
|
|
"""清洗弹幕数据"""
|
|
print(f"清洗弹幕数据,共 {len(danmu_list)} 条")
|
|
# 去除空弹幕和简单过滤
|
|
cleaned = [dm.strip() for dm in danmu_list if len(dm.strip()) > 1]
|
|
return cleaned
|
|
|
|
def extract_keywords(self, danmu_list):
|
|
"""提取关键词"""
|
|
print("开始提取关键词")
|
|
all_text = " ".join(danmu_list)
|
|
words = jieba.cut(all_text)
|
|
# 过滤短词和停用词
|
|
keywords = [word for word in words if len(word) > 1]
|
|
return Counter(keywords)
|
|
|
|
if __name__ == "__main__":
|
|
processor = DataProcessor()
|
|
test_data = ["大语言模型", "AI应用", "很实用"]
|
|
cleaned = processor.clean_danmu(test_data)
|
|
print(f"清洗后数据: {cleaned}") |