102301618/processor.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
import jieba
from collections import Counter

class DataProcessor:
    """高质量弹幕清洗与关键词提取"""

    def __init__(self):
        print("🧠 数据处理器初始化完成")

    def clean_danmu(self, danmu_list):
        """清洗弹幕数据（彻底过滤无效或乱码内容）"""
        print(f"🧹 清洗弹幕数据，共 {len(danmu_list)} 条")

        cleaned = []
        for dm in danmu_list:
            dm = dm.strip()
            # 空或太短
            if not dm or len(dm) < 2:
                continue

            # 去除 HTML 标签
            dm = re.sub(r'<[^>]+>', '', dm)

            # 跳过纯符号或表情代码
            if re.fullmatch(r'[\W_]+', dm):
                continue

            # 跳过纯英文字母或数字
            if re.fullmatch(r'[A-Za-z0-9]+', dm):
                continue

            # 跳过看起来像网址或文件名的内容
            if re.search(r'(http|www|\.com|\.cn|\.jpg|\.png|\.gif|\.svg)', dm, re.I):
                continue

            # 跳过无意义的乱码（连续3个以上随机字母数字）
            if re.search(r'[A-Za-z0-9]{4,}', dm):
                continue

            # 跳过只有单个字重复多次的内容（如“哈哈哈哈哈哈哈”、“啊啊啊啊”）
            if len(set(dm)) == 1 and len(dm) > 3:
                continue

            # 跳过带控制字符的异常行
            if not re.match(r'^[\u4e00-\u9fa5A-Za-z0-9，。！？、,.!?（）【】“”‘’\s]+$', dm):
                continue

            cleaned.append(dm)

        print(f"✅ 清洗后剩余有效弹幕 {len(cleaned)} 条")
        return cleaned

    def extract_keywords(self, danmu_list):
        """提取关键词"""
        print("🔍 开始提取关键词")
        all_text = " ".join(danmu_list)
        words = jieba.cut(all_text)

        # 过滤停用词、过短词、非中文
        keywords = [
            w for w in words
            if len(w) > 1 and re.search(r'[\u4e00-\u9fa5]', w)
        ]

        freq = Counter(keywords)
        print(f"📊 共提取 {len(freq)} 个关键词")
        return freq


if __name__ == "__main__":
    processor = DataProcessor()
    test_data = [
        "asbhfhbsdfhbsdfusdf", "哈哈哈哈哈哈哈哈", "www.bilibili.com",
        "大语言模型真强", "AI应用太方便了", "2333333", "i0.hdslb.com/xxxx.png",
        "🔥🔥🔥", "代码生成好厉害", "<svg path...>"
    ]
    cleaned = processor.clean_danmu(test_data)
    print("清洗后弹幕：", cleaned)
    counter = processor.extract_keywords(cleaned)
    print(counter.most_common(10))