#!/usr/bin/env python3 # -*- coding: utf-8 -*- import re import jieba from collections import Counter class DataProcessor: """高质量弹幕清洗与关键词提取""" def __init__(self): print("🧠 数据处理器初始化完成") def clean_danmu(self, danmu_list): """清洗弹幕数据(彻底过滤无效或乱码内容)""" print(f"🧹 清洗弹幕数据,共 {len(danmu_list)} 条") cleaned = [] for dm in danmu_list: dm = dm.strip() # 空或太短 if not dm or len(dm) < 2: continue # 去除 HTML 标签 dm = re.sub(r'<[^>]+>', '', dm) # 跳过纯符号或表情代码 if re.fullmatch(r'[\W_]+', dm): continue # 跳过纯英文字母或数字 if re.fullmatch(r'[A-Za-z0-9]+', dm): continue # 跳过看起来像网址或文件名的内容 if re.search(r'(http|www|\.com|\.cn|\.jpg|\.png|\.gif|\.svg)', dm, re.I): continue # 跳过无意义的乱码(连续3个以上随机字母数字) if re.search(r'[A-Za-z0-9]{4,}', dm): continue # 跳过只有单个字重复多次的内容(如“哈哈哈哈哈哈哈”、“啊啊啊啊”) if len(set(dm)) == 1 and len(dm) > 3: continue # 跳过带控制字符的异常行 if not re.match(r'^[\u4e00-\u9fa5A-Za-z0-9,。!?、,.!?()【】“”‘’\s]+$', dm): continue cleaned.append(dm) print(f"✅ 清洗后剩余有效弹幕 {len(cleaned)} 条") return cleaned def extract_keywords(self, danmu_list): """提取关键词""" print("🔍 开始提取关键词") all_text = " ".join(danmu_list) words = jieba.cut(all_text) # 过滤停用词、过短词、非中文 keywords = [ w for w in words if len(w) > 1 and re.search(r'[\u4e00-\u9fa5]', w) ] freq = Counter(keywords) print(f"📊 共提取 {len(freq)} 个关键词") return freq if __name__ == "__main__": processor = DataProcessor() test_data = [ "asbhfhbsdfhbsdfusdf", "哈哈哈哈哈哈哈哈", "www.bilibili.com", "大语言模型真强", "AI应用太方便了", "2333333", "i0.hdslb.com/xxxx.png", "🔥🔥🔥", "代码生成好厉害", "" ] cleaned = processor.clean_danmu(test_data) print("清洗后弹幕:", cleaned) counter = processor.extract_keywords(cleaned) print(counter.most_common(10))