|
|
#!/usr/bin/env python3
|
|
|
# -*- coding: utf-8 -*-
|
|
|
import re
|
|
|
import jieba
|
|
|
from collections import Counter
|
|
|
|
|
|
class DataProcessor:
|
|
|
"""高质量弹幕清洗与关键词提取"""
|
|
|
|
|
|
def __init__(self):
|
|
|
print("🧠 数据处理器初始化完成")
|
|
|
|
|
|
def clean_danmu(self, danmu_list):
|
|
|
"""清洗弹幕数据(彻底过滤无效或乱码内容)"""
|
|
|
print(f"🧹 清洗弹幕数据,共 {len(danmu_list)} 条")
|
|
|
|
|
|
cleaned = []
|
|
|
for dm in danmu_list:
|
|
|
dm = dm.strip()
|
|
|
# 空或太短
|
|
|
if not dm or len(dm) < 2:
|
|
|
continue
|
|
|
|
|
|
# 去除 HTML 标签
|
|
|
dm = re.sub(r'<[^>]+>', '', dm)
|
|
|
|
|
|
# 跳过纯符号或表情代码
|
|
|
if re.fullmatch(r'[\W_]+', dm):
|
|
|
continue
|
|
|
|
|
|
# 跳过纯英文字母或数字
|
|
|
if re.fullmatch(r'[A-Za-z0-9]+', dm):
|
|
|
continue
|
|
|
|
|
|
# 跳过看起来像网址或文件名的内容
|
|
|
if re.search(r'(http|www|\.com|\.cn|\.jpg|\.png|\.gif|\.svg)', dm, re.I):
|
|
|
continue
|
|
|
|
|
|
# 跳过无意义的乱码(连续3个以上随机字母数字)
|
|
|
if re.search(r'[A-Za-z0-9]{4,}', dm):
|
|
|
continue
|
|
|
|
|
|
# 跳过只有单个字重复多次的内容(如“哈哈哈哈哈哈哈”、“啊啊啊啊”)
|
|
|
if len(set(dm)) == 1 and len(dm) > 3:
|
|
|
continue
|
|
|
|
|
|
# 跳过带控制字符的异常行
|
|
|
if not re.match(r'^[\u4e00-\u9fa5A-Za-z0-9,。!?、,.!?()【】“”‘’\s]+$', dm):
|
|
|
continue
|
|
|
|
|
|
cleaned.append(dm)
|
|
|
|
|
|
print(f"✅ 清洗后剩余有效弹幕 {len(cleaned)} 条")
|
|
|
return cleaned
|
|
|
|
|
|
def extract_keywords(self, danmu_list):
|
|
|
"""提取关键词"""
|
|
|
print("🔍 开始提取关键词")
|
|
|
all_text = " ".join(danmu_list)
|
|
|
words = jieba.cut(all_text)
|
|
|
|
|
|
# 过滤停用词、过短词、非中文
|
|
|
keywords = [
|
|
|
w for w in words
|
|
|
if len(w) > 1 and re.search(r'[\u4e00-\u9fa5]', w)
|
|
|
]
|
|
|
|
|
|
freq = Counter(keywords)
|
|
|
print(f"📊 共提取 {len(freq)} 个关键词")
|
|
|
return freq
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
processor = DataProcessor()
|
|
|
test_data = [
|
|
|
"asbhfhbsdfhbsdfusdf", "哈哈哈哈哈哈哈哈", "www.bilibili.com",
|
|
|
"大语言模型真强", "AI应用太方便了", "2333333", "i0.hdslb.com/xxxx.png",
|
|
|
"🔥🔥🔥", "代码生成好厉害", "<svg path...>"
|
|
|
]
|
|
|
cleaned = processor.clean_danmu(test_data)
|
|
|
print("清洗后弹幕:", cleaned)
|
|
|
counter = processor.extract_keywords(cleaned)
|
|
|
print(counter.most_common(10))
|