You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

84 lines
2.7 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
import jieba
from collections import Counter
class DataProcessor:
"""高质量弹幕清洗与关键词提取"""
def __init__(self):
print("🧠 数据处理器初始化完成")
def clean_danmu(self, danmu_list):
"""清洗弹幕数据(彻底过滤无效或乱码内容)"""
print(f"🧹 清洗弹幕数据,共 {len(danmu_list)}")
cleaned = []
for dm in danmu_list:
dm = dm.strip()
# 空或太短
if not dm or len(dm) < 2:
continue
# 去除 HTML 标签
dm = re.sub(r'<[^>]+>', '', dm)
# 跳过纯符号或表情代码
if re.fullmatch(r'[\W_]+', dm):
continue
# 跳过纯英文字母或数字
if re.fullmatch(r'[A-Za-z0-9]+', dm):
continue
# 跳过看起来像网址或文件名的内容
if re.search(r'(http|www|\.com|\.cn|\.jpg|\.png|\.gif|\.svg)', dm, re.I):
continue
# 跳过无意义的乱码连续3个以上随机字母数字
if re.search(r'[A-Za-z0-9]{4,}', dm):
continue
# 跳过只有单个字重复多次的内容(如“哈哈哈哈哈哈哈”、“啊啊啊啊”)
if len(set(dm)) == 1 and len(dm) > 3:
continue
# 跳过带控制字符的异常行
if not re.match(r'^[\u4e00-\u9fa5A-Za-z0-9、,.!?()【】“”‘’\s]+$', dm):
continue
cleaned.append(dm)
print(f"✅ 清洗后剩余有效弹幕 {len(cleaned)}")
return cleaned
def extract_keywords(self, danmu_list):
"""提取关键词"""
print("🔍 开始提取关键词")
all_text = " ".join(danmu_list)
words = jieba.cut(all_text)
# 过滤停用词、过短词、非中文
keywords = [
w for w in words
if len(w) > 1 and re.search(r'[\u4e00-\u9fa5]', w)
]
freq = Counter(keywords)
print(f"📊 共提取 {len(freq)} 个关键词")
return freq
if __name__ == "__main__":
processor = DataProcessor()
test_data = [
"asbhfhbsdfhbsdfusdf", "哈哈哈哈哈哈哈哈", "www.bilibili.com",
"大语言模型真强", "AI应用太方便了", "2333333", "i0.hdslb.com/xxxx.png",
"🔥🔥🔥", "代码生成好厉害", "<svg path...>"
]
cleaned = processor.clean_danmu(test_data)
print("清洗后弹幕:", cleaned)
counter = processor.extract_keywords(cleaned)
print(counter.most_common(10))