|
|
|
|
@ -2,10 +2,7 @@
|
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
"""
|
|
|
|
|
processor.py
|
|
|
|
|
弹幕数据清洗与关键词提取模块(优化版)
|
|
|
|
|
- 增强无效弹幕过滤:去除链接、HTML标签、表情、符号噪声
|
|
|
|
|
- 自动统计清洗前后数量
|
|
|
|
|
- 保持与主程序接口一致
|
|
|
|
|
弹幕数据清洗与关键词提取(升级版)
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import re
|
|
|
|
|
@ -13,76 +10,67 @@ import jieba
|
|
|
|
|
from collections import Counter
|
|
|
|
|
|
|
|
|
|
class DataProcessor:
|
|
|
|
|
"""数据处理类"""
|
|
|
|
|
"""数据处理与清洗类"""
|
|
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
|
print("✅ 数据处理器初始化完成")
|
|
|
|
|
print("🧹 数据处理器初始化")
|
|
|
|
|
|
|
|
|
|
def _clean_text(self, text: str) -> str:
|
|
|
|
|
"""内部函数:清洗单条弹幕"""
|
|
|
|
|
if not isinstance(text, str):
|
|
|
|
|
return ""
|
|
|
|
|
# 无效弹幕关键词(常见口水词、广告词)
|
|
|
|
|
self.stop_phrases = set([
|
|
|
|
|
"666", "哈哈", "哈哈哈", "前排", "来了", "好家伙", "可以的", "赞", "顶",
|
|
|
|
|
"nb", "牛逼", "太强了", "强啊", "来了来了", "关注我", "收藏", "up",
|
|
|
|
|
"bilibili", "http", "www", "com", "cn", "png", "jpg", "color", "i0", "hdslb"
|
|
|
|
|
])
|
|
|
|
|
|
|
|
|
|
# 1️⃣ 去掉 URL、文件名、域名等
|
|
|
|
|
text = re.sub(r'http\S+|www\S+|bilibili\S+|hdslb\S+|\.com|\.cn|\.net|png|jpg|jpeg|gif', '', text, flags=re.IGNORECASE)
|
|
|
|
|
|
|
|
|
|
# 2️⃣ 去掉 HTML 标签与表情
|
|
|
|
|
text = re.sub(r'<[^>]+>', '', text)
|
|
|
|
|
text = re.sub(r'\[[^\]]+\]', '', text) # 去掉 [doge]、[笑哭] 等表情
|
|
|
|
|
|
|
|
|
|
# 3️⃣ 去掉仅由数字、符号组成的内容
|
|
|
|
|
if re.fullmatch(r'[\d\W_]+', text):
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
# 4️⃣ 去掉无意义重复字符(如“哈哈哈哈哈”、“。。。”)
|
|
|
|
|
text = re.sub(r'(.)\1{3,}', r'\1', text)
|
|
|
|
|
|
|
|
|
|
# 5️⃣ 去掉多余空白
|
|
|
|
|
text = text.strip()
|
|
|
|
|
|
|
|
|
|
# 6️⃣ 删除过短文本
|
|
|
|
|
if len(text) < 2:
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
return text
|
|
|
|
|
# 停用词(常见虚词、代词)
|
|
|
|
|
self.stopwords = set([
|
|
|
|
|
"的", "了", "是", "我", "你", "他", "她", "它", "也", "和", "就", "都", "在",
|
|
|
|
|
"啊", "吧", "吗", "呢", "哦", "呀", "这", "那", "一个", "不会", "真的"
|
|
|
|
|
])
|
|
|
|
|
|
|
|
|
|
def clean_danmu(self, danmu_list):
|
|
|
|
|
"""清洗弹幕数据"""
|
|
|
|
|
total = len(danmu_list)
|
|
|
|
|
print(f"🧹 开始清洗弹幕数据,共 {total} 条")
|
|
|
|
|
print(f"清洗弹幕数据,共 {len(danmu_list)} 条")
|
|
|
|
|
|
|
|
|
|
cleaned = []
|
|
|
|
|
for dm in danmu_list:
|
|
|
|
|
t = self._clean_text(dm)
|
|
|
|
|
if t:
|
|
|
|
|
cleaned.append(t)
|
|
|
|
|
|
|
|
|
|
print(f"✅ 清洗完成,有效弹幕: {len(cleaned)} 条,占比 {len(cleaned) / total * 100:.1f}%")
|
|
|
|
|
text = dm.strip()
|
|
|
|
|
if not text or len(text) < 2:
|
|
|
|
|
continue
|
|
|
|
|
# 去除网址、表情符号、特殊符号
|
|
|
|
|
if re.search(r"http|www|bilibili|\.com|\.cn", text, re.I):
|
|
|
|
|
continue
|
|
|
|
|
if re.fullmatch(r"[\d\W_]+", text):
|
|
|
|
|
continue
|
|
|
|
|
if any(p in text.lower() for p in self.stop_phrases):
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
cleaned.append(text)
|
|
|
|
|
|
|
|
|
|
print(f"✅ 清洗后剩余 {len(cleaned)} 条有效弹幕")
|
|
|
|
|
return cleaned
|
|
|
|
|
|
|
|
|
|
def extract_keywords(self, danmu_list):
|
|
|
|
|
"""提取关键词"""
|
|
|
|
|
print("🔍 开始提取关键词...")
|
|
|
|
|
print("🔍 开始提取关键词")
|
|
|
|
|
all_text = " ".join(danmu_list)
|
|
|
|
|
words = jieba.cut(all_text)
|
|
|
|
|
# 过滤短词和常见无意义词
|
|
|
|
|
stopwords = {"一个", "什么", "就是", "这个", "那个", "我们", "他们", "哈哈", "真的"}
|
|
|
|
|
keywords = [w for w in words if len(w) > 1 and w not in stopwords]
|
|
|
|
|
return Counter(keywords)
|
|
|
|
|
keywords = [
|
|
|
|
|
w for w in words
|
|
|
|
|
if len(w) > 1 and w not in self.stopwords and not re.fullmatch(r"\d+", w)
|
|
|
|
|
]
|
|
|
|
|
freq = Counter(keywords)
|
|
|
|
|
print(f"✅ 提取关键词 {len(freq)} 个")
|
|
|
|
|
return freq
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
# 测试示例
|
|
|
|
|
processor = DataProcessor()
|
|
|
|
|
test_data = [
|
|
|
|
|
"哈哈哈哈哈",
|
|
|
|
|
"http://www.bilibili.com/video/BVxxx",
|
|
|
|
|
"[doge][笑哭]这个模型太强了!",
|
|
|
|
|
"大语言模型从入门到精通",
|
|
|
|
|
"23333333",
|
|
|
|
|
"<d p='1,2,3'>学习AI真有趣</d>",
|
|
|
|
|
"i0.hdslb.com/bfs/face.png"
|
|
|
|
|
"哈哈哈", "666", "大语言模型真的好强", "关注我", "这也太牛了", "AI生成内容真有趣", "http://bilibili.com"
|
|
|
|
|
]
|
|
|
|
|
cleaned = processor.clean_danmu(test_data)
|
|
|
|
|
print(f"🧩 清洗后数据: {cleaned}")
|
|
|
|
|
print("清洗后:", cleaned)
|
|
|
|
|
freq = processor.extract_keywords(cleaned)
|
|
|
|
|
print("🔥 高频词:", freq.most_common(5))
|
|
|
|
|
print("关键词:", freq.most_common(5))
|
|
|
|
|
|