Update processor.py

main
fzu102301618 6 months ago
parent 67ae8b55ca
commit 3ad42cf5ac

@ -2,10 +2,7 @@
# -*- coding: utf-8 -*-
"""
processor.py
弹幕数据清洗与关键词提取模块优化版
- 增强无效弹幕过滤去除链接HTML标签表情符号噪声
- 自动统计清洗前后数量
- 保持与主程序接口一致
弹幕数据清洗与关键词提取升级版
"""
import re
@ -13,76 +10,67 @@ import jieba
from collections import Counter
class DataProcessor:
"""数据处理"""
"""数据处理与清洗"""
def __init__(self):
print("✅ 数据处理器初始化完成")
print("🧹 数据处理器初始化")
def _clean_text(self, text: str) -> str:
"""内部函数:清洗单条弹幕"""
if not isinstance(text, str):
return ""
# 无效弹幕关键词(常见口水词、广告词)
self.stop_phrases = set([
"666", "哈哈", "哈哈哈", "前排", "来了", "好家伙", "可以的", "", "",
"nb", "牛逼", "太强了", "强啊", "来了来了", "关注我", "收藏", "up",
"bilibili", "http", "www", "com", "cn", "png", "jpg", "color", "i0", "hdslb"
])
# 1⃣ 去掉 URL、文件名、域名等
text = re.sub(r'http\S+|www\S+|bilibili\S+|hdslb\S+|\.com|\.cn|\.net|png|jpg|jpeg|gif', '', text, flags=re.IGNORECASE)
# 2⃣ 去掉 HTML 标签与表情
text = re.sub(r'<[^>]+>', '', text)
text = re.sub(r'\[[^\]]+\]', '', text) # 去掉 [doge]、[笑哭] 等表情
# 3⃣ 去掉仅由数字、符号组成的内容
if re.fullmatch(r'[\d\W_]+', text):
return ""
# 4⃣ 去掉无意义重复字符(如“哈哈哈哈哈”、“。。。”)
text = re.sub(r'(.)\1{3,}', r'\1', text)
# 5⃣ 去掉多余空白
text = text.strip()
# 6⃣ 删除过短文本
if len(text) < 2:
return ""
return text
# 停用词(常见虚词、代词)
self.stopwords = set([
"", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "一个", "不会", "真的"
])
def clean_danmu(self, danmu_list):
"""清洗弹幕数据"""
total = len(danmu_list)
print(f"🧹 开始清洗弹幕数据,共 {total}")
print(f"清洗弹幕数据,共 {len(danmu_list)}")
cleaned = []
for dm in danmu_list:
t = self._clean_text(dm)
if t:
cleaned.append(t)
print(f"✅ 清洗完成,有效弹幕: {len(cleaned)} 条,占比 {len(cleaned) / total * 100:.1f}%")
text = dm.strip()
if not text or len(text) < 2:
continue
# 去除网址、表情符号、特殊符号
if re.search(r"http|www|bilibili|\.com|\.cn", text, re.I):
continue
if re.fullmatch(r"[\d\W_]+", text):
continue
if any(p in text.lower() for p in self.stop_phrases):
continue
cleaned.append(text)
print(f"✅ 清洗后剩余 {len(cleaned)} 条有效弹幕")
return cleaned
def extract_keywords(self, danmu_list):
"""提取关键词"""
print("🔍 开始提取关键词...")
print("🔍 开始提取关键词")
all_text = " ".join(danmu_list)
words = jieba.cut(all_text)
# 过滤短词和常见无意义词
stopwords = {"一个", "什么", "就是", "这个", "那个", "我们", "他们", "哈哈", "真的"}
keywords = [w for w in words if len(w) > 1 and w not in stopwords]
return Counter(keywords)
keywords = [
w for w in words
if len(w) > 1 and w not in self.stopwords and not re.fullmatch(r"\d+", w)
]
freq = Counter(keywords)
print(f"✅ 提取关键词 {len(freq)}")
return freq
if __name__ == "__main__":
# 测试示例
processor = DataProcessor()
test_data = [
"哈哈哈哈哈",
"http://www.bilibili.com/video/BVxxx",
"[doge][笑哭]这个模型太强了!",
"大语言模型从入门到精通",
"23333333",
"<d p='1,2,3'>学习AI真有趣</d>",
"i0.hdslb.com/bfs/face.png"
"哈哈哈", "666", "大语言模型真的好强", "关注我", "这也太牛了", "AI生成内容真有趣", "http://bilibili.com"
]
cleaned = processor.clean_danmu(test_data)
print(f"🧩 清洗后数据: {cleaned}")
print("清洗后:", cleaned)
freq = processor.extract_keywords(cleaned)
print("🔥 高频词:", freq.most_common(5))
print("关键词:", freq.most_common(5))

Loading…
Cancel
Save