Update processor.py

6 months ago · 3ad42cf5ac
parent 67ae8b55ca
commit 3ad42cf5ac
1 changed files with 41 additions and 53 deletions
--- a/processor.py
+++ b/processor.py
@ -2,10 +2,7 @@
 # -*- coding: utf-8 -*-
 """
 processor.py
-弹幕数据清洗与关键词提取模块（优化版）
- 增强无效弹幕过滤：去除链接、HTML标签、表情、符号噪声
- 自动统计清洗前后数量
- 保持与主程序接口一致
+弹幕数据清洗与关键词提取（升级版）
 """

 import re
@ -13,76 +10,67 @@ import jieba
 from collections import Counter

 class DataProcessor:
-    """数据处理类"""
+    """数据处理与清洗类"""

    def __init__(self):
-        print("✅ 数据处理器初始化完成")
+        print("🧹 数据处理器初始化")

-    def _clean_text(self, text: str) -> str:
-        """内部函数：清洗单条弹幕"""
-        if not isinstance(text, str):
-            return ""
+        # 无效弹幕关键词（常见口水词、广告词）
+        self.stop_phrases = set([
+            "666", "哈哈", "哈哈哈", "前排", "来了", "好家伙", "可以的", "赞", "顶", 
+            "nb", "牛逼", "太强了", "强啊", "来了来了", "关注我", "收藏", "up", 
+            "bilibili", "http", "www", "com", "cn", "png", "jpg", "color", "i0", "hdslb"
+        ])

-        # 1️⃣ 去掉 URL、文件名、域名等
-        text = re.sub(r'http\S+|www\S+|bilibili\S+|hdslb\S+|\.com|\.cn|\.net|png|jpg|jpeg|gif', '', text, flags=re.IGNORECASE)
-
-        # 2️⃣ 去掉 HTML 标签与表情
-        text = re.sub(r'<[^>]+>', '', text)
-        text = re.sub(r'\[[^\]]+\]', '', text)  # 去掉 [doge]、[笑哭] 等表情
-
-        # 3️⃣ 去掉仅由数字、符号组成的内容
-        if re.fullmatch(r'[\d\W_]+', text):
-            return ""
-
-        # 4️⃣ 去掉无意义重复字符（如“哈哈哈哈哈”、“。。。”）
-        text = re.sub(r'(.)\1{3,}', r'\1', text)
-
-        # 5️⃣ 去掉多余空白
-        text = text.strip()
-
-        # 6️⃣ 删除过短文本
-        if len(text) < 2:
-            return ""
-
-        return text
+        # 停用词（常见虚词、代词）
+        self.stopwords = set([
+            "的", "了", "是", "我", "你", "他", "她", "它", "也", "和", "就", "都", "在",
+            "啊", "吧", "吗", "呢", "哦", "呀", "这", "那", "一个", "不会", "真的"
+        ])

    def clean_danmu(self, danmu_list):
        """清洗弹幕数据"""
-        total = len(danmu_list)
-        print(f"🧹 开始清洗弹幕数据，共 {total} 条")
+        print(f"清洗弹幕数据，共 {len(danmu_list)} 条")

        cleaned = []
        for dm in danmu_list:
-            t = self._clean_text(dm)
-            if t:
-                cleaned.append(t)
-
-        print(f"✅ 清洗完成，有效弹幕: {len(cleaned)} 条，占比 {len(cleaned) / total * 100:.1f}%")
+            text = dm.strip()
+            if not text or len(text) < 2:
+                continue
+            # 去除网址、表情符号、特殊符号
+            if re.search(r"http|www|bilibili|\.com|\.cn", text, re.I):
+                continue
+            if re.fullmatch(r"[\d\W_]+", text):
+                continue
+            if any(p in text.lower() for p in self.stop_phrases):
+                continue
+
+            cleaned.append(text)
+
+        print(f"✅ 清洗后剩余 {len(cleaned)} 条有效弹幕")
        return cleaned

    def extract_keywords(self, danmu_list):
        """提取关键词"""
-        print("🔍 开始提取关键词...")
+        print("🔍 开始提取关键词")
        all_text = " ".join(danmu_list)
        words = jieba.cut(all_text)
-        # 过滤短词和常见无意义词
-        stopwords = {"一个", "什么", "就是", "这个", "那个", "我们", "他们", "哈哈", "真的"}
-        keywords = [w for w in words if len(w) > 1 and w not in stopwords]
-        return Counter(keywords)
+        keywords = [
+            w for w in words
+            if len(w) > 1 and w not in self.stopwords and not re.fullmatch(r"\d+", w)
+        ]
+        freq = Counter(keywords)
+        print(f"✅ 提取关键词 {len(freq)} 个")
+        return freq


 if __name__ == "__main__":
+    # 测试示例
    processor = DataProcessor()
    test_data = [
-        "哈哈哈哈哈", 
-        "http://www.bilibili.com/video/BVxxx", 
-        "[doge][笑哭]这个模型太强了！", 
-        "大语言模型从入门到精通", 
-        "23333333", 
-        "<d p='1,2,3'>学习AI真有趣</d>", 
-        "i0.hdslb.com/bfs/face.png"
+        "哈哈哈", "666", "大语言模型真的好强", "关注我", "这也太牛了", "AI生成内容真有趣", "http://bilibili.com"
    ]
    cleaned = processor.clean_danmu(test_data)
-    print(f"🧩 清洗后数据: {cleaned}")
+    print("清洗后:", cleaned)
    freq = processor.extract_keywords(cleaned)
-    print("🔥 高频词:", freq.most_common(5))
+    print("关键词:", freq.most_common(5))