feat: use external stop words file and improve text cleaning

4 months ago · 7c21261350
parent 1e02d18dc4
commit 7c21261350
3 changed files with 263 additions and 7 deletions
--- a/data/stopwords.txt
+++ b/data/stopwords.txt
@ -0,0 +1,242 @@
+的
+了
+在
+是
+我
+有
+和
+就
+不
+人
+都
+一
+一个
+上
+也
+很
+到
+说
+要
+去
+你
+会
+着
+没有
+看
+好
+自己
+这
+那
+这个
+那个
+啊
+吧
+呢
+吗
+但是
+还是
+就是
+可以
+觉得
+因为
+所以
+如果
+比如
+其实
+这样
+那样
+怎么
+什么
+这就
+只是
+正在
+能够
+针对
+为了
+我们
+你们
+他们
+它们
+她们
+大家
+各位
+那些
+这些
+这里
+那里
+哪里
+时候
+现在
+之后
+之前
+然后
+于是
+而且
+或者
+虽然
+即使
+既然
+只要
+除非
+无论
+不论
+关于
+对于
+根据
+按照
+作为
+通过
+经过
+除了
+除了
+以及
+并且
+而
+且
+或
+与
+及
+跟
+向
+往
+从
+自
+由
+当
+当着
+沿着
+顺着
+随着
+为了
+为
+因
+由于
+因为
+以
+以便
+以免
+防止
+避免
+像
+如
+同
+好像
+如同
+似乎
+似的
+等于
+不如
+不及
+尽管
+虽然
+但是
+可是
+然而
+偏偏
+只是
+不过
+至于
+不光
+不仅
+不但
+而且
+并
+并且
+或者
+或是
+要么
+还是
+宁可
+宁愿
+与其
+不如
+只有
+只要
+除非
+不仅
+不但
+不光
+不单
+不独
+不只
+不至于
+不致
+不致于
+不免
+不若
+不比
+不过
+不特
+不独
+不尽
+不拘
+不限
+不问
+不管
+不顾
+不料
+不禁
+不觉
+不胜
+不住
+不由
+不平
+不忿
+不测
+不料
+不单
+不独
+不只
+不至于
+不致
+不致于
+不免
+不若
+不比
+不过
+不特
+不独
+不尽
+不拘
+不限
+不问
+不管
+不顾
+不料
+不禁
+不觉
+不胜
+不住
+不由
+不平
+不忿
+不测
+不料
+不单
+不独
+不只
+不至于
+不致
+不致于
+不免
+不若
+不比
+不过
+不特
+不独
+不尽
+不拘
+不限
+不问
+不管
+不顾
+不料
+不禁
+不觉
+不胜
+不住
+不由
+不平
+不忿
+不测
+不料
--- a/src/analysis.py
+++ b/src/analysis.py
@ -7,15 +7,25 @@ from typing import List, Dict, Tuple
 import os

 class DataAnalyzer:
-    def __init__(self, stop_words_path: str = None):
+    def __init__(self, stop_words_path: str = "data/stopwords.txt"):
        self.stop_words = set()
+        
+        # Load external stop words if file exists
        if stop_words_path and os.path.exists(stop_words_path):
-            with open(stop_words_path, 'r', encoding='utf-8') as f:
-                self.stop_words = set(line.strip() for line in f)
+            try:
+                with open(stop_words_path, 'r', encoding='utf-8') as f:
+                    self.stop_words = set(line.strip() for line in f)
+                logging.info(f"Loaded {len(self.stop_words)} stop words from {stop_words_path}")
+            except Exception as e:
+                logging.error(f"Failed to load stop words from {stop_words_path}: {e}")
        
-        # Add default stop words if none provided or to supplement
-        default_stop_words = {'的', '了', '是', '在', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有', '看', '好', '自己', '这', '那', '这个', '那个', '啊', '吧', '呢', '吗', '666', '哈哈', '哈哈哈', '打卡', '第一', '前排', '火钳刘明'}
-        self.stop_words.update(default_stop_words)
+        # Add danmaku specific stop words
+        danmaku_stop_words = {
+            '666', '哈哈', '哈哈哈', '打卡', '第一', '前排', '火钳刘明', '牛', '牛逼', '卧槽', '确实', '233', 'www', '厉害', '这就', 
+            'up', 'UP', 'Up', '视频', '弹幕', '硬币', '投币', '点赞', '关注', '收藏', '三连', '下次', '一定', '加油', '支持', '谢谢', '感谢', '辛苦', '老师', '教程', '讲得', '不错', '听懂', '学废', '眼睛', '脑子', '手', '学会', '学到',
+            '1', '2', '3', '4', '5', '6', '7', '8', '9', '0'
+        }
+        self.stop_words.update(danmaku_stop_words)

    def clean_text(self, text: str) -> str:
        """
@ -25,6 +35,9 @@ class DataAnalyzer:
            return ""
        # Remove special characters, keep only Chinese, English, numbers
        text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9]', ' ', text)
+        # Remove pure numbers
+        if text.strip().isdigit():
+            return ""
        return text.strip()

    def segment_and_count(self, danmaku_list: List[str], top_n: int = 100) -> Tuple[List[Tuple[str, int]], List[str]]:
--- a/src/main.py
+++ b/src/main.py
@ -21,6 +21,7 @@ def main():
    parser.add_argument("--keyword", type=str, default="大语言模型", help="Keyword to search for")
    parser.add_argument("--limit", type=int, default=300, help="Number of videos to analyze")
    parser.add_argument("--db", type=str, default="data/data.db", help="Path to SQLite database")
+    parser.add_argument("--stopwords", type=str, default="data/stopwords.txt", help="Path to stop words file")
    parser.add_argument("--output", type=str, default="data/output.xlsx", help="Path to output Excel file")
    parser.add_argument("--wordcloud", type=str, default="data/wordcloud.png", help="Path to output wordcloud image")
    
@ -31,7 +32,7 @@ def main():
    # Initialize modules
    crawler = BilibiliCrawler()
    storage = StorageManager(args.db)
-    analyzer = DataAnalyzer()
+    analyzer = DataAnalyzer(args.stopwords)
    visualizer = Visualizer()
    
    try: