You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

52 lines
1.5 KiB

import jieba
import re
# 🚫 常见噪声弹幕(可自行扩展)
NOISE_PATTERNS = [
r"6{2,}", # 666, 6666
r"牛逼+", # 牛逼、牛逼了
r"", # 单独“牛”
r"好耶+", # 好耶~
r"哈哈+", # 哈哈、哈哈哈
r"h{2,}", # hhh、hhhh
r"赞+", # 赞、赞赞
r"厉害+", # 厉害了
r"好看+", # 好看
r"不错+", # 不错
r"加油+", # 加油
r"顶+", # 顶
r"膜拜+", # 膜拜
r"哭+", # 哭、哭了
]
# 🔍 编译正则提高性能
NOISE_REGEX = re.compile("|".join(NOISE_PATTERNS))
def is_noise(text: str) -> bool:
"""判断是否为噪声弹幕"""
if not text or len(text.strip()) < 2:
return True
if NOISE_REGEX.search(text.strip()):
return True
return False
def filter_and_tokenize(danmakus):
"""
过滤噪声弹幕并进行中文分词
"""
filtered_tokens = []
for text in danmakus:
if is_noise(text):
continue
# 用 jieba 分词并去除过短的 token
tokens = [w for w in jieba.cut(text) if len(w.strip()) > 1]
filtered_tokens.extend(tokens)
return filtered_tokens
# ✅ 调试示例(可删)
if __name__ == "__main__":
test_danmakus = ["666", "牛逼了", "哈哈哈", "ChatGPT太强了", "OpenAI发布了新模型"]
tokens = filter_and_tokenize(test_danmakus)
print(tokens)