You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
52 lines
1.5 KiB
52 lines
1.5 KiB
import jieba
|
|
import re
|
|
|
|
# 🚫 常见噪声弹幕(可自行扩展)
|
|
NOISE_PATTERNS = [
|
|
r"6{2,}", # 666, 6666
|
|
r"牛逼+", # 牛逼、牛逼了
|
|
r"牛", # 单独“牛”
|
|
r"好耶+", # 好耶~
|
|
r"哈哈+", # 哈哈、哈哈哈
|
|
r"h{2,}", # hhh、hhhh
|
|
r"赞+", # 赞、赞赞
|
|
r"厉害+", # 厉害了
|
|
r"好看+", # 好看
|
|
r"不错+", # 不错
|
|
r"加油+", # 加油
|
|
r"顶+", # 顶
|
|
r"膜拜+", # 膜拜
|
|
r"哭+", # 哭、哭了
|
|
]
|
|
|
|
# 🔍 编译正则提高性能
|
|
NOISE_REGEX = re.compile("|".join(NOISE_PATTERNS))
|
|
|
|
def is_noise(text: str) -> bool:
|
|
"""判断是否为噪声弹幕"""
|
|
if not text or len(text.strip()) < 2:
|
|
return True
|
|
if NOISE_REGEX.search(text.strip()):
|
|
return True
|
|
return False
|
|
|
|
def filter_and_tokenize(danmakus):
|
|
"""
|
|
过滤噪声弹幕并进行中文分词
|
|
"""
|
|
filtered_tokens = []
|
|
for text in danmakus:
|
|
if is_noise(text):
|
|
continue
|
|
# 用 jieba 分词并去除过短的 token
|
|
tokens = [w for w in jieba.cut(text) if len(w.strip()) > 1]
|
|
filtered_tokens.extend(tokens)
|
|
return filtered_tokens
|
|
|
|
|
|
# ✅ 调试示例(可删)
|
|
if __name__ == "__main__":
|
|
test_danmakus = ["666", "牛逼了", "哈哈哈", "ChatGPT太强了", "OpenAI发布了新模型"]
|
|
tokens = filter_and_tokenize(test_danmakus)
|
|
print(tokens)
|