From bcfdf17876c80359b9b8bfe9e0f8ea477390e2b5 Mon Sep 17 00:00:00 2001 From: pufahrcyp <1195744232@qq.com> Date: Wed, 18 Sep 2024 22:06:27 +0800 Subject: [PATCH] =?UTF-8?q?ADD=20=E5=BC=B9=E5=B9=95=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E5=88=86=E6=9E=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bullet_comment_analysis.py | 96 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 bullet_comment_analysis.py diff --git a/bullet_comment_analysis.py b/bullet_comment_analysis.py new file mode 100644 index 0000000..4c5a260 --- /dev/null +++ b/bullet_comment_analysis.py @@ -0,0 +1,96 @@ +import pandas as pd +import re +from collections import defaultdict, Counter + + +def comments_analysis(search_word, keywords_list): + print("正在处理弹幕数据...") + + # 读取弹幕文件 + def read_comments(file_path): + with open(file_path, 'r', encoding='utf-8') as f: + comments = f.readlines() + return [comment.strip() for comment in comments] + + # 统计与 AI 技术应用相关的弹幕数量,并记录匹配的弹幕 + def count_comments(comments, keywords): + count = Counter() + matched_comments = defaultdict(list) # 存储每个关键词对应的匹配弹幕 + # 提前将关键词转化为小写 + keywords_lower = {keyword.lower() for keyword in keywords} + for comment in comments: + comment_lower = comment.lower() # 将弹幕转换为小写 + for keyword in keywords_lower: + if keyword == 'ai': + # 使用正则表达式判断 "AI" 前后是否有英文字母 + if re.search(r'(?