ADD 弹幕数据分析

11 months ago · bcfdf17876
parent c48db50d58
commit bcfdf17876
1 changed files with 96 additions and 0 deletions
--- a/bullet_comment_analysis.py
+++ b/bullet_comment_analysis.py
@ -0,0 +1,96 @@
+import pandas as pd
+import re
+from collections import defaultdict, Counter
+
+
+def comments_analysis(search_word, keywords_list):
+    print("正在处理弹幕数据...")
+    
+    # 读取弹幕文件
+    def read_comments(file_path):
+        with open(file_path, 'r', encoding='utf-8') as f:
+            comments = f.readlines()
+        return [comment.strip() for comment in comments]
+
+    # 统计与 AI 技术应用相关的弹幕数量，并记录匹配的弹幕
+    def count_comments(comments, keywords):
+        count = Counter()
+        matched_comments = defaultdict(list)  # 存储每个关键词对应的匹配弹幕
+        # 提前将关键词转化为小写
+        keywords_lower = {keyword.lower() for keyword in keywords}
+        for comment in comments:
+            comment_lower = comment.lower()  # 将弹幕转换为小写
+            for keyword in keywords_lower:
+                if keyword == 'ai':
+                    # 使用正则表达式判断 "AI" 前后是否有英文字母
+                    if re.search(r'(?<![a-zA-Z])ai(?![a-zA-Z])', comment_lower):
+                        count[keyword] += 1
+                        matched_comments[keyword].append(comment)  # 记录匹配的弹幕
+                else:
+                    if keyword in comment_lower:
+                        count[keyword] += 1
+                        matched_comments[keyword].append(comment)
+
+        return count, matched_comments
+
+    # 将弹幕统计结果按数量排序并选择前 8 项 
+    def process_data(count):
+        data = [{'弹幕': keyword, '数量': num} for keyword, num in count.items()]  
+        df_top8 = pd.DataFrame(data).sort_values(by='数量', ascending=False).head(8)
+        return df_top8
+
+    # 处理与统计结果匹配的弹幕数据
+    def process_matched_comments(matched_comments):
+        matched_data = []
+        for keyword, comments in matched_comments.items():
+            for comment in comments:
+                matched_data.append({'关键词': keyword, '匹配弹幕': comment})
+
+        df_matched = pd.DataFrame(matched_data)
+        return df_matched
+
+    # 将统计结果写入 Excel
+    def write_to_excel(count, matched_comments, output_file):
+        # 调用数据处理函数
+        df_top8 = process_data(count)
+
+        # 调用匹配弹幕处理函数
+        df_matched = process_matched_comments(matched_comments)
+
+        # 写入不同的工作表
+        with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
+            df_top8.to_excel(writer, sheet_name='Sheet1', index=False)  # 写入前 8 个关键词及其数量
+            df_matched.to_excel(writer, sheet_name='Sheet2', index=False)  # 写入匹配的弹幕
+
+        # 将匹配弹幕写入文本文件
+        txt_output_file = f'{search_word}统计弹幕.txt'
+        with open(txt_output_file, mode='w', encoding='utf-8') as f:
+            f.writelines(comment + '\n' for comment in df_matched['匹配弹幕'])
+
+    comments_file = f'{search_word}弹幕.txt'  # 需要分析的弹幕的文件路径
+    output_file = f'{search_word}统计弹幕结果.xlsx'  # 结果输出的文件路径
+    # keywords = ['AI', '人工智能', '智能', 'GPT', '计算', '数据分析', '识别','gtp','合成','监测']  # 相关关键词
+    keywords = keywords_list
+
+    comments = read_comments(comments_file) # 读取需要处理的弹幕数据
+    comment_count, matched_comments = count_comments(comments, keywords) # 统计与 AI 技术应用相关的弹幕数量，并记录匹配的弹幕
+    write_to_excel(comment_count, matched_comments, output_file) # 数据处理与保存
+    print(f"统计结果已写入 {output_file}")
+
+# # 以上函数性能测试
+# if __name__ == "__main__":
+#     import cProfile
+
+#     # 设置测试关键词列表
+#     keywords_list = ['AI', '人工智能', '智能', 'GPT', '计算', '数据分析', '识别', 'gtp', '合成', '监测']
+    
+#     # 创建 cProfile 对象
+#     pr = cProfile.Profile()
+#     pr.enable()  # 开始性能分析
+
+#     # 调用功能
+#     comments_analysis("2024巴黎奥运会", keywords_list)
+
+#     pr.disable()  # 停止性能分析
+#     pr.dump_stats('output.prof')  # 保存性能数据到文件
+#     print("性能分析结果已保存到 output.prof")