You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
102201601/bullet_comment_analysis.py

97 lines
4.3 KiB

import pandas as pd
import re
from collections import defaultdict, Counter
def comments_analysis(search_word, keywords_list):
print("正在处理弹幕数据...")
# 读取弹幕文件
def read_comments(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
comments = f.readlines()
return [comment.strip() for comment in comments]
# 统计与 AI 技术应用相关的弹幕数量,并记录匹配的弹幕
def count_comments(comments, keywords):
count = Counter()
matched_comments = defaultdict(list) # 存储每个关键词对应的匹配弹幕
# 提前将关键词转化为小写
keywords_lower = {keyword.lower() for keyword in keywords}
for comment in comments:
comment_lower = comment.lower() # 将弹幕转换为小写
for keyword in keywords_lower:
if keyword == 'ai':
# 使用正则表达式判断 "AI" 前后是否有英文字母
if re.search(r'(?<![a-zA-Z])ai(?![a-zA-Z])', comment_lower):
count[keyword] += 1
matched_comments[keyword].append(comment) # 记录匹配的弹幕
else:
if keyword in comment_lower:
count[keyword] += 1
matched_comments[keyword].append(comment)
return count, matched_comments
# 将弹幕统计结果按数量排序并选择前 8 项
def process_data(count):
data = [{'弹幕': keyword, '数量': num} for keyword, num in count.items()]
df_top8 = pd.DataFrame(data).sort_values(by='数量', ascending=False).head(8)
return df_top8
# 处理与统计结果匹配的弹幕数据
def process_matched_comments(matched_comments):
matched_data = []
for keyword, comments in matched_comments.items():
for comment in comments:
matched_data.append({'关键词': keyword, '匹配弹幕': comment})
df_matched = pd.DataFrame(matched_data)
return df_matched
# 将统计结果写入 Excel
def write_to_excel(count, matched_comments, output_file):
# 调用数据处理函数
df_top8 = process_data(count)
# 调用匹配弹幕处理函数
df_matched = process_matched_comments(matched_comments)
# 写入不同的工作表
with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
df_top8.to_excel(writer, sheet_name='Sheet1', index=False) # 写入前 8 个关键词及其数量
df_matched.to_excel(writer, sheet_name='Sheet2', index=False) # 写入匹配的弹幕
# 将匹配弹幕写入文本文件
txt_output_file = f'{search_word}统计弹幕.txt'
with open(txt_output_file, mode='w', encoding='utf-8') as f:
f.writelines(comment + '\n' for comment in df_matched['匹配弹幕'])
comments_file = f'{search_word}弹幕.txt' # 需要分析的弹幕的文件路径
output_file = f'{search_word}统计弹幕结果.xlsx' # 结果输出的文件路径
# keywords = ['AI', '人工智能', '智能', 'GPT', '计算', '数据分析', '识别','gtp','合成','监测'] # 相关关键词
keywords = keywords_list
comments = read_comments(comments_file) # 读取需要处理的弹幕数据
comment_count, matched_comments = count_comments(comments, keywords) # 统计与 AI 技术应用相关的弹幕数量,并记录匹配的弹幕
write_to_excel(comment_count, matched_comments, output_file) # 数据处理与保存
print(f"统计结果已写入 {output_file}")
# # 以上函数性能测试
# if __name__ == "__main__":
# import cProfile
# # 设置测试关键词列表
# keywords_list = ['AI', '人工智能', '智能', 'GPT', '计算', '数据分析', '识别', 'gtp', '合成', '监测']
# # 创建 cProfile 对象
# pr = cProfile.Profile()
# pr.enable() # 开始性能分析
# # 调用功能
# comments_analysis("2024巴黎奥运会", keywords_list)
# pr.disable() # 停止性能分析
# pr.dump_stats('output.prof') # 保存性能数据到文件
# print("性能分析结果已保存到 output.prof")