You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
97 lines
4.3 KiB
97 lines
4.3 KiB
import pandas as pd
|
|
import re
|
|
from collections import defaultdict, Counter
|
|
|
|
|
|
def comments_analysis(search_word, keywords_list):
|
|
print("正在处理弹幕数据...")
|
|
|
|
# 读取弹幕文件
|
|
def read_comments(file_path):
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
comments = f.readlines()
|
|
return [comment.strip() for comment in comments]
|
|
|
|
# 统计与 AI 技术应用相关的弹幕数量,并记录匹配的弹幕
|
|
def count_comments(comments, keywords):
|
|
count = Counter()
|
|
matched_comments = defaultdict(list) # 存储每个关键词对应的匹配弹幕
|
|
# 提前将关键词转化为小写
|
|
keywords_lower = {keyword.lower() for keyword in keywords}
|
|
for comment in comments:
|
|
comment_lower = comment.lower() # 将弹幕转换为小写
|
|
for keyword in keywords_lower:
|
|
if keyword == 'ai':
|
|
# 使用正则表达式判断 "AI" 前后是否有英文字母
|
|
if re.search(r'(?<![a-zA-Z])ai(?![a-zA-Z])', comment_lower):
|
|
count[keyword] += 1
|
|
matched_comments[keyword].append(comment) # 记录匹配的弹幕
|
|
else:
|
|
if keyword in comment_lower:
|
|
count[keyword] += 1
|
|
matched_comments[keyword].append(comment)
|
|
|
|
return count, matched_comments
|
|
|
|
# 将弹幕统计结果按数量排序并选择前 8 项
|
|
def process_data(count):
|
|
data = [{'弹幕': keyword, '数量': num} for keyword, num in count.items()]
|
|
df_top8 = pd.DataFrame(data).sort_values(by='数量', ascending=False).head(8)
|
|
return df_top8
|
|
|
|
# 处理与统计结果匹配的弹幕数据
|
|
def process_matched_comments(matched_comments):
|
|
matched_data = []
|
|
for keyword, comments in matched_comments.items():
|
|
for comment in comments:
|
|
matched_data.append({'关键词': keyword, '匹配弹幕': comment})
|
|
|
|
df_matched = pd.DataFrame(matched_data)
|
|
return df_matched
|
|
|
|
# 将统计结果写入 Excel
|
|
def write_to_excel(count, matched_comments, output_file):
|
|
# 调用数据处理函数
|
|
df_top8 = process_data(count)
|
|
|
|
# 调用匹配弹幕处理函数
|
|
df_matched = process_matched_comments(matched_comments)
|
|
|
|
# 写入不同的工作表
|
|
with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
|
|
df_top8.to_excel(writer, sheet_name='Sheet1', index=False) # 写入前 8 个关键词及其数量
|
|
df_matched.to_excel(writer, sheet_name='Sheet2', index=False) # 写入匹配的弹幕
|
|
|
|
# 将匹配弹幕写入文本文件
|
|
txt_output_file = f'{search_word}统计弹幕.txt'
|
|
with open(txt_output_file, mode='w', encoding='utf-8') as f:
|
|
f.writelines(comment + '\n' for comment in df_matched['匹配弹幕'])
|
|
|
|
comments_file = f'{search_word}弹幕.txt' # 需要分析的弹幕的文件路径
|
|
output_file = f'{search_word}统计弹幕结果.xlsx' # 结果输出的文件路径
|
|
# keywords = ['AI', '人工智能', '智能', 'GPT', '计算', '数据分析', '识别','gtp','合成','监测'] # 相关关键词
|
|
keywords = keywords_list
|
|
|
|
comments = read_comments(comments_file) # 读取需要处理的弹幕数据
|
|
comment_count, matched_comments = count_comments(comments, keywords) # 统计与 AI 技术应用相关的弹幕数量,并记录匹配的弹幕
|
|
write_to_excel(comment_count, matched_comments, output_file) # 数据处理与保存
|
|
print(f"统计结果已写入 {output_file}")
|
|
|
|
# # 以上函数性能测试
|
|
# if __name__ == "__main__":
|
|
# import cProfile
|
|
|
|
# # 设置测试关键词列表
|
|
# keywords_list = ['AI', '人工智能', '智能', 'GPT', '计算', '数据分析', '识别', 'gtp', '合成', '监测']
|
|
|
|
# # 创建 cProfile 对象
|
|
# pr = cProfile.Profile()
|
|
# pr.enable() # 开始性能分析
|
|
|
|
# # 调用功能
|
|
# comments_analysis("2024巴黎奥运会", keywords_list)
|
|
|
|
# pr.disable() # 停止性能分析
|
|
# pr.dump_stats('output.prof') # 保存性能数据到文件
|
|
# print("性能分析结果已保存到 output.prof")
|