parent
c48db50d58
commit
bcfdf17876
@ -0,0 +1,96 @@
|
||||
import pandas as pd
|
||||
import re
|
||||
from collections import defaultdict, Counter
|
||||
|
||||
|
||||
def comments_analysis(search_word, keywords_list):
|
||||
print("正在处理弹幕数据...")
|
||||
|
||||
# 读取弹幕文件
|
||||
def read_comments(file_path):
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
comments = f.readlines()
|
||||
return [comment.strip() for comment in comments]
|
||||
|
||||
# 统计与 AI 技术应用相关的弹幕数量,并记录匹配的弹幕
|
||||
def count_comments(comments, keywords):
|
||||
count = Counter()
|
||||
matched_comments = defaultdict(list) # 存储每个关键词对应的匹配弹幕
|
||||
# 提前将关键词转化为小写
|
||||
keywords_lower = {keyword.lower() for keyword in keywords}
|
||||
for comment in comments:
|
||||
comment_lower = comment.lower() # 将弹幕转换为小写
|
||||
for keyword in keywords_lower:
|
||||
if keyword == 'ai':
|
||||
# 使用正则表达式判断 "AI" 前后是否有英文字母
|
||||
if re.search(r'(?<![a-zA-Z])ai(?![a-zA-Z])', comment_lower):
|
||||
count[keyword] += 1
|
||||
matched_comments[keyword].append(comment) # 记录匹配的弹幕
|
||||
else:
|
||||
if keyword in comment_lower:
|
||||
count[keyword] += 1
|
||||
matched_comments[keyword].append(comment)
|
||||
|
||||
return count, matched_comments
|
||||
|
||||
# 将弹幕统计结果按数量排序并选择前 8 项
|
||||
def process_data(count):
|
||||
data = [{'弹幕': keyword, '数量': num} for keyword, num in count.items()]
|
||||
df_top8 = pd.DataFrame(data).sort_values(by='数量', ascending=False).head(8)
|
||||
return df_top8
|
||||
|
||||
# 处理与统计结果匹配的弹幕数据
|
||||
def process_matched_comments(matched_comments):
|
||||
matched_data = []
|
||||
for keyword, comments in matched_comments.items():
|
||||
for comment in comments:
|
||||
matched_data.append({'关键词': keyword, '匹配弹幕': comment})
|
||||
|
||||
df_matched = pd.DataFrame(matched_data)
|
||||
return df_matched
|
||||
|
||||
# 将统计结果写入 Excel
|
||||
def write_to_excel(count, matched_comments, output_file):
|
||||
# 调用数据处理函数
|
||||
df_top8 = process_data(count)
|
||||
|
||||
# 调用匹配弹幕处理函数
|
||||
df_matched = process_matched_comments(matched_comments)
|
||||
|
||||
# 写入不同的工作表
|
||||
with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
|
||||
df_top8.to_excel(writer, sheet_name='Sheet1', index=False) # 写入前 8 个关键词及其数量
|
||||
df_matched.to_excel(writer, sheet_name='Sheet2', index=False) # 写入匹配的弹幕
|
||||
|
||||
# 将匹配弹幕写入文本文件
|
||||
txt_output_file = f'{search_word}统计弹幕.txt'
|
||||
with open(txt_output_file, mode='w', encoding='utf-8') as f:
|
||||
f.writelines(comment + '\n' for comment in df_matched['匹配弹幕'])
|
||||
|
||||
comments_file = f'{search_word}弹幕.txt' # 需要分析的弹幕的文件路径
|
||||
output_file = f'{search_word}统计弹幕结果.xlsx' # 结果输出的文件路径
|
||||
# keywords = ['AI', '人工智能', '智能', 'GPT', '计算', '数据分析', '识别','gtp','合成','监测'] # 相关关键词
|
||||
keywords = keywords_list
|
||||
|
||||
comments = read_comments(comments_file) # 读取需要处理的弹幕数据
|
||||
comment_count, matched_comments = count_comments(comments, keywords) # 统计与 AI 技术应用相关的弹幕数量,并记录匹配的弹幕
|
||||
write_to_excel(comment_count, matched_comments, output_file) # 数据处理与保存
|
||||
print(f"统计结果已写入 {output_file}")
|
||||
|
||||
# # 以上函数性能测试
|
||||
# if __name__ == "__main__":
|
||||
# import cProfile
|
||||
|
||||
# # 设置测试关键词列表
|
||||
# keywords_list = ['AI', '人工智能', '智能', 'GPT', '计算', '数据分析', '识别', 'gtp', '合成', '监测']
|
||||
|
||||
# # 创建 cProfile 对象
|
||||
# pr = cProfile.Profile()
|
||||
# pr.enable() # 开始性能分析
|
||||
|
||||
# # 调用功能
|
||||
# comments_analysis("2024巴黎奥运会", keywords_list)
|
||||
|
||||
# pr.disable() # 停止性能分析
|
||||
# pr.dump_stats('output.prof') # 保存性能数据到文件
|
||||
# print("性能分析结果已保存到 output.prof")
|
Loading…
Reference in new issue