You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

51 lines
2.1 KiB

import pandas as pd
from collections import Counter
# 读取TXT文件并统计AI相关关键词
def count_ai_keywords(file_path, output_excel):
# 定义与AI技术相关的关键词列表
ai_keywords = [
"AI", "人工智能", "Machine learning", "机器学习", "Deep learning", "深度学习",
"Neural network", "神经网络", "自然语言处理", "Natural language processing",
"计算机视觉", "Computer vision", "Robotics", "机器人", "自动化", "Automation",
"人脸识别", "Face recognition", "大数据", "数据挖掘", "智能系统", "自动驾驶", "无人驾驶"
]
# 用来存储统计结果的Counter
keyword_count = Counter()
keyword_danmakus = {keyword: [] for keyword in ai_keywords} # 存储含有每个关键词的弹幕
# 读取文件
with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
# 遍历每个关键词,统计弹幕中包含关键词的数量,并记录弹幕
for keyword in ai_keywords:
if keyword.lower() in line.lower(): # 统计关键词忽略大小写
keyword_count[keyword] += 1
keyword_danmakus[keyword].append(line.strip()) # 将弹幕加入对应关键词列表
# 获取排名前8的关键词
top_keywords = [keyword for keyword, _ in keyword_count.most_common(8)]
print("AI 技术相关的前8条弹幕关键词统计")
for keyword, count in keyword_count.most_common(8):
print(f"{keyword}: {count} 条弹幕")
# 创建一个DataFrame将前8名关键词的弹幕按列存储
df_dict = {}
for keyword in top_keywords:
df_dict[keyword] = keyword_danmakus[keyword]
# 将弹幕写入Excel每个关键词作为一列
df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in df_dict.items()]))
df.to_excel(output_excel, index=False)
print(f"弹幕已保存至 {output_excel}")
# 文件路径
file_path = "danmakus_2024_olympics.txt"
output_excel = "top_ai_danmakus.xlsx"
# 调用函数并统计并保存至Excel
count_ai_keywords(file_path, output_excel)