You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

51 lines
2.1 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import pandas as pd
from collections import Counter
# 读取TXT文件并统计AI相关关键词
def count_ai_keywords(file_path, output_excel):
# 定义与AI技术相关的关键词列表
ai_keywords = [
"AI", "人工智能", "Machine learning", "机器学习", "Deep learning", "深度学习",
"Neural network", "神经网络", "自然语言处理", "Natural language processing",
"计算机视觉", "Computer vision", "Robotics", "机器人", "自动化", "Automation",
"人脸识别", "Face recognition", "大数据", "数据挖掘", "智能系统", "自动驾驶", "无人驾驶"
]
# 用来存储统计结果的Counter
keyword_count = Counter()
keyword_danmakus = {keyword: [] for keyword in ai_keywords} # 存储含有每个关键词的弹幕
# 读取文件
with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
# 遍历每个关键词,统计弹幕中包含关键词的数量,并记录弹幕
for keyword in ai_keywords:
if keyword.lower() in line.lower(): # 统计关键词忽略大小写
keyword_count[keyword] += 1
keyword_danmakus[keyword].append(line.strip()) # 将弹幕加入对应关键词列表
# 获取排名前8的关键词
top_keywords = [keyword for keyword, _ in keyword_count.most_common(8)]
print("AI 技术相关的前8条弹幕关键词统计")
for keyword, count in keyword_count.most_common(8):
print(f"{keyword}: {count} 条弹幕")
# 创建一个DataFrame将前8名关键词的弹幕按列存储
df_dict = {}
for keyword in top_keywords:
df_dict[keyword] = keyword_danmakus[keyword]
# 将弹幕写入Excel每个关键词作为一列
df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in df_dict.items()]))
df.to_excel(output_excel, index=False)
print(f"弹幕已保存至 {output_excel}")
# 文件路径
file_path = "danmakus_2024_olympics.txt"
output_excel = "top_ai_danmakus.xlsx"
# 调用函数并统计并保存至Excel
count_ai_keywords(file_path, output_excel)