You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

41 lines
1.5 KiB

from collections import Counter
import pandas as pd
# 读取xlsx文件
df = pd.read_excel('danmaku_content.xlsx',engine='openpyxl')
# 假设弹幕数据在名为'danmaku'的列中
danmaku_column = df['content']
def filter_ai_danmaku(danmaku_series, keywords):
ai_danmaku = []
for danmaku in danmaku_series:
if pd.notna(danmaku): # 检查弹幕是否为空
danmaku_str = str(danmaku) # 转换为字符串
if any(keyword in danmaku_str for keyword in keywords):
ai_danmaku.append(danmaku_str)
return ai_danmaku
ai_keywords = ["AI", "AI技术", "机器学习", "深度学习", "智能", "VR/AR", "全景直播", "360度", "3D", "追踪" ,"虚拟", "数字", "人工智能", "面部识别", "云技术", "安保", "检测", "监测", "福州大学"]
# 筛选与AI相关的弹幕
filtered_danmaku = filter_ai_danmaku(danmaku_column, ai_keywords)
# print(len(filtered_danmaku))
# 统计每种弹幕的数量
danmaku_count = Counter(filtered_danmaku)
# 排序并获取前n项
sorted_danmaku = danmaku_count.most_common(15)
# print(sorted_danmaku)
# 输出排名前n的弹幕及数量
for i, (danmaku, count) in enumerate(sorted_danmaku, 1):
print(f"排名 {i}: {danmaku} - 数量: {count}")
# 将数据转化为DataFrame
danmaku_df = pd.DataFrame(sorted_danmaku, columns=['弹幕内容', '数量'])
# 导出到Excel文件
danmaku_df.to_excel('ai_danmaku_statistics.xlsx', index=False, engine='openpyxl')