You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
41 lines
1.5 KiB
41 lines
1.5 KiB
from collections import Counter
|
|
import pandas as pd
|
|
|
|
# 读取xlsx文件
|
|
df = pd.read_excel('danmaku_content.xlsx',engine='openpyxl')
|
|
|
|
# 假设弹幕数据在名为'danmaku'的列中
|
|
danmaku_column = df['content']
|
|
|
|
def filter_ai_danmaku(danmaku_series, keywords):
|
|
ai_danmaku = []
|
|
for danmaku in danmaku_series:
|
|
if pd.notna(danmaku): # 检查弹幕是否为空
|
|
danmaku_str = str(danmaku) # 转换为字符串
|
|
if any(keyword in danmaku_str for keyword in keywords):
|
|
ai_danmaku.append(danmaku_str)
|
|
return ai_danmaku
|
|
|
|
ai_keywords = ["AI", "AI技术", "机器学习", "深度学习", "智能", "VR/AR", "全景直播", "360度", "3D", "追踪" ,"虚拟", "数字", "人工智能", "面部识别", "云技术", "安保", "检测", "监测", "福州大学"]
|
|
# 筛选与AI相关的弹幕
|
|
filtered_danmaku = filter_ai_danmaku(danmaku_column, ai_keywords)
|
|
|
|
# print(len(filtered_danmaku))
|
|
|
|
# 统计每种弹幕的数量
|
|
danmaku_count = Counter(filtered_danmaku)
|
|
|
|
# 排序并获取前n项
|
|
sorted_danmaku = danmaku_count.most_common(15)
|
|
|
|
# print(sorted_danmaku)
|
|
|
|
# 输出排名前n的弹幕及数量
|
|
for i, (danmaku, count) in enumerate(sorted_danmaku, 1):
|
|
print(f"排名 {i}: {danmaku} - 数量: {count}")
|
|
|
|
# 将数据转化为DataFrame
|
|
danmaku_df = pd.DataFrame(sorted_danmaku, columns=['弹幕内容', '数量'])
|
|
|
|
# 导出到Excel文件
|
|
danmaku_df.to_excel('ai_danmaku_statistics.xlsx', index=False, engine='openpyxl') |