You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
102201401/2024巴黎奥运会弹幕前八.py

36 lines
1021 B

import pandas as pd
from openpyxl import Workbook
import re
def normalize_bullet_comment(commet):
#归一化类似的弹幕
if re.search(r'{2,}',commet):
return '哈哈哈'
return commet
def is_airelated(comment):
ai_keywords = ['ai','科技','机器','个性化','人机']
return any(keyword in comment for keyword in ai_keywords)
#读取弹幕
with open('巴黎弹幕.txt','r',encoding='utf-8') as file:
bullet_comments = file.readlines()
#处理弹幕
bullet_comments = [normalize_bullet_comment(line.strip()) for line in bullet_comments]
#统计包含关键词的弹幕
ai_related_comments = [comment for comment in bullet_comments if is_airelated(comment)]
count_series = pd.Series(ai_related_comments).value_counts()
top_8 = count_series.head(8)
#创建DataFrame并保存到Excel
df = pd.DataFrame({
'弹幕':top_8.index,
'数量':top_8.values
})
df.to_excel('弹幕_aiTop8.xlsx',index=False,sheet_name='Top 8 Bullet Comments')