You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
36 lines
1021 B
36 lines
1021 B
3 months ago
|
import pandas as pd
|
||
|
from openpyxl import Workbook
|
||
|
import re
|
||
|
|
||
|
def normalize_bullet_comment(commet):
|
||
|
#归一化类似的弹幕
|
||
|
if re.search(r'哈{2,}',commet):
|
||
|
return '哈哈哈'
|
||
|
return commet
|
||
|
|
||
|
def is_airelated(comment):
|
||
|
ai_keywords = ['ai','科技','机器','个性化','人机']
|
||
|
return any(keyword in comment for keyword in ai_keywords)
|
||
|
|
||
|
#读取弹幕
|
||
|
with open('巴黎弹幕.txt','r',encoding='utf-8') as file:
|
||
|
bullet_comments = file.readlines()
|
||
|
|
||
|
#处理弹幕
|
||
|
bullet_comments = [normalize_bullet_comment(line.strip()) for line in bullet_comments]
|
||
|
|
||
|
#统计包含关键词的弹幕
|
||
|
ai_related_comments = [comment for comment in bullet_comments if is_airelated(comment)]
|
||
|
count_series = pd.Series(ai_related_comments).value_counts()
|
||
|
top_8 = count_series.head(8)
|
||
|
|
||
|
#创建DataFrame并保存到Excel
|
||
|
df = pd.DataFrame({
|
||
|
'弹幕':top_8.index,
|
||
|
'数量':top_8.values
|
||
|
})
|
||
|
|
||
|
df.to_excel('弹幕_aiTop8.xlsx',index=False,sheet_name='Top 8 Bullet Comments')
|
||
|
|
||
|
|