parent
1bc34bff47
commit
685ee40a1e
@ -0,0 +1,35 @@
|
|||||||
|
import pandas as pd
|
||||||
|
from openpyxl import Workbook
|
||||||
|
import re
|
||||||
|
|
||||||
|
def normalize_bullet_comment(commet):
|
||||||
|
#归一化类似的弹幕
|
||||||
|
if re.search(r'哈{2,}',commet):
|
||||||
|
return '哈哈哈'
|
||||||
|
return commet
|
||||||
|
|
||||||
|
def is_airelated(comment):
|
||||||
|
ai_keywords = ['ai','神经网络','机器学习','AI','人工智能','深度学习']
|
||||||
|
return any(keyword in comment for keyword in ai_keywords)
|
||||||
|
|
||||||
|
#读取弹幕
|
||||||
|
with open('弹幕.txt','r',encoding='utf-8') as file:
|
||||||
|
bullet_comments = file.readlines()
|
||||||
|
|
||||||
|
#处理弹幕
|
||||||
|
bullet_comments = [normalize_bullet_comment(line.strip()) for line in bullet_comments]
|
||||||
|
|
||||||
|
#统计包含关键词的弹幕
|
||||||
|
ai_related_comments = [comment for comment in bullet_comments if is_airelated(comment)]
|
||||||
|
count_series = pd.Series(ai_related_comments).value_counts()
|
||||||
|
top_8 = count_series.head(8)
|
||||||
|
|
||||||
|
#创建DataFrame并保存到Excel
|
||||||
|
df = pd.DataFrame({
|
||||||
|
'弹幕':top_8.index,
|
||||||
|
'数量':top_8.values
|
||||||
|
})
|
||||||
|
|
||||||
|
df.to_excel('aiTop8.xlsx',index=False,sheet_name='Top 8 Bullet Comments')
|
||||||
|
|
||||||
|
|
Loading…
Reference in new issue