You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.
import os
import re
from collections import Counter
import pandas as pd
#提取txt文件里面的弹幕, 并存于列表中
def read_danmu_files ( directory ) :
danmu_data = [ ]
for filename in os . listdir ( directory ) :
if filename . endswith ( ' paris_olympics_danmak.txt ' ) :
with open ( os . path . join ( directory , filename ) , ' r ' , encoding = ' utf-8 ' ) as file :
danmu_data . extend ( file . readlines ( ) )
return danmu_data
def count_danmu ( ) :
danmu_data = read_danmu_files ( ' ./ ' ) # 当前目录下
#用正则表达式提取含有ai的弹幕
ai_danmaku = [ danmaku for danmaku in danmu_data if
re . search ( r ' (?<![a-zA-Z])((?:AI|人工智能))(?![a-zA-Z]) ' , danmaku ) ]
unique_ai_danmaku = list ( set ( ai_danmaku ) )
#获取与ai弹幕相关的数量
danmaku_counts = [ ( danmaku , ai_danmaku . count ( danmaku ) ) for danmaku in unique_ai_danmaku ]
#对与ai相关弹幕按照数量数量进行排序
sorted_danmaku_counts = sorted ( danmaku_counts , key = lambda x : x [ 1 ] , reverse = True )
#得出排名前八的弹幕
top_8_danmaku = sorted_danmaku_counts [ : 8 ]
#将数据导入excel表
data = { ' 弹幕内容 ' : [ danmaku [ 0 ] for danmaku in top_8_danmaku ] }
df = pd . DataFrame ( data )
df . to_excel ( ' 2025巴黎奥运AI弹幕统计.xlsx ' , index = False )