You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.
#2.2 数据统计
import collections
import re
# 定义关键词列表
keywords = [ ' AI ' , " 人工智能 " , ' ai ' ]
# 读取文本文件
with open ( ' MID、BVID、CID及弹幕.txt ' , ' r ' , encoding = ' utf-8 ' ) as file :
lines = file . readlines ( )
# 初始化一个Counter对象
counter = collections . Counter ( )
# 遍历每一行,检测关键词并计数
for line in lines :
line_lower = line . lower ( ) # 将行转换为小写以进行不区分大小写的匹配
for keyword in keywords :
# 使用正则表达式匹配独立的关键词
if keyword . lower ( ) == ' AI ' :
# 只在中文字符的上下文中匹配独立的“AI”单词
if re . search ( r ' [ \ u4e00- \ u9fff]AI[ \ u4e00- \ u9fff] ' , line_lower ) :
counter [ line . strip ( ) ] + = 1
break # 避免同一行多次计数
else :
if keyword . lower ( ) == ' ai ' :
# 只在中文字符的上下文中匹配独立的“ai”单词
if re . search ( r ' [ \ u4e00- \ u9fff]ai[ \ u4e00- \ u9fff] ' , line_lower ) :
counter [ line . strip ( ) ] + = 1
break # 避免同一行多次计数
else :
if keyword . lower ( ) in line_lower :
counter [ line . strip ( ) ] + = 1
break # 避免同一行多次计数
# 获取出现次数最多的前二十个句子
most_common_lines = counter . most_common ( 20 )
# 输出结果到文件
with open ( ' AI弹幕.txt ' , ' w ' , encoding = ' utf-8 ' ) as f :
for line , count in most_common_lines :
f . write ( f ' { line } : { count } 次 \n ' )
print ( f " { line } : { count } 次 " )
with open ( ' AI弹幕( 生成词云图用) .txt ' , ' w ' , encoding = ' utf-8 ' ) as f :
for line , count in most_common_lines :
f . write ( f ' { line } \n ' )