#2.2 数据统计 import collections import re # 定义关键词列表 keywords = ['AI', "人工智能", 'ai'] # 读取文本文件 with open('MID、BVID、CID及弹幕.txt', 'r', encoding='utf-8') as file: lines = file.readlines() # 初始化一个Counter对象 counter = collections.Counter() # 遍历每一行,检测关键词并计数 for line in lines: line_lower = line.lower() # 将行转换为小写以进行不区分大小写的匹配 for keyword in keywords: # 使用正则表达式匹配独立的关键词 if keyword.lower() == 'AI': # 只在中文字符的上下文中匹配独立的“AI”单词 if re.search(r'[\u4e00-\u9fff]AI[\u4e00-\u9fff]', line_lower): counter[line.strip()] += 1 break # 避免同一行多次计数 else: if keyword.lower() == 'ai': # 只在中文字符的上下文中匹配独立的“ai”单词 if re.search(r'[\u4e00-\u9fff]ai[\u4e00-\u9fff]', line_lower): counter[line.strip()] += 1 break # 避免同一行多次计数 else: if keyword.lower() in line_lower: counter[line.strip()] += 1 break # 避免同一行多次计数 # 获取出现次数最多的前二十个句子 most_common_lines = counter.most_common(20) # 输出结果到文件 with open('AI弹幕.txt', 'w', encoding='utf-8') as f: for line, count in most_common_lines: f.write(f'{line}: {count} 次\n') print(f"{line}: {count} 次") with open('AI弹幕·生成词云图用.txt', 'w', encoding='utf-8') as f: for line, count in most_common_lines: f.write(f'{line}\n')