web_crawler/关键词检测.py

#2.2 数据统计
import collections
import re

# 定义关键词列表
keywords = ['AI', "人工智能", 'ai']

# 读取文本文件
with open('MID、BVID、CID及弹幕.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

# 初始化一个Counter对象
counter = collections.Counter()

# 遍历每一行，检测关键词并计数
for line in lines:
    line_lower = line.lower()  # 将行转换为小写以进行不区分大小写的匹配
    for keyword in keywords:
        # 使用正则表达式匹配独立的关键词
        if keyword.lower() == 'AI':
            # 只在中文字符的上下文中匹配独立的“AI”单词
            if re.search(r'[\u4e00-\u9fff]AI[\u4e00-\u9fff]', line_lower):
                counter[line.strip()] += 1
                break  # 避免同一行多次计数
        else:
            if keyword.lower() == 'ai':
            # 只在中文字符的上下文中匹配独立的“ai”单词
                if re.search(r'[\u4e00-\u9fff]ai[\u4e00-\u9fff]', line_lower):
                    counter[line.strip()] += 1
                    break  # 避免同一行多次计数
            else:
                if keyword.lower() in line_lower:
                    counter[line.strip()] += 1
                    break  # 避免同一行多次计数

# 获取出现次数最多的前二十个句子
most_common_lines = counter.most_common(20)

# 输出结果到文件
with open('AI弹幕.txt', 'w', encoding='utf-8') as f:
    for line, count in most_common_lines:
        f.write(f'{line}: {count} 次\n')
        print(f"{line}: {count} 次")
with open('AI弹幕（生成词云图用）.txt', 'w', encoding='utf-8') as f:
    for line, count in most_common_lines:
        f.write(f'{line}\n')