Update 关键词检测.py

main
ptlwmhjs9 2 months ago
parent abbc2a9906
commit 7543605720

@ -1,46 +1,46 @@
#2.2 数据统计 #2.2 数据统计
import collections import collections
import re import re
# 定义关键词列表 # 定义关键词列表
keywords = ['AI', "人工智能", 'ai'] keywords = ['AI', "人工智能", 'ai']
# 读取文本文件 # 读取文本文件
with open('MID、BVID、CID及弹幕.txt', 'r', encoding='utf-8') as file: with open('MID、BVID、CID及弹幕.txt', 'r', encoding='utf-8') as file:
lines = file.readlines() lines = file.readlines()
# 初始化一个Counter对象 # 初始化一个Counter对象
counter = collections.Counter() counter = collections.Counter()
# 遍历每一行,检测关键词并计数 # 遍历每一行,检测关键词并计数
for line in lines: for line in lines:
line_lower = line.lower() # 将行转换为小写以进行不区分大小写的匹配 line_lower = line.lower() # 将行转换为小写以进行不区分大小写的匹配
for keyword in keywords: for keyword in keywords:
# 使用正则表达式匹配独立的关键词 # 使用正则表达式匹配独立的关键词
if keyword.lower() == 'AI': if keyword.lower() == 'AI':
# 只在中文字符的上下文中匹配独立的“AI”单词 # 只在中文字符的上下文中匹配独立的“AI”单词
if re.search(r'[\u4e00-\u9fff]AI[\u4e00-\u9fff]', line_lower): if re.search(r'[\u4e00-\u9fff]AI[\u4e00-\u9fff]', line_lower):
counter[line.strip()] += 1 counter[line.strip()] += 1
break # 避免同一行多次计数 break # 避免同一行多次计数
else: else:
if keyword.lower() == 'ai': if keyword.lower() == 'ai':
# 只在中文字符的上下文中匹配独立的“ai”单词 # 只在中文字符的上下文中匹配独立的“ai”单词
if re.search(r'[\u4e00-\u9fff]ai[\u4e00-\u9fff]', line_lower): if re.search(r'[\u4e00-\u9fff]ai[\u4e00-\u9fff]', line_lower):
counter[line.strip()] += 1 counter[line.strip()] += 1
break # 避免同一行多次计数 break # 避免同一行多次计数
else: else:
if keyword.lower() in line_lower: if keyword.lower() in line_lower:
counter[line.strip()] += 1 counter[line.strip()] += 1
break # 避免同一行多次计数 break # 避免同一行多次计数
# 获取出现次数最多的前二十个句子 # 获取出现次数最多的前二十个句子
most_common_lines = counter.most_common(20) most_common_lines = counter.most_common(20)
# 输出结果到文件 # 输出结果到文件
with open('AI弹幕.txt', 'w', encoding='utf-8') as f: with open('AI弹幕.txt', 'w', encoding='utf-8') as f:
for line, count in most_common_lines: for line, count in most_common_lines:
f.write(f'{line}: {count}\n') f.write(f'{line}: {count}\n')
print(f"{line}: {count}") print(f"{line}: {count}")
with open('AI弹幕(生成词云图用).txt', 'w', encoding='utf-8') as f: with open('AI弹幕·生成词云图用.txt', 'w', encoding='utf-8') as f:
for line, count in most_common_lines: for line, count in most_common_lines:
f.write(f'{line}\n') f.write(f'{line}\n')

Loading…
Cancel
Save