From 75436057202f856380c0dcb2911fab73e6a2f5dd Mon Sep 17 00:00:00 2001 From: ptlwmhjs9 <2869950691@qq.com> Date: Wed, 18 Sep 2024 19:16:56 +0800 Subject: [PATCH] =?UTF-8?q?Update=20=E5=85=B3=E9=94=AE=E8=AF=8D=E6=A3=80?= =?UTF-8?q?=E6=B5=8B.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 关键词检测.py | 92 +++++++++++++++++++++++----------------------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/关键词检测.py b/关键词检测.py index 43c3f3a..9136f51 100644 --- a/关键词检测.py +++ b/关键词检测.py @@ -1,46 +1,46 @@ -#2.2 数据统计 -import collections -import re - -# 定义关键词列表 -keywords = ['AI', "人工智能", 'ai'] - -# 读取文本文件 -with open('MID、BVID、CID及弹幕.txt', 'r', encoding='utf-8') as file: - lines = file.readlines() - -# 初始化一个Counter对象 -counter = collections.Counter() - -# 遍历每一行,检测关键词并计数 -for line in lines: - line_lower = line.lower() # 将行转换为小写以进行不区分大小写的匹配 - for keyword in keywords: - # 使用正则表达式匹配独立的关键词 - if keyword.lower() == 'AI': - # 只在中文字符的上下文中匹配独立的“AI”单词 - if re.search(r'[\u4e00-\u9fff]AI[\u4e00-\u9fff]', line_lower): - counter[line.strip()] += 1 - break # 避免同一行多次计数 - else: - if keyword.lower() == 'ai': - # 只在中文字符的上下文中匹配独立的“ai”单词 - if re.search(r'[\u4e00-\u9fff]ai[\u4e00-\u9fff]', line_lower): - counter[line.strip()] += 1 - break # 避免同一行多次计数 - else: - if keyword.lower() in line_lower: - counter[line.strip()] += 1 - break # 避免同一行多次计数 - -# 获取出现次数最多的前二十个句子 -most_common_lines = counter.most_common(20) - -# 输出结果到文件 -with open('AI弹幕.txt', 'w', encoding='utf-8') as f: - for line, count in most_common_lines: - f.write(f'{line}: {count} 次\n') - print(f"{line}: {count} 次") -with open('AI弹幕(生成词云图用).txt', 'w', encoding='utf-8') as f: - for line, count in most_common_lines: - f.write(f'{line}\n') +#2.2 数据统计 +import collections +import re + +# 定义关键词列表 +keywords = ['AI', "人工智能", 'ai'] + +# 读取文本文件 +with open('MID、BVID、CID及弹幕.txt', 'r', encoding='utf-8') as file: + lines = file.readlines() + +# 初始化一个Counter对象 +counter = collections.Counter() + +# 遍历每一行,检测关键词并计数 +for line in lines: + line_lower = line.lower() # 将行转换为小写以进行不区分大小写的匹配 + for keyword in keywords: + # 使用正则表达式匹配独立的关键词 + if keyword.lower() == 'AI': + # 只在中文字符的上下文中匹配独立的“AI”单词 + if re.search(r'[\u4e00-\u9fff]AI[\u4e00-\u9fff]', line_lower): + counter[line.strip()] += 1 + break # 避免同一行多次计数 + else: + if keyword.lower() == 'ai': + # 只在中文字符的上下文中匹配独立的“ai”单词 + if re.search(r'[\u4e00-\u9fff]ai[\u4e00-\u9fff]', line_lower): + counter[line.strip()] += 1 + break # 避免同一行多次计数 + else: + if keyword.lower() in line_lower: + counter[line.strip()] += 1 + break # 避免同一行多次计数 + +# 获取出现次数最多的前二十个句子 +most_common_lines = counter.most_common(20) + +# 输出结果到文件 +with open('AI弹幕.txt', 'w', encoding='utf-8') as f: + for line, count in most_common_lines: + f.write(f'{line}: {count} 次\n') + print(f"{line}: {count} 次") +with open('AI弹幕·生成词云图用.txt', 'w', encoding='utf-8') as f: + for line, count in most_common_lines: + f.write(f'{line}\n')