parent
abbc2a9906
commit
7543605720
@ -1,46 +1,46 @@
|
|||||||
#2.2 数据统计
|
#2.2 数据统计
|
||||||
import collections
|
import collections
|
||||||
import re
|
import re
|
||||||
|
|
||||||
# 定义关键词列表
|
# 定义关键词列表
|
||||||
keywords = ['AI', "人工智能", 'ai']
|
keywords = ['AI', "人工智能", 'ai']
|
||||||
|
|
||||||
# 读取文本文件
|
# 读取文本文件
|
||||||
with open('MID、BVID、CID及弹幕.txt', 'r', encoding='utf-8') as file:
|
with open('MID、BVID、CID及弹幕.txt', 'r', encoding='utf-8') as file:
|
||||||
lines = file.readlines()
|
lines = file.readlines()
|
||||||
|
|
||||||
# 初始化一个Counter对象
|
# 初始化一个Counter对象
|
||||||
counter = collections.Counter()
|
counter = collections.Counter()
|
||||||
|
|
||||||
# 遍历每一行,检测关键词并计数
|
# 遍历每一行,检测关键词并计数
|
||||||
for line in lines:
|
for line in lines:
|
||||||
line_lower = line.lower() # 将行转换为小写以进行不区分大小写的匹配
|
line_lower = line.lower() # 将行转换为小写以进行不区分大小写的匹配
|
||||||
for keyword in keywords:
|
for keyword in keywords:
|
||||||
# 使用正则表达式匹配独立的关键词
|
# 使用正则表达式匹配独立的关键词
|
||||||
if keyword.lower() == 'AI':
|
if keyword.lower() == 'AI':
|
||||||
# 只在中文字符的上下文中匹配独立的“AI”单词
|
# 只在中文字符的上下文中匹配独立的“AI”单词
|
||||||
if re.search(r'[\u4e00-\u9fff]AI[\u4e00-\u9fff]', line_lower):
|
if re.search(r'[\u4e00-\u9fff]AI[\u4e00-\u9fff]', line_lower):
|
||||||
counter[line.strip()] += 1
|
counter[line.strip()] += 1
|
||||||
break # 避免同一行多次计数
|
break # 避免同一行多次计数
|
||||||
else:
|
else:
|
||||||
if keyword.lower() == 'ai':
|
if keyword.lower() == 'ai':
|
||||||
# 只在中文字符的上下文中匹配独立的“ai”单词
|
# 只在中文字符的上下文中匹配独立的“ai”单词
|
||||||
if re.search(r'[\u4e00-\u9fff]ai[\u4e00-\u9fff]', line_lower):
|
if re.search(r'[\u4e00-\u9fff]ai[\u4e00-\u9fff]', line_lower):
|
||||||
counter[line.strip()] += 1
|
counter[line.strip()] += 1
|
||||||
break # 避免同一行多次计数
|
break # 避免同一行多次计数
|
||||||
else:
|
else:
|
||||||
if keyword.lower() in line_lower:
|
if keyword.lower() in line_lower:
|
||||||
counter[line.strip()] += 1
|
counter[line.strip()] += 1
|
||||||
break # 避免同一行多次计数
|
break # 避免同一行多次计数
|
||||||
|
|
||||||
# 获取出现次数最多的前二十个句子
|
# 获取出现次数最多的前二十个句子
|
||||||
most_common_lines = counter.most_common(20)
|
most_common_lines = counter.most_common(20)
|
||||||
|
|
||||||
# 输出结果到文件
|
# 输出结果到文件
|
||||||
with open('AI弹幕.txt', 'w', encoding='utf-8') as f:
|
with open('AI弹幕.txt', 'w', encoding='utf-8') as f:
|
||||||
for line, count in most_common_lines:
|
for line, count in most_common_lines:
|
||||||
f.write(f'{line}: {count} 次\n')
|
f.write(f'{line}: {count} 次\n')
|
||||||
print(f"{line}: {count} 次")
|
print(f"{line}: {count} 次")
|
||||||
with open('AI弹幕(生成词云图用).txt', 'w', encoding='utf-8') as f:
|
with open('AI弹幕·生成词云图用.txt', 'w', encoding='utf-8') as f:
|
||||||
for line, count in most_common_lines:
|
for line, count in most_common_lines:
|
||||||
f.write(f'{line}\n')
|
f.write(f'{line}\n')
|
||||||
|
Loading…
Reference in new issue