|
|
|
@ -1,50 +0,0 @@
|
|
|
|
|
#软工个人作业——检测并将写入Excel的程序
|
|
|
|
|
import collections
|
|
|
|
|
import re
|
|
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
|
|
#检测关键词,打印语句排名和出现次数
|
|
|
|
|
#使用正则表达式来处理单词侦测问题
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 定义关键词列表
|
|
|
|
|
keywords = ['AI', "人工智能", 'ai']
|
|
|
|
|
|
|
|
|
|
# 读取文本文件
|
|
|
|
|
with open('all_danmaku.txt', 'r', encoding='utf-8') as file:
|
|
|
|
|
lines = file.readlines()
|
|
|
|
|
|
|
|
|
|
# 初始化一个Counter对象
|
|
|
|
|
counter = collections.Counter()
|
|
|
|
|
|
|
|
|
|
# 遍历每一行,检测关键词并计数
|
|
|
|
|
for line in lines:
|
|
|
|
|
line_lower = line.lower() # 将行转换为小写以进行不区分大小写的匹配
|
|
|
|
|
for keyword in keywords:
|
|
|
|
|
# 使用正则表达式匹配独立的关键词
|
|
|
|
|
if keyword.lower() == 'ai':
|
|
|
|
|
# 只在中文字符的上下文中匹配独立的“ai”单词
|
|
|
|
|
if re.search(r'[\u4e00-\u9fff]ai[\u4e00-\u9fff]', line_lower):
|
|
|
|
|
counter[line.strip()] += 1
|
|
|
|
|
break # 避免同一行多次计数
|
|
|
|
|
else:
|
|
|
|
|
if keyword.lower() in line_lower:
|
|
|
|
|
counter[line.strip()] += 1
|
|
|
|
|
break # 避免同一行多次计数
|
|
|
|
|
|
|
|
|
|
# 获取出现次数最多的前二十个句子
|
|
|
|
|
most_common_lines = counter.most_common(20)
|
|
|
|
|
|
|
|
|
|
# 创建一个DataFrame
|
|
|
|
|
df = pd.DataFrame(most_common_lines, columns=['句子', '出现次数'])
|
|
|
|
|
|
|
|
|
|
# 添加排名列
|
|
|
|
|
df['排名'] = df['出现次数'].rank(method='dense', ascending=False).astype(int)
|
|
|
|
|
|
|
|
|
|
# 重新排列列的顺序
|
|
|
|
|
df = df[['排名', '句子', '出现次数']]
|
|
|
|
|
|
|
|
|
|
# 将DataFrame写入Excel文件
|
|
|
|
|
df.to_excel('关键词检测结果.xlsx', index=False)
|
|
|
|
|
|
|
|
|
|
print("Excel文件已生成:关键词检测结果.xlsx")
|