You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

69 lines
2.9 KiB

import os
from collections import Counter
import pandas as pd
#定义 AI 技术相关的关键词
ai_keywords = ["AI", "人工智能", "深度学习", "机器学习", "神经网络", "算法", "大模型","ai"]
def read_danmu(folder_path):
#读取文件夹中的所有 txt 文件返回弹幕和其来源文件名BV号的列表
all_danmu = []
for filename in os.listdir(folder_path):
if filename.endswith(".txt"):
file_path = os.path.join(folder_path, filename)
with open(file_path, 'r', encoding='utf-8') as file:
danmu_list = file.readlines()
#假设文件名中包含BV号去掉文件名的后缀.txt
bv_number = os.path.splitext(filename)[0]
#将弹幕和BV号作为元组存储
all_danmu.extend([(danmu.strip(), bv_number) for danmu in danmu_list if danmu.strip()])
return all_danmu
def filter_ai_related_danmu(danmu, keywords):
#筛选包含关键词的弹幕并保留来源BV号
ai_related_danmu = [(danmu, source) for danmu, source in danmu if any(keyword in danmu for keyword in keywords)]
return ai_related_danmu
def count_danmu(danmu_list):
danmu_counter = Counter([danmu for danmu, _ in danmu_list])
return danmu_counter
def save(danmu_list, output_file):
# 创建一个包含弹幕内容、出现次数和来源文件的列表
data = []
danmu_counter = Counter([danmu for danmu, _ in danmu_list])
for danmu, count in danmu_counter.items():
# 找到对应的 BV 号
sources = {source for d, source in danmu_list if d == danmu}
source_files = ", ".join(sources) # 如果有多个来源文件,合并它们
data.append([danmu, count, source_files])
# 创建DataFrame
df = pd.DataFrame(data, columns=['弹幕内容', '出现次数', '来源BV号'])
# 将DataFrame 写入 Excel 文件
df.to_excel(output_file, index=False)
print(f"统计结果已保存至 {output_file}")
def print_top_n_danmu(danmu_list, top_n=8):
danmu_counter = count_danmu(danmu_list)
#获取出现次数最多 N个弹幕
most_common_danmu = danmu_counter.most_common(top_n)
print(f"数量排名前 {top_n} 的弹幕:")
for danmu, count in most_common_danmu:
print(f"弹幕内容: {danmu}, 出现次数: {count}")
if __name__ == '__main__':
folder_path = 'E:/前端/软件工程/弹幕收集按序/'
all_danmu = read_danmu(folder_path)
ai_related_danmu = filter_ai_related_danmu(all_danmu, ai_keywords)
if not ai_related_danmu:
print("没有找到与AI技术应用相关的弹幕。")
else:
# 打印数量排名前8的弹幕
print_top_n_danmu(ai_related_danmu, top_n=8)
# 将所有AI相关的弹幕、来源BV号和次数保存至 Excel
output_file = 'E:/前端/软件工程/AI技术弹幕统计结果8_with_BV号.xlsx'
save(ai_related_danmu, output_file)