You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

69 lines
2.9 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import os
from collections import Counter
import pandas as pd
#定义 AI 技术相关的关键词
ai_keywords = ["AI", "人工智能", "深度学习", "机器学习", "神经网络", "算法", "大模型","ai"]
def read_danmu(folder_path):
#读取文件夹中的所有 txt 文件返回弹幕和其来源文件名BV号的列表
all_danmu = []
for filename in os.listdir(folder_path):
if filename.endswith(".txt"):
file_path = os.path.join(folder_path, filename)
with open(file_path, 'r', encoding='utf-8') as file:
danmu_list = file.readlines()
#假设文件名中包含BV号去掉文件名的后缀.txt
bv_number = os.path.splitext(filename)[0]
#将弹幕和BV号作为元组存储
all_danmu.extend([(danmu.strip(), bv_number) for danmu in danmu_list if danmu.strip()])
return all_danmu
def filter_ai_related_danmu(danmu, keywords):
#筛选包含关键词的弹幕并保留来源BV号
ai_related_danmu = [(danmu, source) for danmu, source in danmu if any(keyword in danmu for keyword in keywords)]
return ai_related_danmu
def count_danmu(danmu_list):
danmu_counter = Counter([danmu for danmu, _ in danmu_list])
return danmu_counter
def save(danmu_list, output_file):
# 创建一个包含弹幕内容、出现次数和来源文件的列表
data = []
danmu_counter = Counter([danmu for danmu, _ in danmu_list])
for danmu, count in danmu_counter.items():
# 找到对应的 BV 号
sources = {source for d, source in danmu_list if d == danmu}
source_files = ", ".join(sources) # 如果有多个来源文件,合并它们
data.append([danmu, count, source_files])
# 创建DataFrame
df = pd.DataFrame(data, columns=['弹幕内容', '出现次数', '来源BV号'])
# 将DataFrame 写入 Excel 文件
df.to_excel(output_file, index=False)
print(f"统计结果已保存至 {output_file}")
def print_top_n_danmu(danmu_list, top_n=8):
danmu_counter = count_danmu(danmu_list)
#获取出现次数最多 N个弹幕
most_common_danmu = danmu_counter.most_common(top_n)
print(f"数量排名前 {top_n} 的弹幕:")
for danmu, count in most_common_danmu:
print(f"弹幕内容: {danmu}, 出现次数: {count}")
if __name__ == '__main__':
folder_path = 'E:/前端/软件工程/弹幕收集按序/'
all_danmu = read_danmu(folder_path)
ai_related_danmu = filter_ai_related_danmu(all_danmu, ai_keywords)
if not ai_related_danmu:
print("没有找到与AI技术应用相关的弹幕。")
else:
# 打印数量排名前8的弹幕
print_top_n_danmu(ai_related_danmu, top_n=8)
# 将所有AI相关的弹幕、来源BV号和次数保存至 Excel
output_file = 'E:/前端/软件工程/AI技术弹幕统计结果8_with_BV号.xlsx'
save(ai_related_danmu, output_file)