|
|
|
|
import os
|
|
|
|
|
from collections import Counter
|
|
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
|
|
#定义 AI 技术相关的关键词
|
|
|
|
|
ai_keywords = ["AI", "人工智能", "深度学习", "机器学习", "神经网络", "算法", "大模型","ai"]
|
|
|
|
|
|
|
|
|
|
def read_danmu(folder_path):
|
|
|
|
|
#读取文件夹中的所有 txt 文件,返回弹幕和其来源文件名(BV号)的列表
|
|
|
|
|
all_danmu = []
|
|
|
|
|
for filename in os.listdir(folder_path):
|
|
|
|
|
if filename.endswith(".txt"):
|
|
|
|
|
file_path = os.path.join(folder_path, filename)
|
|
|
|
|
with open(file_path, 'r', encoding='utf-8') as file:
|
|
|
|
|
danmu_list = file.readlines()
|
|
|
|
|
#假设文件名中包含BV号,去掉文件名的后缀.txt
|
|
|
|
|
bv_number = os.path.splitext(filename)[0]
|
|
|
|
|
#将弹幕和BV号作为元组存储
|
|
|
|
|
all_danmu.extend([(danmu.strip(), bv_number) for danmu in danmu_list if danmu.strip()])
|
|
|
|
|
return all_danmu
|
|
|
|
|
|
|
|
|
|
def filter_ai_related_danmu(danmu, keywords):
|
|
|
|
|
#筛选包含关键词的弹幕并保留来源BV号
|
|
|
|
|
ai_related_danmu = [(danmu, source) for danmu, source in danmu if any(keyword in danmu for keyword in keywords)]
|
|
|
|
|
return ai_related_danmu
|
|
|
|
|
|
|
|
|
|
def count_danmu(danmu_list):
|
|
|
|
|
danmu_counter = Counter([danmu for danmu, _ in danmu_list])
|
|
|
|
|
return danmu_counter
|
|
|
|
|
|
|
|
|
|
def save(danmu_list, output_file):
|
|
|
|
|
# 创建一个包含弹幕内容、出现次数和来源文件的列表
|
|
|
|
|
data = []
|
|
|
|
|
danmu_counter = Counter([danmu for danmu, _ in danmu_list])
|
|
|
|
|
for danmu, count in danmu_counter.items():
|
|
|
|
|
# 找到对应的 BV 号
|
|
|
|
|
sources = {source for d, source in danmu_list if d == danmu}
|
|
|
|
|
source_files = ", ".join(sources) # 如果有多个来源文件,合并它们
|
|
|
|
|
data.append([danmu, count, source_files])
|
|
|
|
|
|
|
|
|
|
# 创建DataFrame
|
|
|
|
|
df = pd.DataFrame(data, columns=['弹幕内容', '出现次数', '来源BV号'])
|
|
|
|
|
|
|
|
|
|
# 将DataFrame 写入 Excel 文件
|
|
|
|
|
df.to_excel(output_file, index=False)
|
|
|
|
|
print(f"统计结果已保存至 {output_file}")
|
|
|
|
|
|
|
|
|
|
def print_top_n_danmu(danmu_list, top_n=8):
|
|
|
|
|
danmu_counter = count_danmu(danmu_list)
|
|
|
|
|
#获取出现次数最多 N个弹幕
|
|
|
|
|
most_common_danmu = danmu_counter.most_common(top_n)
|
|
|
|
|
print(f"数量排名前 {top_n} 的弹幕:")
|
|
|
|
|
for danmu, count in most_common_danmu:
|
|
|
|
|
print(f"弹幕内容: {danmu}, 出现次数: {count}")
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
folder_path = 'E:/前端/软件工程/弹幕收集按序/'
|
|
|
|
|
all_danmu = read_danmu(folder_path)
|
|
|
|
|
ai_related_danmu = filter_ai_related_danmu(all_danmu, ai_keywords)
|
|
|
|
|
|
|
|
|
|
if not ai_related_danmu:
|
|
|
|
|
print("没有找到与AI技术应用相关的弹幕。")
|
|
|
|
|
else:
|
|
|
|
|
# 打印数量排名前8的弹幕
|
|
|
|
|
print_top_n_danmu(ai_related_danmu, top_n=8)
|
|
|
|
|
|
|
|
|
|
# 将所有AI相关的弹幕、来源BV号和次数保存至 Excel
|
|
|
|
|
output_file = 'E:/前端/软件工程/AI技术弹幕统计结果8_with_BV号.xlsx'
|
|
|
|
|
save(ai_related_danmu, output_file)
|