diff --git a/2.2.py b/2.2.py deleted file mode 100644 index 433e01d..0000000 --- a/2.2.py +++ /dev/null @@ -1,50 +0,0 @@ -import pandas as pd -from collections import Counter - -# 读取TXT文件并统计AI相关关键词 -def count_ai_keywords(file_path, output_excel): - # 定义与AI技术相关的关键词列表 - ai_keywords = [ - "AI", "人工智能", "Machine learning", "机器学习", "Deep learning", "深度学习", - "Neural network", "神经网络", "自然语言处理", "Natural language processing", - "计算机视觉", "Computer vision", "Robotics", "机器人", "自动化", "Automation", - "人脸识别", "Face recognition", "大数据", "数据挖掘", "智能系统", "自动驾驶", "无人驾驶" - ] - - # 用来存储统计结果的Counter - keyword_count = Counter() - keyword_danmakus = {keyword: [] for keyword in ai_keywords} # 存储含有每个关键词的弹幕 - - # 读取文件 - with open(file_path, 'r', encoding='utf-8') as file: - for line in file: - # 遍历每个关键词,统计弹幕中包含关键词的数量,并记录弹幕 - for keyword in ai_keywords: - if keyword.lower() in line.lower(): # 统计关键词忽略大小写 - keyword_count[keyword] += 1 - keyword_danmakus[keyword].append(line.strip()) # 将弹幕加入对应关键词列表 - - # 获取排名前8的关键词 - top_keywords = [keyword for keyword, _ in keyword_count.most_common(8)] - - print("AI 技术相关的前8条弹幕关键词统计:") - for keyword, count in keyword_count.most_common(8): - print(f"{keyword}: {count} 条弹幕") - - # 创建一个DataFrame,将前8名关键词的弹幕按列存储 - df_dict = {} - for keyword in top_keywords: - df_dict[keyword] = keyword_danmakus[keyword] - - # 将弹幕写入Excel,每个关键词作为一列 - df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in df_dict.items()])) - df.to_excel(output_excel, index=False) - - print(f"弹幕已保存至 {output_excel}") - -# 文件路径 -file_path = "danmakus_2024_olympics.txt" -output_excel = "top_ai_danmakus.xlsx" - -# 调用函数并统计并保存至Excel -count_ai_keywords(file_path, output_excel)