diff --git a/danmaku_analysis.py b/danmaku_analysis.py new file mode 100644 index 0000000..9774fc4 --- /dev/null +++ b/danmaku_analysis.py @@ -0,0 +1,242 @@ +import requests +import re +import json +import pandas as pd +import jieba +from wordcloud import WordCloud +import matplotlib.pyplot as plt +from collections import Counter +import time +import random +from openpyxl import Workbook + +# 设置中文字体 +plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"] + +class BilibiliDanmakuSpider: + def __init__(self): + self.headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", + "Referer": "https://www.bilibili.com/" + } + self.danmaku_list = [] + + def get_video_ids(self, keyword, page_count=36): + """获取搜索结果的视频ID,每页10个视频,36页共360个""" + video_ids = [] + for page in range(1, page_count + 1): + try: + url = f"https://api.bilibili.com/x/web-interface/search/type?keyword={keyword}&search_type=video&page={page}" + response = requests.get(url, headers=self.headers) + data = json.loads(response.text) + + if data["code"] == 0 and data["data"]["result"]: + for item in data["data"]["result"]: + video_ids.append(item["aid"]) + + # 随机延迟,避免被反爬 + time.sleep(random.uniform(1, 3)) + except Exception as e: + print(f"获取第{page}页视频ID失败: {e}") + + return list(set(video_ids))[:360] # 去重并确保最多360个 + + def get_danmakus(self, aid): + """获取单个视频的弹幕""" + try: + # 获取cid + url = f"https://api.bilibili.com/x/web-interface/view?aid={aid}" + response = requests.get(url, headers=self.headers) + cid = json.loads(response.text)["data"]["cid"] + + # 获取弹幕 + danmaku_url = f"https://comment.bilibili.com/{cid}.xml" + response = requests.get(danmaku_url, headers=self.headers) + response.encoding = "utf-8" + + # 提取弹幕内容 + danmakus = re.findall(r'(.*?)', response.text) + self.danmaku_list.extend(danmakus) + + print(f"成功获取视频{aid}的{len(danmakus)}条弹幕") + time.sleep(random.uniform(0.5, 1.5)) + return True + except Exception as e: + print(f"获取视频{aid}弹幕失败: {e}") + return False + + def run(self, keywords=["大语言模型", "大模型", "LLM"]): + """运行爬虫主程序""" + all_video_ids = [] + for keyword in keywords: + print(f"搜索关键词: {keyword}") + video_ids = self.get_video_ids(keyword) + all_video_ids.extend(video_ids) + + # 去重并确保总数不超过360 + unique_video_ids = list(set(all_video_ids))[:360] + print(f"共获取{len(unique_video_ids)}个视频ID,开始爬取弹幕...") + + for idx, aid in enumerate(unique_video_ids, 1): + print(f"正在爬取第{idx}/{len(unique_video_ids)}个视频") + self.get_danmakus(aid) + + print(f"爬取完成,共获取{len(self.danmaku_list)}条弹幕") + return self.danmaku_list + +class DanmakuAnalyzer: + def __init__(self, danmakus): + self.danmakus = [d.strip() for d in danmakus if d.strip()] + self.stopwords = self.load_stopwords() + + def load_stopwords(self): + """加载停用词""" + try: + with open("stopwords.txt", "r", encoding="utf-8") as f: + return set([line.strip() for line in f.readlines()]) + except: + # 默认停用词 + return set(["的", "了", "是", "在", "我", "有", "和", "就", "不", "人", "都", "一", "一个", "上", "也", "很", "到", "说", "要", "去", "你", "会", "着", "没有", "看", "好", "自己", "这"]) + + def count_danmakus(self): + """统计弹幕数量""" + # 过滤与AI技术应用无关的弹幕 + ai_related_words = ["AI", "人工智能", "大模型", "语言模型", "LLM", "训练", "算法", "数据", "应用", "成本", "效率", "伦理", "隐私", "安全", "未来", "发展", "编程", "创作", "教育"] + ai_danmakus = [d for d in self.danmakus if any(word in d for word in ai_related_words)] + + # 统计所有弹幕 + all_counter = Counter(self.danmakus) + # 统计AI相关弹幕 + ai_counter = Counter(ai_danmakus) + + return all_counter, ai_counter + + def save_to_excel(self, ai_counter, filename="弹幕统计.xlsx"): + """保存统计结果到Excel""" + # 转换为DataFrame + df = pd.DataFrame(ai_counter.most_common(), columns=["弹幕内容", "出现次数"]) + # 保存到Excel + df.to_excel(filename, index=False) + print(f"统计结果已保存到{filename}") + return df + + def generate_wordcloud(self, filename="弹幕词云.png"): + """生成词云图""" + # 分词处理 + all_text = " ".join(self.danmakus) + words = jieba.cut(all_text) + words = [word for word in words if word not in self.stopwords and len(word) > 1] + text = " ".join(words) + + # 生成词云 + wc = WordCloud( + font_path="simhei.ttf", # 替换为你的中文字体路径 + background_color="white", + width=1200, + height=800, + max_words=200, + collocations=False + ).generate(text) + + # 显示并保存词云 + plt.figure(figsize=(12, 8)) + plt.imshow(wc, interpolation="bilinear") + plt.axis("off") + plt.tight_layout() + plt.savefig(filename, dpi=300) + plt.show() + print(f"词云图已保存到{filename}") + + def get_main_views(self, ai_counter): + """分析用户主流看法""" + # 分类关键词 + cost_related = ["成本", "价格", "费用", "免费", "付费"] + application_related = ["应用", "使用", "场景", "领域", "行业", "教育", "医疗", "工作", "创作", "编程"] + negative_related = ["风险", "危险", "伦理", "隐私", "安全", "失业", "替代", "问题"] + positive_related = ["方便", "高效", "厉害", "强大", "有用", "帮助", "进步"] + + # 统计各类别弹幕 + categories = { + "应用成本": 0, + "应用领域": 0, + "不利影响": 0, + "积极影响": 0, + "其他观点": 0 + } + + for danmaku, count in ai_counter.items(): + matched = False + for word in cost_related: + if word in danmaku: + categories["应用成本"] += count + matched = True + break + if matched: + continue + + for word in application_related: + if word in danmaku: + categories["应用领域"] += count + matched = True + break + if matched: + continue + + for word in negative_related: + if word in danmaku: + categories["不利影响"] += count + matched = True + break + if matched: + continue + + for word in positive_related: + if word in danmaku: + categories["积极影响"] += count + matched = True + break + if matched: + continue + + categories["其他观点"] += count + + return categories + +if __name__ == "__main__": + # 爬取弹幕 + spider = BilibiliDanmakuSpider() + danmakus = spider.run() + + # 分析弹幕 + analyzer = DanmakuAnalyzer(danmakus) + all_counter, ai_counter = analyzer.count_danmakus() + + # 输出排名前8的AI相关弹幕 + print("\nAI技术应用相关弹幕数量排名前8:") + top8 = ai_counter.most_common(8) + for i, (danmaku, count) in enumerate(top8, 1): + print(f"{i}. {danmaku}: {count}次") + + # 保存到Excel + df = analyzer.save_to_excel(ai_counter) + + # 生成词云 + analyzer.generate_wordcloud() + + # 分析主流看法 + main_views = analyzer.get_main_views(ai_counter) + print("\nB站用户对大语言模型技术的主流看法统计:") + for view, count in main_views.items(): + print(f"{view}: {count}条相关弹幕") + + # 可视化主流看法 + plt.figure(figsize=(10, 6)) + plt.bar(main_views.keys(), main_views.values(), color=['#4CAF50', '#2196F3', '#f44336', '#FFC107', '#9E9E9E']) + plt.title('用户对大语言模型的主要关注点分布') + plt.ylabel('弹幕数量') + plt.xticks(rotation=30) + for i, v in enumerate(main_views.values()): + plt.text(i, v + 5, str(v), ha='center') + plt.tight_layout() + plt.savefig('用户观点分布.png', dpi=300) + plt.show() \ No newline at end of file