From 9258108413544812ff6db94136b04cb50ee153c7 Mon Sep 17 00:00:00 2001 From: fzu102301523 <102301523@fzu.edu.cn> Date: Tue, 18 Nov 2025 14:51:26 +0800 Subject: [PATCH] ADD file via upload --- code1.py | 324 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 324 insertions(+) create mode 100644 code1.py diff --git a/code1.py b/code1.py new file mode 100644 index 0000000..48338aa --- /dev/null +++ b/code1.py @@ -0,0 +1,324 @@ +import requests +import re +import json +import pandas as pd +import jieba +from wordcloud import WordCloud +import matplotlib.pyplot as plt +from collections import Counter +import time +import random +from bs4 import BeautifulSoup +import numpy as np +from PIL import Image + +# 设置中文显示 +plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"] +plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题 + +class BilibiliDanmakuAnalyzer: + def __init__(self): + # 初始化请求头,模拟浏览器访问 + self.headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", + "Accept": "*/*", + "Accept-Language": "zh-CN,zh;q=0.9", + "Connection": "keep-alive" + } + # 存储爬取的所有弹幕 + self.all_danmakus = [] + # 存储视频信息 + self.video_info = [] + + def get_video_ids(self, keyword, page_count=36): + """ + 根据关键词搜索视频,获取视频ID + 每页约10个视频,36页共约360个视频 + """ + video_ids = [] + for page in range(1, page_count + 1): + try: + # B站搜索接口 + url = f"https://api.bilibili.com/x/web-interface/search/type?keyword={keyword}&search_type=video&page={page}" + response = requests.get(url, headers=self.headers) + data = json.loads(response.text) + + if data.get("code") == 0 and data.get("data"): + results = data["data"]["result"] + for item in results: + video_id = item["aid"] + video_ids.append(video_id) + # 保存视频信息 + self.video_info.append({ + "video_id": video_id, + "title": item["title"], + "up主": item["author"], + "播放量": item["play"], + "弹幕数": item["video_review"] + }) + print(f"已获取第{page}页视频,累计{len(video_ids)}个视频ID") + + # 随机休眠,避免被反爬 + time.sleep(random.uniform(1, 3)) + except Exception as e: + print(f"获取第{page}页视频ID失败: {str(e)}") + continue + + return list(set(video_ids)) # 去重 + + def get_danmakus(self, video_id): + """获取单个视频的弹幕""" + try: + # 获取cid(弹幕池ID) + url = f"https://api.bilibili.com/x/web-interface/view?aid={video_id}" + response = requests.get(url, headers=self.headers) + data = json.loads(response.text) + + if data.get("code") == 0 and data.get("data"): + cid = data["data"]["cid"] + + # 获取弹幕 + danmaku_url = f"https://comment.bilibili.com/{cid}.xml" + response = requests.get(danmaku_url, headers=self.headers) + response.encoding = "utf-8" + + # 解析XML格式的弹幕 + soup = BeautifulSoup(response.text, "xml") + danmakus = soup.find_all("d") + + # 提取弹幕文本 + danmaku_texts = [danmaku.text.strip() for danmaku in danmakus] + print(f"视频ID {video_id} 获取到 {len(danmaku_texts)} 条弹幕") + + return danmaku_texts + else: + print(f"获取视频 {video_id} 的cid失败") + return [] + except Exception as e: + print(f"获取视频 {video_id} 弹幕失败: {str(e)}") + return [] + + def crawl_all_danmakus(self, keywords=["大语言模型", "大模型", "LLM"], max_videos=360): + """爬取所有相关视频的弹幕""" + all_video_ids = [] + + # 从多个关键词获取视频ID + for keyword in keywords: + print(f"开始搜索关键词: {keyword}") + video_ids = self.get_video_ids(keyword) + all_video_ids.extend(video_ids) + time.sleep(2) + + # 去重并限制最大数量 + unique_video_ids = list(set(all_video_ids))[:max_videos] + print(f"共获取到 {len(unique_video_ids)} 个不重复的视频ID,开始爬取弹幕...") + + # 爬取每个视频的弹幕 + for i, video_id in enumerate(unique_video_ids): + danmakus = self.get_danmakus(video_id) + self.all_danmakus.extend(danmakus) + + # 每爬取10个视频保存一次数据,防止意外丢失 + if (i + 1) % 10 == 0: + self.save_danmakus_to_file() + print(f"已完成 {i + 1}/{len(unique_video_ids)} 个视频的弹幕爬取,累计弹幕数: {len(self.all_danmakus)}") + + # 随机休眠,避免被反爬 + time.sleep(random.uniform(1, 2)) + + # 最终保存一次 + self.save_danmakus_to_file() + print(f"所有视频弹幕爬取完成,共获取 {len(self.all_danmakus)} 条弹幕") + + # 保存视频信息 + df = pd.DataFrame(self.video_info) + df.to_excel("视频信息.xlsx", index=False) + + return self.all_danmakus + + def save_danmakus_to_file(self, filename="弹幕数据.txt"): + """保存弹幕数据到文件""" + with open(filename, "w", encoding="utf-8") as f: + for danmaku in self.all_danmakus: + f.write(danmaku + "\n") + + def load_danmakus_from_file(self, filename="弹幕数据.txt"): + """从文件加载弹幕数据""" + try: + with open(filename, "r", encoding="utf-8") as f: + self.all_danmakus = [line.strip() for line in f.readlines() if line.strip()] + print(f"从文件加载了 {len(self.all_danmakus)} 条弹幕数据") + return self.all_danmakus + except Exception as e: + print(f"加载弹幕数据失败: {str(e)}") + return [] + + def analyze_application_cases(self, top_n=8): + """分析AI技术应用案例,统计排名前N的弹幕""" + # 常见的LLM应用领域关键词 + application_keywords = [ + "聊天机器人", "智能客服", "内容创作", "代码生成", + "教育辅导", "翻译", "数据分析", "医疗诊断", + "自动写作", "语音助手", "图像生成", "游戏开发", + "推荐系统", "法律咨询", "金融分析", "市场营销" + ] + + # 统计每个应用领域出现的次数 + application_counts = {keyword: 0 for keyword in application_keywords} + + for danmaku in self.all_danmakus: + for keyword in application_keywords: + if keyword in danmaku: + application_counts[keyword] += 1 + + # 按出现次数排序 + sorted_applications = sorted(application_counts.items(), key=lambda x: x[1], reverse=True) + + # 提取前N名 + top_applications = sorted_applications[:top_n] + + # 保存到Excel + df = pd.DataFrame(top_applications, columns=["应用案例", "出现次数"]) + df.to_excel("LLM应用案例统计.xlsx", index=False) + + return top_applications + + def generate_wordcloud(self, mask=None, filename="弹幕词云图.png"): + """生成词云图""" + # 合并所有弹幕文本 + text = " ".join(self.all_danmakus) + + # 使用jieba进行分词 + words = jieba.cut(text) + words = [word for word in words if len(word) > 1] # 过滤单字 + words_text = " ".join(words) + + # 配置词云 + wc = WordCloud( + font_path="simhei.ttf", # 确保有中文字体 + background_color="white", + max_words=200, + mask=mask, + contour_width=1, + contour_color="steelblue" + ) + + # 生成词云 + wc.generate(words_text) + + # 显示词云 + plt.figure(figsize=(12, 8)) + plt.imshow(wc, interpolation="bilinear") + plt.axis("off") + plt.title("B站大语言模型相关视频弹幕词云") + plt.tight_layout(pad=0) + + # 保存词云图 + wc.to_file(filename) + print(f"词云图已保存为 {filename}") + + plt.show() + + def analyze_sentiment(self): + """简单分析用户观点""" + # 成本相关关键词 + cost_keywords = ["贵", "便宜", "成本", "收费", "免费", "价格"] + # 应用领域关键词 + field_keywords = ["教育", "医疗", "工作", "学习", "娱乐", "创作", "办公"] + # 不利影响关键词 + negative_keywords = ["失业", "取代", "错误", "偏见", "隐私", "风险", "依赖"] + # 积极影响关键词 + positive_keywords = ["方便", "高效", "有用", "帮助", "创新", "进步", "强大"] + + # 统计各类关键词出现次数 + cost_count = sum(1 for danmaku in self.all_danmakus if any(kw in danmaku for kw in cost_keywords)) + field_count = sum(1 for danmaku in self.all_danmakus if any(kw in danmaku for kw in field_keywords)) + negative_count = sum(1 for danmaku in self.all_danmakus if any(kw in danmaku for kw in negative_keywords)) + positive_count = sum(1 for danmaku in self.all_danmakus if any(kw in danmaku for kw in positive_keywords)) + + # 计算百分比 + total = len(self.all_danmakus) + if total == 0: + return {} + + result = { + "成本相关讨论占比": f"{cost_count/total*100:.2f}%", + "应用领域讨论占比": f"{field_count/total*100:.2f}%", + "不利影响讨论占比": f"{negative_count/total*100:.2f}%", + "积极影响讨论占比": f"{positive_count/total*100:.2f}%" + } + + return result + + def predict_trend(self): + """预测大语言模型应用发展趋势""" + # 基于常见观点的简单趋势预测 + trends = [ + "1. 行业垂直化:大语言模型将更深入各个专业领域,如医疗、法律、教育等", + "2. 个性化增强:模型将更加了解用户需求,提供个性化服务", + "3. 多模态融合:文本、图像、语音等多模态能力将深度融合", + "4. 边缘部署增加:更多模型将在边缘设备上运行,提升响应速度和隐私性", + "5. 监管加强:随着应用广泛,相关法律法规将逐步完善", + "6. 低代码/无代码结合:降低AI应用门槛,使更多人能使用LLM能力" + ] + return trends + +def main(): + # 创建分析器实例 + analyzer = BilibiliDanmakuAnalyzer() + + # 选择是爬取新数据还是加载已有数据 + choice = input("请选择操作 (1: 爬取新数据, 2: 加载已有数据): ") + + if choice == "1": + # 爬取弹幕数据 + analyzer.crawl_all_danmakus() + else: + # 加载已有数据 + analyzer.load_danmakus_from_file() + + if not analyzer.all_danmakus: + print("没有可用的弹幕数据,程序退出") + return + + # 分析应用案例并输出前8名 + print("\n===== LLM应用案例排名前8 =====") + top_applications = analyzer.analyze_application_cases(8) + for i, (app, count) in enumerate(top_applications, 1): + print(f"{i}. {app}: {count}次") + + # 生成词云图 + print("\n===== 生成词云图 =====") + # 可以使用自定义形状作为词云掩码 + try: + mask = np.array(Image.open("cloud_mask.png")) # 如果有掩码图片 + analyzer.generate_wordcloud(mask) + except: + analyzer.generate_wordcloud() + + # 分析用户观点 + print("\n===== 用户观点分析 =====") + sentiment = analyzer.analyze_sentiment() + for key, value in sentiment.items(): + print(f"{key}: {value}") + + # 生成结论 + print("\n===== 分析结论 =====") + print("1. 从弹幕讨论来看,B站用户最关注的大语言模型应用领域是:" + + ", ".join([app for app, _ in top_applications[:3]])) + + if float(sentiment["积极影响讨论占比"].rstrip('%')) > float(sentiment["不利影响讨论占比"].rstrip('%')): + print("2. 整体来看,用户对大语言模型的评价偏向积极,更多讨论其带来的便利和效率提升") + else: + print("2. 整体来看,用户对大语言模型存在较多担忧,主要集中在其可能带来的负面影响") + + print("3. 应用领域的讨论最为广泛,说明用户普遍关注大语言模型的实际落地场景") + + # 预测发展趋势 + print("\n===== 大语言模型应用发展趋势预测 =====") + trends = analyzer.predict_trend() + for trend in trends: + print(trend) + +if __name__ == "__main__": + main() \ No newline at end of file