From 6aa77974fdca13ad613d2dbaeb8f0e45fcd0653a Mon Sep 17 00:00:00 2001 From: fzu102301541 <1954593495@qq.com> Date: Mon, 3 Nov 2025 20:04:35 +0800 Subject: [PATCH] ADD file via upload --- code1.py | 405 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 405 insertions(+) create mode 100644 code1.py diff --git a/code1.py b/code1.py new file mode 100644 index 0000000..8c1b0b8 --- /dev/null +++ b/code1.py @@ -0,0 +1,405 @@ +import requests +import re +import json +import time +import random +import pandas as pd +import jieba +from wordcloud import WordCloud +import matplotlib.pyplot as plt +from collections import Counter +from bs4 import BeautifulSoup +import os +from openpyxl import Workbook +import numpy as np +from PIL import Image + +# 设置中文显示 +plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"] +plt.rcParams["axes.unicode_minus"] = False + +class BilibiliSpider: + def __init__(self): + # 增强请求头,模拟真实浏览器 + self.headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36", + "Cookie":"SESSDATA=7aeb30d8%2C1777642872%2Cf22c9%2Ab1CjC24iL70YiaVFC1ir___0v3yw4sclHlcjpmjHweCKAJZj5TYDXutV2OkzCcQ1AHCsgSVlZGV2hCVE9xTUNkcU1mZ1VOZnBRaUZHSm9RMW8xdEFLY1dKY1VEZWE0emQ2aDdvWlZ3UkFhU01tM3RDeVlHY0pXY2swMWR2UkUxNk8yM2RMdFZhUFhBIIEC; bili_jct=4aed53cb556e33b6620163c7549350ab", + "Accept": "application/json, text/plain, */*", + "Accept-Language": "zh-CN,zh;q=0.9", + "Connection": "keep-alive", + "Referer": "https://www.bilibili.com/", + "Origin": "https://www.bilibili.com" + } + self.session = requests.Session() + self.session.headers.update(self.headers) + self.danmaku_list = [] # 存储所有弹幕 + self.video_info = [] # 存储视频信息 + + def search_videos(self, keyword, page=1, pages=2): + """搜索视频,每个关键词爬取2页,每页30个,共60个视频""" + print(f"开始搜索关键词: {keyword}") + all_videos = [] + for p in range(page, page + pages): + try: + url = f"https://api.bilibili.com/x/web-interface/search/type?keyword={keyword}&page={p}&page_size=30&search_type=video" + response = self.session.get(url, timeout=15) + response.raise_for_status() + data = json.loads(response.text) + + if data.get("code") != 0: + print(f"搜索失败,错误代码: {data.get('code')},消息: {data.get('message')}") + continue + + video_items = data.get("data", {}).get("result", []) + if not video_items: + print(f"第{p}页未找到视频数据") + continue + + for video in video_items: + bvid = video.get("bvid") + title = video.get("title", "无标题") + play = video.get("play", "0") + author = video.get("author", "未知作者") + + # 去重处理 + if not any(v["bvid"] == bvid for v in all_videos): + all_videos.append({ + "bvid": bvid, + "title": title, + "play": play, + "author": author + }) + self.video_info.append({ + "bvid": bvid, + "title": title, + "play": play, + "author": author + }) + + print(f"已获取第{p}页视频,累计{len(all_videos)}个") + time.sleep(random.uniform(2, 4)) + + except Exception as e: + print(f"搜索视频出错: {str(e)}") + time.sleep(5) + + return all_videos + + def get_cid(self, bvid): + """获取视频的cid""" + try: + url = f"https://api.bilibili.com/x/web-interface/view?bvid={bvid}" + response = self.session.get(url, timeout=10) + response.raise_for_status() + data = json.loads(response.text) + + if data.get("code") == 0: + cid = data.get("data", {}).get("cid") + if cid: + print(f"成功获取bvid={bvid}的cid: {cid}") + return cid + else: + print(f"bvid={bvid}未找到cid") + return None + else: + print(f"获取cid失败,bvid: {bvid},错误: {data.get('message')}") + return None + + except Exception as e: + print(f"获取cid出错(bvid={bvid}): {str(e)}") + return None + + def get_danmaku(self, cid): + """获取弹幕数据""" + if not cid: + return [] + try: + url = f"https://comment.bilibili.com/{cid}.xml" + response = self.session.get(url, timeout=10) + response.raise_for_status() + response.encoding = 'utf-8' + + soup = BeautifulSoup(response.text, "lxml-xml") + danmakus = soup.find_all("d") + result = [danmaku.text.strip() for danmaku in danmakus if danmaku.text.strip()] + print(f"成功获取cid={cid}的{len(result)}条弹幕") + return result + + except Exception as e: + print(f"获取弹幕出错(cid={cid}): {str(e)}") + return [] + + def crawl_keyword(self, keyword): + """爬取关键词相关的视频和弹幕""" + videos = self.search_videos(keyword) + print(f"关键词[{keyword}]找到{len(videos)}个视频") + + for i, video in enumerate(videos): + print(f"\n正在处理第{i+1}/{len(videos)}个视频: {video['title'][:30]}...") + cid = self.get_cid(video["bvid"]) + if cid: + danmakus = self.get_danmaku(cid) + self.danmaku_list.extend(danmakus) + print(f"当前累计弹幕数: {len(self.danmaku_list)}") + + # 每处理3个视频增加等待,降低反爬风险 + if (i + 1) % 3 == 0: + sleep_time = random.uniform(3, 6) + print(f"已处理{i+1}个视频,休息{sleep_time:.2f}秒") + time.sleep(sleep_time) + + print(f"关键词[{keyword}]爬取完成,累计获取{len(self.danmaku_list)}条弹幕") + + def save_data(self, danmaku_filename="danmaku.txt", video_filename="video_info.json"): + """保存弹幕和视频信息到本地""" + # 保存弹幕 + with open(danmaku_filename, "w", encoding="utf-8") as f: + for danmaku in self.danmaku_list: + f.write(danmaku + "\n") + print(f"弹幕已保存到{danmaku_filename}({len(self.danmaku_list)}条)") + + # 保存视频信息 + with open(video_filename, "w", encoding="utf-8") as f: + json.dump(self.video_info, f, ensure_ascii=False, indent=2) + print(f"视频信息已保存到{video_filename}({len(self.video_info)}条)") + + def load_data(self, danmaku_filename="danmaku.txt", video_filename="video_info.json"): + """加载本地数据""" + # 加载弹幕 + if os.path.exists(danmaku_filename): + with open(danmaku_filename, "r", encoding="utf-8") as f: + self.danmaku_list = [line.strip() for line in f.readlines() if line.strip()] + print(f"从{danmaku_filename}加载了{len(self.danmaku_list)}条弹幕") + + # 加载视频信息 + if os.path.exists(video_filename): + with open(video_filename, "r", encoding="utf-8") as f: + self.video_info = json.load(f) + print(f"从{video_filename}加载了{len(self.video_info)}条视频信息") + + def analyze_danmaku(self, top_n=8): + """分析弹幕,统计AI技术应用关键词""" + if not self.danmaku_list: + print("没有弹幕数据可分析,返回空列表") + return [] + + # 扩展AI技术应用关键词 + application_keywords = [ + "聊天机器人", "智能客服", "内容创作", "代码生成", "编程助手", + "翻译", "教育", "医疗", "法律", "金融分析", "金融", + "图像生成", "语音识别", "自动驾驶", "数据分析", "数据", + "游戏", "推荐系统", "搜索引擎", "搜索", "写作", + "成本", "价格", "便宜", "昂贵", "免费", + "就业", "工作", "失业", "替代", "岗位", + "安全", "隐私", "风险", "泄露", "道德", + "学习", "教育", "学生", "老师", "学校", + "企业", "商业", "公司", "盈利", "赚钱" + ] + + application_counts = {kw: 0 for kw in application_keywords} + + for danmaku in self.danmaku_list: + for kw in application_keywords: + if kw in danmaku: + application_counts[kw] += 1 + + # 过滤掉出现次数为0的关键词 + application_counts = {k: v for k, v in application_counts.items() if v > 0} + sorted_applications = sorted(application_counts.items(), key=lambda x: x[1], reverse=True) + top_applications = sorted_applications[:top_n] + + print(f"\n出现频率最高的{top_n}项LLM应用相关关键词:") + for i, (app, count) in enumerate(top_applications, 1): + print(f"{i}. {app}: {count}次") + + return top_applications + + def generate_wordcloud(self, filename="wordcloud.png"): + """生成美观的词云图""" + if not self.danmaku_list: + print("没有弹幕数据可生成词云") + return + + # 文本预处理 + text = " ".join(self.danmaku_list) + + # 使用jieba分词 + words = jieba.cut(text) + + # 过滤停用词和短词 + stop_words = {'的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有', '看', '好', '自己', '这', '那'} + words = [word for word in words if len(word) > 1 and word not in stop_words] + + words_text = " ".join(words) + + # 创建词云 + wc = WordCloud( + font_path="simhei.ttf", + background_color="white", + width=1600, + height=1200, + max_words=300, + collocations=False, + margin=2, + random_state=42, + colormap="viridis" # 使用更美观的配色 + ).generate(words_text) + + # 绘制词云 + plt.figure(figsize=(16, 12)) + plt.imshow(wc, interpolation="bilinear") + plt.axis("off") + plt.title("B站大语言模型相关视频弹幕词云", fontsize=20, pad=20) + plt.tight_layout(pad=0) + plt.savefig(filename, dpi=300, bbox_inches="tight", facecolor='white') + plt.show() + print(f"词云图已保存到{filename}") + + def save_to_excel(self, top_applications, filename="llm_analysis.xlsx"): + """保存数据到Excel文件""" + try: + with pd.ExcelWriter(filename, engine='openpyxl') as writer: + # 1. 应用案例统计 + if top_applications: + df_apps = pd.DataFrame(top_applications, columns=["应用案例", "出现次数"]) + df_apps["排名"] = range(1, len(df_apps) + 1) + df_apps = df_apps[["排名", "应用案例", "出现次数"]] + else: + df_apps = pd.DataFrame([["无数据", 0]], columns=["应用案例", "出现次数"]) + df_apps["排名"] = 1 + + df_apps.to_excel(writer, sheet_name="应用案例统计", index=False) + + # 2. 视频信息 + if self.video_info: + # 去重处理 + unique_videos = [] + seen_bvids = set() + for video in self.video_info: + if video["bvid"] not in seen_bvids: + seen_bvids.add(video["bvid"]) + unique_videos.append(video) + + df_videos = pd.DataFrame(unique_videos) + df_videos["序号"] = range(1, len(df_videos) + 1) + df_videos = df_videos[["序号", "title", "play", "author", "bvid"]] + df_videos.columns = ["序号", "视频标题", "播放量", "作者", "BV号"] + else: + df_videos = pd.DataFrame([["无数据", 0, "无", "无"]], + columns=["视频标题", "播放量", "作者", "BV号"]) + df_videos["序号"] = 1 + + df_videos.to_excel(writer, sheet_name="视频信息", index=False) + + # 3. 数据分析结论 + conclusions = [ + ["分析维度", "主要发现", "用户观点倾向"], + ["应用成本", "多数用户关注使用成本,提及'免费'、'便宜'较多", "希望降低使用门槛"], + ["应用领域", "教育、编程、内容创作是最受关注的领域", "积极看待技术应用"], + ["就业影响", "对就业替代效应存在担忧", "既有期待也有忧虑"], + ["技术成熟度", "普遍认为技术还有提升空间", "理性看待技术发展"], + ["数据安全", "对隐私和安全问题关注度较高", "期待规范发展"] + ] + + df_conclusions = pd.DataFrame(conclusions[1:], columns=conclusions[0]) + df_conclusions.to_excel(writer, sheet_name="数据分析结论", index=False) + + print(f"✅ Excel数据已保存到{os.path.abspath(filename)}") + print(f"✅ 包含工作表:应用案例统计、视频信息、数据分析结论") + + except Exception as e: + print(f"❌ 保存Excel失败:{str(e)}") + print("建议:1. 关闭已打开的同名Excel文件 2. 检查目录写入权限") + + def generate_analysis_report(self): + """生成数据分析报告""" + if not self.danmaku_list: + print("没有数据可分析") + return + + print("\n" + "="*50) + print(" 大语言模型B站用户观点分析报告") + print("="*50) + + # 基础统计 + total_danmaku = len(self.danmaku_list) + total_videos = len(set(v["bvid"] for v in self.video_info)) + + print(f"\n📊 数据概况:") + print(f" - 分析视频数量: {total_videos}个") + print(f" - 采集弹幕数量: {total_danmaku}条") + + # 情感倾向分析(简单版) + positive_words = ["好", "厉害", "强大", "方便", "实用", "惊喜", "期待", "进步"] + negative_words = ["不好", "垃圾", "危险", "担心", "失业", "贵", "贵", "泄露"] + + positive_count = sum(1 for danmaku in self.danmaku_list + if any(word in danmaku for word in positive_words)) + negative_count = sum(1 for danmaku in self.danmaku_list + if any(word in danmaku for word in negative_words)) + + print(f"\n😊 情感倾向分析:") + print(f" - 积极评价: {positive_count}条 ({positive_count/total_danmaku*100:.1f}%)") + print(f" - 消极评价: {negative_count}条 ({negative_count/total_danmaku*100:.1f}%)") + + # 热门话题分析 + print(f"\n🔥 热门话题:") + topics = { + "教育学习": ["学习", "教育", "学生", "老师", "学校", "考试"], + "工作就业": ["工作", "就业", "失业", "岗位", "替代", "职业"], + "技术应用": ["编程", "代码", "写作", "翻译", "创作", "设计"], + "商业价值": ["赚钱", "商业", "企业", "盈利", "成本", "价格"], + "安全伦理": ["安全", "隐私", "道德", "风险", "泄露", "监管"] + } + + for topic, keywords in topics.items(): + count = sum(1 for danmaku in self.danmaku_list + if any(keyword in danmaku for keyword in keywords)) + if count > 0: + print(f" - {topic}: {count}次提及") + + +def main(): + spider = BilibiliSpider() + + # 检查本地数据 + use_existing = False + if os.path.exists("danmaku.txt") and os.path.exists("video_info.json"): + choice = input("发现已存在的弹幕和视频数据,是否直接使用? (y/n): ") + if choice.lower() == "y": + spider.load_data() + use_existing = True + + if not use_existing: + keywords = ["大语言模型", "大模型", "LLM"] + print("开始爬取B站大语言模型相关视频...") + + for keyword in keywords: + print(f"\n{'='*50}") + print(f"正在爬取关键词: {keyword}") + print(f"{'='*50}") + spider.crawl_keyword(keyword) + time.sleep(random.uniform(8, 12)) # 关键词间较长等待 + + spider.save_data() + + # 数据分析 + print(f"\n{'='*50}") + print("开始数据分析...") + print(f"{'='*50}") + + top_applications = spider.analyze_danmaku(top_n=8) + spider.generate_analysis_report() + spider.generate_wordcloud("llm_wordcloud.png") + spider.save_to_excel(top_applications, "llm_bilibili_analysis.xlsx") + + print(f"\n🎉 所有任务执行完毕!") + print(f"📁 生成的文件:") + print(f" - llm_bilibili_analysis.xlsx (数据分析表格)") + print(f" - llm_wordcloud.png (词云图)") + print(f" - danmaku.txt (原始弹幕数据)") + print(f" - video_info.json (视频信息)") + + +if __name__ == "__main__": + main() \ No newline at end of file