diff --git a/code1.py b/code1.py new file mode 100644 index 0000000..ea176fb --- /dev/null +++ b/code1.py @@ -0,0 +1,230 @@ +import requests +import re +import json +import time +import random +import pandas as pd +import jieba +from wordcloud import WordCloud +import matplotlib.pyplot as plt +from collections import Counter +from bs4 import BeautifulSoup +import os +from openpyxl import Workbook + +# 设置中文显示 +plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"] +plt.rcParams["axes.unicode_minus"] = False + +class BilibiliSpider: + def __init__(self): + self.headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", + "Accept": "application/json, text/plain, */*", + "Referer": "https://www.bilibili.com/" + } + self.session = requests.Session() + self.session.headers.update(self.headers) + self.danmaku_list = [] # 存储所有弹幕 + self.video_info = [] # 存储视频信息 + + def search_videos(self, keyword, page=1, pages=36): + print(f"开始搜索关键词: {keyword}") + all_videos = [] + for p in range(page, page + pages): + try: + url = f"https://api.bilibili.com/x/web-interface/search/all/v2?keyword={keyword}&page={p}&page_size=10" + response = self.session.get(url) + data = json.loads(response.text) + if data.get("code") != 0: + print(f"搜索失败,错误代码: {data.get('code')}") + continue + video_items = data.get("data", {}).get("result", []) + for item in video_items: + if item.get("result_type") == "video": + for video in item.get("data", []): + bvid = video.get("bvid") + title = video.get("title") + play = video.get("play") + author = video.get("author") + all_videos.append({"bvid": bvid, "title": title, "play": play, "author": author}) + self.video_info.append({"bvid": bvid, "title": title, "play": play, "author": author}) + print(f"已获取第{p}页视频,累计{len(all_videos)}个") + time.sleep(random.uniform(1, 3)) + except Exception as e: + print(f"搜索视频出错: {e}") + time.sleep(5) + return all_videos[:360] + + def get_cid(self, bvid): + try: + url = f"https://api.bilibili.com/x/web-interface/view?bvid={bvid}" + response = self.session.get(url) + data = json.loads(response.text) + if data.get("code") == 0: + return data.get("data", {}).get("cid") + else: + print(f"获取cid失败,bvid: {bvid},错误代码: {data.get('code')}") + return None + except Exception as e: + print(f"获取cid出错: {e}") + return None + + def get_danmaku(self, cid): + if not cid: + return [] + try: + url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}" + response = self.session.get(url) + response.encoding = "utf-8" + soup = BeautifulSoup(response.text, "lxml") + danmakus = soup.find_all("d") + result = [danmaku.text.strip() for danmaku in danmakus] + print(f"成功获取{len(result)}条弹幕") + return result + except Exception as e: + print(f"获取弹幕出错: {e}") + return [] + + def crawl_keyword(self, keyword): + videos = self.search_videos(keyword) + for i, video in enumerate(videos): + print(f"正在处理第{i+1}/{len(videos)}个视频: {video['title']}") + cid = self.get_cid(video["bvid"]) + if cid: + danmakus = self.get_danmaku(cid) + self.danmaku_list.extend(danmakus) + if (i + 1) % 5 == 0: + time.sleep(random.uniform(3, 5)) + print(f"关键词[{keyword}]爬取完成,累计获取{len(self.danmaku_list)}条弹幕") + + def save_danmaku(self, filename="danmaku.txt"): + with open(filename, "w", encoding="utf-8") as f: + for danmaku in self.danmaku_list: + f.write(danmaku + "\n") + print(f"弹幕已保存到{filename}") + + def load_danmaku(self, filename="danmaku.txt"): + if os.path.exists(filename): + with open(filename, "r", encoding="utf-8") as f: + self.danmaku_list = [line.strip() for line in f.readlines() if line.strip()] + print(f"从{filename}加载了{len(self.danmaku_list)}条弹幕") + + def analyze_danmaku(self, top_n=8): + """修复:返回空列表时明确提示,避免后续保存出错""" + if not self.danmaku_list: + print("没有弹幕数据可分析,返回空列表") + return [] + application_keywords = [ + "聊天机器人", "智能客服", "内容创作", "代码生成", + "翻译", "教育", "医疗", "法律", "金融分析", + "图像生成", "语音识别", "自动驾驶", "数据分析", + "游戏", "推荐系统", "搜索引擎" + ] + application_counts = {kw: 0 for kw in application_keywords} + for danmaku in self.danmaku_list: + for kw in application_keywords: + if kw in danmaku: + application_counts[kw] += 1 + sorted_applications = sorted(application_counts.items(), key=lambda x: x[1], reverse=True) + top_applications = sorted_applications[:top_n] + print(f"出现频率最高的{top_n}项LLM应用:") + for i, (app, count) in enumerate(top_applications, 1): + print(f"{i}. {app}: {count}次") + # 若前N项全为0次,提示并返回非空列表(避免Excel写入失败) + if all(count == 0 for _, count in top_applications): + print("未匹配到任何LLM应用关键词,Excel将写入空数据") + return top_applications + + def generate_wordcloud(self, filename="wordcloud.png"): + if not self.danmaku_list: + print("没有弹幕数据可生成词云") + return + text = " ".join(self.danmaku_list) + words = jieba.cut(text) + words = [word for word in words if len(word) > 1] + words_text = " ".join(words) + wc = WordCloud( + font_path="simhei.ttf", + background_color="white", + width=1200, + height=800, + max_words=200, + collocations=False + ).generate(words_text) + plt.figure(figsize=(12, 8)) + plt.imshow(wc, interpolation="bilinear") + plt.axis("off") + plt.tight_layout(pad=0) + plt.savefig(filename, dpi=300, bbox_inches="tight") + plt.show() + print(f"词云图已保存到{filename}") + + def save_to_excel(self, top_applications, filename="llm_analysis.xlsx"): + """修复:添加异常捕获、空数据处理,确保文件生成""" + try: + wb = Workbook() + # 应用案例工作表(处理空数据) + ws_apps = wb.active + ws_apps.title = "应用案例统计" + ws_apps.append(["排名", "应用案例", "出现次数"]) + if top_applications: + for i, (app, count) in enumerate(top_applications, 1): + ws_apps.append([i, app, count]) + else: + ws_apps.append([1, "无匹配数据", 0]) # 空数据时写入占位行 + + # 视频信息工作表(处理空数据) + ws_videos = wb.create_sheet(title="视频信息") + ws_videos.append(["序号", "视频标题", "播放量", "作者", "BV号"]) + if self.video_info: + for i, video in enumerate(self.video_info[:360], 1): + ws_videos.append([i, video["title"], video["play"], video["author"], video["bvid"]]) + else: + ws_videos.append([1, "无视频数据", 0, "无", "无"]) # 空数据时写入占位行 + + # 保存文件(确保路径可写) + wb.save(filename) + print(f"✅ Excel数据已成功保存到{os.path.abspath(filename)}") + except Exception as e: + # 捕获所有保存错误并提示 + print(f"❌ 保存Excel失败:{e}") + print("建议:1. 关闭已打开的同名Excel文件 2. 检查当前目录是否有写入权限") + + +def main(): + spider = BilibiliSpider() + if os.path.exists("danmaku.txt"): + choice = input("发现已存在的弹幕数据,是否直接使用? (y/n): ") + if choice.lower() == "y": + spider.load_danmaku() + else: + keywords = ["大语言模型", "大模型", "LLM"] + for keyword in keywords: + spider.crawl_keyword(keyword) + time.sleep(random.uniform(5, 10)) + spider.save_danmaku() + else: + keywords = ["大语言模型", "大模型", "LLM"] + for keyword in keywords: + spider.crawl_keyword(keyword) + time.sleep(random.uniform(5, 10)) + spider.save_danmaku() + + # 分析弹幕(即使返回空列表,也继续执行保存) + top_applications = spider.analyze_danmaku(top_n=8) + + # 生成词云 + spider.generate_wordcloud() + + # 强制执行Excel保存(添加异常捕获,确保不中断) + try: + spider.save_to_excel(top_applications) + except Exception as e: + print(f"Excel保存最终失败:{e}") + + print("所有任务执行完毕(含异常处理)!") + + +if __name__ == "__main__": + main() \ No newline at end of file