diff --git a/code1.py b/code1.py deleted file mode 100644 index ea176fb..0000000 --- a/code1.py +++ /dev/null @@ -1,230 +0,0 @@ -import requests -import re -import json -import time -import random -import pandas as pd -import jieba -from wordcloud import WordCloud -import matplotlib.pyplot as plt -from collections import Counter -from bs4 import BeautifulSoup -import os -from openpyxl import Workbook - -# 设置中文显示 -plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"] -plt.rcParams["axes.unicode_minus"] = False - -class BilibiliSpider: - def __init__(self): - self.headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", - "Accept": "application/json, text/plain, */*", - "Referer": "https://www.bilibili.com/" - } - self.session = requests.Session() - self.session.headers.update(self.headers) - self.danmaku_list = [] # 存储所有弹幕 - self.video_info = [] # 存储视频信息 - - def search_videos(self, keyword, page=1, pages=36): - print(f"开始搜索关键词: {keyword}") - all_videos = [] - for p in range(page, page + pages): - try: - url = f"https://api.bilibili.com/x/web-interface/search/all/v2?keyword={keyword}&page={p}&page_size=10" - response = self.session.get(url) - data = json.loads(response.text) - if data.get("code") != 0: - print(f"搜索失败,错误代码: {data.get('code')}") - continue - video_items = data.get("data", {}).get("result", []) - for item in video_items: - if item.get("result_type") == "video": - for video in item.get("data", []): - bvid = video.get("bvid") - title = video.get("title") - play = video.get("play") - author = video.get("author") - all_videos.append({"bvid": bvid, "title": title, "play": play, "author": author}) - self.video_info.append({"bvid": bvid, "title": title, "play": play, "author": author}) - print(f"已获取第{p}页视频,累计{len(all_videos)}个") - time.sleep(random.uniform(1, 3)) - except Exception as e: - print(f"搜索视频出错: {e}") - time.sleep(5) - return all_videos[:360] - - def get_cid(self, bvid): - try: - url = f"https://api.bilibili.com/x/web-interface/view?bvid={bvid}" - response = self.session.get(url) - data = json.loads(response.text) - if data.get("code") == 0: - return data.get("data", {}).get("cid") - else: - print(f"获取cid失败,bvid: {bvid},错误代码: {data.get('code')}") - return None - except Exception as e: - print(f"获取cid出错: {e}") - return None - - def get_danmaku(self, cid): - if not cid: - return [] - try: - url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}" - response = self.session.get(url) - response.encoding = "utf-8" - soup = BeautifulSoup(response.text, "lxml") - danmakus = soup.find_all("d") - result = [danmaku.text.strip() for danmaku in danmakus] - print(f"成功获取{len(result)}条弹幕") - return result - except Exception as e: - print(f"获取弹幕出错: {e}") - return [] - - def crawl_keyword(self, keyword): - videos = self.search_videos(keyword) - for i, video in enumerate(videos): - print(f"正在处理第{i+1}/{len(videos)}个视频: {video['title']}") - cid = self.get_cid(video["bvid"]) - if cid: - danmakus = self.get_danmaku(cid) - self.danmaku_list.extend(danmakus) - if (i + 1) % 5 == 0: - time.sleep(random.uniform(3, 5)) - print(f"关键词[{keyword}]爬取完成,累计获取{len(self.danmaku_list)}条弹幕") - - def save_danmaku(self, filename="danmaku.txt"): - with open(filename, "w", encoding="utf-8") as f: - for danmaku in self.danmaku_list: - f.write(danmaku + "\n") - print(f"弹幕已保存到{filename}") - - def load_danmaku(self, filename="danmaku.txt"): - if os.path.exists(filename): - with open(filename, "r", encoding="utf-8") as f: - self.danmaku_list = [line.strip() for line in f.readlines() if line.strip()] - print(f"从{filename}加载了{len(self.danmaku_list)}条弹幕") - - def analyze_danmaku(self, top_n=8): - """修复:返回空列表时明确提示,避免后续保存出错""" - if not self.danmaku_list: - print("没有弹幕数据可分析,返回空列表") - return [] - application_keywords = [ - "聊天机器人", "智能客服", "内容创作", "代码生成", - "翻译", "教育", "医疗", "法律", "金融分析", - "图像生成", "语音识别", "自动驾驶", "数据分析", - "游戏", "推荐系统", "搜索引擎" - ] - application_counts = {kw: 0 for kw in application_keywords} - for danmaku in self.danmaku_list: - for kw in application_keywords: - if kw in danmaku: - application_counts[kw] += 1 - sorted_applications = sorted(application_counts.items(), key=lambda x: x[1], reverse=True) - top_applications = sorted_applications[:top_n] - print(f"出现频率最高的{top_n}项LLM应用:") - for i, (app, count) in enumerate(top_applications, 1): - print(f"{i}. {app}: {count}次") - # 若前N项全为0次,提示并返回非空列表(避免Excel写入失败) - if all(count == 0 for _, count in top_applications): - print("未匹配到任何LLM应用关键词,Excel将写入空数据") - return top_applications - - def generate_wordcloud(self, filename="wordcloud.png"): - if not self.danmaku_list: - print("没有弹幕数据可生成词云") - return - text = " ".join(self.danmaku_list) - words = jieba.cut(text) - words = [word for word in words if len(word) > 1] - words_text = " ".join(words) - wc = WordCloud( - font_path="simhei.ttf", - background_color="white", - width=1200, - height=800, - max_words=200, - collocations=False - ).generate(words_text) - plt.figure(figsize=(12, 8)) - plt.imshow(wc, interpolation="bilinear") - plt.axis("off") - plt.tight_layout(pad=0) - plt.savefig(filename, dpi=300, bbox_inches="tight") - plt.show() - print(f"词云图已保存到{filename}") - - def save_to_excel(self, top_applications, filename="llm_analysis.xlsx"): - """修复:添加异常捕获、空数据处理,确保文件生成""" - try: - wb = Workbook() - # 应用案例工作表(处理空数据) - ws_apps = wb.active - ws_apps.title = "应用案例统计" - ws_apps.append(["排名", "应用案例", "出现次数"]) - if top_applications: - for i, (app, count) in enumerate(top_applications, 1): - ws_apps.append([i, app, count]) - else: - ws_apps.append([1, "无匹配数据", 0]) # 空数据时写入占位行 - - # 视频信息工作表(处理空数据) - ws_videos = wb.create_sheet(title="视频信息") - ws_videos.append(["序号", "视频标题", "播放量", "作者", "BV号"]) - if self.video_info: - for i, video in enumerate(self.video_info[:360], 1): - ws_videos.append([i, video["title"], video["play"], video["author"], video["bvid"]]) - else: - ws_videos.append([1, "无视频数据", 0, "无", "无"]) # 空数据时写入占位行 - - # 保存文件(确保路径可写) - wb.save(filename) - print(f"✅ Excel数据已成功保存到{os.path.abspath(filename)}") - except Exception as e: - # 捕获所有保存错误并提示 - print(f"❌ 保存Excel失败:{e}") - print("建议:1. 关闭已打开的同名Excel文件 2. 检查当前目录是否有写入权限") - - -def main(): - spider = BilibiliSpider() - if os.path.exists("danmaku.txt"): - choice = input("发现已存在的弹幕数据,是否直接使用? (y/n): ") - if choice.lower() == "y": - spider.load_danmaku() - else: - keywords = ["大语言模型", "大模型", "LLM"] - for keyword in keywords: - spider.crawl_keyword(keyword) - time.sleep(random.uniform(5, 10)) - spider.save_danmaku() - else: - keywords = ["大语言模型", "大模型", "LLM"] - for keyword in keywords: - spider.crawl_keyword(keyword) - time.sleep(random.uniform(5, 10)) - spider.save_danmaku() - - # 分析弹幕(即使返回空列表,也继续执行保存) - top_applications = spider.analyze_danmaku(top_n=8) - - # 生成词云 - spider.generate_wordcloud() - - # 强制执行Excel保存(添加异常捕获,确保不中断) - try: - spider.save_to_excel(top_applications) - except Exception as e: - print(f"Excel保存最终失败:{e}") - - print("所有任务执行完毕(含异常处理)!") - - -if __name__ == "__main__": - main() \ No newline at end of file