import requests from bs4 import BeautifulSoup import time import pandas as pd from collections import Counter from wordcloud import WordCloud import matplotlib.pyplot as plt cnt = 0 # 已爬取视频数 danmuku_all = [] # 弹幕库 headers = { "cookie": "cookie", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36" } def get_cid(bvid): url = f"https://api.bilibili.com/x/player/pagelist?bvid={bvid}" try: response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() Json = response.json() return Json['data'][0]['cid'] except requests.exceptions.RequestException as e: print(f"请求失败: {e}") return None def get_danmuku(cid): if cid is None: return [] url = f"https://comment.bilibili.com/{cid}.xml" try: response = requests.get(url, headers=headers, timeout=10) response.encoding = 'utf-8' soup = BeautifulSoup(response.text, 'xml') return [i.text for i in soup.find_all('d')] except requests.exceptions.RequestException as e: print(f"请求失败: {e}") return [] for Page in range(1, 22): # 1到22页够300个视频 url = f'https://api.bilibili.com/x/web-interface/search/type?search_type=video&keyword=巴黎奥运会&page={Page}' try: response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() Json = response.json() results = Json['data']['result'] for result in results: cid = get_cid(result['bvid']) danmuku = get_danmuku(cid) danmuku_all.extend(danmuku) cnt += 1 if cnt >= 300: break if cnt >= 300: break except requests.exceptions.RequestException as e: print(f"请求失败: {e}") time.sleep(1) # 延时1秒防止被屏蔽 def filter_danmuku(danmuku_list, keywords): # 筛选包含指定关键词的弹幕 keywords_lower = [keyword.lower() for keyword in keywords] # 关键词小写 filtered = [d for d in danmuku_list if any(keyword in d.lower() for keyword in keywords_lower)] return filtered # 读取弹幕文件 with open('所有视频弹幕.txt', 'r', encoding='utf-8') as file: danmuku_all = file.readlines() # 筛选包含关键词的弹幕 keywords = ['AI配音' , 'ai配音' , '人工智能' , 'ai画图' , 'AI画图' , 'AI识曲' , 'AI生成' , '神经网络' , '卷积神经网络' , '循环神经网络' , '智能家居' , '自动驾驶' , '智能推荐' , '智能算法' , '强化学习' , '计算机视觉' , 'ai还原' , 'ai合成'] filtered_danmuku = filter_danmuku(danmuku_all, keywords) # 统计弹幕数量 counter = Counter(filtered_danmuku) most_common = counter.most_common(8) # 将结果按列写入Excel data = {'弹幕内容': [content.strip() for content, count in most_common], '数量': [count for content, count in most_common]} df = pd.DataFrame(data) df.to_excel('AI_人工智能_弹幕统计.xlsx', index=False) print("前8位弹幕统计已保存到 'AI_人工智能_弹幕统计.xlsx'.") font_path = r'C:\Windows\Fonts\simhei.ttf' try: df = pd.read_excel('AI_人工智能_弹幕统计.xlsx') if '弹幕内容' not in df.columns: raise ValueError("Excel 文件中没有找到 '弹幕内容' 列") text = ' '.join(df['弹幕内容'].dropna()) wordcloud = WordCloud(font_path=font_path, width=800, height=400, background_color='white').generate(text) plt.figure(figsize=(10, 5)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') plt.show() wordcloud.to_file('词云图.png') except FileNotFoundError: print("文件未找到,请检查文件路径") except ValueError as ve: print(ve) except Exception as e: print(f"发生错误: {e}")