import requests from bs4 import BeautifulSoup import time import random import openpyxl from collections import Counter import jieba import wordcloud headers = { 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'DNT': '1', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/82.0.4083.0 Safari/537.36 Edg/82.0.458.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Sec-Fetch-Site': 'none', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-User': '?1', 'Sec-Fetch-Dest': 'document', 'Accept-Language': 'en-US,en;q=0.9' } def get_video_ids(api_urls): video_ids=[] headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0', 'cookie' : 'BIDUPSID=B217BCFBC37D845BE7576B36283B7200; PSTM=1694936551; BAIDUID_BFESS=B217BCFBC37D845B41B0963C72424B9E:FG=1; ZFY=8a5uERFpWfnCU59:Bf7xfjug61O89yJEG1n4GF:BPaY1c:C; BDRCVFR[bPTzwF-RsLY]=mk3SLVN4HKm; H_PS_PSSID=60724_60360_60799; BD_HOME=1; BD_UPN=12314753; BA_HECTOR=058ka00hahalak0gak21a1akapkf6v1jelat81u' } #获取前300个热门视频bvid并保存 # 设定计数器记录id的个数 cnt = 0 # 从多页数据中获取视频信息 for page in range(1, 22): # 执行翻页操作 api_url = api_urls + str(page) # 获取Datas response = requests.get(api_url, headers=headers) response.encoding = 'utf-8' Json = response.json() Datas = Json['data']['result'] for Data in Datas: # 通过try except跳过B站设置的断点 try: bvids = Data['bvid'] video_ids.append(bvids) with open("bv.txt", mode='a', encoding='utf-8') as f: # 执行写入操作 f.write(bvids + '\n') cnt += 1 if (cnt >= 300): break except: continue if (cnt >= 300): break # print(bvids) return video_ids # API URLs api_urls = 'https://api.bilibili.com/x/web-interface/search/type?search_type=video&keyword=巴黎奥运会&page=' video_ids = get_video_ids(api_urls) #从bvid获取单个cid的方法 def get_cid_from_bv(bv_number, p_number=0): try: url = 'https://api.bilibili.com/x/player/pagelist?bvid={}&jsonp=jsonp' response = requests.get(url.format(bv_number), headers=headers) if response.status_code == 200: data = response.json()['data'] if p_number < len(data): return data[p_number]['cid'] else: print(f'Error: Part number out of range for BV code {bv_number}.') return None else: print(f'Error: Failed to retrieve CID for BV code {bv_number}. Status code: {response.status_code}') return None except Exception as e: print(f'Error: {str(e)} for BV code {bv_number}.') return None #批量转换bvid到cid def get_cids_from_bv_list(video_ids): cid_list = [] for bv_code in video_ids: cid = get_cid_from_bv(bv_code) if cid is not None: cid_list.append(cid) else: cid_list.append(None) return cid_list cids = get_cids_from_bv_list(video_ids) print(f'The corresponding CIDs for the provided BV codes are: {cids}') #通过cid批量获取弹幕链接 def get_danmu(cids): url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={cids}' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' } response = requests.get(url, headers=headers) response.encoding = 'utf-8' soup = BeautifulSoup(response.text, 'html.parser') danmu_list = [] for danmu in soup.find_all('d'): danmu_list.append(danmu.text) return danmu_list # 批量爬取弹幕 with open('all_danmu.txt', 'w', encoding='utf-8') as f: for video_id in cids: danmu_list = get_danmu(video_id) print(f'视频 {video_id} 的弹幕数量: {len(danmu_list)}') # 写入当前视频的弹幕到文件 f.write(f'== 开始保存视频 {video_id} 的弹幕 ==\n') f.write('\n'.join(danmu_list) + '\n') # 随机休眠1-3秒,防止被ban time.sleep(random.uniform(1, 3)) print('所有视频的弹幕已保存到 all_danmu.txt 文件中') # 1. 读取弹幕数据文件 filename = 'all_danmu.txt' with open(filename, 'r', encoding='utf-8') as f: danmus = f.readlines() # 2. 过滤与AI技术相关的弹幕 ai_keywords = ['AI', '人工智能'] ai_related_danmus = [danmu for danmu in danmus if any(keyword in danmu for keyword in ai_keywords)] # 3. 统计每种弹幕数量 danmu_counter = Counter(ai_related_danmus) # 4. 获取数量排名前8的弹幕 top_danmus = danmu_counter.most_common(8) # 5. 将统计结果写入Excel表 excel_filename = 'AI相关弹幕统计.xlsx' wb = openpyxl.Workbook() sheet = wb.active sheet.title = 'AI相关弹幕统计' sheet['A1'] = '弹幕内容' sheet['B1'] = '出现次数' # 写入前8的弹幕和次数 for idx, (danmu, count) in enumerate(top_danmus, start=2): sheet[f'A{idx}'] = danmu.strip() # 去除弹幕末尾的换行符 sheet[f'B{idx}'] = count # 保存Excel文件 wb.save(excel_filename) print(f'AI相关弹幕统计已保存到 {excel_filename}') # 读取弹幕数据文件 filename = 'all_danmu.txt' with open(filename, 'r', encoding='utf-8') as f: text = f.read() # 使用 jieba 分词 word_list = jieba.lcut(text) text_str = ' '.join(word_list) # 生成词云 wc = wordcloud.WordCloud(font_path='C:/Windows/Fonts/微软雅黑/msyh.ttc', # 指定中文字体文件 width=800, height=600, # 设置词云大小 background_color='white', # 设置背景颜色 stopwords={'哈哈','哈哈哈','的','是','了','我','和','这','也','你','啊','吧','就是','这个','吗','他','不是','真的','都','在','现在','感觉','看','有','不'}, contour_width=1, contour_color='blue') wc.generate(text_str) # 保存词云图到当前文件夹 wc.to_file('wordcloud.png')