import requests from bs4 import BeautifulSoup import re import time import random import jieba import wordcloud import matplotlib.pyplot as plt import pandas as pd from pandas import ExcelWriter from collections import Counter headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0', } keywords = [ 'AI', '人工智能', '机器学习', '深度学习', '神经网络', '自动', '算法', '数据科学', '自然语言', '计算机', '人工智能技术', '大数据', '预测分析', '机器视觉', '智能', '计算机', '人工智能应用', '数据分析', '情感计算', 'ai'] videosnumber = 0 # 获取搜索结果页面的内容 def get_search_page(search_url): response = requests.get(search_url, headers=headers) response.raise_for_status() # 确保请求成功 return response.text # 提取页面中所有视频的链接 def extract_video_links(page_content): soup = BeautifulSoup(page_content, 'html.parser') video_links = [] for a_tag in soup.select(".video-list.row div.bili-video-card > div > a"): link = a_tag.get('href') video_links.append(link) return video_links # 提取视频的BV号 def extract__BV(video_urls): links=[] for video_url in video_urls: video_id_match = re.search(r'/video/([^/]+)', video_url) if video_id_match: links.append(video_id_match.group(1)) return links # 将视频BV号转为CID def get_cid_from_bv(bv_ids): cids=[] for bv_id in bv_ids: # 视频详情 API 地址 video_url = f'https://api.bilibili.com/x/web-interface/view?bvid={bv_id}' # 发送请求 response = requests.get(video_url, headers=headers) response.raise_for_status() data = response.json() # 提取 cid if data.get('code') == 0: cid = data.get('data', {}).get('cid') cids.append(cid) return cids #获取弹幕 def get_danmu(id): global videosnumber video_url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={id}' response = requests.get(video_url, headers=headers) #要爬取的网址 response.encoding='utf-8' #编码方式 html = response.text soup = BeautifulSoup(html) #使用beautifulsoup库快速查找我们想要的信息 all_txt = soup.findAll("d") #寻找到所有包含d的行 txt=[all_txts.attrs ["p"]for all_txts in all_txt] #寻找到所有包含d的行中属性为p的值,这里边包含了弹幕的虚拟id等 txtss=[all_txts.string for all_txts in all_txt] #寻找到所有包含d的行中的字符串数据,即弹幕内容 txtsss=[txts.replace(' ','') for txts in txtss] #将字符串中的空格消除掉 videosnumber = videosnumber +1 bulletnumber = len(txtsss) print( f"这是第{videosnumber}视频, 获取到{bulletnumber}弹幕") time.sleep(random.randint(0,2)+random.random()) return(txtsss) ###打印便可看见一条条弹幕的属性和内容了。 #翻页 def page(url,num): num=num+1 url=f'https://search.bilibili.com/video?keyword=2024巴黎奥运会&page={num}' return url #处理弹幕 def wcloud(alltxt): danmustr=''.join(i for i in alltxt) #将所有弹幕拼接在一起 words=list(jieba.cut(danmustr)) ###利用jieba库将弹幕按词进行切分 words=[i for i in words if len(i)>1] ###挑出长度大于1的词语(为去除诸如?,哈,啊等字符) wc=wordcloud.WordCloud(height=1000,width=1000,font_path='simsun.ttc')#利用wordcloud库定义词云图片的信息 wc.generate(' '.join(words)) ##生成图片 print(wc) plt.imshow(wc) plt.show() def sort(txt, keywords): comment_counter = Counter() for line in txt: if any(word in keywords for word in jieba.cut(line)): comment_counter[line] += 1 return comment_counter #存入excel def save_to_excel(danmu_data, filename='danmu_data.xlsx'): # 创建 DataFrame df = pd.DataFrame(danmu_data, columns=['弹幕']) # 保存到 Excel with ExcelWriter(filename, engine='openpyxl') as writer: df.to_excel(writer, index=False) # 主函数 def main(kword,mubiao): search_url= f'https://search.bilibili.com/video?keyword={kword}' for i in range(100): search_url=page(search_url,i) page_content = get_search_page(search_url) video_links = extract_video_links(page_content) bvs = extract__BV(video_links) alltxt=[] cids = [] cids = get_cid_from_bv(bvs) for id in cids: if(videosnumber>=mubiao): break txt = get_danmu(id) alltxt=alltxt + txt if(videosnumber>=mubiao): break return(alltxt) # 示例搜索页 URL(需要替换为实际的搜索页 URL) keword = "2024巴黎奥运会" #视频关键词 flag = 10 #你要爬的视频数量 alltxt=main(keword,flag) wcloud(alltxt) save_to_excel(alltxt) comment_counter = sort(alltxt, keywords) top_comments = comment_counter.most_common(8) # 输出排名前8的AI相关弹幕 for comment, count in top_comments: print(f'弹幕: {comment}, 数量: {count}')