import requests from bs4 import BeautifulSoup import re import time import random import jieba import wordcloud import matplotlib.pyplot as plt import pandas as pd from pandas import ExcelWriter from collections import Counter headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0', } keywords = [ 'AI', '人工智能', '机器学习', '深度学习', '神经网络', '自动化', '算法', '数据科学', '智能算法', '自然语言处理', '计算机视觉', '智能机器人', '智能系统', '人工智能技术', 'AI技术', 'AI应用', '智能设备', '智能分析', 'AI模型', '大数据', '预测分析', '智能预测', '智慧城市', '智能制造', '机器视觉', '自动驾驶', '智能传感器', '智能控制', '智能推荐', '计算机科学', '人工智能应用', '人工智能发展', 'AI伦理', '人工智能安全', '智能算法应用', '数据分析', '智能化', '智能化技术', '算法优化', '机器智能', '情感计算','ai' ] videosnumber = 0 # 获取搜索结果页面的内容 def get_search_page(search_url): response = requests.get(search_url, headers=headers) response.raise_for_status() # 确保请求成功 return response.text # 提取页面中所有视频的链接 def extract_video_links(page_content): soup = BeautifulSoup(page_content, 'html.parser') video_links = [] for a_tag in soup.select(".video-list.row div.bili-video-card > div > a"): link = a_tag.get('href') video_links.append(link) return video_links # 提取视频的BV号 def extract__BV(video_urls): links=[] for video_url in video_urls: video_id_match = re.search(r'/video/([^/]+)', video_url) if video_id_match: links.append(video_id_match.group(1)) return links # 将视频BV号转为CID def get_cid_from_bv(bv_ids): cids=[] for bv_id in bv_ids: # 视频详情 API 地址 video_url = f'https://api.bilibili.com/x/web-interface/view?bvid={bv_id}' # 发送请求 response = requests.get(video_url, headers=headers) response.raise_for_status() data = response.json() # 提取 cid if data.get('code') == 0: cid = data.get('data', {}).get('cid') cids.append(cid) return cids #获取弹幕 def get_danmu(id): global videosnumber video_url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={id}' response = requests.get(video_url, headers=headers) #要爬取的网址 response.encoding='utf-8' #编码方式 html = response.text soup = BeautifulSoup(html) #使用beautifulsoup库快速查找我们想要的信息 all_txt = soup.findAll("d") #寻找到所有包含d的行 txt=[all_txts.attrs ["p"]for all_txts in all_txt] #寻找到所有包含d的行中属性为p的值,这里边包含了弹幕的虚拟id等 txtss=[all_txts.string for all_txts in all_txt] #寻找到所有包含d的行中的字符串数据,即弹幕内容 txtsss=[txts.replace(' ','') for txts in txtss] #将字符串中的空格消除掉 videosnumber = videosnumber +1 bulletnumber = len(txtsss) print( f"这是第{videosnumber}视频, 获取到{bulletnumber}弹幕") time.sleep(random.randint(0,2)+random.random()) return(txtsss) ###打印便可看见一条条弹幕的属性和内容了。 #翻页 def page(url,num): num=num+1 url=f'https://search.bilibili.com/video?keyword=2024巴黎奥运会&page={num}' return url #处理弹幕 def chuli(alltxt): danmustr=''.join(i for i in alltxt) #将所有弹幕拼接在一起 words=list(jieba.cut(danmustr)) ###利用jieba库将弹幕按词进行切分 words=[i for i in words if len(i)>1] ###挑出长度大于1的词语(为去除诸如?,哈,啊等字符) wc=wordcloud.WordCloud(height=1000,width=1000,font_path='simsun.ttc')#利用wordcloud库定义词云图片的信息 wc.generate(' '.join(words)) ##生成图片 print(wc) plt.imshow(wc) plt.show() def sort(txt, keywords): comment_counter = Counter() for line in txt: if any(word in keywords for word in jieba.cut(line)): comment_counter[line] += 1 return comment_counter comment_counter = sort(alltxt, keywords) top_comments = comment_counter.most_common(8) #输出排名前八 #存入excel def save_to_excel(danmu_data, filename='danmu_data.xlsx'): # 创建 DataFrame df = pd.DataFrame(danmu_data, columns=['弹幕']) # 保存到 Excel with ExcelWriter(filename, engine='openpyxl') as writer: df.to_excel(writer, index=False) # 主函数 def main(kword,mubiao): search_url= f'https://search.bilibili.com/video?keyword={kword}' for i in range(100): search_url=page(search_url,i) page_content = get_search_page(search_url) video_links = extract_video_links(page_content) bvs = extract__BV(video_links) alltxt=[] cids = [] cids = get_cid_from_bv(bvs) for id in cids: if(videosnumber>=mubiao): break txt = get_danmu(id) alltxt=alltxt + txt if(videosnumber>=mubiao): break return(alltxt) # 示例搜索页 URL(需要替换为实际的搜索页 URL) keword = "2024巴黎奥运会" #视频关键词 flag = 300 #你要爬的视频数量 alltxt=main(keword,flag) chuli(alltxt) save_to_excel(alltxt) # 输出排名前8的AI相关弹幕 for comment, count in top_comments: print(f'弹幕: {comment}, 数量: {count}')