You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

131 lines
5.2 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
from bs4 import BeautifulSoup
import re
import time
import random
import jieba # 结巴分词 pip install jieba
import wordcloud # 词云图 pip install wordcloud
import imageio # 读取本地图片 修改词云图形
import matplotlib.pyplot as plt
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0',
}
keywords = [
'AI', '人工智能', '机器学习', '深度学习', '神经网络', '自动化',
'算法', '数据科学', '智能算法', '自然语言处理', '计算机视觉',
'智能机器人', '智能系统', '人工智能技术', 'AI技术', 'AI应用',
'智能设备', '智能分析', 'AI模型', '大数据', '预测分析',
'模式识别', '语音识别', '图像识别', '机器人技术', '数据挖掘',
'智能决策', '虚拟助手', '增强现实', '计算智能', '自适应系统',
'智能网络', '知识图谱', '智能交互', 'AI解决方案', '计算机智能',
'自然语言生成', '深度神经网络', '强化学习', '迁移学习', '生成对抗网络',
'智能预测', '智慧城市', '智能制造', '机器视觉', '自动驾驶',
'智能传感器', '智能控制', '智能推荐', '计算机科学', '人工智能应用',
'人工智能发展', 'AI伦理', '人工智能安全', '智能算法应用', '数据分析',
'智能化', '智能化技术', '算法优化', '机器智能', '情感计算','ai'
]
videosnumber = 0
# 获取搜索结果页面的内容
def get_search_page(search_url):
response = requests.get(search_url, headers=headers)
response.raise_for_status() # 确保请求成功
return response.text
# 提取页面中所有视频的链接
def extract_video_links(page_content):
soup = BeautifulSoup(page_content, 'html.parser')
video_links = []
# 选择器根据实际网页结构可能需要调整
for a_tag in soup.select(".video-list.row div.bili-video-card > div > a"):
link = a_tag.get('href')
video_links.append(link)
return video_links
# 提取视频的BV号
def extract__BV(video_urls):
links=[]
for video_url in video_urls:
video_id_match = re.search(r'/video/([^/]+)', video_url)
if video_id_match:
links.append(video_id_match.group(1))
return links
def get_cid_from_bv(bv_ids):
cids=[]
for bv_id in bv_ids:
# 视频详情 API 地址
video_url = f'https://api.bilibili.com/x/web-interface/view?bvid={bv_id}'
# 发送请求
response = requests.get(video_url, headers=headers)
response.raise_for_status()
data = response.json()
# 提取 cid
if data.get('code') == 0:
cid = data.get('data', {}).get('cid')
cids.append(cid)
return cids
def get_danmu(id):
global videosnumber
video_url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={id}'
response = requests.get(video_url, headers=headers) #要爬取的网址
response.encoding='utf-8' #编码方式
html = response.text
soup = BeautifulSoup(html) #使用beautifulsoup库快速查找我们想要的信息
all_txt = soup.findAll("d") #寻找到所有包含d的行
txt=[all_txts.attrs ["p"]for all_txts in all_txt] #寻找到所有包含d的行中属性为p的值这里边包含了弹幕的虚拟id等
txtss=[all_txts.string for all_txts in all_txt] #寻找到所有包含d的行中的字符串数据即弹幕内容
txtsss=[txts.replace(' ','') for txts in txtss] #将字符串中的空格消除掉
videosnumber = videosnumber +1
bulletnumber = len(txtsss)
print( f"这是第{videosnumber}视频, 获取到{bulletnumber}弹幕")
time.sleep(random.randint(0,2)+random.random())
return(txtsss) ###打印便可看见一条条弹幕的属性和内容了。
def page(url,num):
num=num+1
url=f'https://search.bilibili.com/video?keyword=2024巴黎奥运会&page={num}'
return url
def chuli(alltxt):
danmustr=''.join(i for i in alltxt) #将所有弹幕拼接在一起
words=list(jieba.cut(danmustr)) ###利用jieba库将弹幕按词进行切分
words=[i for i in words if len(i)>1] ###挑出长度大于1的词语为去除诸如啊等字符
wc=wordcloud.WordCloud(height=1000,width=1000,font_path='simsun.ttc')#利用wordcloud库定义词云图片的信息
wc.generate(' '.join(words)) ##生成图片
print(wc)
plt.imshow(wc)
plt.show()
# 主函数
def main(kword,mubiao):
search_url= f'https://search.bilibili.com/video?keyword={kword}'
print(search_url)
for i in range(100):
search_url=page(search_url,i)
page_content = get_search_page(search_url)
video_links = extract_video_links(page_content)
bvs = extract__BV(video_links)
alltxt=[]
cids = []
cids = get_cid_from_bv(bvs)
for id in cids:
if(videosnumber>=mubiao): break
txt = get_danmu(id)
alltxt=alltxt + txt
if(videosnumber>=mubiao): break
return(alltxt)
# 示例搜索页 URL需要替换为实际的搜索页 URL
keword = "2024巴黎奥运会"
flag = 5 #你要爬的视频数量
alltxt=main(keword,flag)
chuli(alltxt)