You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.
import requests
import re
import time
def get_videourl ( ) : #使用该函数获取每个视频的url的特征值
str = " "
#通过在哔站网站搜索关键词“巴黎奥运会”得到的url;
url = ' https://search.bilibili.com/video?vt=97225548&keyword=巴黎奥运会&from_source=webtop_search&spm_id_from=333.1007&search_source=2 '
headers = {
' User-Agent ' : ' Mozilla/5.0 (Windows NT 10.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 '
}
#决定不同页面的两个参数值
params = {
' page ' : 0 ,
' o ' : 0
}
#每页拥有30个视频, 所以要对前10页进行爬取
while params [ ' page ' ] < 1 :
#通过requests请求得到返回数据, 并将返回的页面源代码, 存储于str中
response = requests . get ( url , params = params , headers = headers )
response . encoding = " utf-8 "
str + = response . text
#对下一页进行请求,改变参数值
params [ ' page ' ] + = 1
params [ ' o ' ] + = 30
time . sleep ( 1 )
#使用正则表达式提取每条视频url中的特征值, 并对重复的进行消除
obj = re . compile ( r " video/(?P<surl>.*?)/ " , re . S )
list1 = obj . findall ( str )
urlist = list ( set ( list1 ) )
#返回存有特征值的列表
return urlist
# print(len(urlist))
# print(urlist)
# path = Path('gurl.json')
# contents = json.dumps(urlist)
# path.write_text((contents))