|
|
@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
import time
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_videourl(): #使用该函数获取每个视频的url的特征值
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
str = " "
|
|
|
|
|
|
|
|
#通过在哔站网站搜索关键词“巴黎奥运会”得到的url;
|
|
|
|
|
|
|
|
url = 'https://search.bilibili.com/video?vt=97225548&keyword=巴黎奥运会&from_source=webtop_search&spm_id_from=333.1007&search_source=2'
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#决定不同页面的两个参数值
|
|
|
|
|
|
|
|
params = {
|
|
|
|
|
|
|
|
'page': 0,
|
|
|
|
|
|
|
|
'o': 0
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#每页拥有30个视频,所以要对前10页进行爬取
|
|
|
|
|
|
|
|
while params['page'] < 1:
|
|
|
|
|
|
|
|
#通过requests请求得到返回数据,并将返回的页面源代码,存储于str中
|
|
|
|
|
|
|
|
response = requests.get(url, params=params, headers=headers)
|
|
|
|
|
|
|
|
response.encoding = "utf-8"
|
|
|
|
|
|
|
|
str += response.text
|
|
|
|
|
|
|
|
#对下一页进行请求,改变参数值
|
|
|
|
|
|
|
|
params['page'] += 1
|
|
|
|
|
|
|
|
params['o'] += 30
|
|
|
|
|
|
|
|
time.sleep(1)
|
|
|
|
|
|
|
|
#使用正则表达式提取每条视频url中的特征值,并对重复的进行消除
|
|
|
|
|
|
|
|
obj = re.compile(r"video/(?P<surl>.*?)/", re.S)
|
|
|
|
|
|
|
|
list1 = obj.findall(str)
|
|
|
|
|
|
|
|
urlist = list(set(list1))
|
|
|
|
|
|
|
|
#返回存有特征值的列表
|
|
|
|
|
|
|
|
return urlist
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# print(len(urlist))
|
|
|
|
|
|
|
|
# print(urlist)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# path = Path('gurl.json')
|
|
|
|
|
|
|
|
# contents = json.dumps(urlist)
|
|
|
|
|
|
|
|
# path.write_text((contents))
|