From fdd9ff0633649cef94a9811825d3d20ec76c3c5c Mon Sep 17 00:00:00 2001 From: piw4f8lbj <1836196924@qq.com> Date: Wed, 18 Sep 2024 15:34:24 +0800 Subject: [PATCH] ADD file via upload --- geturls.py | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 geturls.py diff --git a/geturls.py b/geturls.py new file mode 100644 index 0000000..e96c137 --- /dev/null +++ b/geturls.py @@ -0,0 +1,56 @@ +import requests +import re +import time + +def get_videourl(): #使用该函数获取每个视频的url的特征值 + + str = " " + #通过在哔站网站搜索关键词“巴黎奥运会”得到的url; + url = 'https://search.bilibili.com/video?vt=97225548&keyword=巴黎奥运会&from_source=webtop_search&spm_id_from=333.1007&search_source=2' + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36' + } + #决定不同页面的两个参数值 + params = { + 'page': 0, + 'o': 0 + } + #每页拥有30个视频,所以要对前10页进行爬取 + while params['page'] < 1: + #通过requests请求得到返回数据,并将返回的页面源代码,存储于str中 + response = requests.get(url, params=params, headers=headers) + response.encoding = "utf-8" + str += response.text + #对下一页进行请求,改变参数值 + params['page'] += 1 + params['o'] += 30 + time.sleep(1) + #使用正则表达式提取每条视频url中的特征值,并对重复的进行消除 + obj = re.compile(r"video/(?P.*?)/", re.S) + list1 = obj.findall(str) + urlist = list(set(list1)) + #返回存有特征值的列表 + return urlist + + + + + + + + + + + + + + + + + # print(len(urlist)) +# print(urlist) + + +# path = Path('gurl.json') +# contents = json.dumps(urlist) +# path.write_text((contents)) \ No newline at end of file