diff --git a/get_300urls.py b/get_300urls.py new file mode 100644 index 0000000..fb43f83 --- /dev/null +++ b/get_300urls.py @@ -0,0 +1,38 @@ +import requests +from bs4 import BeautifulSoup +import time + +def get_300videos_urls(keyword): #获取300个视频的URL + page = 1 + urls = [] # 使用列表来存储视频链接 + + while len(urls) < 300: + url = f"https://search.bilibili.com/video?keyword={keyword}&page={page}" + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0" + } + + response = requests.get(url, headers=headers) + time.sleep(0.5) + page += 1 + + if response.status_code == 200: + soup = BeautifulSoup(response.text, 'html.parser') + + # 查找所有包含视频链接的 标签 + for item in soup.find_all('a', href=True): + href = item['href'] + + # 处理以 // 开头的 URL + if href.startswith('//'): + href = 'https:' + href + + # 只保留以 /video/ 开头的链接 + if '/video/' in href: + if href not in urls: # 确保链接唯一性 + urls.append(href) + if len(urls) >= 300: + break + if len(urls) == 0: + print("get urls failed") + return urls \ No newline at end of file