parent
ad5d1ab257
commit
9e7d1e7360
@ -0,0 +1,54 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import time
|
||||
|
||||
def get_300videos_urls(keyword):
|
||||
|
||||
# 从 B 站搜索结果中获取最多 300 个视频的 URL。
|
||||
page = 1
|
||||
urls = [] # 存储视频链接的列表
|
||||
|
||||
while len(urls) < 300:
|
||||
url = f"https://search.bilibili.com/video?keyword={keyword}&page={page}"
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.get(url, headers=headers)
|
||||
response.raise_for_status()
|
||||
time.sleep(0.5) # 避免过于频繁的请求
|
||||
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
# 查找所有包含视频链接的 <a> 标签
|
||||
for item in soup.find_all('a', href=True):
|
||||
href = item['href']
|
||||
|
||||
# 处理以 // 开头的 URL
|
||||
if href.startswith('//'):
|
||||
href = 'https:' + href
|
||||
|
||||
# 只保留以 /video/ 开头的链接
|
||||
if '/video/' in href and href not in urls:
|
||||
urls.append(href)
|
||||
|
||||
# 如果已获取 300 个链接,则退出循环
|
||||
if len(urls) >= 300:
|
||||
break
|
||||
|
||||
# 如果已获取 300 个链接,则退出循环
|
||||
if len(urls) >= 300:
|
||||
break
|
||||
|
||||
page += 1 # 继续请求下一页
|
||||
|
||||
except requests.RequestException as e:
|
||||
print(f"Error fetching URL from page {page}: {e}")
|
||||
|
||||
if not urls:
|
||||
print("Failed to retrieve any URLs.")
|
||||
else:
|
||||
print(f"Retrieved {len(urls)} URLs.")
|
||||
|
||||
return urls
|
Loading…
Reference in new issue