|
|
|
@ -0,0 +1,44 @@
|
|
|
|
|
from selenium import webdriver
|
|
|
|
|
from selenium.webdriver.chrome.service import Service
|
|
|
|
|
from selenium.webdriver.chrome.options import Options
|
|
|
|
|
from selenium.webdriver.common.by import By
|
|
|
|
|
|
|
|
|
|
def get_urls(query, number):
|
|
|
|
|
chrome_driver_path = 'D:\chromedriver-win64\chromedriver.exe'
|
|
|
|
|
service = Service(chrome_driver_path)
|
|
|
|
|
options = Options()
|
|
|
|
|
options.add_argument('--headless') #后台运行,不显示浏览器窗口
|
|
|
|
|
|
|
|
|
|
driver = webdriver.Chrome(service=service, options=options) # 初始化 WebDriver
|
|
|
|
|
url_list = set()
|
|
|
|
|
page = 1
|
|
|
|
|
while (len(url_list) < number):
|
|
|
|
|
search_url = f'https://search.bilibili.com/video?keyword={query}&page={page}'
|
|
|
|
|
driver.get(search_url) # 打开网页
|
|
|
|
|
|
|
|
|
|
# 查找符合选择器的所有 <a> 标签
|
|
|
|
|
elements = driver.find_elements(By.CSS_SELECTOR, ".video-list.row div.bili-video-card > div > a")
|
|
|
|
|
|
|
|
|
|
# 将每个 <a> 标签的 href 属性(即网址)加入list
|
|
|
|
|
for element in elements:
|
|
|
|
|
url_list.add(element.get_attribute('href'))
|
|
|
|
|
if (len(url_list) >= number): break
|
|
|
|
|
# print(f"page = {page}, cnt = {len(url_list)}")
|
|
|
|
|
page = page + 1
|
|
|
|
|
|
|
|
|
|
driver.quit() # 关闭浏览器
|
|
|
|
|
return url_list
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
|
|
|
|
|
query = '2024巴黎奥运会'
|
|
|
|
|
number = 300
|
|
|
|
|
cnt = 0
|
|
|
|
|
url_list = get_urls(query=query, number=number)
|
|
|
|
|
for url in url_list :
|
|
|
|
|
with open('./urls.txt', mode='a', encoding='utf-8') as f:
|
|
|
|
|
f.write(url + "\n")
|
|
|
|
|
cnt = cnt + 1
|
|
|
|
|
print(f"url : {cnt}/{number}")
|
|
|
|
|
|
|
|
|
|
|