From a85387eb04205439c4d676ea5924e01c8dc73344 Mon Sep 17 00:00:00 2001 From: pg3fbpv9r Date: Wed, 11 Sep 2024 16:26:15 +0800 Subject: [PATCH] ADD file via upload --- get_urls.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 get_urls.py diff --git a/get_urls.py b/get_urls.py new file mode 100644 index 0000000..d152f6c --- /dev/null +++ b/get_urls.py @@ -0,0 +1,39 @@ +from selenium import webdriver +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.common.by import By + +def get_urls(query, number): + chrome_driver_path = 'D:\chromedriver-win64\chromedriver.exe' + service = Service(chrome_driver_path) + options = Options() + options.add_argument('--headless') #后台运行,不显示浏览器窗口 + + driver = webdriver.Chrome(service=service, options=options) # 初始化 WebDriver + url_list = set() + page = 1 + while (len(url_list) < number): + search_url = f'https://search.bilibili.com/video?keyword={query}&page={page}' + driver.get(search_url) # 打开网页 + + # 查找符合选择器的所有 标签 + elements = driver.find_elements(By.CSS_SELECTOR, ".video-list.row div.bili-video-card > div > a") + + # 将每个 标签的 href 属性(即网址)加入list + for element in elements: + url_list.add(element.get_attribute('href')) + if (len(url_list) >= number): break + # print(f"page = {page}, cnt = {len(url_list)}") + page = page + 1 + + driver.quit() # 关闭浏览器 + return url_list + +if __name__ == '__main__': + + query = '2024巴黎奥运会' + number = 300 + url_list = get_urls(query=query, number=number) + for url in url_list : + print(url) + \ No newline at end of file