From cf12ce96b8638ba5093416a05a18f91fe238bdd7 Mon Sep 17 00:00:00 2001 From: pg3fbpv9r Date: Sat, 14 Sep 2024 21:19:02 +0800 Subject: [PATCH] Delete 'get_urls.py' --- get_urls.py | 39 --------------------------------------- 1 file changed, 39 deletions(-) delete mode 100644 get_urls.py diff --git a/get_urls.py b/get_urls.py deleted file mode 100644 index d152f6c..0000000 --- a/get_urls.py +++ /dev/null @@ -1,39 +0,0 @@ -from selenium import webdriver -from selenium.webdriver.chrome.service import Service -from selenium.webdriver.chrome.options import Options -from selenium.webdriver.common.by import By - -def get_urls(query, number): - chrome_driver_path = 'D:\chromedriver-win64\chromedriver.exe' - service = Service(chrome_driver_path) - options = Options() - options.add_argument('--headless') #后台运行,不显示浏览器窗口 - - driver = webdriver.Chrome(service=service, options=options) # 初始化 WebDriver - url_list = set() - page = 1 - while (len(url_list) < number): - search_url = f'https://search.bilibili.com/video?keyword={query}&page={page}' - driver.get(search_url) # 打开网页 - - # 查找符合选择器的所有 标签 - elements = driver.find_elements(By.CSS_SELECTOR, ".video-list.row div.bili-video-card > div > a") - - # 将每个 标签的 href 属性(即网址)加入list - for element in elements: - url_list.add(element.get_attribute('href')) - if (len(url_list) >= number): break - # print(f"page = {page}, cnt = {len(url_list)}") - page = page + 1 - - driver.quit() # 关闭浏览器 - return url_list - -if __name__ == '__main__': - - query = '2024巴黎奥运会' - number = 300 - url_list = get_urls(query=query, number=number) - for url in url_list : - print(url) - \ No newline at end of file