diff --git a/get_urls.py b/get_urls.py new file mode 100644 index 0000000..0656cfc --- /dev/null +++ b/get_urls.py @@ -0,0 +1,44 @@ +from selenium import webdriver +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.common.by import By + +def get_urls(query, number): + chrome_driver_path = 'D:\chromedriver-win64\chromedriver.exe' + service = Service(chrome_driver_path) + options = Options() + options.add_argument('--headless') #后台运行,不显示浏览器窗口 + + driver = webdriver.Chrome(service=service, options=options) # 初始化 WebDriver + url_list = set() + page = 1 + while (len(url_list) < number): + search_url = f'https://search.bilibili.com/video?keyword={query}&page={page}' + driver.get(search_url) # 打开网页 + + # 查找符合选择器的所有 标签 + elements = driver.find_elements(By.CSS_SELECTOR, ".video-list.row div.bili-video-card > div > a") + + # 将每个 标签的 href 属性(即网址)加入list + for element in elements: + url_list.add(element.get_attribute('href')) + if (len(url_list) >= number): break + # print(f"page = {page}, cnt = {len(url_list)}") + page = page + 1 + + driver.quit() # 关闭浏览器 + return url_list + +if __name__ == '__main__': + + query = '2024巴黎奥运会' + number = 300 + cnt = 0 + url_list = get_urls(query=query, number=number) + for url in url_list : + with open('./urls.txt', mode='a', encoding='utf-8') as f: + f.write(url + "\n") + cnt = cnt + 1 + print(f"url : {cnt}/{number}") + + \ No newline at end of file