from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.keys import Keys import time from Common import base_url from bs4 import BeautifulSoup class ClassSearch: def __init__(self): self.visited_links = set() self.data_queue = [] self.baseurl = base_url() def search(self, url, driver, WAIT): """ :param url: 要爬的指定URL :param driver: 启动器 :param WAIT: TimeOut时间 :return: 返回有多少页信息 """ try: driver.get(url) inputs = WAIT.until(EC.presence_of_element_located((By.XPATH, '//*[@id="searchform"]/div[1]/p[1]/input'))) submit = WAIT.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="searchform"]/div[2]/input'))) inputs.send_keys(input("想要搜索的电影年份:")) time.sleep(3) submit.click() all_h = driver.window_handles driver.switch_to.window(all_h[1]) self.get_source(driver, WAIT) total = WAIT.until(EC.presence_of_element_located((By.XPATH, '//td[1]/b[2]'))) return int(total.text) except TimeoutException: driver.refresh() print("重新开始访问") all_h = driver.window_handles driver.switch_to.window(all_h[0]) return self.search(driver.current_url, driver, WAIT) def next_page(self, page_num, driver, WAIT): """ :param page_num: 要爬取的页数 :param driver: 启动器 :param WAIT: TimeOut :return: None """ n = 1 try: box = WAIT.until(EC.presence_of_element_located((By.NAME, 'PageNo'))) submit = WAIT.until(EC.presence_of_element_located((By.NAME, 'plistgo'))) box.send_keys(Keys.CONTROL, 'a') box.send_keys(page_num) submit.click() all_h = driver.window_handles driver.switch_to.window(all_h[-1]) self.get_source(driver, WAIT) except TimeoutException: driver.refresh() n += 1 if n > 3: return None return self.next_page(page_num, driver, WAIT) def get_source(self, driver, WAIT): """ :param driver: 启动器 :param WAIT: TimeOut :return: 是否爬取成功 """ try: WAIT.until(EC.presence_of_element_located((By.XPATH, '//div[4]/div[2]/ul'))) print(driver.current_url) html = driver.page_source soup = BeautifulSoup(html, 'lxml') lists = soup.find(class_='co_content8').find_all('tbody') for item in lists[:-2]: item_link = item.find('a').get('href') link = self.baseurl + item_link if link not in self.visited_links: # 如果链接未被访问过,则获取数据并添加到队列中 self.visited_links.add(link) self.data_queue.append(link) return True except TimeoutException: driver.refresh() self.get_source(driver, WAIT)