From a410891db4f7838a72c931e75a6e51b7a558b5f3 Mon Sep 17 00:00:00 2001 From: pc3vqetl4 <493801852@qq.com> Date: Sat, 27 Apr 2024 22:45:05 +0800 Subject: [PATCH] ADD file via upload --- ClassSearch.py | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 ClassSearch.py diff --git a/ClassSearch.py b/ClassSearch.py new file mode 100644 index 0000000..92a4885 --- /dev/null +++ b/ClassSearch.py @@ -0,0 +1,93 @@ +from selenium.common.exceptions import TimeoutException +from selenium.webdriver.common.by import By +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.keys import Keys +import time +from Common import base_url +from bs4 import BeautifulSoup + +class ClassSearch: + def __init__(self): + self.visited_links = set() + self.data_queue = [] + self.baseurl = base_url() + + def search(self, url, driver, WAIT): + """ + :param url: 要爬的指定URL + :param driver: 启动器 + :param WAIT: TimeOut时间 + :return: 返回有多少页信息 + """ + try: + driver.get(url) + + inputs = WAIT.until(EC.presence_of_element_located((By.XPATH, '//*[@id="searchform"]/div[1]/p[1]/input'))) + submit = WAIT.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="searchform"]/div[2]/input'))) + inputs.send_keys(input("想要搜索的电影年份:")) + time.sleep(3) + submit.click() + + all_h = driver.window_handles + driver.switch_to.window(all_h[1]) + self.get_source(driver, WAIT) + + total = WAIT.until(EC.presence_of_element_located((By.XPATH, '//td[1]/b[2]'))) + return int(total.text) + + except TimeoutException: + driver.refresh() + print("重新开始访问") + all_h = driver.window_handles + driver.switch_to.window(all_h[0]) + return self.search(driver.current_url, driver, WAIT) + + def next_page(self, page_num, driver, WAIT): + """ + :param page_num: 要爬取的页数 + :param driver: 启动器 + :param WAIT: TimeOut + :return: None + """ + n = 1 + try: + box = WAIT.until(EC.presence_of_element_located((By.NAME, 'PageNo'))) + submit = WAIT.until(EC.presence_of_element_located((By.NAME, 'plistgo'))) + box.send_keys(Keys.CONTROL, 'a') + box.send_keys(page_num) + submit.click() + + all_h = driver.window_handles + driver.switch_to.window(all_h[-1]) + + self.get_source(driver, WAIT) + except TimeoutException: + driver.refresh() + n += 1 + if n > 3: + return None + return self.next_page(page_num, driver, WAIT) + + def get_source(self, driver, WAIT): + """ + :param driver: 启动器 + :param WAIT: TimeOut + :return: 是否爬取成功 + """ + try: + WAIT.until(EC.presence_of_element_located((By.XPATH, '//div[4]/div[2]/ul'))) + print(driver.current_url) + html = driver.page_source + soup = BeautifulSoup(html, 'lxml') + + lists = soup.find(class_='co_content8').find_all('tbody') + for item in lists[:-2]: + item_link = item.find('a').get('href') + link = self.baseurl + item_link + if link not in self.visited_links: # 如果链接未被访问过,则获取数据并添加到队列中 + self.visited_links.add(link) + self.data_queue.append(link) + return True + except TimeoutException: + driver.refresh() + self.get_source(driver, WAIT) \ No newline at end of file