You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

93 lines
3.3 KiB

from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
from Common import base_url
from bs4 import BeautifulSoup
class ClassSearch:
def __init__(self):
self.visited_links = set()
self.data_queue = []
self.baseurl = base_url()
def search(self, url, driver, WAIT):
"""
:param url: 要爬的指定URL
:param driver: 启动器
:param WAIT: TimeOut时间
:return: 返回有多少页信息
"""
try:
driver.get(url)
inputs = WAIT.until(EC.presence_of_element_located((By.XPATH, '//*[@id="searchform"]/div[1]/p[1]/input')))
submit = WAIT.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="searchform"]/div[2]/input')))
inputs.send_keys(input("想要搜索的电影年份:"))
time.sleep(3)
submit.click()
all_h = driver.window_handles
driver.switch_to.window(all_h[1])
self.get_source(driver, WAIT)
total = WAIT.until(EC.presence_of_element_located((By.XPATH, '//td[1]/b[2]')))
return int(total.text)
except TimeoutException:
driver.refresh()
print("重新开始访问")
all_h = driver.window_handles
driver.switch_to.window(all_h[0])
return self.search(driver.current_url, driver, WAIT)
def next_page(self, page_num, driver, WAIT):
"""
:param page_num: 要爬取的页数
:param driver: 启动器
:param WAIT: TimeOut
:return: None
"""
n = 1
try:
box = WAIT.until(EC.presence_of_element_located((By.NAME, 'PageNo')))
submit = WAIT.until(EC.presence_of_element_located((By.NAME, 'plistgo')))
box.send_keys(Keys.CONTROL, 'a')
box.send_keys(page_num)
submit.click()
all_h = driver.window_handles
driver.switch_to.window(all_h[-1])
self.get_source(driver, WAIT)
except TimeoutException:
driver.refresh()
n += 1
if n > 3:
return None
return self.next_page(page_num, driver, WAIT)
def get_source(self, driver, WAIT):
"""
:param driver: 启动器
:param WAIT: TimeOut
:return: 是否爬取成功
"""
try:
WAIT.until(EC.presence_of_element_located((By.XPATH, '//div[4]/div[2]/ul')))
print(driver.current_url)
html = driver.page_source
soup = BeautifulSoup(html, 'lxml')
lists = soup.find(class_='co_content8').find_all('tbody')
for item in lists[:-2]:
item_link = item.find('a').get('href')
link = self.baseurl + item_link
if link not in self.visited_links: # 如果链接未被访问过,则获取数据并添加到队列中
self.visited_links.add(link)
self.data_queue.append(link)
return True
except TimeoutException:
driver.refresh()
self.get_source(driver, WAIT)