You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
93 lines
3.3 KiB
93 lines
3.3 KiB
from selenium.common.exceptions import TimeoutException
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
from selenium.webdriver.common.keys import Keys
|
|
import time
|
|
from Common import base_url
|
|
from bs4 import BeautifulSoup
|
|
|
|
class ClassSearch:
|
|
def __init__(self):
|
|
self.visited_links = set()
|
|
self.data_queue = []
|
|
self.baseurl = base_url()
|
|
|
|
def search(self, url, driver, WAIT):
|
|
"""
|
|
:param url: 要爬的指定URL
|
|
:param driver: 启动器
|
|
:param WAIT: TimeOut时间
|
|
:return: 返回有多少页信息
|
|
"""
|
|
try:
|
|
driver.get(url)
|
|
|
|
inputs = WAIT.until(EC.presence_of_element_located((By.XPATH, '//*[@id="searchform"]/div[1]/p[1]/input')))
|
|
submit = WAIT.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="searchform"]/div[2]/input')))
|
|
inputs.send_keys(input("想要搜索的电影年份:"))
|
|
time.sleep(3)
|
|
submit.click()
|
|
|
|
all_h = driver.window_handles
|
|
driver.switch_to.window(all_h[1])
|
|
self.get_source(driver, WAIT)
|
|
|
|
total = WAIT.until(EC.presence_of_element_located((By.XPATH, '//td[1]/b[2]')))
|
|
return int(total.text)
|
|
|
|
except TimeoutException:
|
|
driver.refresh()
|
|
print("重新开始访问")
|
|
all_h = driver.window_handles
|
|
driver.switch_to.window(all_h[0])
|
|
return self.search(driver.current_url, driver, WAIT)
|
|
|
|
def next_page(self, page_num, driver, WAIT):
|
|
"""
|
|
:param page_num: 要爬取的页数
|
|
:param driver: 启动器
|
|
:param WAIT: TimeOut
|
|
:return: None
|
|
"""
|
|
n = 1
|
|
try:
|
|
box = WAIT.until(EC.presence_of_element_located((By.NAME, 'PageNo')))
|
|
submit = WAIT.until(EC.presence_of_element_located((By.NAME, 'plistgo')))
|
|
box.send_keys(Keys.CONTROL, 'a')
|
|
box.send_keys(page_num)
|
|
submit.click()
|
|
|
|
all_h = driver.window_handles
|
|
driver.switch_to.window(all_h[-1])
|
|
|
|
self.get_source(driver, WAIT)
|
|
except TimeoutException:
|
|
driver.refresh()
|
|
n += 1
|
|
if n > 3:
|
|
return None
|
|
return self.next_page(page_num, driver, WAIT)
|
|
|
|
def get_source(self, driver, WAIT):
|
|
"""
|
|
:param driver: 启动器
|
|
:param WAIT: TimeOut
|
|
:return: 是否爬取成功
|
|
"""
|
|
try:
|
|
WAIT.until(EC.presence_of_element_located((By.XPATH, '//div[4]/div[2]/ul')))
|
|
print(driver.current_url)
|
|
html = driver.page_source
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
|
|
lists = soup.find(class_='co_content8').find_all('tbody')
|
|
for item in lists[:-2]:
|
|
item_link = item.find('a').get('href')
|
|
link = self.baseurl + item_link
|
|
if link not in self.visited_links: # 如果链接未被访问过,则获取数据并添加到队列中
|
|
self.visited_links.add(link)
|
|
self.data_queue.append(link)
|
|
return True
|
|
except TimeoutException:
|
|
driver.refresh()
|
|
self.get_source(driver, WAIT) |