parent
bf8a5ce931
commit
a410891db4
@ -0,0 +1,93 @@
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.common.keys import Keys
|
||||
import time
|
||||
from Common import base_url
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
class ClassSearch:
|
||||
def __init__(self):
|
||||
self.visited_links = set()
|
||||
self.data_queue = []
|
||||
self.baseurl = base_url()
|
||||
|
||||
def search(self, url, driver, WAIT):
|
||||
"""
|
||||
:param url: 要爬的指定URL
|
||||
:param driver: 启动器
|
||||
:param WAIT: TimeOut时间
|
||||
:return: 返回有多少页信息
|
||||
"""
|
||||
try:
|
||||
driver.get(url)
|
||||
|
||||
inputs = WAIT.until(EC.presence_of_element_located((By.XPATH, '//*[@id="searchform"]/div[1]/p[1]/input')))
|
||||
submit = WAIT.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="searchform"]/div[2]/input')))
|
||||
inputs.send_keys(input("想要搜索的电影年份:"))
|
||||
time.sleep(3)
|
||||
submit.click()
|
||||
|
||||
all_h = driver.window_handles
|
||||
driver.switch_to.window(all_h[1])
|
||||
self.get_source(driver, WAIT)
|
||||
|
||||
total = WAIT.until(EC.presence_of_element_located((By.XPATH, '//td[1]/b[2]')))
|
||||
return int(total.text)
|
||||
|
||||
except TimeoutException:
|
||||
driver.refresh()
|
||||
print("重新开始访问")
|
||||
all_h = driver.window_handles
|
||||
driver.switch_to.window(all_h[0])
|
||||
return self.search(driver.current_url, driver, WAIT)
|
||||
|
||||
def next_page(self, page_num, driver, WAIT):
|
||||
"""
|
||||
:param page_num: 要爬取的页数
|
||||
:param driver: 启动器
|
||||
:param WAIT: TimeOut
|
||||
:return: None
|
||||
"""
|
||||
n = 1
|
||||
try:
|
||||
box = WAIT.until(EC.presence_of_element_located((By.NAME, 'PageNo')))
|
||||
submit = WAIT.until(EC.presence_of_element_located((By.NAME, 'plistgo')))
|
||||
box.send_keys(Keys.CONTROL, 'a')
|
||||
box.send_keys(page_num)
|
||||
submit.click()
|
||||
|
||||
all_h = driver.window_handles
|
||||
driver.switch_to.window(all_h[-1])
|
||||
|
||||
self.get_source(driver, WAIT)
|
||||
except TimeoutException:
|
||||
driver.refresh()
|
||||
n += 1
|
||||
if n > 3:
|
||||
return None
|
||||
return self.next_page(page_num, driver, WAIT)
|
||||
|
||||
def get_source(self, driver, WAIT):
|
||||
"""
|
||||
:param driver: 启动器
|
||||
:param WAIT: TimeOut
|
||||
:return: 是否爬取成功
|
||||
"""
|
||||
try:
|
||||
WAIT.until(EC.presence_of_element_located((By.XPATH, '//div[4]/div[2]/ul')))
|
||||
print(driver.current_url)
|
||||
html = driver.page_source
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
|
||||
lists = soup.find(class_='co_content8').find_all('tbody')
|
||||
for item in lists[:-2]:
|
||||
item_link = item.find('a').get('href')
|
||||
link = self.baseurl + item_link
|
||||
if link not in self.visited_links: # 如果链接未被访问过,则获取数据并添加到队列中
|
||||
self.visited_links.add(link)
|
||||
self.data_queue.append(link)
|
||||
return True
|
||||
except TimeoutException:
|
||||
driver.refresh()
|
||||
self.get_source(driver, WAIT)
|
Loading…
Reference in new issue