from selenium import webdriver import time from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.common.by import By from urllib.parse import urljoin from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.chrome.options import Options # 这个是一个用来控制chrome以无界面模式打开的浏览器 # 创建一个参数对象,用来控制chrome以无界面的方式打开 chrome_options = Options() # # #后面的两个是固定写法 chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') class VipCollection: def __init__(self): self.driver = webdriver.Chrome(options=chrome_options) self.start_url = 'https://www.qidian.com/rank/vipcollect/' # 打开VIP收藏榜网页 def start(self): self.driver.get(self.start_url) self.driver.maximize_window() def page_request(self,j): page = 1+j f = open("qidain.txt", "a",encoding="utf-8") try: booklist = [] wait = WebDriverWait(self.driver, 10) # 设置网页加载超时时间10s wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'rank-body'))) # 等待页面排行数据被加载完成 lis = self.driver.find_elements(By.XPATH,'''//*[@class='book-img-text']/ul/li''') # 小说排名列表信息 # print(lis) # 解析列表信息,获取小说名称、url、排名、收藏数 for li in lis: data_rid = li.get_attribute('data-rid') # 直接获取li标签的data-rid属性,即每一页的排名1,2,3,4...20 rank = (page - 2) * 20 + int(data_rid) # 因为每一页的排名都是1、2、3、4、5...20,所以我们需要给排名进行计算 name = li.find_element(By.XPATH,"./div[@class='book-mid-info']/h2/a").text # 获取小说名称 type_1 = li.find_element(By.XPATH,"./div[@class='book-mid-info']/p[@class='author']/a[2]").text#获取小说类别 author = li.find_element(By.XPATH,"./div[@class='book-mid-info']/p[@class='author']/a[@class='name']").text # 获取小说作者 href = li.find_element(By.XPATH,"./div[@class='book-mid-info']/h2/a").get_attribute('href') detail_url = urljoin(self.start_url, href) # 对url进行自动拼接,得到完整的小说详情url collection = li.find_element(By.XPATH,'./div[3]/div/p/span/span').text # 获取收藏数 print({'rank': rank,'name': name, 'type_1': type_1,"author":author, 'detail_url': detail_url, 'collection': collection})# dis_1 = {'rank': rank,'name': name, 'type_1': type_1,"author":author, 'detail_url': detail_url, 'collection': collection} booklist.append(dis_1) for k in booklist[::-1]:#前面打开的文件与列表f = open("qidain.txt", "a",encoding="utf-8"),booklist = [] k = str(k)+'\n' f.write(k) # 定位下一页按钮,尝试点击元素直到最后一页 try: a = self.driver.find_element(By.ID,'PAGINATION-INPUT') a.clear() a.send_keys(page) b = self.driver.find_element(By.XPATH,'//*[@id="PAGINATION-BUTTON"]') if page == 6: self.close() if a: b.click() self.page_request(page) # 请求自己对页面进行解析 # 下一页元素不存在就捕获异常,并结束 except NoSuchElementException: print('翻页错误') except Exception as e: print('退出爬虫') def close(self): time.sleep(5) # 等待5秒关闭浏览器 self.driver.close() if __name__ == '__main__': top = VipCollection() top.start() top.page_request(j=1)