You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

81 lines
3.5 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

from selenium import webdriver
import time
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from urllib.parse import urljoin
from selenium.common.exceptions import NoSuchElementException
class VipCollection:
def __init__(self):
self.driver = webdriver.Chrome()
self.start_url = 'https://www.qidian.com/rank/vipcollect/'
# 打开VIP收藏榜网页
def start(self):
self.driver.get(self.start_url)
self.driver.maximize_window()
def page_request(self,j):
page = 1+j
f = open("qidain.txt", "a",encoding="utf-8")
try:
booklist=[]
dis=dict()
wait = WebDriverWait(self.driver, 10) # 设置网页加载超时时间10s
wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'rank-body'))) # 等待页面排行数据被加载完成
lis = self.driver.find_elements(By.XPATH,'''//*[@class='book-img-text']/ul/li''') # 小说排名列表信息
# 解析列表信息获取小说名称、url、排名、收藏数
for li in lis:
data_rid = li.get_attribute('data-rid') # 直接获取li标签的data-rid属性即每一页的排名1234...20
rank = (page - 2) * 20 + int(data_rid) # 因为每一页的排名都是1、2、3、4、5...20,所以我们需要给排名进行计算
name = li.find_element(By.XPATH,"./div[@class='book-mid-info']/h2/a").text # 获取小说名称
type=li.find_element(By.XPATH,"./div[@class='book-mid-info']/p[@class='author']/a[2]").text#获取小说类别
author = li.find_element(By.XPATH,"./div[@class='book-mid-info']/p[@class='author']/a[@class='name']").text # 获取小说作者
href = li.find_element(By.XPATH,"./div[@class='book-mid-info']/h2/a").get_attribute('href')
detail_url = urljoin(self.start_url, href) # 对url进行自动拼接得到完整的小说详情url
collection = li.find_element(By.XPATH,'./div[3]/div/p/span/span').text # 获取收藏数
print({'rank': rank,'name': name, 'type': type,"author":author, 'detail_url': detail_url, 'collection': collection})#
dis={'rank': rank,'name': name, 'type': type,"author":author, 'detail_url': detail_url, 'collection': collection}
booklist.append(dis)
for k in booklist[::-1]:
k=str(k)+'\n'
f.write(k)
# 定位下一页按钮,尝试点击元素直到最后一页
try:
a= self.driver.find_element(By.ID,'PAGINATION-INPUT')
a.clear()
a.send_keys(page)
b=self.driver.find_element(By.XPATH,'//*[@id="PAGINATION-BUTTON"]')
if page==6:
self.close()
if a:
b.click()
self.page_request(page) # 请求自己对页面进行解析
# 下一页元素不存在就捕获异常,并结束
except NoSuchElementException:
print('已经是最后一页等待5秒结束......')
except Exception as e:
print(e)
# 等待5秒关闭浏览器
def close(self):
time.sleep(5)
self.driver.close()
if __name__ == '__main__':
top = VipCollection()
top.start()
top.page_request(j=1)
top.close()