|
|
from selenium import webdriver
|
|
|
import time
|
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
|
from selenium.webdriver.support.wait import WebDriverWait
|
|
|
from selenium.webdriver.common.by import By
|
|
|
from urllib.parse import urljoin
|
|
|
from selenium.common.exceptions import NoSuchElementException
|
|
|
|
|
|
|
|
|
|
|
|
class VipCollection:
|
|
|
def __init__(self):
|
|
|
self.driver = webdriver.Chrome()
|
|
|
self.start_url = 'https://www.qidian.com/rank/vipcollect/'
|
|
|
|
|
|
# 打开VIP收藏榜网页
|
|
|
def start(self):
|
|
|
self.driver.get(self.start_url)
|
|
|
self.driver.maximize_window()
|
|
|
|
|
|
|
|
|
def page_request(self,j):
|
|
|
|
|
|
page = 1+j
|
|
|
|
|
|
f = open("qidain.txt", "a",encoding="utf-8")
|
|
|
try:
|
|
|
booklist=[]
|
|
|
dis=dict()
|
|
|
wait = WebDriverWait(self.driver, 10) # 设置网页加载超时时间10s
|
|
|
wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'rank-body'))) # 等待页面排行数据被加载完成
|
|
|
lis = self.driver.find_elements(By.XPATH,'''//*[@class='book-img-text']/ul/li''') # 小说排名列表信息
|
|
|
# 解析列表信息,获取小说名称、url、排名、收藏数
|
|
|
for li in lis:
|
|
|
data_rid = li.get_attribute('data-rid') # 直接获取li标签的data-rid属性,即每一页的排名1,2,3,4...20
|
|
|
rank = (page - 2) * 20 + int(data_rid) # 因为每一页的排名都是1、2、3、4、5...20,所以我们需要给排名进行计算
|
|
|
name = li.find_element(By.XPATH,"./div[@class='book-mid-info']/h2/a").text # 获取小说名称
|
|
|
type=li.find_element(By.XPATH,"./div[@class='book-mid-info']/p[@class='author']/a[2]").text#获取小说类别
|
|
|
author = li.find_element(By.XPATH,"./div[@class='book-mid-info']/p[@class='author']/a[@class='name']").text # 获取小说作者
|
|
|
href = li.find_element(By.XPATH,"./div[@class='book-mid-info']/h2/a").get_attribute('href')
|
|
|
detail_url = urljoin(self.start_url, href) # 对url进行自动拼接,得到完整的小说详情url
|
|
|
collection = li.find_element(By.XPATH,'./div[3]/div/p/span/span').text # 获取收藏数
|
|
|
print({'rank': rank,'name': name, 'type': type,"author":author, 'detail_url': detail_url, 'collection': collection})#
|
|
|
dis={'rank': rank,'name': name, 'type': type,"author":author, 'detail_url': detail_url, 'collection': collection}
|
|
|
booklist.append(dis)
|
|
|
|
|
|
for k in booklist[::-1]:
|
|
|
k=str(k)+'\n'
|
|
|
f.write(k)
|
|
|
|
|
|
# 定位下一页按钮,尝试点击元素直到最后一页
|
|
|
try:
|
|
|
a= self.driver.find_element(By.ID,'PAGINATION-INPUT')
|
|
|
a.clear()
|
|
|
a.send_keys(page)
|
|
|
b=self.driver.find_element(By.XPATH,'//*[@id="PAGINATION-BUTTON"]')
|
|
|
if page==6:
|
|
|
self.close()
|
|
|
|
|
|
if a:
|
|
|
b.click()
|
|
|
self.page_request(page) # 请求自己对页面进行解析
|
|
|
# 下一页元素不存在就捕获异常,并结束
|
|
|
except NoSuchElementException:
|
|
|
print('已经是最后一页,等待5秒结束......')
|
|
|
except Exception as e:
|
|
|
print(e)
|
|
|
|
|
|
# 等待5秒关闭浏览器
|
|
|
def close(self):
|
|
|
time.sleep(5)
|
|
|
self.driver.close()
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
top = VipCollection()
|
|
|
top.start()
|
|
|
top.page_request(j=1)
|
|
|
top.close()
|
|
|
|