You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

58 lines
1.7 KiB

import time
from selenium import webdriver
from lxml import etree
class Spider:
def __init__(self):
self.bro = webdriver.Chrome()
def small_sort_spider(self,url):
def link_get(resp, xpath):
html = etree.HTML(resp)
link = html.xpath(xpath)
return link
self.bro.get(url)
resp = self.bro.page_source
time.sleep(1)
more_href = link_get(resp,"/html/body/div[5]/div[2]/div[3]/div/dl/dd/ul/li[@class='more']/a/@href")
return more_href
def big_sort_spider(self, url):
def link_get(resp,xpath):
html = etree.HTML(resp)
link = html.xpath(xpath)
return link
self.bro.get(url)
resp = self.bro.page_source
href = link_get(resp,'/html/body/div[5]/div/div[1]/div/a/@href')
return href
def more_medicinal_detail_href(self,url):
def link_get(resp,xpath):
html = etree.HTML(resp)
link = html.xpath(xpath)
return link
self.bro.get(url)
resp = self.bro.page_source
detail = link_get(resp,'/html/body/div[5]/div[2]/div[4]/div/dl/dd/ul/li/a/@href')
sort = link_get(resp,'/html/body/div[5]/div[2]/div[1]/div[1]/b/h1/text()')
return detail,sort
def medicinal_detail_href(self,url):
def link_get(resp,xpath):
html = etree.HTML(resp)
link = html.xpath(xpath)
return link
self.bro.get(url)
resp = self.bro.page_source
detail = link_get(resp,'/html/body/div[5]/div[2]/div[4]/div/dl/dd/ul/li/a/@href')
sort = link_get(resp,"/html/body/div[5]/div[2]/div[1]/div[1]/b/h1/text()")
return detail,sort