You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
58 lines
1.7 KiB
58 lines
1.7 KiB
1 year ago
|
import time
|
||
|
|
||
|
from selenium import webdriver
|
||
|
from lxml import etree
|
||
|
|
||
|
|
||
|
|
||
|
class Spider:
|
||
|
def __init__(self):
|
||
|
self.bro = webdriver.Chrome()
|
||
|
|
||
|
|
||
|
def small_sort_spider(self,url):
|
||
|
def link_get(resp, xpath):
|
||
|
html = etree.HTML(resp)
|
||
|
link = html.xpath(xpath)
|
||
|
return link
|
||
|
|
||
|
self.bro.get(url)
|
||
|
resp = self.bro.page_source
|
||
|
time.sleep(1)
|
||
|
more_href = link_get(resp,"/html/body/div[5]/div[2]/div[3]/div/dl/dd/ul/li[@class='more']/a/@href")
|
||
|
return more_href
|
||
|
|
||
|
def big_sort_spider(self, url):
|
||
|
def link_get(resp,xpath):
|
||
|
html = etree.HTML(resp)
|
||
|
link = html.xpath(xpath)
|
||
|
return link
|
||
|
self.bro.get(url)
|
||
|
resp = self.bro.page_source
|
||
|
href = link_get(resp,'/html/body/div[5]/div/div[1]/div/a/@href')
|
||
|
return href
|
||
|
|
||
|
def more_medicinal_detail_href(self,url):
|
||
|
def link_get(resp,xpath):
|
||
|
html = etree.HTML(resp)
|
||
|
link = html.xpath(xpath)
|
||
|
return link
|
||
|
self.bro.get(url)
|
||
|
resp = self.bro.page_source
|
||
|
detail = link_get(resp,'/html/body/div[5]/div[2]/div[4]/div/dl/dd/ul/li/a/@href')
|
||
|
sort = link_get(resp,'/html/body/div[5]/div[2]/div[1]/div[1]/b/h1/text()')
|
||
|
|
||
|
return detail,sort
|
||
|
|
||
|
def medicinal_detail_href(self,url):
|
||
|
def link_get(resp,xpath):
|
||
|
html = etree.HTML(resp)
|
||
|
link = html.xpath(xpath)
|
||
|
return link
|
||
|
self.bro.get(url)
|
||
|
resp = self.bro.page_source
|
||
|
detail = link_get(resp,'/html/body/div[5]/div[2]/div[4]/div/dl/dd/ul/li/a/@href')
|
||
|
sort = link_get(resp,"/html/body/div[5]/div[2]/div[1]/div[1]/b/h1/text()")
|
||
|
|
||
|
return detail,sort
|