diff --git a/chromedriver.exe b/chromedriver.exe new file mode 100644 index 0000000..6fabd60 Binary files /dev/null and b/chromedriver.exe differ diff --git a/main.py b/main.py new file mode 100644 index 0000000..a05114c --- /dev/null +++ b/main.py @@ -0,0 +1,28 @@ +import bigSortSpider +import spider + + + + +class Main: + def __init__(self): + self.spider = spider.Spider() + + def main(self): + url = 'http://www.zhongyoo.com/name/' + big_hrefs = self.spider.big_sort_spider(url) + for href in big_hrefs: + more_hrefs = self.spider.small_sort_spider(href) + + if more_hrefs != []: + for more_href in more_hrefs: + detail,sort = self.spider.more_medicinal_detail_href(more_href) + print(sort) + print(detail) + else: + detail,sort = self.spider.medicinal_detail_href(href) + print(sort) + print(detail) + +if __name__ == '__main__': + mian = Main().main() \ No newline at end of file diff --git a/spider.py b/spider.py new file mode 100644 index 0000000..c171a5d --- /dev/null +++ b/spider.py @@ -0,0 +1,57 @@ +import time + +from selenium import webdriver +from lxml import etree + + + +class Spider: + def __init__(self): + self.bro = webdriver.Chrome() + + + def small_sort_spider(self,url): + def link_get(resp, xpath): + html = etree.HTML(resp) + link = html.xpath(xpath) + return link + + self.bro.get(url) + resp = self.bro.page_source + time.sleep(1) + more_href = link_get(resp,"/html/body/div[5]/div[2]/div[3]/div/dl/dd/ul/li[@class='more']/a/@href") + return more_href + + def big_sort_spider(self, url): + def link_get(resp,xpath): + html = etree.HTML(resp) + link = html.xpath(xpath) + return link + self.bro.get(url) + resp = self.bro.page_source + href = link_get(resp,'/html/body/div[5]/div/div[1]/div/a/@href') + return href + + def more_medicinal_detail_href(self,url): + def link_get(resp,xpath): + html = etree.HTML(resp) + link = html.xpath(xpath) + return link + self.bro.get(url) + resp = self.bro.page_source + detail = link_get(resp,'/html/body/div[5]/div[2]/div[4]/div/dl/dd/ul/li/a/@href') + sort = link_get(resp,'/html/body/div[5]/div[2]/div[1]/div[1]/b/h1/text()') + + return detail,sort + + def medicinal_detail_href(self,url): + def link_get(resp,xpath): + html = etree.HTML(resp) + link = html.xpath(xpath) + return link + self.bro.get(url) + resp = self.bro.page_source + detail = link_get(resp,'/html/body/div[5]/div[2]/div[4]/div/dl/dd/ul/li/a/@href') + sort = link_get(resp,"/html/body/div[5]/div[2]/div[1]/div[1]/b/h1/text()") + + return detail,sort