diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/BookEntity.java b/tamguo-crawler/src/main/java/com/tamguo/model/BookEntity.java index 8803011..fb6a2c5 100644 --- a/tamguo-crawler/src/main/java/com/tamguo/model/BookEntity.java +++ b/tamguo-crawler/src/main/java/com/tamguo/model/BookEntity.java @@ -27,6 +27,8 @@ public class BookEntity extends SuperEntity implements Serializable private Integer orders; + private String reserveField1; + public BookEntity() { } @@ -89,4 +91,12 @@ public class BookEntity extends SuperEntity implements Serializable public void setOrders(Integer orders) { this.orders = orders; } + + public String getReserveField1() { + return reserveField1; + } + + public void setReserveField1(String reserveField1) { + this.reserveField1 = reserveField1; + } } \ No newline at end of file diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java b/tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java index 8d337c0..c37ca2b 100644 --- a/tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java +++ b/tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java @@ -4,19 +4,29 @@ import com.xuxueli.crawler.annotation.PageFieldSelect; import com.xuxueli.crawler.annotation.PageSelect; import com.xuxueli.crawler.conf.XxlCrawlerConf; -import java.util.List; - -@PageSelect(cssQuery = "body") +@PageSelect(cssQuery = ".pic_right li") public class BookVo { - @PageFieldSelect(cssQuery = ".pic_right .text") - private List name; + @PageFieldSelect(cssQuery = ".text") + private String name; + + @PageFieldSelect(cssQuery = "a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") + private String bookUrl; + + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } - public List getName() { - return name; - } + public String getBookUrl() { + return bookUrl; + } - public void setName(List name) { - this.name = name; - } + public void setBookUrl(String bookUrl) { + this.bookUrl = bookUrl; + } } diff --git a/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java b/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java index 4c467a1..2b27c9c 100644 --- a/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java +++ b/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java @@ -13,8 +13,6 @@ import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; -import java.util.List; - @Service public class BookService implements IBookService { @@ -39,14 +37,15 @@ public class BookService implements IBookService { String pageUrl = html.baseUri(); if (pageUrl.equals("http://www.ruiwen.com/jiaocai/")) { logger.info("开始解析书本信息:{}", pageUrl); - List books = bookVo.getName(); - books.forEach(item -> { - BookEntity bookEntity = new BookEntity(); - bookEntity.setName(item); - bookEntity.setQuestionNum(0); - bookEntity.setPointNum(0); - bookMapper.insert(bookEntity); - }); + String name = bookVo.getName(); + String url = bookVo.getBookUrl(); + + BookEntity bookEntity = new BookEntity(); + bookEntity.setName(name); + bookEntity.setReserveField1(url); + bookEntity.setQuestionNum(0); + bookEntity.setPointNum(0); + bookMapper.insert(bookEntity); } diff --git a/tamguo-crawler/src/main/java/com/tamguo/service/impl/CrawlerBookService.java b/tamguo-crawler/src/main/java/com/tamguo/service/impl/CrawlerBookService.java index 4d54fe7..2558157 100644 --- a/tamguo-crawler/src/main/java/com/tamguo/service/impl/CrawlerBookService.java +++ b/tamguo-crawler/src/main/java/com/tamguo/service/impl/CrawlerBookService.java @@ -1,7 +1,9 @@ package com.tamguo.service.impl; -import com.tamguo.config.redis.CacheService; +import com.baomidou.mybatisplus.mapper.Condition; +import com.tamguo.dao.BookMapper; import com.tamguo.dao.CrawlerBookMapper; +import com.tamguo.model.BookEntity; import com.tamguo.model.CrawlerBookEntity; import com.tamguo.model.vo.CrawlerBookVo; import com.tamguo.service.ICrawlerBookService; @@ -15,52 +17,61 @@ import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; +import java.util.List; + @Service public class CrawlerBookService implements ICrawlerBookService { @Autowired CrawlerBookMapper crawlerBookMapper; + @Autowired + BookMapper bookMapper; private Logger logger = LoggerFactory.getLogger(getClass()); - private static final String FILES_NO_FORMAT = "000000"; - private static final String FILES_PREFIX = "FPIMAGE"; - private static final String DOMAIN = "http://www.tamguo.com"; - @Autowired - CacheService cacheService; - - //一年级语文上册 @Override public void crawlerBook() { - XxlCrawler crawler = new XxlCrawler.Builder() - .setUrls("http://www.ruiwen.com/jiaocai/") - .setWhiteUrlRegexs("http://www\\.ruiwen\\.com/jiaocai/yuwen/bubianban/yinianjishangce/shangce\\d+\\.html")//一年级语文上册 -// .setWhiteUrlRegexs("http://www\\.ruiwen\\.com/jiaocai/yuwen/bubianban/yinianjixiace/xiace\\d+\\.html")//一年级语文下册 -// .setWhiteUrlRegexs("http://www\\.ruiwen\\.com/jiaocai/yingyu/bubianban/yinianjixiace/xiace\\d+\\.html")//一年级英语上册 - .setAllowSpread(true) - .setFailRetryCount(5) - .setThreadCount(20) - .setPageParser(new PageParser() { - @Override - public void parse(Document html, Element pageVoElement, CrawlerBookVo crawlerBookVo) { - // 解析封装 PageVo 对象 - String img = crawlerBookVo.getBookImage(); - if (StringUtils.isNoneBlank(img)) { - CrawlerBookEntity crawlerBookEntity = new CrawlerBookEntity(); - crawlerBookEntity.setBookUid("1019244094196551682");//一年级语文上册 -// crawlerBookEntity.setBookUid("1019244094704062466");//一年级语文下册 -// crawlerBookEntity.setBookUid("1019244096797020162");//一年级英语上册 - crawlerBookEntity.setBookUrl(crawlerBookVo.getBookImage()); - crawlerBookEntity.setOrders(Integer.parseInt(img.substring(img.lastIndexOf("/") + 1, img.lastIndexOf(".")))); - crawlerBookMapper.insert(crawlerBookEntity); + List bookEntities = bookMapper.selectList(Condition.EMPTY); + for (BookEntity bookEntity : bookEntities) { + String url = bookEntity.getReserveField1(); + String bookId = bookEntity.getUid(); + String regexs = url.replaceAll("\\d+", "\\\\d+").replaceAll("\\.","\\\\."); + + XxlCrawler crawler = new XxlCrawler.Builder() + .setUrls("http://www.ruiwen.com/jiaocai/") + .setWhiteUrlRegexs(regexs)// + .setAllowSpread(true) + .setFailRetryCount(5) + .setThreadCount(20) + .setPageParser(new PageParser() { + @Override + public void parse(Document html, Element pageVoElement, CrawlerBookVo crawlerBookVo) { + String pageUrl = html.baseUri(); + // 解析封装 PageVo 对象 + String img = crawlerBookVo.getBookImage(); + if (StringUtils.isNoneBlank(img)) { + CrawlerBookEntity crawlerBookEntity = new CrawlerBookEntity(); + crawlerBookEntity.setBookUid(bookId); + crawlerBookEntity.setBookUrl(crawlerBookVo.getBookImage()); + crawlerBookEntity.setOrders(Integer.parseInt(img.substring(img.lastIndexOf("/") + 1, img.lastIndexOf(".")))); + crawlerBookMapper.insert(crawlerBookEntity); + } + + } - } - }).build(); + }).build(); + + // 获取科目 + crawler.start(true); + } - // 获取科目 - crawler.start(true); } +// public static void main(String[] args) { +// String url = "http://www.ruiwen.com/jiaocai/yuwen/renjiaoban/yinianjishangce/shangce1.html"; +// System.out.println(url.replaceAll("\\d+", "\\\\d+").replaceAll("\\.","\\\\.")); +// } + }