diff --git a/tamguo-crawler/src/main/java/com/tamguo/dao/BookMapper.java b/tamguo-crawler/src/main/java/com/tamguo/dao/BookMapper.java new file mode 100644 index 0000000..2fcf33d --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/dao/BookMapper.java @@ -0,0 +1,8 @@ +package com.tamguo.dao; + +import com.tamguo.config.dao.SuperMapper; +import com.tamguo.model.BookEntity; + +public interface BookMapper extends SuperMapper { + +} diff --git a/tamguo-crawler/src/main/java/com/tamguo/dao/CrawlerBookMapper.java b/tamguo-crawler/src/main/java/com/tamguo/dao/CrawlerBookMapper.java new file mode 100644 index 0000000..bf312b8 --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/dao/CrawlerBookMapper.java @@ -0,0 +1,8 @@ +package com.tamguo.dao; + +import com.tamguo.config.dao.SuperMapper; +import com.tamguo.model.CrawlerBookEntity; + +public interface CrawlerBookMapper extends SuperMapper { + +} diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/BookEntity.java b/tamguo-crawler/src/main/java/com/tamguo/model/BookEntity.java new file mode 100644 index 0000000..fb6a2c5 --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/model/BookEntity.java @@ -0,0 +1,102 @@ +package com.tamguo.model; + +import com.baomidou.mybatisplus.annotations.TableName; +import com.tamguo.config.dao.SuperEntity; + +import java.io.Serializable; + +/** + * The persistent class for the tiku_book database table. + */ +@TableName(value = "tiku_book") +public class BookEntity extends SuperEntity implements Serializable { + private static final long serialVersionUID = 1L; + + + private String subjectId; + + private String courseId; + + private String name; + + private String publishingHouse; + + private Integer questionNum; + + private Integer pointNum; + + private Integer orders; + + private String reserveField1; + + public BookEntity() { + } + + public static long getSerialVersionUID() { + return serialVersionUID; + } + + public String getSubjectId() { + return subjectId; + } + + public void setSubjectId(String subjectId) { + this.subjectId = subjectId; + } + + public String getCourseId() { + return courseId; + } + + public void setCourseId(String courseId) { + this.courseId = courseId; + } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public String getPublishingHouse() { + return publishingHouse; + } + + public void setPublishingHouse(String publishingHouse) { + this.publishingHouse = publishingHouse; + } + + public Integer getQuestionNum() { + return questionNum; + } + + public void setQuestionNum(Integer questionNum) { + this.questionNum = questionNum; + } + + public Integer getPointNum() { + return pointNum; + } + + public void setPointNum(Integer pointNum) { + this.pointNum = pointNum; + } + + public Integer getOrders() { + return orders; + } + + public void setOrders(Integer orders) { + this.orders = orders; + } + + public String getReserveField1() { + return reserveField1; + } + + public void setReserveField1(String reserveField1) { + this.reserveField1 = reserveField1; + } +} \ No newline at end of file diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/CrawlerBookEntity.java b/tamguo-crawler/src/main/java/com/tamguo/model/CrawlerBookEntity.java new file mode 100644 index 0000000..79c5041 --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/model/CrawlerBookEntity.java @@ -0,0 +1,51 @@ +package com.tamguo.model; + +import com.baomidou.mybatisplus.annotations.TableName; +import com.tamguo.config.dao.SuperEntity; + +import java.io.Serializable; + +/** + * The persistent class for the crawler_book database table. + */ +@TableName(value = "crawler_book") +public class CrawlerBookEntity extends SuperEntity implements Serializable { + private static final long serialVersionUID = 1L; + + private String bookUrl; + + private String bookUid; + + private Integer orders; + + public CrawlerBookEntity() { + } + + public static long getSerialVersionUID() { + return serialVersionUID; + } + + public String getBookUrl() { + return bookUrl; + } + + public void setBookUrl(String bookUrl) { + this.bookUrl = bookUrl; + } + + public String getBookUid() { + return bookUid; + } + + public void setBookUid(String bookUid) { + this.bookUid = bookUid; + } + + public Integer getOrders() { + return orders; + } + + public void setOrders(Integer orders) { + this.orders = orders; + } +} \ No newline at end of file diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java b/tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java new file mode 100644 index 0000000..c37ca2b --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java @@ -0,0 +1,32 @@ +package com.tamguo.model.vo; + +import com.xuxueli.crawler.annotation.PageFieldSelect; +import com.xuxueli.crawler.annotation.PageSelect; +import com.xuxueli.crawler.conf.XxlCrawlerConf; + +@PageSelect(cssQuery = ".pic_right li") +public class BookVo { + + @PageFieldSelect(cssQuery = ".text") + private String name; + + @PageFieldSelect(cssQuery = "a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") + private String bookUrl; + + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public String getBookUrl() { + return bookUrl; + } + + public void setBookUrl(String bookUrl) { + this.bookUrl = bookUrl; + } +} diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/vo/ChapterVo.java b/tamguo-crawler/src/main/java/com/tamguo/model/vo/ChapterVo.java new file mode 100644 index 0000000..70378fc --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/model/vo/ChapterVo.java @@ -0,0 +1,32 @@ +package com.tamguo.model.vo; + +import com.xuxueli.crawler.annotation.PageFieldSelect; +import com.xuxueli.crawler.annotation.PageSelect; + +import java.util.List; + +@PageSelect(cssQuery = ".out-chapter") +public class ChapterVo { + + @PageFieldSelect(cssQuery = "h3") + private String name; + + @PageFieldSelect(cssQuery = ".out-list li") + private List sonChapters; + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public List getSonChapters() { + return sonChapters; + } + + public void setSonChapters(List sonChapters) { + this.sonChapters = sonChapters; + } +} diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/vo/CrawlerBookVo.java b/tamguo-crawler/src/main/java/com/tamguo/model/vo/CrawlerBookVo.java new file mode 100644 index 0000000..0c348a1 --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/model/vo/CrawlerBookVo.java @@ -0,0 +1,20 @@ +package com.tamguo.model.vo; + +import com.xuxueli.crawler.annotation.PageFieldSelect; +import com.xuxueli.crawler.annotation.PageSelect; +import com.xuxueli.crawler.conf.XxlCrawlerConf; + +@PageSelect(cssQuery = "body") +public class CrawlerBookVo { + + @PageFieldSelect(cssQuery = ".con .pic img", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:src") + private String bookImage; + + public String getBookImage() { + return bookImage; + } + + public void setBookImage(String bookImage) { + this.bookImage = bookImage; + } +} diff --git a/tamguo-crawler/src/main/java/com/tamguo/service/IBookService.java b/tamguo-crawler/src/main/java/com/tamguo/service/IBookService.java new file mode 100644 index 0000000..3aeae12 --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/service/IBookService.java @@ -0,0 +1,11 @@ +package com.tamguo.service; + +public interface IBookService { + + /** + * 爬取书本数据 + */ + void crawlerBook(); + + +} diff --git a/tamguo-crawler/src/main/java/com/tamguo/service/ICrawlerBookService.java b/tamguo-crawler/src/main/java/com/tamguo/service/ICrawlerBookService.java new file mode 100644 index 0000000..f2c223f --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/service/ICrawlerBookService.java @@ -0,0 +1,11 @@ +package com.tamguo.service; + +public interface ICrawlerBookService { + + /** + * 爬取书本数据 + */ + void crawlerBook(); + + +} diff --git a/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java b/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java new file mode 100644 index 0000000..4d37f18 --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java @@ -0,0 +1,74 @@ +package com.tamguo.service.impl; + +import com.tamguo.dao.ChapterMapper; +import com.tamguo.model.ChapterEntity; +import com.tamguo.model.vo.ChapterVo; +import com.tamguo.service.IBookService; +import com.xuxueli.crawler.XxlCrawler; +import com.xuxueli.crawler.parser.PageParser; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Service; + +import java.util.List; +import java.util.UUID; + +@Service +public class BookService implements IBookService { + + + @Autowired + ChapterMapper chapterMapper; + + private Logger logger = LoggerFactory.getLogger(getClass()); + + + @Override + public void crawlerBook() { + XxlCrawler crawler = new XxlCrawler.Builder() + .setUrls("https://tiku.baidu.com/tikupc/chapterlist/1bfd700abb68a98271fefa04-27-jiaocai-11") + .setAllowSpread(false) + .setFailRetryCount(5) + .setThreadCount(20) + .setPageParser(new PageParser() { + @Override + public void parse(Document html, Element pageVoElement, ChapterVo chapterVo) { + // 解析封装 PageVo 对象 + String parentName = chapterVo.getName(); + ChapterEntity chapterEntity = new ChapterEntity(); + String uid = UUID.randomUUID().toString().replace("-", ""); + chapterEntity.setUid(uid); + chapterEntity.setName(parentName); + chapterEntity.setCourseId("0"); + chapterEntity.setCourseId("0"); + chapterEntity.setParentId("-1"); + chapterEntity.setQuestionNum(0); + chapterEntity.setPointNum(0); + chapterMapper.insert(chapterEntity); + + List sonChapters = chapterVo.getSonChapters(); + sonChapters.forEach(s -> { + ChapterEntity sonChapterEntity = new ChapterEntity(); + sonChapterEntity.setName(s); + sonChapterEntity.setCourseId("0"); + sonChapterEntity.setCourseId("0"); + sonChapterEntity.setParentId(uid); + sonChapterEntity.setQuestionNum(0); + sonChapterEntity.setPointNum(0); + chapterMapper.insert(sonChapterEntity); + }); + + } + + +// } + }).build(); + + // 获取科目 + crawler.start(true); + } + +} diff --git a/tamguo-crawler/src/main/java/com/tamguo/service/impl/CrawlerBookService.java b/tamguo-crawler/src/main/java/com/tamguo/service/impl/CrawlerBookService.java new file mode 100644 index 0000000..2558157 --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/service/impl/CrawlerBookService.java @@ -0,0 +1,77 @@ +package com.tamguo.service.impl; + +import com.baomidou.mybatisplus.mapper.Condition; +import com.tamguo.dao.BookMapper; +import com.tamguo.dao.CrawlerBookMapper; +import com.tamguo.model.BookEntity; +import com.tamguo.model.CrawlerBookEntity; +import com.tamguo.model.vo.CrawlerBookVo; +import com.tamguo.service.ICrawlerBookService; +import com.xuxueli.crawler.XxlCrawler; +import com.xuxueli.crawler.parser.PageParser; +import org.apache.commons.lang3.StringUtils; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Service; + +import java.util.List; + +@Service +public class CrawlerBookService implements ICrawlerBookService { + + + @Autowired + CrawlerBookMapper crawlerBookMapper; + @Autowired + BookMapper bookMapper; + + private Logger logger = LoggerFactory.getLogger(getClass()); + + + @Override + public void crawlerBook() { + List bookEntities = bookMapper.selectList(Condition.EMPTY); + for (BookEntity bookEntity : bookEntities) { + String url = bookEntity.getReserveField1(); + String bookId = bookEntity.getUid(); + String regexs = url.replaceAll("\\d+", "\\\\d+").replaceAll("\\.","\\\\."); + + XxlCrawler crawler = new XxlCrawler.Builder() + .setUrls("http://www.ruiwen.com/jiaocai/") + .setWhiteUrlRegexs(regexs)// + .setAllowSpread(true) + .setFailRetryCount(5) + .setThreadCount(20) + .setPageParser(new PageParser() { + @Override + public void parse(Document html, Element pageVoElement, CrawlerBookVo crawlerBookVo) { + String pageUrl = html.baseUri(); + // 解析封装 PageVo 对象 + String img = crawlerBookVo.getBookImage(); + if (StringUtils.isNoneBlank(img)) { + CrawlerBookEntity crawlerBookEntity = new CrawlerBookEntity(); + crawlerBookEntity.setBookUid(bookId); + crawlerBookEntity.setBookUrl(crawlerBookVo.getBookImage()); + crawlerBookEntity.setOrders(Integer.parseInt(img.substring(img.lastIndexOf("/") + 1, img.lastIndexOf(".")))); + crawlerBookMapper.insert(crawlerBookEntity); + } + + + } + }).build(); + + // 获取科目 + crawler.start(true); + } + + } + +// public static void main(String[] args) { +// String url = "http://www.ruiwen.com/jiaocai/yuwen/renjiaoban/yinianjishangce/shangce1.html"; +// System.out.println(url.replaceAll("\\d+", "\\\\d+").replaceAll("\\.","\\\\.")); +// } + +} diff --git a/tamguo-crawler/src/test/java/com/tamguo/BookCrawler.java b/tamguo-crawler/src/test/java/com/tamguo/BookCrawler.java new file mode 100644 index 0000000..7b496bf --- /dev/null +++ b/tamguo-crawler/src/test/java/com/tamguo/BookCrawler.java @@ -0,0 +1,23 @@ +package com.tamguo; + +import com.tamguo.service.IBookService; +import com.tamguo.service.ISubjectService; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.test.context.junit4.SpringRunner; + +@RunWith(SpringRunner.class) +@SpringBootTest +public class BookCrawler { + + @Autowired + IBookService bookService; + + @Test + public void crawlerBook() throws Exception { + bookService.crawlerBook(); + } + +} diff --git a/tamguo-crawler/src/test/java/com/tamguo/CrawlerBookCrawler.java b/tamguo-crawler/src/test/java/com/tamguo/CrawlerBookCrawler.java new file mode 100644 index 0000000..0a0c2da --- /dev/null +++ b/tamguo-crawler/src/test/java/com/tamguo/CrawlerBookCrawler.java @@ -0,0 +1,23 @@ +package com.tamguo; + +import com.tamguo.service.IBookService; +import com.tamguo.service.ICrawlerBookService; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.test.context.junit4.SpringRunner; + +@RunWith(SpringRunner.class) +@SpringBootTest +public class CrawlerBookCrawler { + + @Autowired + ICrawlerBookService crawlerBookService; + + @Test + public void crawlerBook() throws Exception { + crawlerBookService.crawlerBook(); + } + +}