From d328257550a313cad484585e8d55cd224ac7ee1d Mon Sep 17 00:00:00 2001 From: sh00859 <302959274@qq.com> Date: Tue, 17 Jul 2018 19:17:23 +0800 Subject: [PATCH 1/7] 123 --- .../main/java/com/tamguo/model/vo/BookVo.java | 186 ++++++++++++++++++ .../java/com/tamguo/service/IBookService.java | 11 ++ .../com/tamguo/service/impl/BookService.java | 95 +++++++++ .../src/test/java/com/tamguo/BookCrawler.java | 23 +++ 4 files changed, 315 insertions(+) create mode 100644 tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java create mode 100644 tamguo-crawler/src/main/java/com/tamguo/service/IBookService.java create mode 100644 tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java create mode 100644 tamguo-crawler/src/test/java/com/tamguo/BookCrawler.java diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java b/tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java new file mode 100644 index 0000000..2dc8257 --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java @@ -0,0 +1,186 @@ +package com.tamguo.model.vo; + +import com.xuxueli.crawler.annotation.PageFieldSelect; +import com.xuxueli.crawler.annotation.PageSelect; +import com.xuxueli.crawler.conf.XxlCrawlerConf; + +import java.util.List; + +@PageSelect(cssQuery = "body") +public class BookVo { + + @PageFieldSelect(cssQuery = ".all-list-li") + private List name; + + // 类型名称 + @PageFieldSelect(cssQuery=".submenu-contain .contain-title") + private String subjectName; + + // 科目信息 + @PageFieldSelect(cssQuery=".course-list-container .course-list .course-item") + private List courseName; + + // 带采集的科目URLs + @PageFieldSelect(cssQuery = ".all-list-li a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") + private List courseUrls; + + + @PageFieldSelect(cssQuery=".screening .selected a") + private String chapterPageCourseName; + + @PageFieldSelect(cssQuery=".screening .selected a") + private String chapterCurrName; + + // 带采集的章节URLs缓存 + @PageFieldSelect(cssQuery = ".main-submenu .contain-ul .contain-li:eq(1) a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") + private List chapterUrlsTemp; + + // 待采集的章节URLs + @PageFieldSelect(cssQuery = ".screening .sc-subject li:not(.selected) a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") + private List chapterUrls; + + // 待采集的问题URLs + @PageFieldSelect(cssQuery = ".list-right .detail-chapter .detail-kpoint-1 .detail-kpoint-2 .mask a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") + private List questionUrlsTemp; + + // 待采集问题URLs + @PageFieldSelect(cssQuery = ".bd-content .question-box .question-box-inner .view-analyse a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") + private List questionUrls; + + // 单个题目数据 + @PageFieldSelect(cssQuery=".question-box-inner .questem-inner", selectType = XxlCrawlerConf.SelectType.HTML) + private String content; + + @PageFieldSelect(cssQuery=".exam-answer-content", selectType = XxlCrawlerConf.SelectType.HTML) + private List answer; + + @PageFieldSelect(cssQuery=".exam-analysis .exam-analysis-content", selectType = XxlCrawlerConf.SelectType.HTML) + private String analysis; + + @PageFieldSelect(cssQuery=".que-title span:eq(0)",selectType = XxlCrawlerConf.SelectType.TEXT) + private String questionType; + + @PageFieldSelect(cssQuery=".que-title span:eq(1)",selectType = XxlCrawlerConf.SelectType.TEXT) + private String score; + + public List getName() { + return name; + } + + public void setName(List name) { + this.name = name; + } + + public List getCourseName() { + return courseName; + } + + public void setCourseName(List courseName) { + this.courseName = courseName; + } + + public String getSubjectName() { + return subjectName; + } + + public void setSubjectName(String subjectName) { + this.subjectName = subjectName; + } + + public List getCourseUrls() { + return courseUrls; + } + + public void setCourseUrls(List courseUrls) { + this.courseUrls = courseUrls; + } + + public List getChapterUrls() { + return chapterUrls; + } + + public void setChapterUrls(List chapterUrls) { + this.chapterUrls = chapterUrls; + } + + public List getChapterUrlsTemp() { + return chapterUrlsTemp; + } + + public void setChapterUrlsTemp(List chapterUrlsTemp) { + this.chapterUrlsTemp = chapterUrlsTemp; + } + + public String getChapterPageCourseName() { + return chapterPageCourseName; + } + + public void setChapterPageCourseName(String chapterPageCourseName) { + this.chapterPageCourseName = chapterPageCourseName; + } + + public String getChapterCurrName() { + return chapterCurrName; + } + + public void setChapterCurrName(String chapterCurrName) { + this.chapterCurrName = chapterCurrName; + } + + public List getQuestionUrlsTemp() { + return questionUrlsTemp; + } + + public void setQuestionUrlsTemp(List questionUrlsTemp) { + this.questionUrlsTemp = questionUrlsTemp; + } + + public List getQuestionUrls() { + return questionUrls; + } + + public void setQuestionUrls(List questionUrls) { + this.questionUrls = questionUrls; + } + + public String getAnalysis() { + return analysis; + } + + public void setAnalysis(String analysis) { + this.analysis = analysis; + } + + public String getContent() { + return content; + } + + public void setContent(String content) { + this.content = content; + } + + public String getQuestionType() { + return questionType; + } + + public void setQuestionType(String questionType) { + this.questionType = questionType; + } + + public String getScore() { + return score; + } + + public void setScore(String score) { + this.score = score; + } + + public List getAnswer() { + return answer; + } + + public void setAnswer(List answer) { + this.answer = answer; + } + +} diff --git a/tamguo-crawler/src/main/java/com/tamguo/service/IBookService.java b/tamguo-crawler/src/main/java/com/tamguo/service/IBookService.java new file mode 100644 index 0000000..3aeae12 --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/service/IBookService.java @@ -0,0 +1,11 @@ +package com.tamguo.service; + +public interface IBookService { + + /** + * 爬取书本数据 + */ + void crawlerBook(); + + +} diff --git a/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java b/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java new file mode 100644 index 0000000..c5b554f --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java @@ -0,0 +1,95 @@ +package com.tamguo.service.impl; + +import com.tamguo.dao.ChapterMapper; +import com.tamguo.dao.CourseMapper; +import com.tamguo.dao.CrawlerQuestionMapper; +import com.tamguo.dao.SubjectMapper; +import com.tamguo.model.vo.BookVo; +import com.tamguo.service.IBookService; +import com.xuxueli.crawler.XxlCrawler; +import com.xuxueli.crawler.parser.PageParser; +import com.xuxueli.crawler.rundata.RunData; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Service; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +@Service +public class BookService implements IBookService { + + + @Autowired + SubjectMapper subjectMapper; + @Autowired + CourseMapper courseMapper; + @Autowired + ChapterMapper chapterMapper; + @Autowired + CrawlerQuestionMapper crawlerQuestionMapper; + + private Logger logger = LoggerFactory.getLogger(getClass()); + + private Set urls = new HashSet<>(); + + private Set questionUrls = new HashSet(); + + private Map chapterQuestionListMap = new HashMap<>(); + + private RunData runData; + + @Override + public void crawlerBook() { + XxlCrawler crawler = new XxlCrawler.Builder() + .setUrls("http://www.dzkbw.com") + .setAllowSpread(false) + .setFailRetryCount(5) + .setThreadCount(20) + .setPageParser(new PageParser() { + @Override + public void parse(Document html, Element pageVoElement, BookVo bookVo) { + // 解析封装 PageVo 对象 + String pageUrl = html.baseUri(); + if (pageUrl.equals("https://tiku.baidu.com/")) { + logger.info("开始解析考试分类:{}", pageUrl); +// for (int i = 0; i < subjectVo.getName().size(); i++) { +// String name = subjectVo.getName().get(i); +// +// SubjectEntity subject = subjectMapper.findByName(name); +// if (subject != null) { +// continue; +// } +// SubjectEntity entity = new SubjectEntity(); +// if (name.equals("高考")) { +// name = "高考"; +// entity.setName(name); +// subjectMapper.insert(entity); +// // 加入科目爬取数据 +// for (String url : subjectVo.getCourseUrls()) { +// runData.addUrl(url); +// } +// +// } +// +// +// } + + + } + + + } + }).build(); + + runData = crawler.getRunData(); + // 获取科目 + crawler.start(true); + } + +} diff --git a/tamguo-crawler/src/test/java/com/tamguo/BookCrawler.java b/tamguo-crawler/src/test/java/com/tamguo/BookCrawler.java new file mode 100644 index 0000000..7b496bf --- /dev/null +++ b/tamguo-crawler/src/test/java/com/tamguo/BookCrawler.java @@ -0,0 +1,23 @@ +package com.tamguo; + +import com.tamguo.service.IBookService; +import com.tamguo.service.ISubjectService; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.test.context.junit4.SpringRunner; + +@RunWith(SpringRunner.class) +@SpringBootTest +public class BookCrawler { + + @Autowired + IBookService bookService; + + @Test + public void crawlerBook() throws Exception { + bookService.crawlerBook(); + } + +} From b1a9cdfc89f3d8217b1c5e144cb7c89bdd8a1ba1 Mon Sep 17 00:00:00 2001 From: cff <302959274@qq.com> Date: Tue, 17 Jul 2018 22:05:01 +0800 Subject: [PATCH 2/7] =?UTF-8?q?tiku=5Fbook=E7=AE=A1=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../main/java/com/tamguo/dao/BookMapper.java | 8 + .../java/com/tamguo/model/BookEntity.java | 92 ++++++++++ .../main/java/com/tamguo/model/vo/BookVo.java | 166 +----------------- .../com/tamguo/service/impl/BookService.java | 64 ++----- 4 files changed, 116 insertions(+), 214 deletions(-) create mode 100644 tamguo-crawler/src/main/java/com/tamguo/dao/BookMapper.java create mode 100644 tamguo-crawler/src/main/java/com/tamguo/model/BookEntity.java diff --git a/tamguo-crawler/src/main/java/com/tamguo/dao/BookMapper.java b/tamguo-crawler/src/main/java/com/tamguo/dao/BookMapper.java new file mode 100644 index 0000000..2fcf33d --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/dao/BookMapper.java @@ -0,0 +1,8 @@ +package com.tamguo.dao; + +import com.tamguo.config.dao.SuperMapper; +import com.tamguo.model.BookEntity; + +public interface BookMapper extends SuperMapper { + +} diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/BookEntity.java b/tamguo-crawler/src/main/java/com/tamguo/model/BookEntity.java new file mode 100644 index 0000000..0d3967b --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/model/BookEntity.java @@ -0,0 +1,92 @@ +package com.tamguo.model; + +import com.baomidou.mybatisplus.annotations.TableName; +import com.tamguo.config.dao.SuperEntity; + +import java.io.Serializable; + +/** + * The persistent class for the tiku_course database table. + */ +@TableName(value = "tiku_book") +public class BookEntity extends SuperEntity implements Serializable { + private static final long serialVersionUID = 1L; + + + private String subjectId; + + private String courseId; + + private String name; + + private String publishingHouse; + + private Integer questionNum; + + private Integer pointNum; + + private Integer orders; + + public BookEntity() { + } + + public static long getSerialVersionUID() { + return serialVersionUID; + } + + public String getSubjectId() { + return subjectId; + } + + public void setSubjectId(String subjectId) { + this.subjectId = subjectId; + } + + public String getCourseId() { + return courseId; + } + + public void setCourseId(String courseId) { + this.courseId = courseId; + } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public String getPublishingHouse() { + return publishingHouse; + } + + public void setPublishingHouse(String publishingHouse) { + this.publishingHouse = publishingHouse; + } + + public Integer getQuestionNum() { + return questionNum; + } + + public void setQuestionNum(Integer questionNum) { + this.questionNum = questionNum; + } + + public Integer getPointNum() { + return pointNum; + } + + public void setPointNum(Integer pointNum) { + this.pointNum = pointNum; + } + + public Integer getOrders() { + return orders; + } + + public void setOrders(Integer orders) { + this.orders = orders; + } +} \ No newline at end of file diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java b/tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java index 2dc8257..9a80c74 100644 --- a/tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java +++ b/tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java @@ -9,59 +9,8 @@ import java.util.List; @PageSelect(cssQuery = "body") public class BookVo { - @PageFieldSelect(cssQuery = ".all-list-li") + @PageFieldSelect(cssQuery = ".ih3") private List name; - - // 类型名称 - @PageFieldSelect(cssQuery=".submenu-contain .contain-title") - private String subjectName; - - // 科目信息 - @PageFieldSelect(cssQuery=".course-list-container .course-list .course-item") - private List courseName; - - // 带采集的科目URLs - @PageFieldSelect(cssQuery = ".all-list-li a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") - private List courseUrls; - - - @PageFieldSelect(cssQuery=".screening .selected a") - private String chapterPageCourseName; - - @PageFieldSelect(cssQuery=".screening .selected a") - private String chapterCurrName; - - // 带采集的章节URLs缓存 - @PageFieldSelect(cssQuery = ".main-submenu .contain-ul .contain-li:eq(1) a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") - private List chapterUrlsTemp; - - // 待采集的章节URLs - @PageFieldSelect(cssQuery = ".screening .sc-subject li:not(.selected) a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") - private List chapterUrls; - - // 待采集的问题URLs - @PageFieldSelect(cssQuery = ".list-right .detail-chapter .detail-kpoint-1 .detail-kpoint-2 .mask a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") - private List questionUrlsTemp; - - // 待采集问题URLs - @PageFieldSelect(cssQuery = ".bd-content .question-box .question-box-inner .view-analyse a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") - private List questionUrls; - - // 单个题目数据 - @PageFieldSelect(cssQuery=".question-box-inner .questem-inner", selectType = XxlCrawlerConf.SelectType.HTML) - private String content; - - @PageFieldSelect(cssQuery=".exam-answer-content", selectType = XxlCrawlerConf.SelectType.HTML) - private List answer; - - @PageFieldSelect(cssQuery=".exam-analysis .exam-analysis-content", selectType = XxlCrawlerConf.SelectType.HTML) - private String analysis; - - @PageFieldSelect(cssQuery=".que-title span:eq(0)",selectType = XxlCrawlerConf.SelectType.TEXT) - private String questionType; - - @PageFieldSelect(cssQuery=".que-title span:eq(1)",selectType = XxlCrawlerConf.SelectType.TEXT) - private String score; public List getName() { return name; @@ -70,117 +19,4 @@ public class BookVo { public void setName(List name) { this.name = name; } - - public List getCourseName() { - return courseName; - } - - public void setCourseName(List courseName) { - this.courseName = courseName; - } - - public String getSubjectName() { - return subjectName; - } - - public void setSubjectName(String subjectName) { - this.subjectName = subjectName; - } - - public List getCourseUrls() { - return courseUrls; - } - - public void setCourseUrls(List courseUrls) { - this.courseUrls = courseUrls; - } - - public List getChapterUrls() { - return chapterUrls; - } - - public void setChapterUrls(List chapterUrls) { - this.chapterUrls = chapterUrls; - } - - public List getChapterUrlsTemp() { - return chapterUrlsTemp; - } - - public void setChapterUrlsTemp(List chapterUrlsTemp) { - this.chapterUrlsTemp = chapterUrlsTemp; - } - - public String getChapterPageCourseName() { - return chapterPageCourseName; - } - - public void setChapterPageCourseName(String chapterPageCourseName) { - this.chapterPageCourseName = chapterPageCourseName; - } - - public String getChapterCurrName() { - return chapterCurrName; - } - - public void setChapterCurrName(String chapterCurrName) { - this.chapterCurrName = chapterCurrName; - } - - public List getQuestionUrlsTemp() { - return questionUrlsTemp; - } - - public void setQuestionUrlsTemp(List questionUrlsTemp) { - this.questionUrlsTemp = questionUrlsTemp; - } - - public List getQuestionUrls() { - return questionUrls; - } - - public void setQuestionUrls(List questionUrls) { - this.questionUrls = questionUrls; - } - - public String getAnalysis() { - return analysis; - } - - public void setAnalysis(String analysis) { - this.analysis = analysis; - } - - public String getContent() { - return content; - } - - public void setContent(String content) { - this.content = content; - } - - public String getQuestionType() { - return questionType; - } - - public void setQuestionType(String questionType) { - this.questionType = questionType; - } - - public String getScore() { - return score; - } - - public void setScore(String score) { - this.score = score; - } - - public List getAnswer() { - return answer; - } - - public void setAnswer(List answer) { - this.answer = answer; - } - } diff --git a/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java b/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java index c5b554f..5459b7f 100644 --- a/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java +++ b/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java @@ -1,14 +1,11 @@ package com.tamguo.service.impl; -import com.tamguo.dao.ChapterMapper; -import com.tamguo.dao.CourseMapper; -import com.tamguo.dao.CrawlerQuestionMapper; -import com.tamguo.dao.SubjectMapper; +import com.tamguo.dao.BookMapper; +import com.tamguo.model.BookEntity; import com.tamguo.model.vo.BookVo; import com.tamguo.service.IBookService; import com.xuxueli.crawler.XxlCrawler; import com.xuxueli.crawler.parser.PageParser; -import com.xuxueli.crawler.rundata.RunData; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.slf4j.Logger; @@ -16,33 +13,17 @@ import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; +import java.util.List; @Service public class BookService implements IBookService { @Autowired - SubjectMapper subjectMapper; - @Autowired - CourseMapper courseMapper; - @Autowired - ChapterMapper chapterMapper; - @Autowired - CrawlerQuestionMapper crawlerQuestionMapper; + BookMapper bookMapper; private Logger logger = LoggerFactory.getLogger(getClass()); - private Set urls = new HashSet<>(); - - private Set questionUrls = new HashSet(); - - private Map chapterQuestionListMap = new HashMap<>(); - - private RunData runData; @Override public void crawlerBook() { @@ -56,38 +37,23 @@ public class BookService implements IBookService { public void parse(Document html, Element pageVoElement, BookVo bookVo) { // 解析封装 PageVo 对象 String pageUrl = html.baseUri(); - if (pageUrl.equals("https://tiku.baidu.com/")) { - logger.info("开始解析考试分类:{}", pageUrl); -// for (int i = 0; i < subjectVo.getName().size(); i++) { -// String name = subjectVo.getName().get(i); -// -// SubjectEntity subject = subjectMapper.findByName(name); -// if (subject != null) { -// continue; -// } -// SubjectEntity entity = new SubjectEntity(); -// if (name.equals("高考")) { -// name = "高考"; -// entity.setName(name); -// subjectMapper.insert(entity); -// // 加入科目爬取数据 -// for (String url : subjectVo.getCourseUrls()) { -// runData.addUrl(url); -// } -// -// } -// -// -// } - - + if (pageUrl.equals("http://www.dzkbw.com")) { + logger.info("开始解析书本信息:{}", pageUrl); + List books = bookVo.getName(); + books.forEach(item -> { + BookEntity bookEntity = new BookEntity(); + bookEntity.setName(item); + bookEntity.setQuestionNum(0); + bookEntity.setPointNum(0); + bookMapper.insert(bookEntity); + }); } } }).build(); - runData = crawler.getRunData(); +// runData = crawler.getRunData(); // 获取科目 crawler.start(true); } From 7c6f116beb7b62e8defaf292cfacd9d003231e0d Mon Sep 17 00:00:00 2001 From: cff <302959274@qq.com> Date: Tue, 17 Jul 2018 22:21:31 +0800 Subject: [PATCH 3/7] =?UTF-8?q?=E6=8D=A2=E7=BD=91=E7=AB=99http://www.ruiwe?= =?UTF-8?q?n.com/jiaocai/?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java | 2 +- .../src/main/java/com/tamguo/service/impl/BookService.java | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java b/tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java index 9a80c74..19f9dcd 100644 --- a/tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java +++ b/tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java @@ -9,7 +9,7 @@ import java.util.List; @PageSelect(cssQuery = "body") public class BookVo { - @PageFieldSelect(cssQuery = ".ih3") + @PageFieldSelect(cssQuery = ".text") private List name; public List getName() { diff --git a/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java b/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java index 5459b7f..4c467a1 100644 --- a/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java +++ b/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java @@ -28,7 +28,7 @@ public class BookService implements IBookService { @Override public void crawlerBook() { XxlCrawler crawler = new XxlCrawler.Builder() - .setUrls("http://www.dzkbw.com") + .setUrls("http://www.ruiwen.com/jiaocai/") .setAllowSpread(false) .setFailRetryCount(5) .setThreadCount(20) @@ -37,7 +37,7 @@ public class BookService implements IBookService { public void parse(Document html, Element pageVoElement, BookVo bookVo) { // 解析封装 PageVo 对象 String pageUrl = html.baseUri(); - if (pageUrl.equals("http://www.dzkbw.com")) { + if (pageUrl.equals("http://www.ruiwen.com/jiaocai/")) { logger.info("开始解析书本信息:{}", pageUrl); List books = bookVo.getName(); books.forEach(item -> { From 55e1a53ace3ecd022a484b2b4643e27b33d97894 Mon Sep 17 00:00:00 2001 From: cff <302959274@qq.com> Date: Tue, 17 Jul 2018 23:16:04 +0800 Subject: [PATCH 4/7] =?UTF-8?q?=E4=B8=80=E5=B9=B4=E7=BA=A7=E8=AF=AD?= =?UTF-8?q?=E6=96=87=E4=B8=8A=E5=86=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../com/tamguo/dao/CrawlerBookMapper.java | 8 +++ .../java/com/tamguo/model/BookEntity.java | 2 +- .../com/tamguo/model/CrawlerBookEntity.java | 51 +++++++++++++++ .../com/tamguo/model/vo/CrawlerBookVo.java | 20 ++++++ .../tamguo/service/ICrawlerBookService.java | 11 ++++ .../service/impl/CrawlerBookService.java | 62 +++++++++++++++++++ .../java/com/tamguo/CrawlerBookCrawler.java | 23 +++++++ 7 files changed, 176 insertions(+), 1 deletion(-) create mode 100644 tamguo-crawler/src/main/java/com/tamguo/dao/CrawlerBookMapper.java create mode 100644 tamguo-crawler/src/main/java/com/tamguo/model/CrawlerBookEntity.java create mode 100644 tamguo-crawler/src/main/java/com/tamguo/model/vo/CrawlerBookVo.java create mode 100644 tamguo-crawler/src/main/java/com/tamguo/service/ICrawlerBookService.java create mode 100644 tamguo-crawler/src/main/java/com/tamguo/service/impl/CrawlerBookService.java create mode 100644 tamguo-crawler/src/test/java/com/tamguo/CrawlerBookCrawler.java diff --git a/tamguo-crawler/src/main/java/com/tamguo/dao/CrawlerBookMapper.java b/tamguo-crawler/src/main/java/com/tamguo/dao/CrawlerBookMapper.java new file mode 100644 index 0000000..bf312b8 --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/dao/CrawlerBookMapper.java @@ -0,0 +1,8 @@ +package com.tamguo.dao; + +import com.tamguo.config.dao.SuperMapper; +import com.tamguo.model.CrawlerBookEntity; + +public interface CrawlerBookMapper extends SuperMapper { + +} diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/BookEntity.java b/tamguo-crawler/src/main/java/com/tamguo/model/BookEntity.java index 0d3967b..8803011 100644 --- a/tamguo-crawler/src/main/java/com/tamguo/model/BookEntity.java +++ b/tamguo-crawler/src/main/java/com/tamguo/model/BookEntity.java @@ -6,7 +6,7 @@ import com.tamguo.config.dao.SuperEntity; import java.io.Serializable; /** - * The persistent class for the tiku_course database table. + * The persistent class for the tiku_book database table. */ @TableName(value = "tiku_book") public class BookEntity extends SuperEntity implements Serializable { diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/CrawlerBookEntity.java b/tamguo-crawler/src/main/java/com/tamguo/model/CrawlerBookEntity.java new file mode 100644 index 0000000..79c5041 --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/model/CrawlerBookEntity.java @@ -0,0 +1,51 @@ +package com.tamguo.model; + +import com.baomidou.mybatisplus.annotations.TableName; +import com.tamguo.config.dao.SuperEntity; + +import java.io.Serializable; + +/** + * The persistent class for the crawler_book database table. + */ +@TableName(value = "crawler_book") +public class CrawlerBookEntity extends SuperEntity implements Serializable { + private static final long serialVersionUID = 1L; + + private String bookUrl; + + private String bookUid; + + private Integer orders; + + public CrawlerBookEntity() { + } + + public static long getSerialVersionUID() { + return serialVersionUID; + } + + public String getBookUrl() { + return bookUrl; + } + + public void setBookUrl(String bookUrl) { + this.bookUrl = bookUrl; + } + + public String getBookUid() { + return bookUid; + } + + public void setBookUid(String bookUid) { + this.bookUid = bookUid; + } + + public Integer getOrders() { + return orders; + } + + public void setOrders(Integer orders) { + this.orders = orders; + } +} \ No newline at end of file diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/vo/CrawlerBookVo.java b/tamguo-crawler/src/main/java/com/tamguo/model/vo/CrawlerBookVo.java new file mode 100644 index 0000000..0c348a1 --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/model/vo/CrawlerBookVo.java @@ -0,0 +1,20 @@ +package com.tamguo.model.vo; + +import com.xuxueli.crawler.annotation.PageFieldSelect; +import com.xuxueli.crawler.annotation.PageSelect; +import com.xuxueli.crawler.conf.XxlCrawlerConf; + +@PageSelect(cssQuery = "body") +public class CrawlerBookVo { + + @PageFieldSelect(cssQuery = ".con .pic img", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:src") + private String bookImage; + + public String getBookImage() { + return bookImage; + } + + public void setBookImage(String bookImage) { + this.bookImage = bookImage; + } +} diff --git a/tamguo-crawler/src/main/java/com/tamguo/service/ICrawlerBookService.java b/tamguo-crawler/src/main/java/com/tamguo/service/ICrawlerBookService.java new file mode 100644 index 0000000..f2c223f --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/service/ICrawlerBookService.java @@ -0,0 +1,11 @@ +package com.tamguo.service; + +public interface ICrawlerBookService { + + /** + * 爬取书本数据 + */ + void crawlerBook(); + + +} diff --git a/tamguo-crawler/src/main/java/com/tamguo/service/impl/CrawlerBookService.java b/tamguo-crawler/src/main/java/com/tamguo/service/impl/CrawlerBookService.java new file mode 100644 index 0000000..3a92131 --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/service/impl/CrawlerBookService.java @@ -0,0 +1,62 @@ +package com.tamguo.service.impl; + +import com.tamguo.config.redis.CacheService; +import com.tamguo.dao.CrawlerBookMapper; +import com.tamguo.model.CrawlerBookEntity; +import com.tamguo.model.vo.CrawlerBookVo; +import com.tamguo.service.ICrawlerBookService; +import com.xuxueli.crawler.XxlCrawler; +import com.xuxueli.crawler.parser.PageParser; +import org.apache.commons.lang3.StringUtils; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Service; + +@Service +public class CrawlerBookService implements ICrawlerBookService { + + + @Autowired + CrawlerBookMapper crawlerBookMapper; + + private Logger logger = LoggerFactory.getLogger(getClass()); + + private static final String FILES_NO_FORMAT = "000000"; + private static final String FILES_PREFIX = "FPIMAGE"; + private static final String DOMAIN = "http://www.tamguo.com"; + @Autowired + CacheService cacheService; + + + //一年级语文上册 + @Override + public void crawlerBook() { + XxlCrawler crawler = new XxlCrawler.Builder() + .setUrls("http://www.ruiwen.com/jiaocai/") + .setWhiteUrlRegexs("http://www\\.ruiwen\\.com/jiaocai/yuwen/bubianban/yinianjishangce/shangce\\d+\\.html") + .setAllowSpread(true) + .setFailRetryCount(5) + .setThreadCount(20) + .setPageParser(new PageParser() { + @Override + public void parse(Document html, Element pageVoElement, CrawlerBookVo crawlerBookVo) { + // 解析封装 PageVo 对象 + String img = crawlerBookVo.getBookImage(); + if (StringUtils.isNoneBlank(img)) { + CrawlerBookEntity crawlerBookEntity = new CrawlerBookEntity(); + crawlerBookEntity.setBookUid("1019238600753074178"); + crawlerBookEntity.setBookUrl(crawlerBookVo.getBookImage()); + crawlerBookEntity.setOrders(Integer.parseInt(img.substring(img.lastIndexOf("/") + 1, img.lastIndexOf(".")))); + crawlerBookMapper.insert(crawlerBookEntity); + } + } + }).build(); + + // 获取科目 + crawler.start(true); + } + +} diff --git a/tamguo-crawler/src/test/java/com/tamguo/CrawlerBookCrawler.java b/tamguo-crawler/src/test/java/com/tamguo/CrawlerBookCrawler.java new file mode 100644 index 0000000..0a0c2da --- /dev/null +++ b/tamguo-crawler/src/test/java/com/tamguo/CrawlerBookCrawler.java @@ -0,0 +1,23 @@ +package com.tamguo; + +import com.tamguo.service.IBookService; +import com.tamguo.service.ICrawlerBookService; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.test.context.junit4.SpringRunner; + +@RunWith(SpringRunner.class) +@SpringBootTest +public class CrawlerBookCrawler { + + @Autowired + ICrawlerBookService crawlerBookService; + + @Test + public void crawlerBook() throws Exception { + crawlerBookService.crawlerBook(); + } + +} From 4fe94c1cf0cf7a03e756a1f79aed343755130ad7 Mon Sep 17 00:00:00 2001 From: cff <302959274@qq.com> Date: Tue, 17 Jul 2018 23:43:13 +0800 Subject: [PATCH 5/7] =?UTF-8?q?=E4=B8=80=E5=B9=B4=E7=BA=A7=E8=AF=AD?= =?UTF-8?q?=E6=96=87=E4=B8=8A=E5=86=8C+=E4=B8=80=E5=B9=B4=E7=BA=A7?= =?UTF-8?q?=E8=AF=AD=E6=96=87=E4=B8=8B=E5=86=8C+=E4=B8=80=E5=B9=B4?= =?UTF-8?q?=E7=BA=A7=E8=8B=B1=E8=AF=AD=E4=B8=8A=E5=86=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/main/java/com/tamguo/model/vo/BookVo.java | 2 +- .../java/com/tamguo/service/impl/CrawlerBookService.java | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java b/tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java index 19f9dcd..8d337c0 100644 --- a/tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java +++ b/tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java @@ -9,7 +9,7 @@ import java.util.List; @PageSelect(cssQuery = "body") public class BookVo { - @PageFieldSelect(cssQuery = ".text") + @PageFieldSelect(cssQuery = ".pic_right .text") private List name; public List getName() { diff --git a/tamguo-crawler/src/main/java/com/tamguo/service/impl/CrawlerBookService.java b/tamguo-crawler/src/main/java/com/tamguo/service/impl/CrawlerBookService.java index 3a92131..4d54fe7 100644 --- a/tamguo-crawler/src/main/java/com/tamguo/service/impl/CrawlerBookService.java +++ b/tamguo-crawler/src/main/java/com/tamguo/service/impl/CrawlerBookService.java @@ -36,7 +36,9 @@ public class CrawlerBookService implements ICrawlerBookService { public void crawlerBook() { XxlCrawler crawler = new XxlCrawler.Builder() .setUrls("http://www.ruiwen.com/jiaocai/") - .setWhiteUrlRegexs("http://www\\.ruiwen\\.com/jiaocai/yuwen/bubianban/yinianjishangce/shangce\\d+\\.html") + .setWhiteUrlRegexs("http://www\\.ruiwen\\.com/jiaocai/yuwen/bubianban/yinianjishangce/shangce\\d+\\.html")//一年级语文上册 +// .setWhiteUrlRegexs("http://www\\.ruiwen\\.com/jiaocai/yuwen/bubianban/yinianjixiace/xiace\\d+\\.html")//一年级语文下册 +// .setWhiteUrlRegexs("http://www\\.ruiwen\\.com/jiaocai/yingyu/bubianban/yinianjixiace/xiace\\d+\\.html")//一年级英语上册 .setAllowSpread(true) .setFailRetryCount(5) .setThreadCount(20) @@ -47,7 +49,9 @@ public class CrawlerBookService implements ICrawlerBookService { String img = crawlerBookVo.getBookImage(); if (StringUtils.isNoneBlank(img)) { CrawlerBookEntity crawlerBookEntity = new CrawlerBookEntity(); - crawlerBookEntity.setBookUid("1019238600753074178"); + crawlerBookEntity.setBookUid("1019244094196551682");//一年级语文上册 +// crawlerBookEntity.setBookUid("1019244094704062466");//一年级语文下册 +// crawlerBookEntity.setBookUid("1019244096797020162");//一年级英语上册 crawlerBookEntity.setBookUrl(crawlerBookVo.getBookImage()); crawlerBookEntity.setOrders(Integer.parseInt(img.substring(img.lastIndexOf("/") + 1, img.lastIndexOf(".")))); crawlerBookMapper.insert(crawlerBookEntity); From 90bccde83ee15c6efc12ee92baf718fa054458b6 Mon Sep 17 00:00:00 2001 From: sh00859 <302959274@qq.com> Date: Wed, 18 Jul 2018 11:18:26 +0800 Subject: [PATCH 6/7] =?UTF-8?q?=E9=80=BB=E8=BE=91=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/com/tamguo/model/BookEntity.java | 10 +++ .../main/java/com/tamguo/model/vo/BookVo.java | 32 +++++--- .../com/tamguo/service/impl/BookService.java | 19 +++-- .../service/impl/CrawlerBookService.java | 77 +++++++++++-------- 4 files changed, 84 insertions(+), 54 deletions(-) diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/BookEntity.java b/tamguo-crawler/src/main/java/com/tamguo/model/BookEntity.java index 8803011..fb6a2c5 100644 --- a/tamguo-crawler/src/main/java/com/tamguo/model/BookEntity.java +++ b/tamguo-crawler/src/main/java/com/tamguo/model/BookEntity.java @@ -27,6 +27,8 @@ public class BookEntity extends SuperEntity implements Serializable private Integer orders; + private String reserveField1; + public BookEntity() { } @@ -89,4 +91,12 @@ public class BookEntity extends SuperEntity implements Serializable public void setOrders(Integer orders) { this.orders = orders; } + + public String getReserveField1() { + return reserveField1; + } + + public void setReserveField1(String reserveField1) { + this.reserveField1 = reserveField1; + } } \ No newline at end of file diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java b/tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java index 8d337c0..c37ca2b 100644 --- a/tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java +++ b/tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java @@ -4,19 +4,29 @@ import com.xuxueli.crawler.annotation.PageFieldSelect; import com.xuxueli.crawler.annotation.PageSelect; import com.xuxueli.crawler.conf.XxlCrawlerConf; -import java.util.List; - -@PageSelect(cssQuery = "body") +@PageSelect(cssQuery = ".pic_right li") public class BookVo { - @PageFieldSelect(cssQuery = ".pic_right .text") - private List name; + @PageFieldSelect(cssQuery = ".text") + private String name; + + @PageFieldSelect(cssQuery = "a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") + private String bookUrl; + + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } - public List getName() { - return name; - } + public String getBookUrl() { + return bookUrl; + } - public void setName(List name) { - this.name = name; - } + public void setBookUrl(String bookUrl) { + this.bookUrl = bookUrl; + } } diff --git a/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java b/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java index 4c467a1..2b27c9c 100644 --- a/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java +++ b/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java @@ -13,8 +13,6 @@ import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; -import java.util.List; - @Service public class BookService implements IBookService { @@ -39,14 +37,15 @@ public class BookService implements IBookService { String pageUrl = html.baseUri(); if (pageUrl.equals("http://www.ruiwen.com/jiaocai/")) { logger.info("开始解析书本信息:{}", pageUrl); - List books = bookVo.getName(); - books.forEach(item -> { - BookEntity bookEntity = new BookEntity(); - bookEntity.setName(item); - bookEntity.setQuestionNum(0); - bookEntity.setPointNum(0); - bookMapper.insert(bookEntity); - }); + String name = bookVo.getName(); + String url = bookVo.getBookUrl(); + + BookEntity bookEntity = new BookEntity(); + bookEntity.setName(name); + bookEntity.setReserveField1(url); + bookEntity.setQuestionNum(0); + bookEntity.setPointNum(0); + bookMapper.insert(bookEntity); } diff --git a/tamguo-crawler/src/main/java/com/tamguo/service/impl/CrawlerBookService.java b/tamguo-crawler/src/main/java/com/tamguo/service/impl/CrawlerBookService.java index 4d54fe7..2558157 100644 --- a/tamguo-crawler/src/main/java/com/tamguo/service/impl/CrawlerBookService.java +++ b/tamguo-crawler/src/main/java/com/tamguo/service/impl/CrawlerBookService.java @@ -1,7 +1,9 @@ package com.tamguo.service.impl; -import com.tamguo.config.redis.CacheService; +import com.baomidou.mybatisplus.mapper.Condition; +import com.tamguo.dao.BookMapper; import com.tamguo.dao.CrawlerBookMapper; +import com.tamguo.model.BookEntity; import com.tamguo.model.CrawlerBookEntity; import com.tamguo.model.vo.CrawlerBookVo; import com.tamguo.service.ICrawlerBookService; @@ -15,52 +17,61 @@ import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; +import java.util.List; + @Service public class CrawlerBookService implements ICrawlerBookService { @Autowired CrawlerBookMapper crawlerBookMapper; + @Autowired + BookMapper bookMapper; private Logger logger = LoggerFactory.getLogger(getClass()); - private static final String FILES_NO_FORMAT = "000000"; - private static final String FILES_PREFIX = "FPIMAGE"; - private static final String DOMAIN = "http://www.tamguo.com"; - @Autowired - CacheService cacheService; - - //一年级语文上册 @Override public void crawlerBook() { - XxlCrawler crawler = new XxlCrawler.Builder() - .setUrls("http://www.ruiwen.com/jiaocai/") - .setWhiteUrlRegexs("http://www\\.ruiwen\\.com/jiaocai/yuwen/bubianban/yinianjishangce/shangce\\d+\\.html")//一年级语文上册 -// .setWhiteUrlRegexs("http://www\\.ruiwen\\.com/jiaocai/yuwen/bubianban/yinianjixiace/xiace\\d+\\.html")//一年级语文下册 -// .setWhiteUrlRegexs("http://www\\.ruiwen\\.com/jiaocai/yingyu/bubianban/yinianjixiace/xiace\\d+\\.html")//一年级英语上册 - .setAllowSpread(true) - .setFailRetryCount(5) - .setThreadCount(20) - .setPageParser(new PageParser() { - @Override - public void parse(Document html, Element pageVoElement, CrawlerBookVo crawlerBookVo) { - // 解析封装 PageVo 对象 - String img = crawlerBookVo.getBookImage(); - if (StringUtils.isNoneBlank(img)) { - CrawlerBookEntity crawlerBookEntity = new CrawlerBookEntity(); - crawlerBookEntity.setBookUid("1019244094196551682");//一年级语文上册 -// crawlerBookEntity.setBookUid("1019244094704062466");//一年级语文下册 -// crawlerBookEntity.setBookUid("1019244096797020162");//一年级英语上册 - crawlerBookEntity.setBookUrl(crawlerBookVo.getBookImage()); - crawlerBookEntity.setOrders(Integer.parseInt(img.substring(img.lastIndexOf("/") + 1, img.lastIndexOf(".")))); - crawlerBookMapper.insert(crawlerBookEntity); + List bookEntities = bookMapper.selectList(Condition.EMPTY); + for (BookEntity bookEntity : bookEntities) { + String url = bookEntity.getReserveField1(); + String bookId = bookEntity.getUid(); + String regexs = url.replaceAll("\\d+", "\\\\d+").replaceAll("\\.","\\\\."); + + XxlCrawler crawler = new XxlCrawler.Builder() + .setUrls("http://www.ruiwen.com/jiaocai/") + .setWhiteUrlRegexs(regexs)// + .setAllowSpread(true) + .setFailRetryCount(5) + .setThreadCount(20) + .setPageParser(new PageParser() { + @Override + public void parse(Document html, Element pageVoElement, CrawlerBookVo crawlerBookVo) { + String pageUrl = html.baseUri(); + // 解析封装 PageVo 对象 + String img = crawlerBookVo.getBookImage(); + if (StringUtils.isNoneBlank(img)) { + CrawlerBookEntity crawlerBookEntity = new CrawlerBookEntity(); + crawlerBookEntity.setBookUid(bookId); + crawlerBookEntity.setBookUrl(crawlerBookVo.getBookImage()); + crawlerBookEntity.setOrders(Integer.parseInt(img.substring(img.lastIndexOf("/") + 1, img.lastIndexOf(".")))); + crawlerBookMapper.insert(crawlerBookEntity); + } + + } - } - }).build(); + }).build(); + + // 获取科目 + crawler.start(true); + } - // 获取科目 - crawler.start(true); } +// public static void main(String[] args) { +// String url = "http://www.ruiwen.com/jiaocai/yuwen/renjiaoban/yinianjishangce/shangce1.html"; +// System.out.println(url.replaceAll("\\d+", "\\\\d+").replaceAll("\\.","\\\\.")); +// } + } From ab57a9ceb4eb4ae3bb246f22791fadd2f7c78441 Mon Sep 17 00:00:00 2001 From: sh00859 <302959274@qq.com> Date: Wed, 18 Jul 2018 17:39:39 +0800 Subject: [PATCH 7/7] =?UTF-8?q?=E7=AB=A0=E8=8A=82=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/com/tamguo/model/vo/ChapterVo.java | 32 +++++++++++ .../com/tamguo/service/impl/BookService.java | 56 ++++++++++++------- 2 files changed, 67 insertions(+), 21 deletions(-) create mode 100644 tamguo-crawler/src/main/java/com/tamguo/model/vo/ChapterVo.java diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/vo/ChapterVo.java b/tamguo-crawler/src/main/java/com/tamguo/model/vo/ChapterVo.java new file mode 100644 index 0000000..70378fc --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/model/vo/ChapterVo.java @@ -0,0 +1,32 @@ +package com.tamguo.model.vo; + +import com.xuxueli.crawler.annotation.PageFieldSelect; +import com.xuxueli.crawler.annotation.PageSelect; + +import java.util.List; + +@PageSelect(cssQuery = ".out-chapter") +public class ChapterVo { + + @PageFieldSelect(cssQuery = "h3") + private String name; + + @PageFieldSelect(cssQuery = ".out-list li") + private List sonChapters; + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public List getSonChapters() { + return sonChapters; + } + + public void setSonChapters(List sonChapters) { + this.sonChapters = sonChapters; + } +} diff --git a/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java b/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java index 2b27c9c..4d37f18 100644 --- a/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java +++ b/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java @@ -1,8 +1,8 @@ package com.tamguo.service.impl; -import com.tamguo.dao.BookMapper; -import com.tamguo.model.BookEntity; -import com.tamguo.model.vo.BookVo; +import com.tamguo.dao.ChapterMapper; +import com.tamguo.model.ChapterEntity; +import com.tamguo.model.vo.ChapterVo; import com.tamguo.service.IBookService; import com.xuxueli.crawler.XxlCrawler; import com.xuxueli.crawler.parser.PageParser; @@ -13,12 +13,15 @@ import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; +import java.util.List; +import java.util.UUID; + @Service public class BookService implements IBookService { @Autowired - BookMapper bookMapper; + ChapterMapper chapterMapper; private Logger logger = LoggerFactory.getLogger(getClass()); @@ -26,33 +29,44 @@ public class BookService implements IBookService { @Override public void crawlerBook() { XxlCrawler crawler = new XxlCrawler.Builder() - .setUrls("http://www.ruiwen.com/jiaocai/") + .setUrls("https://tiku.baidu.com/tikupc/chapterlist/1bfd700abb68a98271fefa04-27-jiaocai-11") .setAllowSpread(false) .setFailRetryCount(5) .setThreadCount(20) - .setPageParser(new PageParser() { + .setPageParser(new PageParser() { @Override - public void parse(Document html, Element pageVoElement, BookVo bookVo) { + public void parse(Document html, Element pageVoElement, ChapterVo chapterVo) { // 解析封装 PageVo 对象 - String pageUrl = html.baseUri(); - if (pageUrl.equals("http://www.ruiwen.com/jiaocai/")) { - logger.info("开始解析书本信息:{}", pageUrl); - String name = bookVo.getName(); - String url = bookVo.getBookUrl(); - - BookEntity bookEntity = new BookEntity(); - bookEntity.setName(name); - bookEntity.setReserveField1(url); - bookEntity.setQuestionNum(0); - bookEntity.setPointNum(0); - bookMapper.insert(bookEntity); - } + String parentName = chapterVo.getName(); + ChapterEntity chapterEntity = new ChapterEntity(); + String uid = UUID.randomUUID().toString().replace("-", ""); + chapterEntity.setUid(uid); + chapterEntity.setName(parentName); + chapterEntity.setCourseId("0"); + chapterEntity.setCourseId("0"); + chapterEntity.setParentId("-1"); + chapterEntity.setQuestionNum(0); + chapterEntity.setPointNum(0); + chapterMapper.insert(chapterEntity); + List sonChapters = chapterVo.getSonChapters(); + sonChapters.forEach(s -> { + ChapterEntity sonChapterEntity = new ChapterEntity(); + sonChapterEntity.setName(s); + sonChapterEntity.setCourseId("0"); + sonChapterEntity.setCourseId("0"); + sonChapterEntity.setParentId(uid); + sonChapterEntity.setQuestionNum(0); + sonChapterEntity.setPointNum(0); + chapterMapper.insert(sonChapterEntity); + }); } + + +// } }).build(); -// runData = crawler.getRunData(); // 获取科目 crawler.start(true); }