From b1a9cdfc89f3d8217b1c5e144cb7c89bdd8a1ba1 Mon Sep 17 00:00:00 2001 From: cff <302959274@qq.com> Date: Tue, 17 Jul 2018 22:05:01 +0800 Subject: [PATCH] =?UTF-8?q?tiku=5Fbook=E7=AE=A1=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../main/java/com/tamguo/dao/BookMapper.java | 8 + .../java/com/tamguo/model/BookEntity.java | 92 ++++++++++ .../main/java/com/tamguo/model/vo/BookVo.java | 166 +----------------- .../com/tamguo/service/impl/BookService.java | 64 ++----- 4 files changed, 116 insertions(+), 214 deletions(-) create mode 100644 tamguo-crawler/src/main/java/com/tamguo/dao/BookMapper.java create mode 100644 tamguo-crawler/src/main/java/com/tamguo/model/BookEntity.java diff --git a/tamguo-crawler/src/main/java/com/tamguo/dao/BookMapper.java b/tamguo-crawler/src/main/java/com/tamguo/dao/BookMapper.java new file mode 100644 index 0000000..2fcf33d --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/dao/BookMapper.java @@ -0,0 +1,8 @@ +package com.tamguo.dao; + +import com.tamguo.config.dao.SuperMapper; +import com.tamguo.model.BookEntity; + +public interface BookMapper extends SuperMapper { + +} diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/BookEntity.java b/tamguo-crawler/src/main/java/com/tamguo/model/BookEntity.java new file mode 100644 index 0000000..0d3967b --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/model/BookEntity.java @@ -0,0 +1,92 @@ +package com.tamguo.model; + +import com.baomidou.mybatisplus.annotations.TableName; +import com.tamguo.config.dao.SuperEntity; + +import java.io.Serializable; + +/** + * The persistent class for the tiku_course database table. + */ +@TableName(value = "tiku_book") +public class BookEntity extends SuperEntity implements Serializable { + private static final long serialVersionUID = 1L; + + + private String subjectId; + + private String courseId; + + private String name; + + private String publishingHouse; + + private Integer questionNum; + + private Integer pointNum; + + private Integer orders; + + public BookEntity() { + } + + public static long getSerialVersionUID() { + return serialVersionUID; + } + + public String getSubjectId() { + return subjectId; + } + + public void setSubjectId(String subjectId) { + this.subjectId = subjectId; + } + + public String getCourseId() { + return courseId; + } + + public void setCourseId(String courseId) { + this.courseId = courseId; + } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public String getPublishingHouse() { + return publishingHouse; + } + + public void setPublishingHouse(String publishingHouse) { + this.publishingHouse = publishingHouse; + } + + public Integer getQuestionNum() { + return questionNum; + } + + public void setQuestionNum(Integer questionNum) { + this.questionNum = questionNum; + } + + public Integer getPointNum() { + return pointNum; + } + + public void setPointNum(Integer pointNum) { + this.pointNum = pointNum; + } + + public Integer getOrders() { + return orders; + } + + public void setOrders(Integer orders) { + this.orders = orders; + } +} \ No newline at end of file diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java b/tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java index 2dc8257..9a80c74 100644 --- a/tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java +++ b/tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java @@ -9,59 +9,8 @@ import java.util.List; @PageSelect(cssQuery = "body") public class BookVo { - @PageFieldSelect(cssQuery = ".all-list-li") + @PageFieldSelect(cssQuery = ".ih3") private List name; - - // 类型名称 - @PageFieldSelect(cssQuery=".submenu-contain .contain-title") - private String subjectName; - - // 科目信息 - @PageFieldSelect(cssQuery=".course-list-container .course-list .course-item") - private List courseName; - - // 带采集的科目URLs - @PageFieldSelect(cssQuery = ".all-list-li a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") - private List courseUrls; - - - @PageFieldSelect(cssQuery=".screening .selected a") - private String chapterPageCourseName; - - @PageFieldSelect(cssQuery=".screening .selected a") - private String chapterCurrName; - - // 带采集的章节URLs缓存 - @PageFieldSelect(cssQuery = ".main-submenu .contain-ul .contain-li:eq(1) a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") - private List chapterUrlsTemp; - - // 待采集的章节URLs - @PageFieldSelect(cssQuery = ".screening .sc-subject li:not(.selected) a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") - private List chapterUrls; - - // 待采集的问题URLs - @PageFieldSelect(cssQuery = ".list-right .detail-chapter .detail-kpoint-1 .detail-kpoint-2 .mask a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") - private List questionUrlsTemp; - - // 待采集问题URLs - @PageFieldSelect(cssQuery = ".bd-content .question-box .question-box-inner .view-analyse a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") - private List questionUrls; - - // 单个题目数据 - @PageFieldSelect(cssQuery=".question-box-inner .questem-inner", selectType = XxlCrawlerConf.SelectType.HTML) - private String content; - - @PageFieldSelect(cssQuery=".exam-answer-content", selectType = XxlCrawlerConf.SelectType.HTML) - private List answer; - - @PageFieldSelect(cssQuery=".exam-analysis .exam-analysis-content", selectType = XxlCrawlerConf.SelectType.HTML) - private String analysis; - - @PageFieldSelect(cssQuery=".que-title span:eq(0)",selectType = XxlCrawlerConf.SelectType.TEXT) - private String questionType; - - @PageFieldSelect(cssQuery=".que-title span:eq(1)",selectType = XxlCrawlerConf.SelectType.TEXT) - private String score; public List getName() { return name; @@ -70,117 +19,4 @@ public class BookVo { public void setName(List name) { this.name = name; } - - public List getCourseName() { - return courseName; - } - - public void setCourseName(List courseName) { - this.courseName = courseName; - } - - public String getSubjectName() { - return subjectName; - } - - public void setSubjectName(String subjectName) { - this.subjectName = subjectName; - } - - public List getCourseUrls() { - return courseUrls; - } - - public void setCourseUrls(List courseUrls) { - this.courseUrls = courseUrls; - } - - public List getChapterUrls() { - return chapterUrls; - } - - public void setChapterUrls(List chapterUrls) { - this.chapterUrls = chapterUrls; - } - - public List getChapterUrlsTemp() { - return chapterUrlsTemp; - } - - public void setChapterUrlsTemp(List chapterUrlsTemp) { - this.chapterUrlsTemp = chapterUrlsTemp; - } - - public String getChapterPageCourseName() { - return chapterPageCourseName; - } - - public void setChapterPageCourseName(String chapterPageCourseName) { - this.chapterPageCourseName = chapterPageCourseName; - } - - public String getChapterCurrName() { - return chapterCurrName; - } - - public void setChapterCurrName(String chapterCurrName) { - this.chapterCurrName = chapterCurrName; - } - - public List getQuestionUrlsTemp() { - return questionUrlsTemp; - } - - public void setQuestionUrlsTemp(List questionUrlsTemp) { - this.questionUrlsTemp = questionUrlsTemp; - } - - public List getQuestionUrls() { - return questionUrls; - } - - public void setQuestionUrls(List questionUrls) { - this.questionUrls = questionUrls; - } - - public String getAnalysis() { - return analysis; - } - - public void setAnalysis(String analysis) { - this.analysis = analysis; - } - - public String getContent() { - return content; - } - - public void setContent(String content) { - this.content = content; - } - - public String getQuestionType() { - return questionType; - } - - public void setQuestionType(String questionType) { - this.questionType = questionType; - } - - public String getScore() { - return score; - } - - public void setScore(String score) { - this.score = score; - } - - public List getAnswer() { - return answer; - } - - public void setAnswer(List answer) { - this.answer = answer; - } - } diff --git a/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java b/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java index c5b554f..5459b7f 100644 --- a/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java +++ b/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java @@ -1,14 +1,11 @@ package com.tamguo.service.impl; -import com.tamguo.dao.ChapterMapper; -import com.tamguo.dao.CourseMapper; -import com.tamguo.dao.CrawlerQuestionMapper; -import com.tamguo.dao.SubjectMapper; +import com.tamguo.dao.BookMapper; +import com.tamguo.model.BookEntity; import com.tamguo.model.vo.BookVo; import com.tamguo.service.IBookService; import com.xuxueli.crawler.XxlCrawler; import com.xuxueli.crawler.parser.PageParser; -import com.xuxueli.crawler.rundata.RunData; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.slf4j.Logger; @@ -16,33 +13,17 @@ import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; +import java.util.List; @Service public class BookService implements IBookService { @Autowired - SubjectMapper subjectMapper; - @Autowired - CourseMapper courseMapper; - @Autowired - ChapterMapper chapterMapper; - @Autowired - CrawlerQuestionMapper crawlerQuestionMapper; + BookMapper bookMapper; private Logger logger = LoggerFactory.getLogger(getClass()); - private Set urls = new HashSet<>(); - - private Set questionUrls = new HashSet(); - - private Map chapterQuestionListMap = new HashMap<>(); - - private RunData runData; @Override public void crawlerBook() { @@ -56,38 +37,23 @@ public class BookService implements IBookService { public void parse(Document html, Element pageVoElement, BookVo bookVo) { // 解析封装 PageVo 对象 String pageUrl = html.baseUri(); - if (pageUrl.equals("https://tiku.baidu.com/")) { - logger.info("开始解析考试分类:{}", pageUrl); -// for (int i = 0; i < subjectVo.getName().size(); i++) { -// String name = subjectVo.getName().get(i); -// -// SubjectEntity subject = subjectMapper.findByName(name); -// if (subject != null) { -// continue; -// } -// SubjectEntity entity = new SubjectEntity(); -// if (name.equals("高考")) { -// name = "高考"; -// entity.setName(name); -// subjectMapper.insert(entity); -// // 加入科目爬取数据 -// for (String url : subjectVo.getCourseUrls()) { -// runData.addUrl(url); -// } -// -// } -// -// -// } - - + if (pageUrl.equals("http://www.dzkbw.com")) { + logger.info("开始解析书本信息:{}", pageUrl); + List books = bookVo.getName(); + books.forEach(item -> { + BookEntity bookEntity = new BookEntity(); + bookEntity.setName(item); + bookEntity.setQuestionNum(0); + bookEntity.setPointNum(0); + bookMapper.insert(bookEntity); + }); } } }).build(); - runData = crawler.getRunData(); +// runData = crawler.getRunData(); // 获取科目 crawler.start(true); }