From d328257550a313cad484585e8d55cd224ac7ee1d Mon Sep 17 00:00:00 2001 From: sh00859 <302959274@qq.com> Date: Tue, 17 Jul 2018 19:17:23 +0800 Subject: [PATCH] 123 --- .../main/java/com/tamguo/model/vo/BookVo.java | 186 ++++++++++++++++++ .../java/com/tamguo/service/IBookService.java | 11 ++ .../com/tamguo/service/impl/BookService.java | 95 +++++++++ .../src/test/java/com/tamguo/BookCrawler.java | 23 +++ 4 files changed, 315 insertions(+) create mode 100644 tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java create mode 100644 tamguo-crawler/src/main/java/com/tamguo/service/IBookService.java create mode 100644 tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java create mode 100644 tamguo-crawler/src/test/java/com/tamguo/BookCrawler.java diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java b/tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java new file mode 100644 index 0000000..2dc8257 --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/model/vo/BookVo.java @@ -0,0 +1,186 @@ +package com.tamguo.model.vo; + +import com.xuxueli.crawler.annotation.PageFieldSelect; +import com.xuxueli.crawler.annotation.PageSelect; +import com.xuxueli.crawler.conf.XxlCrawlerConf; + +import java.util.List; + +@PageSelect(cssQuery = "body") +public class BookVo { + + @PageFieldSelect(cssQuery = ".all-list-li") + private List name; + + // 类型名称 + @PageFieldSelect(cssQuery=".submenu-contain .contain-title") + private String subjectName; + + // 科目信息 + @PageFieldSelect(cssQuery=".course-list-container .course-list .course-item") + private List courseName; + + // 带采集的科目URLs + @PageFieldSelect(cssQuery = ".all-list-li a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") + private List courseUrls; + + + @PageFieldSelect(cssQuery=".screening .selected a") + private String chapterPageCourseName; + + @PageFieldSelect(cssQuery=".screening .selected a") + private String chapterCurrName; + + // 带采集的章节URLs缓存 + @PageFieldSelect(cssQuery = ".main-submenu .contain-ul .contain-li:eq(1) a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") + private List chapterUrlsTemp; + + // 待采集的章节URLs + @PageFieldSelect(cssQuery = ".screening .sc-subject li:not(.selected) a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") + private List chapterUrls; + + // 待采集的问题URLs + @PageFieldSelect(cssQuery = ".list-right .detail-chapter .detail-kpoint-1 .detail-kpoint-2 .mask a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") + private List questionUrlsTemp; + + // 待采集问题URLs + @PageFieldSelect(cssQuery = ".bd-content .question-box .question-box-inner .view-analyse a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") + private List questionUrls; + + // 单个题目数据 + @PageFieldSelect(cssQuery=".question-box-inner .questem-inner", selectType = XxlCrawlerConf.SelectType.HTML) + private String content; + + @PageFieldSelect(cssQuery=".exam-answer-content", selectType = XxlCrawlerConf.SelectType.HTML) + private List answer; + + @PageFieldSelect(cssQuery=".exam-analysis .exam-analysis-content", selectType = XxlCrawlerConf.SelectType.HTML) + private String analysis; + + @PageFieldSelect(cssQuery=".que-title span:eq(0)",selectType = XxlCrawlerConf.SelectType.TEXT) + private String questionType; + + @PageFieldSelect(cssQuery=".que-title span:eq(1)",selectType = XxlCrawlerConf.SelectType.TEXT) + private String score; + + public List getName() { + return name; + } + + public void setName(List name) { + this.name = name; + } + + public List getCourseName() { + return courseName; + } + + public void setCourseName(List courseName) { + this.courseName = courseName; + } + + public String getSubjectName() { + return subjectName; + } + + public void setSubjectName(String subjectName) { + this.subjectName = subjectName; + } + + public List getCourseUrls() { + return courseUrls; + } + + public void setCourseUrls(List courseUrls) { + this.courseUrls = courseUrls; + } + + public List getChapterUrls() { + return chapterUrls; + } + + public void setChapterUrls(List chapterUrls) { + this.chapterUrls = chapterUrls; + } + + public List getChapterUrlsTemp() { + return chapterUrlsTemp; + } + + public void setChapterUrlsTemp(List chapterUrlsTemp) { + this.chapterUrlsTemp = chapterUrlsTemp; + } + + public String getChapterPageCourseName() { + return chapterPageCourseName; + } + + public void setChapterPageCourseName(String chapterPageCourseName) { + this.chapterPageCourseName = chapterPageCourseName; + } + + public String getChapterCurrName() { + return chapterCurrName; + } + + public void setChapterCurrName(String chapterCurrName) { + this.chapterCurrName = chapterCurrName; + } + + public List getQuestionUrlsTemp() { + return questionUrlsTemp; + } + + public void setQuestionUrlsTemp(List questionUrlsTemp) { + this.questionUrlsTemp = questionUrlsTemp; + } + + public List getQuestionUrls() { + return questionUrls; + } + + public void setQuestionUrls(List questionUrls) { + this.questionUrls = questionUrls; + } + + public String getAnalysis() { + return analysis; + } + + public void setAnalysis(String analysis) { + this.analysis = analysis; + } + + public String getContent() { + return content; + } + + public void setContent(String content) { + this.content = content; + } + + public String getQuestionType() { + return questionType; + } + + public void setQuestionType(String questionType) { + this.questionType = questionType; + } + + public String getScore() { + return score; + } + + public void setScore(String score) { + this.score = score; + } + + public List getAnswer() { + return answer; + } + + public void setAnswer(List answer) { + this.answer = answer; + } + +} diff --git a/tamguo-crawler/src/main/java/com/tamguo/service/IBookService.java b/tamguo-crawler/src/main/java/com/tamguo/service/IBookService.java new file mode 100644 index 0000000..3aeae12 --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/service/IBookService.java @@ -0,0 +1,11 @@ +package com.tamguo.service; + +public interface IBookService { + + /** + * 爬取书本数据 + */ + void crawlerBook(); + + +} diff --git a/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java b/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java new file mode 100644 index 0000000..c5b554f --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java @@ -0,0 +1,95 @@ +package com.tamguo.service.impl; + +import com.tamguo.dao.ChapterMapper; +import com.tamguo.dao.CourseMapper; +import com.tamguo.dao.CrawlerQuestionMapper; +import com.tamguo.dao.SubjectMapper; +import com.tamguo.model.vo.BookVo; +import com.tamguo.service.IBookService; +import com.xuxueli.crawler.XxlCrawler; +import com.xuxueli.crawler.parser.PageParser; +import com.xuxueli.crawler.rundata.RunData; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Service; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +@Service +public class BookService implements IBookService { + + + @Autowired + SubjectMapper subjectMapper; + @Autowired + CourseMapper courseMapper; + @Autowired + ChapterMapper chapterMapper; + @Autowired + CrawlerQuestionMapper crawlerQuestionMapper; + + private Logger logger = LoggerFactory.getLogger(getClass()); + + private Set urls = new HashSet<>(); + + private Set questionUrls = new HashSet(); + + private Map chapterQuestionListMap = new HashMap<>(); + + private RunData runData; + + @Override + public void crawlerBook() { + XxlCrawler crawler = new XxlCrawler.Builder() + .setUrls("http://www.dzkbw.com") + .setAllowSpread(false) + .setFailRetryCount(5) + .setThreadCount(20) + .setPageParser(new PageParser() { + @Override + public void parse(Document html, Element pageVoElement, BookVo bookVo) { + // 解析封装 PageVo 对象 + String pageUrl = html.baseUri(); + if (pageUrl.equals("https://tiku.baidu.com/")) { + logger.info("开始解析考试分类:{}", pageUrl); +// for (int i = 0; i < subjectVo.getName().size(); i++) { +// String name = subjectVo.getName().get(i); +// +// SubjectEntity subject = subjectMapper.findByName(name); +// if (subject != null) { +// continue; +// } +// SubjectEntity entity = new SubjectEntity(); +// if (name.equals("高考")) { +// name = "高考"; +// entity.setName(name); +// subjectMapper.insert(entity); +// // 加入科目爬取数据 +// for (String url : subjectVo.getCourseUrls()) { +// runData.addUrl(url); +// } +// +// } +// +// +// } + + + } + + + } + }).build(); + + runData = crawler.getRunData(); + // 获取科目 + crawler.start(true); + } + +} diff --git a/tamguo-crawler/src/test/java/com/tamguo/BookCrawler.java b/tamguo-crawler/src/test/java/com/tamguo/BookCrawler.java new file mode 100644 index 0000000..7b496bf --- /dev/null +++ b/tamguo-crawler/src/test/java/com/tamguo/BookCrawler.java @@ -0,0 +1,23 @@ +package com.tamguo; + +import com.tamguo.service.IBookService; +import com.tamguo.service.ISubjectService; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.test.context.junit4.SpringRunner; + +@RunWith(SpringRunner.class) +@SpringBootTest +public class BookCrawler { + + @Autowired + IBookService bookService; + + @Test + public void crawlerBook() throws Exception { + bookService.crawlerBook(); + } + +}