From 0e5c1cd6915d7ad5cef33b5edd40c071c61e064c Mon Sep 17 00:00:00 2001 From: tamguo Date: Sat, 14 Jul 2018 21:58:37 +0800 Subject: [PATCH] =?UTF-8?q?=E7=88=AC=E6=95=B0=E6=8D=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../com/tamguo/dao/CrawlerChapterMapper.java | 8 ++ .../tamguo/model/CrawlerChapterEntity.java | 50 +++++++++ .../com/tamguo/service/IChapterService.java | 5 + .../com/tamguo/service/ISubjectService.java | 1 + .../tamguo/service/impl/ChapterService.java | 100 ++++++++++++++++++ .../mappers/CrawlerChapterMapper.xml | 5 + .../test/java/com/tamguo/ChapterCrawler.java | 23 ++++ 7 files changed, 192 insertions(+) create mode 100644 tamguo-crawler/src/main/java/com/tamguo/dao/CrawlerChapterMapper.java create mode 100644 tamguo-crawler/src/main/java/com/tamguo/model/CrawlerChapterEntity.java create mode 100644 tamguo-crawler/src/main/resources/mappers/CrawlerChapterMapper.xml create mode 100644 tamguo-crawler/src/test/java/com/tamguo/ChapterCrawler.java diff --git a/tamguo-crawler/src/main/java/com/tamguo/dao/CrawlerChapterMapper.java b/tamguo-crawler/src/main/java/com/tamguo/dao/CrawlerChapterMapper.java new file mode 100644 index 0000000..9fe25ea --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/dao/CrawlerChapterMapper.java @@ -0,0 +1,8 @@ +package com.tamguo.dao; + +import com.tamguo.config.dao.SuperMapper; +import com.tamguo.model.CrawlerChapterEntity; + +public interface CrawlerChapterMapper extends SuperMapper{ + +} diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/CrawlerChapterEntity.java b/tamguo-crawler/src/main/java/com/tamguo/model/CrawlerChapterEntity.java new file mode 100644 index 0000000..aa90258 --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/model/CrawlerChapterEntity.java @@ -0,0 +1,50 @@ +package com.tamguo.model; + +import com.baomidou.mybatisplus.annotations.TableName; +import com.tamguo.config.dao.SuperEntity; + +@TableName(value="crawler_chapter") +public class CrawlerChapterEntity extends SuperEntity{ + + private static final long serialVersionUID = 1L; + + private String chapterUid; + + private String courseUid; + + private String chapterUrl; + + private String subjectUid; + + public String getChapterUid() { + return chapterUid; + } + + public void setChapterUid(String chapterUid) { + this.chapterUid = chapterUid; + } + + public String getCourseUid() { + return courseUid; + } + + public void setCourseUid(String courseUid) { + this.courseUid = courseUid; + } + + public String getChapterUrl() { + return chapterUrl; + } + + public void setChapterUrl(String chapterUrl) { + this.chapterUrl = chapterUrl; + } + + public String getSubjectUid() { + return subjectUid; + } + + public void setSubjectUid(String subjectUid) { + this.subjectUid = subjectUid; + } +} diff --git a/tamguo-crawler/src/main/java/com/tamguo/service/IChapterService.java b/tamguo-crawler/src/main/java/com/tamguo/service/IChapterService.java index 1c4a500..332a810 100644 --- a/tamguo-crawler/src/main/java/com/tamguo/service/IChapterService.java +++ b/tamguo-crawler/src/main/java/com/tamguo/service/IChapterService.java @@ -6,4 +6,9 @@ public interface IChapterService { * 修改章节数量 */ void modifyQuestionNum(); + + /** + * 爬取章节数据 + */ + void crawlerChapter(); } diff --git a/tamguo-crawler/src/main/java/com/tamguo/service/ISubjectService.java b/tamguo-crawler/src/main/java/com/tamguo/service/ISubjectService.java index b6ad7ea..c5500b3 100644 --- a/tamguo-crawler/src/main/java/com/tamguo/service/ISubjectService.java +++ b/tamguo-crawler/src/main/java/com/tamguo/service/ISubjectService.java @@ -6,5 +6,6 @@ public interface ISubjectService { * 爬取考试数据 */ void crawlerSubject(); + } diff --git a/tamguo-crawler/src/main/java/com/tamguo/service/impl/ChapterService.java b/tamguo-crawler/src/main/java/com/tamguo/service/impl/ChapterService.java index faefc11..b021ba6 100644 --- a/tamguo-crawler/src/main/java/com/tamguo/service/impl/ChapterService.java +++ b/tamguo-crawler/src/main/java/com/tamguo/service/impl/ChapterService.java @@ -1,20 +1,52 @@ package com.tamguo.service.impl; +import java.util.HashSet; import java.util.List; +import java.util.Set; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; +import com.alibaba.fastjson.JSONObject; import com.baomidou.mybatisplus.plugins.Page; import com.tamguo.dao.ChapterMapper; +import com.tamguo.dao.CourseMapper; +import com.tamguo.dao.CrawlerChapterMapper; +import com.tamguo.dao.CrawlerQuestionMapper; +import com.tamguo.dao.SubjectMapper; import com.tamguo.model.ChapterEntity; +import com.tamguo.model.CourseEntity; +import com.tamguo.model.CrawlerChapterEntity; +import com.tamguo.model.vo.SubjectVo; import com.tamguo.service.IChapterService; +import com.xuxueli.crawler.XxlCrawler; +import com.xuxueli.crawler.parser.PageParser; +import com.xuxueli.crawler.rundata.RunData; @Service public class ChapterService implements IChapterService{ + @Autowired + SubjectMapper subjectMapper; + @Autowired + CourseMapper courseMapper; @Autowired ChapterMapper chapterMapper; + @Autowired + CrawlerQuestionMapper crawlerQuestionMapper; + + private Logger logger = LoggerFactory.getLogger(getClass()); + + private Set urls = new HashSet<>(); + + private RunData runData; + + @Autowired + private CrawlerChapterMapper crawlerChapterMapper; @Override public void modifyQuestionNum() { @@ -37,4 +69,72 @@ public class ChapterService implements IChapterService{ } } + @Override + public void crawlerChapter() { + XxlCrawler crawler = new XxlCrawler.Builder() + .setUrls("https://tiku.baidu.com/") + .setAllowSpread(false) + .setFailRetryCount(5) + .setThreadCount(20) + .setPageParser(new PageParser() { + + @Override + public void parse(Document html, Element pageVoElement, SubjectVo subjectVo) { + // 解析封装 PageVo 对象 + String pageUrl = html.baseUri(); + if(pageUrl.equals("https://tiku.baidu.com/")) { + logger.info("开始解析考试分类:{}" , pageUrl); + // 加入科目爬取数据 + for(String url : subjectVo.getCourseUrls()) { + runData.addUrl(url); + } + } + if(pageUrl.contains("https://tiku.baidu.com/tikupc/homepage/")) { + logger.info("开始解析科目分类:{}" , pageUrl); + // 加入科目爬取数据 + for(String url : subjectVo.getChapterUrlsTemp()) { + runData.addUrl(url); + } + } + + if(pageUrl.contains("https://tiku.baidu.com/tikupc/chapterlist/")) { + logger.info("开始解析章节:{}" , pageUrl); + ChapterEntity chapterCondition = new ChapterEntity(); + chapterCondition.setName(subjectVo.getChapterCurrName()); + ChapterEntity chapterEntity = chapterMapper.selectOne(chapterCondition); + CourseEntity course = courseMapper.selectById(chapterEntity.getCourseId()); + + CrawlerChapterEntity crawlerChapter = new CrawlerChapterEntity(); + crawlerChapter.setChapterUid(chapterEntity.getUid()); + crawlerChapter.setChapterUrl(pageUrl); + crawlerChapter.setCourseUid(chapterEntity.getCourseId()); + crawlerChapter.setSubjectUid(course.getSubjectId()); + crawlerChapterMapper.insert(crawlerChapter); + + logger.info("url:{}" ,pageUrl ); + logger.info("subjectVo:{}" ,JSONObject.toJSON(subjectVo) ); + if(subjectVo.getChapterUrls() != null) { + for(String url : subjectVo.getChapterUrls()) { + if(url.equals(pageVoElement.getElementsByClass("main-inner").get(0).getElementsByClass("selected").get(0).getElementsByTag("a").attr("abs:href"))) { + continue; + } + if(!urls.contains(url)) { + crawlerChapter = new CrawlerChapterEntity(); + crawlerChapter.setChapterUid(chapterEntity.getUid()); + crawlerChapter.setChapterUrl(url); + crawlerChapter.setCourseUid(chapterEntity.getCourseId()); + crawlerChapter.setSubjectUid(course.getSubjectId()); + crawlerChapterMapper.insert(crawlerChapter); + } + } + } + } + } + }).build(); + + runData = crawler.getRunData(); + // 获取科目 + crawler.start(true); + } + } diff --git a/tamguo-crawler/src/main/resources/mappers/CrawlerChapterMapper.xml b/tamguo-crawler/src/main/resources/mappers/CrawlerChapterMapper.xml new file mode 100644 index 0000000..745dfe7 --- /dev/null +++ b/tamguo-crawler/src/main/resources/mappers/CrawlerChapterMapper.xml @@ -0,0 +1,5 @@ + + + + + \ No newline at end of file diff --git a/tamguo-crawler/src/test/java/com/tamguo/ChapterCrawler.java b/tamguo-crawler/src/test/java/com/tamguo/ChapterCrawler.java new file mode 100644 index 0000000..ff1185f --- /dev/null +++ b/tamguo-crawler/src/test/java/com/tamguo/ChapterCrawler.java @@ -0,0 +1,23 @@ +package com.tamguo; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.test.context.junit4.SpringRunner; + +import com.tamguo.service.IChapterService; + +@RunWith(SpringRunner.class) +@SpringBootTest +public class ChapterCrawler { + + @Autowired + IChapterService iChapterService; + + @Test + public void crawlerChapter() throws Exception { + iChapterService.crawlerChapter(); + } + +}