From 99406fab0693f22b98489ea9d4ed8c5ef5660672 Mon Sep 17 00:00:00 2001 From: tamguo Date: Thu, 5 Jul 2018 09:00:12 +0800 Subject: [PATCH] =?UTF-8?q?=E7=88=AC=E5=8F=96=E6=95=B0=E6=8D=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tamguo-crawler/pom.xml | 11 +++ .../com/tamguo/dao/CrawlerQuestionMapper.java | 9 +++ .../tamguo/model/CrawlerQuestionEntity.java | 56 +++++++++++++ .../java/com/tamguo/model/vo/SubjectVo.java | 80 +++++++++++++++++++ .../tamguo/service/impl/SubjectService.java | 65 ++++++++++++--- .../mappers/CrawlerQuestionMapper.xml | 6 ++ tamguo-crawler/src/main/resources/redis.xml | 9 +++ 7 files changed, 224 insertions(+), 12 deletions(-) create mode 100644 tamguo-crawler/src/main/java/com/tamguo/dao/CrawlerQuestionMapper.java create mode 100644 tamguo-crawler/src/main/java/com/tamguo/model/CrawlerQuestionEntity.java create mode 100644 tamguo-crawler/src/main/resources/mappers/CrawlerQuestionMapper.xml create mode 100644 tamguo-crawler/src/main/resources/redis.xml diff --git a/tamguo-crawler/pom.xml b/tamguo-crawler/pom.xml index 1c82634..fb8ad7e 100644 --- a/tamguo-crawler/pom.xml +++ b/tamguo-crawler/pom.xml @@ -70,6 +70,17 @@ xxl-crawler 1.2.1 + + + net.sourceforge.htmlunit + htmlunit + provided + + + redis.clients + jedis + 2.9.0 + diff --git a/tamguo-crawler/src/main/java/com/tamguo/dao/CrawlerQuestionMapper.java b/tamguo-crawler/src/main/java/com/tamguo/dao/CrawlerQuestionMapper.java new file mode 100644 index 0000000..1c645f6 --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/dao/CrawlerQuestionMapper.java @@ -0,0 +1,9 @@ +package com.tamguo.dao; + +import com.tamguo.config.dao.SuperMapper; +import com.tamguo.model.CrawlerQuestionEntity; + +public interface CrawlerQuestionMapper extends SuperMapper{ + + +} diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/CrawlerQuestionEntity.java b/tamguo-crawler/src/main/java/com/tamguo/model/CrawlerQuestionEntity.java new file mode 100644 index 0000000..c1eba61 --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/model/CrawlerQuestionEntity.java @@ -0,0 +1,56 @@ +package com.tamguo.model; + +import java.io.Serializable; +import com.baomidou.mybatisplus.annotations.TableName; +import com.tamguo.config.dao.SuperEntity; + + + +/** + * The persistent class for the crawler_question database table. + * + */ +@TableName(value="crawler_question") +public class CrawlerQuestionEntity extends SuperEntity implements Serializable { + private static final long serialVersionUID = 1L; + + private String questionUrl; + + private String chapterId; + + private String status; + + public String getQuestionUrl() { + return questionUrl; + } + + + + public void setQuestionUrl(String questionUrl) { + this.questionUrl = questionUrl; + } + + + + public String getChapterId() { + return chapterId; + } + + public void setChapterId(String chapterId) { + this.chapterId = chapterId; + } + + public String getStatus() { + return status; + } + + public void setStatus(String status) { + this.status = status; + } + + + + public static long getSerialversionuid() { + return serialVersionUID; + } +} \ No newline at end of file diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/vo/SubjectVo.java b/tamguo-crawler/src/main/java/com/tamguo/model/vo/SubjectVo.java index df3c05f..ac0be5f 100644 --- a/tamguo-crawler/src/main/java/com/tamguo/model/vo/SubjectVo.java +++ b/tamguo-crawler/src/main/java/com/tamguo/model/vo/SubjectVo.java @@ -39,6 +39,30 @@ public class SubjectVo { @PageFieldSelect(cssQuery = ".screening .sc-subject li:not(.selected) a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") private List chapterUrls; + // 待采集的问题URLs + @PageFieldSelect(cssQuery = ".list-right .detail-chapter .detail-kpoint-1 .detail-kpoint-2 .mask a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") + private List questionUrlsTemp; + + // 待采集问题URLs + @PageFieldSelect(cssQuery = ".bd-content .question-box .question-box-inner .view-analyse a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") + private List questionUrls; + + // 单个题目数据 + @PageFieldSelect(cssQuery=".question-box-inner .questem-inner", selectType = XxlCrawlerConf.SelectType.HTML) + private String content; + + @PageFieldSelect(cssQuery=".exam-answer-content", selectType = XxlCrawlerConf.SelectType.HTML) + private List answer; + + @PageFieldSelect(cssQuery=".exam-analysis .exam-analysis-content", selectType = XxlCrawlerConf.SelectType.HTML) + private String analysis; + + @PageFieldSelect(cssQuery=".que-title span:eq(0)",selectType = XxlCrawlerConf.SelectType.TEXT) + private String questionType; + + @PageFieldSelect(cssQuery=".que-title span:eq(1)",selectType = XxlCrawlerConf.SelectType.TEXT) + private String score; + public List getName() { return name; } @@ -103,4 +127,60 @@ public class SubjectVo { this.chapterCurrName = chapterCurrName; } + public List getQuestionUrlsTemp() { + return questionUrlsTemp; + } + + public void setQuestionUrlsTemp(List questionUrlsTemp) { + this.questionUrlsTemp = questionUrlsTemp; + } + + public List getQuestionUrls() { + return questionUrls; + } + + public void setQuestionUrls(List questionUrls) { + this.questionUrls = questionUrls; + } + + public String getAnalysis() { + return analysis; + } + + public void setAnalysis(String analysis) { + this.analysis = analysis; + } + + public String getContent() { + return content; + } + + public void setContent(String content) { + this.content = content; + } + + public String getQuestionType() { + return questionType; + } + + public void setQuestionType(String questionType) { + this.questionType = questionType; + } + + public String getScore() { + return score; + } + + public void setScore(String score) { + this.score = score; + } + + public List getAnswer() { + return answer; + } + + public void setAnswer(List answer) { + this.answer = answer; + } + } diff --git a/tamguo-crawler/src/main/java/com/tamguo/service/impl/SubjectService.java b/tamguo-crawler/src/main/java/com/tamguo/service/impl/SubjectService.java index 06d7c24..6ef7ff3 100644 --- a/tamguo-crawler/src/main/java/com/tamguo/service/impl/SubjectService.java +++ b/tamguo-crawler/src/main/java/com/tamguo/service/impl/SubjectService.java @@ -1,8 +1,10 @@ package com.tamguo.service.impl; import java.math.BigInteger; -import java.util.ArrayList; -import java.util.List; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; @@ -14,9 +16,11 @@ import org.springframework.stereotype.Service; import com.tamguo.dao.ChapterMapper; import com.tamguo.dao.CourseMapper; +import com.tamguo.dao.CrawlerQuestionMapper; import com.tamguo.dao.SubjectMapper; import com.tamguo.model.ChapterEntity; import com.tamguo.model.CourseEntity; +import com.tamguo.model.CrawlerQuestionEntity; import com.tamguo.model.SubjectEntity; import com.tamguo.model.vo.SubjectVo; import com.tamguo.service.ISubjectService; @@ -33,20 +37,24 @@ public class SubjectService implements ISubjectService{ CourseMapper courseMapper; @Autowired ChapterMapper chapterMapper; + @Autowired + CrawlerQuestionMapper crawlerQuestionMapper; private Logger logger = LoggerFactory.getLogger(getClass()); - private List urls = new ArrayList<>(); + private Set urls = new HashSet<>(); + + private Set questionUrls = new HashSet(); + private Map chapterQuestionListMap = new HashMap<>(); + private RunData runData; @Override public void crawlerSubject() { XxlCrawler crawler = new XxlCrawler.Builder() .setUrls("https://tiku.baidu.com/") - .setWhiteUrlRegexs("https://tiku.baidu.com/tikupc/homepage/\\w+","https://tiku.baidu.com/tikupc/homepage/\\w+" - , "https://tiku.baidu.com/" - , "https://tiku.baidu.com/tikupc/chapterlist/.*") + .setAllowSpread(false) .setPageParser(new PageParser() { @Override @@ -75,7 +83,6 @@ public class SubjectService implements ISubjectService{ runData.addUrl(url); } } - if(pageUrl.contains("https://tiku.baidu.com/tikupc/homepage/")) { logger.info("开始解析科目分类:{}" , pageUrl); for(int i=0 ; i 0) { + String questionUrl = maskList.get(0).getElementsByTag("a").attr("abs:href"); + questionUrl = questionUrl.replace("1-5", "1-1000"); + chapterQuestionListMap.put(questionUrl, chapter2); + + runData.addUrl(questionUrl); + } } } } @@ -185,9 +200,8 @@ public class SubjectService implements ISubjectService{ // 剔除已经爬取的数据 urls.add(pageUrl); - // 加入科目爬取数据 for(String url : subjectVo.getChapterUrls()) { - if(url.equals("https://tiku.baidu.com"+pageVoElement.getElementsByClass("main-inner").get(0).getElementsByClass("selected").get(0).getElementsByTag("a").attr("href"))) { + if(url.equals(pageVoElement.getElementsByClass("main-inner").get(0).getElementsByClass("selected").get(0).getElementsByTag("a").attr("abs:href"))) { continue; } if(!urls.contains(url)) { @@ -195,13 +209,40 @@ public class SubjectService implements ISubjectService{ } } } + + if(pageUrl.contains("https://tiku.baidu.com/tikupc/chapterdetail")) { + // 加入待解析题目列表 + for(String questionUrl : subjectVo.getQuestionUrls()) { + if(!questionUrls.contains(questionUrl)) { + // 处理URL + // runData.addUrl(questionUrl); + questionUrls.add(questionUrl); + + ChapterEntity chapterEntity = (ChapterEntity) chapterQuestionListMap.get(pageUrl); + + CrawlerQuestionEntity condition = new CrawlerQuestionEntity(); + condition.setQuestionUrl(questionUrl); + if(crawlerQuestionMapper.selectOne(condition) == null) { + CrawlerQuestionEntity crawlerQuestion = new CrawlerQuestionEntity(); + crawlerQuestion.setQuestionUrl(questionUrl); + crawlerQuestion.setChapterId(chapterEntity.getUid()); + crawlerQuestion.setStatus("0"); + crawlerQuestionMapper.insert(crawlerQuestion); + }else { + logger.info(questionUrl+"已经爬取"); + } + } + } + } + + /*if(pageUrl.contains("https://tiku.baidu.com/tikupc/singledetail")) { + ChapterEntity chapterEntity = (ChapterEntity) chapterQuestionMap.get(pageUrl); + System.out.println(chapterEntity); + }*/ } }).build(); runData = crawler.getRunData(); - - - // 获取科目 crawler.start(true); } diff --git a/tamguo-crawler/src/main/resources/mappers/CrawlerQuestionMapper.xml b/tamguo-crawler/src/main/resources/mappers/CrawlerQuestionMapper.xml new file mode 100644 index 0000000..4c97165 --- /dev/null +++ b/tamguo-crawler/src/main/resources/mappers/CrawlerQuestionMapper.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/tamguo-crawler/src/main/resources/redis.xml b/tamguo-crawler/src/main/resources/redis.xml new file mode 100644 index 0000000..4055e0d --- /dev/null +++ b/tamguo-crawler/src/main/resources/redis.xml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file