diff --git a/tamguo-crawler/pom.xml b/tamguo-crawler/pom.xml
index 1c82634..fb8ad7e 100644
--- a/tamguo-crawler/pom.xml
+++ b/tamguo-crawler/pom.xml
@@ -70,6 +70,17 @@
xxl-crawler
1.2.1
+
+
+ net.sourceforge.htmlunit
+ htmlunit
+ provided
+
+
+ redis.clients
+ jedis
+ 2.9.0
+
diff --git a/tamguo-crawler/src/main/java/com/tamguo/dao/CrawlerQuestionMapper.java b/tamguo-crawler/src/main/java/com/tamguo/dao/CrawlerQuestionMapper.java
new file mode 100644
index 0000000..1c645f6
--- /dev/null
+++ b/tamguo-crawler/src/main/java/com/tamguo/dao/CrawlerQuestionMapper.java
@@ -0,0 +1,9 @@
+package com.tamguo.dao;
+
+import com.tamguo.config.dao.SuperMapper;
+import com.tamguo.model.CrawlerQuestionEntity;
+
+public interface CrawlerQuestionMapper extends SuperMapper{
+
+
+}
diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/CrawlerQuestionEntity.java b/tamguo-crawler/src/main/java/com/tamguo/model/CrawlerQuestionEntity.java
new file mode 100644
index 0000000..c1eba61
--- /dev/null
+++ b/tamguo-crawler/src/main/java/com/tamguo/model/CrawlerQuestionEntity.java
@@ -0,0 +1,56 @@
+package com.tamguo.model;
+
+import java.io.Serializable;
+import com.baomidou.mybatisplus.annotations.TableName;
+import com.tamguo.config.dao.SuperEntity;
+
+
+
+/**
+ * The persistent class for the crawler_question database table.
+ *
+ */
+@TableName(value="crawler_question")
+public class CrawlerQuestionEntity extends SuperEntity implements Serializable {
+ private static final long serialVersionUID = 1L;
+
+ private String questionUrl;
+
+ private String chapterId;
+
+ private String status;
+
+ public String getQuestionUrl() {
+ return questionUrl;
+ }
+
+
+
+ public void setQuestionUrl(String questionUrl) {
+ this.questionUrl = questionUrl;
+ }
+
+
+
+ public String getChapterId() {
+ return chapterId;
+ }
+
+ public void setChapterId(String chapterId) {
+ this.chapterId = chapterId;
+ }
+
+ public String getStatus() {
+ return status;
+ }
+
+ public void setStatus(String status) {
+ this.status = status;
+ }
+
+
+
+ public static long getSerialversionuid() {
+ return serialVersionUID;
+ }
+}
\ No newline at end of file
diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/vo/SubjectVo.java b/tamguo-crawler/src/main/java/com/tamguo/model/vo/SubjectVo.java
index df3c05f..ac0be5f 100644
--- a/tamguo-crawler/src/main/java/com/tamguo/model/vo/SubjectVo.java
+++ b/tamguo-crawler/src/main/java/com/tamguo/model/vo/SubjectVo.java
@@ -39,6 +39,30 @@ public class SubjectVo {
@PageFieldSelect(cssQuery = ".screening .sc-subject li:not(.selected) a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href")
private List chapterUrls;
+ // 待采集的问题URLs
+ @PageFieldSelect(cssQuery = ".list-right .detail-chapter .detail-kpoint-1 .detail-kpoint-2 .mask a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href")
+ private List questionUrlsTemp;
+
+ // 待采集问题URLs
+ @PageFieldSelect(cssQuery = ".bd-content .question-box .question-box-inner .view-analyse a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href")
+ private List questionUrls;
+
+ // 单个题目数据
+ @PageFieldSelect(cssQuery=".question-box-inner .questem-inner", selectType = XxlCrawlerConf.SelectType.HTML)
+ private String content;
+
+ @PageFieldSelect(cssQuery=".exam-answer-content", selectType = XxlCrawlerConf.SelectType.HTML)
+ private List answer;
+
+ @PageFieldSelect(cssQuery=".exam-analysis .exam-analysis-content", selectType = XxlCrawlerConf.SelectType.HTML)
+ private String analysis;
+
+ @PageFieldSelect(cssQuery=".que-title span:eq(0)",selectType = XxlCrawlerConf.SelectType.TEXT)
+ private String questionType;
+
+ @PageFieldSelect(cssQuery=".que-title span:eq(1)",selectType = XxlCrawlerConf.SelectType.TEXT)
+ private String score;
+
public List getName() {
return name;
}
@@ -103,4 +127,60 @@ public class SubjectVo {
this.chapterCurrName = chapterCurrName;
}
+ public List getQuestionUrlsTemp() {
+ return questionUrlsTemp;
+ }
+
+ public void setQuestionUrlsTemp(List questionUrlsTemp) {
+ this.questionUrlsTemp = questionUrlsTemp;
+ }
+
+ public List getQuestionUrls() {
+ return questionUrls;
+ }
+
+ public void setQuestionUrls(List questionUrls) {
+ this.questionUrls = questionUrls;
+ }
+
+ public String getAnalysis() {
+ return analysis;
+ }
+
+ public void setAnalysis(String analysis) {
+ this.analysis = analysis;
+ }
+
+ public String getContent() {
+ return content;
+ }
+
+ public void setContent(String content) {
+ this.content = content;
+ }
+
+ public String getQuestionType() {
+ return questionType;
+ }
+
+ public void setQuestionType(String questionType) {
+ this.questionType = questionType;
+ }
+
+ public String getScore() {
+ return score;
+ }
+
+ public void setScore(String score) {
+ this.score = score;
+ }
+
+ public List getAnswer() {
+ return answer;
+ }
+
+ public void setAnswer(List answer) {
+ this.answer = answer;
+ }
+
}
diff --git a/tamguo-crawler/src/main/java/com/tamguo/service/impl/SubjectService.java b/tamguo-crawler/src/main/java/com/tamguo/service/impl/SubjectService.java
index 06d7c24..6ef7ff3 100644
--- a/tamguo-crawler/src/main/java/com/tamguo/service/impl/SubjectService.java
+++ b/tamguo-crawler/src/main/java/com/tamguo/service/impl/SubjectService.java
@@ -1,8 +1,10 @@
package com.tamguo.service.impl;
import java.math.BigInteger;
-import java.util.ArrayList;
-import java.util.List;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
@@ -14,9 +16,11 @@ import org.springframework.stereotype.Service;
import com.tamguo.dao.ChapterMapper;
import com.tamguo.dao.CourseMapper;
+import com.tamguo.dao.CrawlerQuestionMapper;
import com.tamguo.dao.SubjectMapper;
import com.tamguo.model.ChapterEntity;
import com.tamguo.model.CourseEntity;
+import com.tamguo.model.CrawlerQuestionEntity;
import com.tamguo.model.SubjectEntity;
import com.tamguo.model.vo.SubjectVo;
import com.tamguo.service.ISubjectService;
@@ -33,20 +37,24 @@ public class SubjectService implements ISubjectService{
CourseMapper courseMapper;
@Autowired
ChapterMapper chapterMapper;
+ @Autowired
+ CrawlerQuestionMapper crawlerQuestionMapper;
private Logger logger = LoggerFactory.getLogger(getClass());
- private List urls = new ArrayList<>();
+ private Set urls = new HashSet<>();
+
+ private Set questionUrls = new HashSet();
+ private Map chapterQuestionListMap = new HashMap<>();
+
private RunData runData;
@Override
public void crawlerSubject() {
XxlCrawler crawler = new XxlCrawler.Builder()
.setUrls("https://tiku.baidu.com/")
- .setWhiteUrlRegexs("https://tiku.baidu.com/tikupc/homepage/\\w+","https://tiku.baidu.com/tikupc/homepage/\\w+"
- , "https://tiku.baidu.com/"
- , "https://tiku.baidu.com/tikupc/chapterlist/.*")
+ .setAllowSpread(false)
.setPageParser(new PageParser() {
@Override
@@ -75,7 +83,6 @@ public class SubjectService implements ISubjectService{
runData.addUrl(url);
}
}
-
if(pageUrl.contains("https://tiku.baidu.com/tikupc/homepage/")) {
logger.info("开始解析科目分类:{}" , pageUrl);
for(int i=0 ; i 0) {
+ String questionUrl = maskList.get(0).getElementsByTag("a").attr("abs:href");
+ questionUrl = questionUrl.replace("1-5", "1-1000");
+ chapterQuestionListMap.put(questionUrl, chapter2);
+
+ runData.addUrl(questionUrl);
+ }
}
}
}
@@ -185,9 +200,8 @@ public class SubjectService implements ISubjectService{
// 剔除已经爬取的数据
urls.add(pageUrl);
- // 加入科目爬取数据
for(String url : subjectVo.getChapterUrls()) {
- if(url.equals("https://tiku.baidu.com"+pageVoElement.getElementsByClass("main-inner").get(0).getElementsByClass("selected").get(0).getElementsByTag("a").attr("href"))) {
+ if(url.equals(pageVoElement.getElementsByClass("main-inner").get(0).getElementsByClass("selected").get(0).getElementsByTag("a").attr("abs:href"))) {
continue;
}
if(!urls.contains(url)) {
@@ -195,13 +209,40 @@ public class SubjectService implements ISubjectService{
}
}
}
+
+ if(pageUrl.contains("https://tiku.baidu.com/tikupc/chapterdetail")) {
+ // 加入待解析题目列表
+ for(String questionUrl : subjectVo.getQuestionUrls()) {
+ if(!questionUrls.contains(questionUrl)) {
+ // 处理URL
+ // runData.addUrl(questionUrl);
+ questionUrls.add(questionUrl);
+
+ ChapterEntity chapterEntity = (ChapterEntity) chapterQuestionListMap.get(pageUrl);
+
+ CrawlerQuestionEntity condition = new CrawlerQuestionEntity();
+ condition.setQuestionUrl(questionUrl);
+ if(crawlerQuestionMapper.selectOne(condition) == null) {
+ CrawlerQuestionEntity crawlerQuestion = new CrawlerQuestionEntity();
+ crawlerQuestion.setQuestionUrl(questionUrl);
+ crawlerQuestion.setChapterId(chapterEntity.getUid());
+ crawlerQuestion.setStatus("0");
+ crawlerQuestionMapper.insert(crawlerQuestion);
+ }else {
+ logger.info(questionUrl+"已经爬取");
+ }
+ }
+ }
+ }
+
+ /*if(pageUrl.contains("https://tiku.baidu.com/tikupc/singledetail")) {
+ ChapterEntity chapterEntity = (ChapterEntity) chapterQuestionMap.get(pageUrl);
+ System.out.println(chapterEntity);
+ }*/
}
}).build();
runData = crawler.getRunData();
-
-
-
// 获取科目
crawler.start(true);
}
diff --git a/tamguo-crawler/src/main/resources/mappers/CrawlerQuestionMapper.xml b/tamguo-crawler/src/main/resources/mappers/CrawlerQuestionMapper.xml
new file mode 100644
index 0000000..4c97165
--- /dev/null
+++ b/tamguo-crawler/src/main/resources/mappers/CrawlerQuestionMapper.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/tamguo-crawler/src/main/resources/redis.xml b/tamguo-crawler/src/main/resources/redis.xml
new file mode 100644
index 0000000..4055e0d
--- /dev/null
+++ b/tamguo-crawler/src/main/resources/redis.xml
@@ -0,0 +1,9 @@
+
+
+
+
+
+
+
+
+
\ No newline at end of file