diff --git a/tamguo-crawler/src/main/java/com/tamguo/dao/CrawlerPaperMapper.java b/tamguo-crawler/src/main/java/com/tamguo/dao/CrawlerPaperMapper.java
new file mode 100644
index 0000000..f569ce6
--- /dev/null
+++ b/tamguo-crawler/src/main/java/com/tamguo/dao/CrawlerPaperMapper.java
@@ -0,0 +1,8 @@
+package com.tamguo.dao;
+
+import com.tamguo.config.dao.SuperMapper;
+import com.tamguo.model.CrawlerPaperEntity;
+
+public interface CrawlerPaperMapper extends SuperMapper{
+
+}
diff --git a/tamguo-crawler/src/main/java/com/tamguo/dao/PaperMapper.java b/tamguo-crawler/src/main/java/com/tamguo/dao/PaperMapper.java
new file mode 100644
index 0000000..1f1b0a7
--- /dev/null
+++ b/tamguo-crawler/src/main/java/com/tamguo/dao/PaperMapper.java
@@ -0,0 +1,8 @@
+package com.tamguo.dao;
+
+import com.tamguo.config.dao.SuperMapper;
+import com.tamguo.model.PaperEntity;
+
+public interface PaperMapper extends SuperMapper {
+
+}
diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/CrawlerPaperEntity.java b/tamguo-crawler/src/main/java/com/tamguo/model/CrawlerPaperEntity.java
new file mode 100644
index 0000000..8361180
--- /dev/null
+++ b/tamguo-crawler/src/main/java/com/tamguo/model/CrawlerPaperEntity.java
@@ -0,0 +1,34 @@
+package com.tamguo.model;
+
+import com.baomidou.mybatisplus.annotations.TableName;
+import com.tamguo.config.dao.SuperEntity;
+
+@TableName(value="crawler_paper")
+public class CrawlerPaperEntity extends SuperEntity{
+
+ private static final long serialVersionUID = 1L;
+
+ private String questionUrl;
+
+ private String paperId;
+
+ public String getQuestionUrl() {
+ return questionUrl;
+ }
+
+ public void setQuestionUrl(String questionUrl) {
+ this.questionUrl = questionUrl;
+ }
+
+ public static long getSerialversionuid() {
+ return serialVersionUID;
+ }
+
+ public String getPaperId() {
+ return paperId;
+ }
+
+ public void setPaperId(String paperId) {
+ this.paperId = paperId;
+ }
+}
diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/PaperEntity.java b/tamguo-crawler/src/main/java/com/tamguo/model/PaperEntity.java
new file mode 100644
index 0000000..c84b0c3
--- /dev/null
+++ b/tamguo-crawler/src/main/java/com/tamguo/model/PaperEntity.java
@@ -0,0 +1,246 @@
+package com.tamguo.model;
+
+import java.io.Serializable;
+import org.apache.commons.lang3.StringUtils;
+
+import com.alibaba.fastjson.JSONArray;
+import com.baomidou.mybatisplus.annotations.TableField;
+import com.baomidou.mybatisplus.annotations.TableName;
+import com.tamguo.config.dao.SuperEntity;
+
+
+/**
+ * The persistent class for the tiku_chapter database table.
+ *
+ */
+@TableName(value="tiku_paper")
+public class PaperEntity extends SuperEntity implements Serializable {
+ private static final long serialVersionUID = 1L;
+
+ private String subjectId;
+
+ private String courseId;
+
+ private String schoolId;
+
+ private String areaId;
+
+ private String createrId;
+
+ private String name;
+
+ private String questionInfo;
+
+ private String type;
+
+ private String year;
+
+ private Integer downHits;
+
+ private Integer openHits;
+
+ private String seoTitle;
+
+ private String seoKeywords;
+
+ private String seoDescription;
+
+ @TableField(value="free")
+ private String free;
+
+ private String point;
+
+ private String money;
+
+ @TableField(exist=false)
+ private String subjectName;
+
+ @TableField(exist=false)
+ private String courseName;
+
+ @TableField(exist=false)
+ private String areaName;
+
+ @TableField(exist=false)
+ private String schoolName;
+
+ public JSONArray getQueInfo(){
+ if(StringUtils.isEmpty(getQuestionInfo())){
+ return null;
+ }
+ return JSONArray.parseArray(getQuestionInfo());
+ }
+
+ public String getCourseId() {
+ return courseId;
+ }
+
+ public void setCourseId(String courseId) {
+ this.courseId = courseId;
+ }
+
+ public String getAreaId() {
+ return areaId;
+ }
+
+ public void setAreaId(String areaId) {
+ this.areaId = areaId;
+ }
+
+ public String getName() {
+ return name;
+ }
+
+ public void setName(String name) {
+ this.name = name;
+ }
+
+ public String getType() {
+ return type;
+ }
+
+ public void setType(String type) {
+ this.type = type;
+ }
+
+ public String getYear() {
+ return year;
+ }
+
+ public void setYear(String year) {
+ this.year = year;
+ }
+
+ public static long getSerialversionuid() {
+ return serialVersionUID;
+ }
+
+ public String getSchoolId() {
+ return schoolId;
+ }
+
+ public void setSchoolId(String schoolId) {
+ this.schoolId = schoolId;
+ }
+
+ public Integer getDownHits() {
+ return downHits;
+ }
+
+ public void setDownHits(Integer downHits) {
+ this.downHits = downHits;
+ }
+
+ public Integer getOpenHits() {
+ return openHits;
+ }
+
+ public void setOpenHits(Integer openHits) {
+ this.openHits = openHits;
+ }
+
+ public String getQuestionInfo() {
+ return questionInfo;
+ }
+
+ public void setQuestionInfo(String questionInfo) {
+ this.questionInfo = questionInfo;
+ }
+
+ public String getCreaterId() {
+ return createrId;
+ }
+
+ public void setCreaterId(String createrId) {
+ this.createrId = createrId;
+ }
+
+ public String getSeoTitle() {
+ return seoTitle;
+ }
+
+ public void setSeoTitle(String seoTitle) {
+ this.seoTitle = seoTitle;
+ }
+
+ public String getSeoKeywords() {
+ return seoKeywords;
+ }
+
+ public void setSeoKeywords(String seoKeywords) {
+ this.seoKeywords = seoKeywords;
+ }
+
+ public String getSeoDescription() {
+ return seoDescription;
+ }
+
+ public void setSeoDescription(String seoDescription) {
+ this.seoDescription = seoDescription;
+ }
+
+ public String getCourseName() {
+ return courseName;
+ }
+
+ public void setCourseName(String courseName) {
+ this.courseName = courseName;
+ }
+
+ public String getAreaName() {
+ return areaName;
+ }
+
+ public void setAreaName(String areaName) {
+ this.areaName = areaName;
+ }
+
+ public String getSchoolName() {
+ return schoolName;
+ }
+
+ public void setSchoolName(String schoolName) {
+ this.schoolName = schoolName;
+ }
+
+ public String getSubjectId() {
+ return subjectId;
+ }
+
+ public void setSubjectId(String subjectId) {
+ this.subjectId = subjectId;
+ }
+
+ public String getSubjectName() {
+ return subjectName;
+ }
+
+ public void setSubjectName(String subjectName) {
+ this.subjectName = subjectName;
+ }
+
+ public String getPoint() {
+ return point;
+ }
+
+ public void setPoint(String point) {
+ this.point = point;
+ }
+
+ public String getMoney() {
+ return money;
+ }
+
+ public void setMoney(String money) {
+ this.money = money;
+ }
+
+ public String getFree() {
+ return free;
+ }
+
+ public void setFree(String free) {
+ this.free = free;
+ }
+
+}
\ No newline at end of file
diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/enums/QuestionType.java b/tamguo-crawler/src/main/java/com/tamguo/model/enums/QuestionType.java
index 3939e6b..79804ef 100644
--- a/tamguo-crawler/src/main/java/com/tamguo/model/enums/QuestionType.java
+++ b/tamguo-crawler/src/main/java/com/tamguo/model/enums/QuestionType.java
@@ -36,6 +36,8 @@ public enum QuestionType {
return WENDATI;
}else if("选择题".equals(value)) {
return DANXUANTI;
+ }else if("简答题(综合题)".equals(value)) {
+ return WENDATI;
}
return WENDATI;
}
diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/vo/PaperVo.java b/tamguo-crawler/src/main/java/com/tamguo/model/vo/PaperVo.java
new file mode 100644
index 0000000..7b0d085
--- /dev/null
+++ b/tamguo-crawler/src/main/java/com/tamguo/model/vo/PaperVo.java
@@ -0,0 +1,68 @@
+package com.tamguo.model.vo;
+
+import java.util.List;
+
+import com.xuxueli.crawler.annotation.PageFieldSelect;
+import com.xuxueli.crawler.annotation.PageSelect;
+import com.xuxueli.crawler.conf.XxlCrawlerConf;
+
+@PageSelect(cssQuery = "body")
+public class PaperVo {
+
+ // 试卷URL
+ @PageFieldSelect(cssQuery = ".paperlist .paper-title a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href")
+ private List paperUrls;
+
+ @PageFieldSelect(cssQuery = ".title-bar .title-inner .paper-title .title")
+ private String paperName;
+
+ @PageFieldSelect(cssQuery = ".quelist-wrap .ques-container .ques-info-wrap .que-type")
+ private List questionInfoTypes;
+
+ @PageFieldSelect(cssQuery = ".quelist-wrap .ques-container .ques-info-wrap .que-info")
+ private List questionInfoTitles;
+
+ @PageFieldSelect(cssQuery = ".view-analyse .view-link", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href")
+ private List questionUrls;
+
+ public String getPaperName() {
+ return paperName;
+ }
+
+ public void setPaperName(String paperName) {
+ this.paperName = paperName;
+ }
+
+ public List getQuestionInfoTypes() {
+ return questionInfoTypes;
+ }
+
+ public void setQuestionInfoTypes(List questionInfoTypes) {
+ this.questionInfoTypes = questionInfoTypes;
+ }
+
+ public List getPaperUrls() {
+ return paperUrls;
+ }
+
+ public void setPaperUrls(List paperUrls) {
+ this.paperUrls = paperUrls;
+ }
+
+ public List getQuestionInfoTitles() {
+ return questionInfoTitles;
+ }
+
+ public void setQuestionInfoTitles(List questionInfoTitles) {
+ this.questionInfoTitles = questionInfoTitles;
+ }
+
+ public List getQuestionUrls() {
+ return questionUrls;
+ }
+
+ public void setQuestionUrls(List questionUrls) {
+ this.questionUrls = questionUrls;
+ }
+
+}
diff --git a/tamguo-crawler/src/test/java/com/tamguo/PaperCrawler.java b/tamguo-crawler/src/test/java/com/tamguo/PaperCrawler.java
new file mode 100644
index 0000000..8213882
--- /dev/null
+++ b/tamguo-crawler/src/test/java/com/tamguo/PaperCrawler.java
@@ -0,0 +1,114 @@
+package com.tamguo;
+
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.boot.test.context.SpringBootTest;
+import org.springframework.test.context.junit4.SpringRunner;
+
+import com.alibaba.fastjson.JSONArray;
+import com.alibaba.fastjson.JSONObject;
+import com.tamguo.dao.CrawlerPaperMapper;
+import com.tamguo.dao.PaperMapper;
+import com.tamguo.model.CrawlerPaperEntity;
+import com.tamguo.model.PaperEntity;
+import com.tamguo.model.enums.QuestionType;
+import com.tamguo.model.vo.PaperVo;
+import com.xuxueli.crawler.XxlCrawler;
+import com.xuxueli.crawler.parser.PageParser;
+import com.xuxueli.crawler.rundata.RunData;
+
+@RunWith(SpringRunner.class)
+@SpringBootTest
+public class PaperCrawler {
+
+ // 高考
+ private final String SUBJECT_ID = "gaokao";
+ // 科目
+ private final String COURSE_ID = "likeshuxue";
+ // 110000 北京
+ private final String AREA_ID = "110000";
+ // 年份
+ private final String YEAR = "2018";
+ // 真题试卷 类型(1:真题试卷,2:模拟试卷,3:押题预测,4:名校精品)
+ private final String PAPER_TYPE = "1";
+ // 开始采集的URL
+ private final String START_URL = "https://tiku.baidu.com/tikupc/paperlist/1bfd700abb68a98271fefa04-16-0-2018-37-1-download";
+
+ private RunData runData;
+
+ @Autowired
+ private PaperMapper paperMapper;
+ @Autowired
+ private CrawlerPaperMapper crawlerPaperMapper;
+
+ @Test
+ public void crawler() {
+ XxlCrawler crawler = new XxlCrawler.Builder()
+ .setUrls(START_URL)
+ .setAllowSpread(false)
+ .setFailRetryCount(5)
+ .setThreadCount(1)
+ .setPageParser(new PageParser() {
+
+ @Override
+ public void parse(Document html, Element pageVoElement, PaperVo paperVo) {
+ // 解析封装 PageVo 对象
+ String pageUrl = html.baseUri();
+ if(pageUrl.contains("https://tiku.baidu.com/tikupc/paperdetail")) {
+ System.out.println(paperVo.getPaperName());
+
+ PaperEntity paper = new PaperEntity();
+ paper.setSubjectId(SUBJECT_ID);
+ paper.setCourseId(COURSE_ID);
+ paper.setSchoolId("");
+ paper.setAreaId(AREA_ID);
+ paper.setCreaterId("system");
+ paper.setName(paperVo.getPaperName());
+ paper.setYear(YEAR);
+ paper.setFree("0");
+ paper.setSeoTitle(paperVo.getPaperName());
+ paper.setSeoKeywords("");
+ paper.setSeoDescription("");
+ paper.setType(PAPER_TYPE);
+ JSONArray entitys = new JSONArray();
+ // 处理类型问题
+ for(int i=0 ; i() {
+
+ @Override
+ public void parse(Document html, Element pageVoElement, QuestionVo questionVo) {
+ if(StringUtils.isEmpty(questionVo.getContent())) {
+ runData.addUrl(html.baseUri());
+ return;
+ }
+ CrawlerPaperEntity condition = new CrawlerPaperEntity();
+ condition.setQuestionUrl(html.baseUri());
+ System.out.println(html.baseUri());
+ CrawlerPaperEntity crawlerPaper = crawlerPaperMapper.selectOne(condition);
+ PaperEntity paper = paperMapper.selectById(crawlerPaper.getPaperId());
+ CourseEntity course = courseMapper.selectById(paper.getCourseId());
+ SubjectEntity subject = subjectMapper.selectById(paper.getSubjectId());
+
+ QuestionType questionType = QuestionType.getQuestionType(questionVo.getQuestionType());
+
+
+ QuestionEntity question = new QuestionEntity();
+ if(questionType == QuestionType.DANXUANTI) {
+ if(!StringUtils.isEmpty(questionVo.getQueoptions())) {
+ question.setContent(questionVo.getContent() + questionVo.getQueoptions());
+ }else {
+ question.setContent(questionVo.getContent());
+ }
+ }else {
+ question.setContent(questionVo.getContent());
+ }
+ question.setAnalysis(questionVo.getAnalysis());
+ if(StringUtils.isEmpty(question.getAnalysis())) {
+ question.setAnalysis(" 略
");
+ }
+ question.setAnswer(questionVo.getAnswer());
+ question.setAuditStatus("1");
+ question.setChapterId("");
+ question.setCourseId(course.getId());
+ question.setPaperId(paper.getId());
+ question.setQuestionType(questionType.getValue().toString());
+ if(questionVo.getReviewPoint() != null && questionVo.getReviewPoint().size() > 0) {
+ question.setReviewPoint(StringUtils.join(questionVo.getReviewPoint().toArray(), ","));
+ }
+ // 处理分数
+ if(questionVo.getScore() != null) {
+ if(questionVo.getScore().contains("分")) {
+ question.setScore(questionVo.getScore());
+ }
+ if(questionVo.getScore().contains("年")) {
+ question.setYear(questionVo.getScore());
+ }
+ }
+ if(questionVo.getYear() != null) {
+ if(questionVo.getYear().contains("年")) {
+ question.setYear(questionVo.getYear());
+ }
+ }
+ question.setSubjectId(subject.getId());
+
+ if (questionVo.getAnswerImages()!=null && questionVo.getAnswerImages().size() > 0) {
+ Set imagesSet = new HashSet<>(questionVo.getAnswerImages());
+ for (String img: imagesSet) {
+
+ // 下载图片文件
+ String fileName = getFileName(img);
+ String filePath = getFilePath();
+ String fileDatePath = getFileDatePath();
+
+ File dir = new File(filePath +fileDatePath+ "/");
+ if (!dir.exists())
+ dir.mkdirs();
+ boolean ret = FileUtil.downFile(img, XxlCrawlerConf.TIMEOUT_MILLIS_DEFAULT, filePath +fileDatePath+ "/", fileName);
+ System.out.println("down images " + (ret?"success":"fail") + ":" + img);
+
+ // 替换URL
+ question.setAnswer(question.getAnswer().replace(img, DOMAIN + "/files/paper/" + COURSE_ID + '/' + fileDatePath + "/" + fileName));
+ }
+ question.setAnswer(question.getAnswer());
+ }
+
+ if (questionVo.getAnalysisImages()!=null && questionVo.getAnalysisImages().size() > 0) {
+ Set imagesSet = new HashSet<>(questionVo.getAnalysisImages());
+ for (String img: imagesSet) {
+
+ // 下载图片文件
+ String fileName = getFileName(img);
+ String filePath = getFilePath();
+ String fileDatePath = getFileDatePath();
+
+ File dir = new File(filePath +fileDatePath+ "/");
+ if (!dir.exists())
+ dir.mkdirs();
+ boolean ret = FileUtil.downFile(img, XxlCrawlerConf.TIMEOUT_MILLIS_DEFAULT, filePath +fileDatePath+ "/", fileName);
+ System.out.println("down images " + (ret?"success":"fail") + ":" + img);
+
+ // 替换URL
+ question.setAnalysis(question.getAnalysis().replace(img, DOMAIN + "/files/paper/" + COURSE_ID + '/' + fileDatePath + "/" + fileName));
+ }
+ question.setAnalysis(question.getAnalysis());
+ }
+
+ if (questionVo.getContentImages()!=null && questionVo.getContentImages().size() > 0) {
+ Set imagesSet = new HashSet<>(questionVo.getContentImages());
+ for (String img: imagesSet) {
+
+ // 下载图片文件
+ String fileName = getFileName(img);
+ String filePath = getFilePath();
+ String fileDatePath = getFileDatePath();
+ File dir = new File(filePath +fileDatePath+ "/");
+ if (!dir.exists()) {
+ dir.mkdirs();
+ }
+ boolean ret = FileUtil.downFile(img, XxlCrawlerConf.TIMEOUT_MILLIS_DEFAULT, filePath +fileDatePath+ "/", fileName);
+ System.out.println("down images " + (ret?"success":"fail") + ":" + img);
+
+ // 替换URL
+ question.setContent(question.getContent().replace(img, DOMAIN + "/files/paper/" + COURSE_ID + '/' + fileDatePath + "/" + fileName));
+ }
+ question.setContent(question.getContent());
+ }
+
+
+ // 处理图片
+ question.setSourceType("baidu");
+ question.setSourceUrl(html.baseUri());
+ questionMapper.insert(question);
+ }
+
+ public String getFileName(String img) {
+ return getFileNo() + img.substring(img.lastIndexOf("."));
+ }
+
+ private String getFilePath() {
+ return "/home/webdata/files/paper/" + COURSE_ID + "/";
+ }
+
+ private String getFileDatePath() {
+ SimpleDateFormat sdf = new SimpleDateFormat("ddHHmm");
+ String format = sdf.format(new Date());
+ return format;
+ }
+
+ private String getFileNo() {
+ SimpleDateFormat sdf = new SimpleDateFormat("ddHHmm");
+ String format = sdf.format(new Date());
+ DecimalFormat df = new DecimalFormat(FILES_NO_FORMAT);
+ String key = FILES_PREFIX + format;
+ Long incr = cacheService.incr(key);
+ String avatorNo = FILES_PREFIX + df.format(incr);
+ return avatorNo;
+ }
+ }).build();
+
+ runData = crawler.getRunData();
+ int page = 1;
+ int pageSize = 1000;
+ while(true) {
+ Page questionPage = new Page(page , pageSize);
+ List questionList = crawlerPaperMapper.selectPage(questionPage, Condition.create().orderDesc(Arrays.asList("id")));
+ for(int i=0 ;i vars = new HashMap<>();
vars.put("domainName", env.getProperty("domain.name"));
vars.put("adminDomain", env.getProperty("admin.domain.name"));
- vars.put("PAPER_TYPE_ZHENTI", "1");
- vars.put("PAPER_TYPE_MONI", "2");
- vars.put("PAPER_TYPE_YATI", "3");
- vars.put("PAPER_TYPE_MINGXIAO", "4");
+ vars.put("PAPER_TYPE_ZHENTI", SystemConstant.ZHENGTI_PAPER_ID);
+ vars.put("PAPER_TYPE_MONI", SystemConstant.MONI_PAPER_ID);
+ vars.put("PAPER_TYPE_YATI", SystemConstant.YATI_PAPER_ID);
+ vars.put("PAPER_TYPE_MINGXIAO", SystemConstant.MINGXIAO_PAPER_ID);
viewResolver.setStaticVariables(vars);
}
}
diff --git a/tamguo-tms/src/main/resources/templates/index.html b/tamguo-tms/src/main/resources/templates/index.html
index 9032233..1750402 100644
--- a/tamguo-tms/src/main/resources/templates/index.html
+++ b/tamguo-tms/src/main/resources/templates/index.html
@@ -107,7 +107,7 @@
试卷资源
一考知底,高分必刷,全面提分
当前位置:
- 更多地区 >
+ 更多地区 >
@@ -136,11 +136,11 @@
@@ -151,7 +151,7 @@
-
-
+
2017年高考真题 语文 (北京卷)
@@ -183,7 +183,7 @@
-
-
+
语文 海淀区2016年高三期末试卷
diff --git a/tamguo-tms/src/main/resources/templates/paper.html b/tamguo-tms/src/main/resources/templates/paper.html
index 384491d..083ba1c 100644
--- a/tamguo-tms/src/main/resources/templates/paper.html
+++ b/tamguo-tms/src/main/resources/templates/paper.html
@@ -89,13 +89,13 @@
- 单选题
- 本大题共15小题,每小题1分,共15分。在每小题给出的4个选项中,有且只有一项是符合题目要求。
+ 单选题
+