diff --git a/tamguo-crawler/src/main/java/com/tamguo/dao/CrawlerPaperMapper.java b/tamguo-crawler/src/main/java/com/tamguo/dao/CrawlerPaperMapper.java new file mode 100644 index 0000000..f569ce6 --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/dao/CrawlerPaperMapper.java @@ -0,0 +1,8 @@ +package com.tamguo.dao; + +import com.tamguo.config.dao.SuperMapper; +import com.tamguo.model.CrawlerPaperEntity; + +public interface CrawlerPaperMapper extends SuperMapper{ + +} diff --git a/tamguo-crawler/src/main/java/com/tamguo/dao/PaperMapper.java b/tamguo-crawler/src/main/java/com/tamguo/dao/PaperMapper.java new file mode 100644 index 0000000..1f1b0a7 --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/dao/PaperMapper.java @@ -0,0 +1,8 @@ +package com.tamguo.dao; + +import com.tamguo.config.dao.SuperMapper; +import com.tamguo.model.PaperEntity; + +public interface PaperMapper extends SuperMapper { + +} diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/CrawlerPaperEntity.java b/tamguo-crawler/src/main/java/com/tamguo/model/CrawlerPaperEntity.java new file mode 100644 index 0000000..8361180 --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/model/CrawlerPaperEntity.java @@ -0,0 +1,34 @@ +package com.tamguo.model; + +import com.baomidou.mybatisplus.annotations.TableName; +import com.tamguo.config.dao.SuperEntity; + +@TableName(value="crawler_paper") +public class CrawlerPaperEntity extends SuperEntity{ + + private static final long serialVersionUID = 1L; + + private String questionUrl; + + private String paperId; + + public String getQuestionUrl() { + return questionUrl; + } + + public void setQuestionUrl(String questionUrl) { + this.questionUrl = questionUrl; + } + + public static long getSerialversionuid() { + return serialVersionUID; + } + + public String getPaperId() { + return paperId; + } + + public void setPaperId(String paperId) { + this.paperId = paperId; + } +} diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/PaperEntity.java b/tamguo-crawler/src/main/java/com/tamguo/model/PaperEntity.java new file mode 100644 index 0000000..c84b0c3 --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/model/PaperEntity.java @@ -0,0 +1,246 @@ +package com.tamguo.model; + +import java.io.Serializable; +import org.apache.commons.lang3.StringUtils; + +import com.alibaba.fastjson.JSONArray; +import com.baomidou.mybatisplus.annotations.TableField; +import com.baomidou.mybatisplus.annotations.TableName; +import com.tamguo.config.dao.SuperEntity; + + +/** + * The persistent class for the tiku_chapter database table. + * + */ +@TableName(value="tiku_paper") +public class PaperEntity extends SuperEntity implements Serializable { + private static final long serialVersionUID = 1L; + + private String subjectId; + + private String courseId; + + private String schoolId; + + private String areaId; + + private String createrId; + + private String name; + + private String questionInfo; + + private String type; + + private String year; + + private Integer downHits; + + private Integer openHits; + + private String seoTitle; + + private String seoKeywords; + + private String seoDescription; + + @TableField(value="free") + private String free; + + private String point; + + private String money; + + @TableField(exist=false) + private String subjectName; + + @TableField(exist=false) + private String courseName; + + @TableField(exist=false) + private String areaName; + + @TableField(exist=false) + private String schoolName; + + public JSONArray getQueInfo(){ + if(StringUtils.isEmpty(getQuestionInfo())){ + return null; + } + return JSONArray.parseArray(getQuestionInfo()); + } + + public String getCourseId() { + return courseId; + } + + public void setCourseId(String courseId) { + this.courseId = courseId; + } + + public String getAreaId() { + return areaId; + } + + public void setAreaId(String areaId) { + this.areaId = areaId; + } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public String getType() { + return type; + } + + public void setType(String type) { + this.type = type; + } + + public String getYear() { + return year; + } + + public void setYear(String year) { + this.year = year; + } + + public static long getSerialversionuid() { + return serialVersionUID; + } + + public String getSchoolId() { + return schoolId; + } + + public void setSchoolId(String schoolId) { + this.schoolId = schoolId; + } + + public Integer getDownHits() { + return downHits; + } + + public void setDownHits(Integer downHits) { + this.downHits = downHits; + } + + public Integer getOpenHits() { + return openHits; + } + + public void setOpenHits(Integer openHits) { + this.openHits = openHits; + } + + public String getQuestionInfo() { + return questionInfo; + } + + public void setQuestionInfo(String questionInfo) { + this.questionInfo = questionInfo; + } + + public String getCreaterId() { + return createrId; + } + + public void setCreaterId(String createrId) { + this.createrId = createrId; + } + + public String getSeoTitle() { + return seoTitle; + } + + public void setSeoTitle(String seoTitle) { + this.seoTitle = seoTitle; + } + + public String getSeoKeywords() { + return seoKeywords; + } + + public void setSeoKeywords(String seoKeywords) { + this.seoKeywords = seoKeywords; + } + + public String getSeoDescription() { + return seoDescription; + } + + public void setSeoDescription(String seoDescription) { + this.seoDescription = seoDescription; + } + + public String getCourseName() { + return courseName; + } + + public void setCourseName(String courseName) { + this.courseName = courseName; + } + + public String getAreaName() { + return areaName; + } + + public void setAreaName(String areaName) { + this.areaName = areaName; + } + + public String getSchoolName() { + return schoolName; + } + + public void setSchoolName(String schoolName) { + this.schoolName = schoolName; + } + + public String getSubjectId() { + return subjectId; + } + + public void setSubjectId(String subjectId) { + this.subjectId = subjectId; + } + + public String getSubjectName() { + return subjectName; + } + + public void setSubjectName(String subjectName) { + this.subjectName = subjectName; + } + + public String getPoint() { + return point; + } + + public void setPoint(String point) { + this.point = point; + } + + public String getMoney() { + return money; + } + + public void setMoney(String money) { + this.money = money; + } + + public String getFree() { + return free; + } + + public void setFree(String free) { + this.free = free; + } + +} \ No newline at end of file diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/enums/QuestionType.java b/tamguo-crawler/src/main/java/com/tamguo/model/enums/QuestionType.java index 3939e6b..79804ef 100644 --- a/tamguo-crawler/src/main/java/com/tamguo/model/enums/QuestionType.java +++ b/tamguo-crawler/src/main/java/com/tamguo/model/enums/QuestionType.java @@ -36,6 +36,8 @@ public enum QuestionType { return WENDATI; }else if("选择题".equals(value)) { return DANXUANTI; + }else if("简答题(综合题)".equals(value)) { + return WENDATI; } return WENDATI; } diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/vo/PaperVo.java b/tamguo-crawler/src/main/java/com/tamguo/model/vo/PaperVo.java new file mode 100644 index 0000000..7b0d085 --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/model/vo/PaperVo.java @@ -0,0 +1,68 @@ +package com.tamguo.model.vo; + +import java.util.List; + +import com.xuxueli.crawler.annotation.PageFieldSelect; +import com.xuxueli.crawler.annotation.PageSelect; +import com.xuxueli.crawler.conf.XxlCrawlerConf; + +@PageSelect(cssQuery = "body") +public class PaperVo { + + // 试卷URL + @PageFieldSelect(cssQuery = ".paperlist .paper-title a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") + private List paperUrls; + + @PageFieldSelect(cssQuery = ".title-bar .title-inner .paper-title .title") + private String paperName; + + @PageFieldSelect(cssQuery = ".quelist-wrap .ques-container .ques-info-wrap .que-type") + private List questionInfoTypes; + + @PageFieldSelect(cssQuery = ".quelist-wrap .ques-container .ques-info-wrap .que-info") + private List questionInfoTitles; + + @PageFieldSelect(cssQuery = ".view-analyse .view-link", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") + private List questionUrls; + + public String getPaperName() { + return paperName; + } + + public void setPaperName(String paperName) { + this.paperName = paperName; + } + + public List getQuestionInfoTypes() { + return questionInfoTypes; + } + + public void setQuestionInfoTypes(List questionInfoTypes) { + this.questionInfoTypes = questionInfoTypes; + } + + public List getPaperUrls() { + return paperUrls; + } + + public void setPaperUrls(List paperUrls) { + this.paperUrls = paperUrls; + } + + public List getQuestionInfoTitles() { + return questionInfoTitles; + } + + public void setQuestionInfoTitles(List questionInfoTitles) { + this.questionInfoTitles = questionInfoTitles; + } + + public List getQuestionUrls() { + return questionUrls; + } + + public void setQuestionUrls(List questionUrls) { + this.questionUrls = questionUrls; + } + +} diff --git a/tamguo-crawler/src/test/java/com/tamguo/PaperCrawler.java b/tamguo-crawler/src/test/java/com/tamguo/PaperCrawler.java new file mode 100644 index 0000000..8213882 --- /dev/null +++ b/tamguo-crawler/src/test/java/com/tamguo/PaperCrawler.java @@ -0,0 +1,114 @@ +package com.tamguo; + +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.test.context.junit4.SpringRunner; + +import com.alibaba.fastjson.JSONArray; +import com.alibaba.fastjson.JSONObject; +import com.tamguo.dao.CrawlerPaperMapper; +import com.tamguo.dao.PaperMapper; +import com.tamguo.model.CrawlerPaperEntity; +import com.tamguo.model.PaperEntity; +import com.tamguo.model.enums.QuestionType; +import com.tamguo.model.vo.PaperVo; +import com.xuxueli.crawler.XxlCrawler; +import com.xuxueli.crawler.parser.PageParser; +import com.xuxueli.crawler.rundata.RunData; + +@RunWith(SpringRunner.class) +@SpringBootTest +public class PaperCrawler { + + // 高考 + private final String SUBJECT_ID = "gaokao"; + // 科目 + private final String COURSE_ID = "likeshuxue"; + // 110000 北京 + private final String AREA_ID = "110000"; + // 年份 + private final String YEAR = "2018"; + // 真题试卷 类型(1:真题试卷,2:模拟试卷,3:押题预测,4:名校精品) + private final String PAPER_TYPE = "1"; + // 开始采集的URL + private final String START_URL = "https://tiku.baidu.com/tikupc/paperlist/1bfd700abb68a98271fefa04-16-0-2018-37-1-download"; + + private RunData runData; + + @Autowired + private PaperMapper paperMapper; + @Autowired + private CrawlerPaperMapper crawlerPaperMapper; + + @Test + public void crawler() { + XxlCrawler crawler = new XxlCrawler.Builder() + .setUrls(START_URL) + .setAllowSpread(false) + .setFailRetryCount(5) + .setThreadCount(1) + .setPageParser(new PageParser() { + + @Override + public void parse(Document html, Element pageVoElement, PaperVo paperVo) { + // 解析封装 PageVo 对象 + String pageUrl = html.baseUri(); + if(pageUrl.contains("https://tiku.baidu.com/tikupc/paperdetail")) { + System.out.println(paperVo.getPaperName()); + + PaperEntity paper = new PaperEntity(); + paper.setSubjectId(SUBJECT_ID); + paper.setCourseId(COURSE_ID); + paper.setSchoolId(""); + paper.setAreaId(AREA_ID); + paper.setCreaterId("system"); + paper.setName(paperVo.getPaperName()); + paper.setYear(YEAR); + paper.setFree("0"); + paper.setSeoTitle(paperVo.getPaperName()); + paper.setSeoKeywords(""); + paper.setSeoDescription(""); + paper.setType(PAPER_TYPE); + JSONArray entitys = new JSONArray(); + // 处理类型问题 + for(int i=0 ; i() { + + @Override + public void parse(Document html, Element pageVoElement, QuestionVo questionVo) { + if(StringUtils.isEmpty(questionVo.getContent())) { + runData.addUrl(html.baseUri()); + return; + } + CrawlerPaperEntity condition = new CrawlerPaperEntity(); + condition.setQuestionUrl(html.baseUri()); + System.out.println(html.baseUri()); + CrawlerPaperEntity crawlerPaper = crawlerPaperMapper.selectOne(condition); + PaperEntity paper = paperMapper.selectById(crawlerPaper.getPaperId()); + CourseEntity course = courseMapper.selectById(paper.getCourseId()); + SubjectEntity subject = subjectMapper.selectById(paper.getSubjectId()); + + QuestionType questionType = QuestionType.getQuestionType(questionVo.getQuestionType()); + + + QuestionEntity question = new QuestionEntity(); + if(questionType == QuestionType.DANXUANTI) { + if(!StringUtils.isEmpty(questionVo.getQueoptions())) { + question.setContent(questionVo.getContent() + questionVo.getQueoptions()); + }else { + question.setContent(questionVo.getContent()); + } + }else { + question.setContent(questionVo.getContent()); + } + question.setAnalysis(questionVo.getAnalysis()); + if(StringUtils.isEmpty(question.getAnalysis())) { + question.setAnalysis("


"); + } + question.setAnswer(questionVo.getAnswer()); + question.setAuditStatus("1"); + question.setChapterId(""); + question.setCourseId(course.getId()); + question.setPaperId(paper.getId()); + question.setQuestionType(questionType.getValue().toString()); + if(questionVo.getReviewPoint() != null && questionVo.getReviewPoint().size() > 0) { + question.setReviewPoint(StringUtils.join(questionVo.getReviewPoint().toArray(), ",")); + } + // 处理分数 + if(questionVo.getScore() != null) { + if(questionVo.getScore().contains("分")) { + question.setScore(questionVo.getScore()); + } + if(questionVo.getScore().contains("年")) { + question.setYear(questionVo.getScore()); + } + } + if(questionVo.getYear() != null) { + if(questionVo.getYear().contains("年")) { + question.setYear(questionVo.getYear()); + } + } + question.setSubjectId(subject.getId()); + + if (questionVo.getAnswerImages()!=null && questionVo.getAnswerImages().size() > 0) { + Set imagesSet = new HashSet<>(questionVo.getAnswerImages()); + for (String img: imagesSet) { + + // 下载图片文件 + String fileName = getFileName(img); + String filePath = getFilePath(); + String fileDatePath = getFileDatePath(); + + File dir = new File(filePath +fileDatePath+ "/"); + if (!dir.exists()) + dir.mkdirs(); + boolean ret = FileUtil.downFile(img, XxlCrawlerConf.TIMEOUT_MILLIS_DEFAULT, filePath +fileDatePath+ "/", fileName); + System.out.println("down images " + (ret?"success":"fail") + ":" + img); + + // 替换URL + question.setAnswer(question.getAnswer().replace(img, DOMAIN + "/files/paper/" + COURSE_ID + '/' + fileDatePath + "/" + fileName)); + } + question.setAnswer(question.getAnswer()); + } + + if (questionVo.getAnalysisImages()!=null && questionVo.getAnalysisImages().size() > 0) { + Set imagesSet = new HashSet<>(questionVo.getAnalysisImages()); + for (String img: imagesSet) { + + // 下载图片文件 + String fileName = getFileName(img); + String filePath = getFilePath(); + String fileDatePath = getFileDatePath(); + + File dir = new File(filePath +fileDatePath+ "/"); + if (!dir.exists()) + dir.mkdirs(); + boolean ret = FileUtil.downFile(img, XxlCrawlerConf.TIMEOUT_MILLIS_DEFAULT, filePath +fileDatePath+ "/", fileName); + System.out.println("down images " + (ret?"success":"fail") + ":" + img); + + // 替换URL + question.setAnalysis(question.getAnalysis().replace(img, DOMAIN + "/files/paper/" + COURSE_ID + '/' + fileDatePath + "/" + fileName)); + } + question.setAnalysis(question.getAnalysis()); + } + + if (questionVo.getContentImages()!=null && questionVo.getContentImages().size() > 0) { + Set imagesSet = new HashSet<>(questionVo.getContentImages()); + for (String img: imagesSet) { + + // 下载图片文件 + String fileName = getFileName(img); + String filePath = getFilePath(); + String fileDatePath = getFileDatePath(); + File dir = new File(filePath +fileDatePath+ "/"); + if (!dir.exists()) { + dir.mkdirs(); + } + boolean ret = FileUtil.downFile(img, XxlCrawlerConf.TIMEOUT_MILLIS_DEFAULT, filePath +fileDatePath+ "/", fileName); + System.out.println("down images " + (ret?"success":"fail") + ":" + img); + + // 替换URL + question.setContent(question.getContent().replace(img, DOMAIN + "/files/paper/" + COURSE_ID + '/' + fileDatePath + "/" + fileName)); + } + question.setContent(question.getContent()); + } + + + // 处理图片 + question.setSourceType("baidu"); + question.setSourceUrl(html.baseUri()); + questionMapper.insert(question); + } + + public String getFileName(String img) { + return getFileNo() + img.substring(img.lastIndexOf(".")); + } + + private String getFilePath() { + return "/home/webdata/files/paper/" + COURSE_ID + "/"; + } + + private String getFileDatePath() { + SimpleDateFormat sdf = new SimpleDateFormat("ddHHmm"); + String format = sdf.format(new Date()); + return format; + } + + private String getFileNo() { + SimpleDateFormat sdf = new SimpleDateFormat("ddHHmm"); + String format = sdf.format(new Date()); + DecimalFormat df = new DecimalFormat(FILES_NO_FORMAT); + String key = FILES_PREFIX + format; + Long incr = cacheService.incr(key); + String avatorNo = FILES_PREFIX + df.format(incr); + return avatorNo; + } + }).build(); + + runData = crawler.getRunData(); + int page = 1; + int pageSize = 1000; + while(true) { + Page questionPage = new Page(page , pageSize); + List questionList = crawlerPaperMapper.selectPage(questionPage, Condition.create().orderDesc(Arrays.asList("id"))); + for(int i=0 ;i vars = new HashMap<>(); vars.put("domainName", env.getProperty("domain.name")); vars.put("adminDomain", env.getProperty("admin.domain.name")); - vars.put("PAPER_TYPE_ZHENTI", "1"); - vars.put("PAPER_TYPE_MONI", "2"); - vars.put("PAPER_TYPE_YATI", "3"); - vars.put("PAPER_TYPE_MINGXIAO", "4"); + vars.put("PAPER_TYPE_ZHENTI", SystemConstant.ZHENGTI_PAPER_ID); + vars.put("PAPER_TYPE_MONI", SystemConstant.MONI_PAPER_ID); + vars.put("PAPER_TYPE_YATI", SystemConstant.YATI_PAPER_ID); + vars.put("PAPER_TYPE_MINGXIAO", SystemConstant.MINGXIAO_PAPER_ID); viewResolver.setStaticVariables(vars); } } diff --git a/tamguo-tms/src/main/resources/templates/index.html b/tamguo-tms/src/main/resources/templates/index.html index 9032233..1750402 100644 --- a/tamguo-tms/src/main/resources/templates/index.html +++ b/tamguo-tms/src/main/resources/templates/index.html @@ -107,7 +107,7 @@

试卷资源 一考知底,高分必刷,全面提分 当前位置: - 更多地区 > + 更多地区 >

@@ -136,11 +136,11 @@ @@ -151,7 +151,7 @@