From 8d4d11bc9a3ca9df6007b7d49f2e1d902b5c08ee Mon Sep 17 00:00:00 2001 From: tamguo Date: Sat, 11 Aug 2018 17:38:25 +0800 Subject: [PATCH] =?UTF-8?q?seo=20=E4=BC=98=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/com/tamguo/model/CrawlerPaperEntity.java | 10 ++++++++++ .../src/main/java/com/tamguo/model/vo/PaperVo.java | 11 +++++++++++ .../src/test/java/com/tamguo/PaperCrawler.java | 10 +++++++--- .../test/java/com/tamguo/PaperQuestionCrawler.java | 8 ++++---- .../java/com/tamguo/config/web/ThymeleafConfig.java | 1 + tamguo-tms/src/main/resources/templates/chapter.html | 5 +++-- tamguo-tms/src/main/resources/templates/index.html | 2 +- tamguo-tms/src/main/resources/templates/paper.html | 4 ++-- .../src/main/resources/templates/paperlist.html | 6 +++--- .../src/main/resources/templates/questionList.html | 2 +- 10 files changed, 43 insertions(+), 16 deletions(-) diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/CrawlerPaperEntity.java b/tamguo-crawler/src/main/java/com/tamguo/model/CrawlerPaperEntity.java index 8361180..3c6ad09 100644 --- a/tamguo-crawler/src/main/java/com/tamguo/model/CrawlerPaperEntity.java +++ b/tamguo-crawler/src/main/java/com/tamguo/model/CrawlerPaperEntity.java @@ -12,6 +12,8 @@ public class CrawlerPaperEntity extends SuperEntity{ private String paperId; + private Integer queindex; + public String getQuestionUrl() { return questionUrl; } @@ -31,4 +33,12 @@ public class CrawlerPaperEntity extends SuperEntity{ public void setPaperId(String paperId) { this.paperId = paperId; } + + public Integer getQueindex() { + return queindex; + } + + public void setQueindex(Integer queindex) { + this.queindex = queindex; + } } diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/vo/PaperVo.java b/tamguo-crawler/src/main/java/com/tamguo/model/vo/PaperVo.java index 7b0d085..ab87279 100644 --- a/tamguo-crawler/src/main/java/com/tamguo/model/vo/PaperVo.java +++ b/tamguo-crawler/src/main/java/com/tamguo/model/vo/PaperVo.java @@ -24,6 +24,9 @@ public class PaperVo { @PageFieldSelect(cssQuery = ".view-analyse .view-link", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") private List questionUrls; + + @PageFieldSelect(cssQuery = ".question-box-inner .queindex-wrap .queindex") + private List queindexs; public String getPaperName() { return paperName; @@ -64,5 +67,13 @@ public class PaperVo { public void setQuestionUrls(List questionUrls) { this.questionUrls = questionUrls; } + + public List getQueindexs() { + return queindexs; + } + + public void setQueindexs(List queindexs) { + this.queindexs = queindexs; + } } diff --git a/tamguo-crawler/src/test/java/com/tamguo/PaperCrawler.java b/tamguo-crawler/src/test/java/com/tamguo/PaperCrawler.java index 8213882..4fe79d1 100644 --- a/tamguo-crawler/src/test/java/com/tamguo/PaperCrawler.java +++ b/tamguo-crawler/src/test/java/com/tamguo/PaperCrawler.java @@ -18,6 +18,7 @@ import com.tamguo.model.enums.QuestionType; import com.tamguo.model.vo.PaperVo; import com.xuxueli.crawler.XxlCrawler; import com.xuxueli.crawler.parser.PageParser; +import com.xuxueli.crawler.parser.strategy.HtmlUnitPageLoader; import com.xuxueli.crawler.rundata.RunData; @RunWith(SpringRunner.class) @@ -31,11 +32,11 @@ public class PaperCrawler { // 110000 北京 private final String AREA_ID = "110000"; // 年份 - private final String YEAR = "2018"; + private final String YEAR = "2017"; // 真题试卷 类型(1:真题试卷,2:模拟试卷,3:押题预测,4:名校精品) private final String PAPER_TYPE = "1"; // 开始采集的URL - private final String START_URL = "https://tiku.baidu.com/tikupc/paperlist/1bfd700abb68a98271fefa04-16-0-2018-37-1-download"; + private final String START_URL = "https://tiku.baidu.com/tikupc/paperlist/1bfd700abb68a98271fefa04-16-1-2017-37-1-download"; private RunData runData; @@ -51,6 +52,7 @@ public class PaperCrawler { .setAllowSpread(false) .setFailRetryCount(5) .setThreadCount(1) + .setPageLoader(new HtmlUnitPageLoader()) .setPageParser(new PageParser() { @Override @@ -89,11 +91,13 @@ public class PaperCrawler { paper.setQuestionInfo(entitys.toJSONString()); paperMapper.insert(paper); - // 插入图片 + // 插入 for(int i=0 ; i() { @@ -215,13 +215,13 @@ public class PaperQuestionCrawler { } private String getFileDatePath() { - SimpleDateFormat sdf = new SimpleDateFormat("ddHHmm"); + SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmm"); String format = sdf.format(new Date()); return format; } private String getFileNo() { - SimpleDateFormat sdf = new SimpleDateFormat("ddHHmm"); + SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmm"); String format = sdf.format(new Date()); DecimalFormat df = new DecimalFormat(FILES_NO_FORMAT); String key = FILES_PREFIX + format; @@ -236,7 +236,7 @@ public class PaperQuestionCrawler { int pageSize = 1000; while(true) { Page questionPage = new Page(page , pageSize); - List questionList = crawlerPaperMapper.selectPage(questionPage, Condition.create().orderDesc(Arrays.asList("id"))); + List questionList = crawlerPaperMapper.selectPage(questionPage, Condition.create().orderAsc(Arrays.asList("queindex"))); for(int i=0 ;i
1.1 集合的含义
-

+

531 道题

-