From f9fc006a7bf19eaaff695cfff20818f9eb3cdb22 Mon Sep 17 00:00:00 2001 From: tamguo Date: Tue, 3 Jul 2018 18:29:19 +0800 Subject: [PATCH] =?UTF-8?q?=E7=88=AC=E5=8F=96=E6=95=B0=E6=8D=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/com/tamguo/dao/ChapterMapper.java | 9 ++ .../java/com/tamguo/model/ChapterEntity.java | 79 ++++++++++++ .../java/com/tamguo/model/CourseEntity.java | 10 -- .../java/com/tamguo/model/vo/SubjectVo.java | 66 +++++++++- .../tamguo/service/impl/SubjectService.java | 122 ++++++++++++++++-- .../src/main/resources/application.properties | 2 +- .../main/resources/mappers/ChapterMapper.xml | 6 + .../src/main/resources/application.properties | 2 +- 8 files changed, 270 insertions(+), 26 deletions(-) create mode 100644 tamguo-crawler/src/main/java/com/tamguo/dao/ChapterMapper.java create mode 100644 tamguo-crawler/src/main/java/com/tamguo/model/ChapterEntity.java create mode 100644 tamguo-crawler/src/main/resources/mappers/ChapterMapper.xml diff --git a/tamguo-crawler/src/main/java/com/tamguo/dao/ChapterMapper.java b/tamguo-crawler/src/main/java/com/tamguo/dao/ChapterMapper.java new file mode 100644 index 0000000..cc10c3d --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/dao/ChapterMapper.java @@ -0,0 +1,9 @@ +package com.tamguo.dao; + +import com.tamguo.config.dao.SuperMapper; +import com.tamguo.model.ChapterEntity; + +public interface ChapterMapper extends SuperMapper{ + + +} diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/ChapterEntity.java b/tamguo-crawler/src/main/java/com/tamguo/model/ChapterEntity.java new file mode 100644 index 0000000..31fe3bc --- /dev/null +++ b/tamguo-crawler/src/main/java/com/tamguo/model/ChapterEntity.java @@ -0,0 +1,79 @@ +package com.tamguo.model; + +import java.io.Serializable; +import com.baomidou.mybatisplus.annotations.TableName; +import com.tamguo.config.dao.SuperEntity; + + +/** + * The persistent class for the tiku_chapter database table. + * + */ +@TableName(value="tiku_chapter") +public class ChapterEntity extends SuperEntity implements Serializable { + private static final long serialVersionUID = 1L; + + private String courseId; + + private String name; + + private String parentId; + + private Integer questionNum; + + private Integer pointNum; + + private Integer orders; + + public ChapterEntity() { + } + + public String getCourseId() { + return this.courseId; + } + + public void setCourseId(String courseId) { + this.courseId = courseId; + } + + public String getName() { + return this.name; + } + + public void setName(String name) { + this.name = name; + } + + public String getParentId() { + return this.parentId; + } + + public void setParentId(String parentId) { + this.parentId = parentId; + } + + public Integer getQuestionNum() { + return questionNum; + } + + public void setQuestionNum(Integer questionNum) { + this.questionNum = questionNum; + } + + public Integer getPointNum() { + return pointNum; + } + + public void setPointNum(Integer pointNum) { + this.pointNum = pointNum; + } + + public Integer getOrders() { + return orders; + } + + public void setOrders(Integer orders) { + this.orders = orders; + } + +} \ No newline at end of file diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/CourseEntity.java b/tamguo-crawler/src/main/java/com/tamguo/model/CourseEntity.java index e0a6dc5..fd4ff6a 100644 --- a/tamguo-crawler/src/main/java/com/tamguo/model/CourseEntity.java +++ b/tamguo-crawler/src/main/java/com/tamguo/model/CourseEntity.java @@ -23,8 +23,6 @@ public class CourseEntity extends SuperEntity implements Serializa private BigInteger questionNum; - private String icon; - private Integer orders; private String seoTitle; @@ -68,14 +66,6 @@ public class CourseEntity extends SuperEntity implements Serializa this.pointNum = pointNum; } - public String getIcon() { - return icon; - } - - public void setIcon(String icon) { - this.icon = icon; - } - public Integer getOrders() { return orders; } diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/vo/SubjectVo.java b/tamguo-crawler/src/main/java/com/tamguo/model/vo/SubjectVo.java index d259152..df3c05f 100644 --- a/tamguo-crawler/src/main/java/com/tamguo/model/vo/SubjectVo.java +++ b/tamguo-crawler/src/main/java/com/tamguo/model/vo/SubjectVo.java @@ -4,6 +4,7 @@ import java.util.List; import com.xuxueli.crawler.annotation.PageFieldSelect; import com.xuxueli.crawler.annotation.PageSelect; +import com.xuxueli.crawler.conf.XxlCrawlerConf; @PageSelect(cssQuery = "body") public class SubjectVo { @@ -11,11 +12,32 @@ public class SubjectVo { @PageFieldSelect(cssQuery = ".all-list-li") private List name; + // 类型名称 + @PageFieldSelect(cssQuery=".submenu-contain .contain-title") + private String subjectName; + + // 科目信息 @PageFieldSelect(cssQuery=".course-list-container .course-list .course-item") private List courseName; - @PageFieldSelect(cssQuery=".submenu-contain .contain-title") - private String subjectName; + // 带采集的科目URLs + @PageFieldSelect(cssQuery = ".all-list-li a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") + private List courseUrls; + + + @PageFieldSelect(cssQuery=".screening .selected a") + private String chapterPageCourseName; + + @PageFieldSelect(cssQuery=".screening .selected a") + private String chapterCurrName; + + // 带采集的章节URLs缓存 + @PageFieldSelect(cssQuery = ".main-submenu .contain-ul .contain-li:eq(1) a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") + private List chapterUrlsTemp; + + // 待采集的章节URLs + @PageFieldSelect(cssQuery = ".screening .sc-subject li:not(.selected) a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") + private List chapterUrls; public List getName() { return name; @@ -41,4 +63,44 @@ public class SubjectVo { this.subjectName = subjectName; } + public List getCourseUrls() { + return courseUrls; + } + + public void setCourseUrls(List courseUrls) { + this.courseUrls = courseUrls; + } + + public List getChapterUrls() { + return chapterUrls; + } + + public void setChapterUrls(List chapterUrls) { + this.chapterUrls = chapterUrls; + } + + public List getChapterUrlsTemp() { + return chapterUrlsTemp; + } + + public void setChapterUrlsTemp(List chapterUrlsTemp) { + this.chapterUrlsTemp = chapterUrlsTemp; + } + + public String getChapterPageCourseName() { + return chapterPageCourseName; + } + + public void setChapterPageCourseName(String chapterPageCourseName) { + this.chapterPageCourseName = chapterPageCourseName; + } + + public String getChapterCurrName() { + return chapterCurrName; + } + + public void setChapterCurrName(String chapterCurrName) { + this.chapterCurrName = chapterCurrName; + } + } diff --git a/tamguo-crawler/src/main/java/com/tamguo/service/impl/SubjectService.java b/tamguo-crawler/src/main/java/com/tamguo/service/impl/SubjectService.java index e7241fd..63fb6c2 100644 --- a/tamguo-crawler/src/main/java/com/tamguo/service/impl/SubjectService.java +++ b/tamguo-crawler/src/main/java/com/tamguo/service/impl/SubjectService.java @@ -1,8 +1,9 @@ package com.tamguo.service.impl; import java.math.BigInteger; +import java.util.ArrayList; +import java.util.List; -import org.apache.commons.lang3.StringUtils; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; @@ -11,8 +12,10 @@ import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; +import com.tamguo.dao.ChapterMapper; import com.tamguo.dao.CourseMapper; import com.tamguo.dao.SubjectMapper; +import com.tamguo.model.ChapterEntity; import com.tamguo.model.CourseEntity; import com.tamguo.model.SubjectEntity; import com.tamguo.model.vo.SubjectVo; @@ -28,16 +31,22 @@ public class SubjectService implements ISubjectService{ SubjectMapper subjectMapper; @Autowired CourseMapper courseMapper; + @Autowired + ChapterMapper chapterMapper; private Logger logger = LoggerFactory.getLogger(getClass()); + private List urls = new ArrayList<>(); + private RunData runData; @Override public void crawlerSubject() { XxlCrawler crawler = new XxlCrawler.Builder() .setUrls("https://tiku.baidu.com/") - .setWhiteUrlRegexs("https://tiku\\.baidu\\.com/tikupc/homepage/\\w+" , "https://tiku.baidu.com/") + .setWhiteUrlRegexs("https://tiku.baidu.com/tikupc/homepage/\\w+","https://tiku.baidu.com/tikupc/homepage/\\w+" + , "https://tiku.baidu.com/" + , "https://tiku.baidu.com/tikupc/chapterlist/.*") .setPageParser(new PageParser() { @Override @@ -59,15 +68,12 @@ public class SubjectService implements ISubjectService{ } entity.setName(name); subjectMapper.insert(entity); - - // 获取Course - Elements elements = pageVoElement.getElementsByClass("all-list-li"); - for(int k=0 ; k + + + + + \ No newline at end of file diff --git a/tamguo/src/main/resources/application.properties b/tamguo/src/main/resources/application.properties index ae28523..fa403f2 100644 --- a/tamguo/src/main/resources/application.properties +++ b/tamguo/src/main/resources/application.properties @@ -12,7 +12,7 @@ spring.datasource.maxPoolPreparedStatementPerConnectionSize=20 spring.datasource.maxWait=60000 spring.datasource.minEvictableIdleTimeMillis=300000 spring.datasource.minIdle=5 -spring.datasource.password= +spring.datasource.password=Tanguo spring.datasource.poolPreparedStatements=true spring.datasource.testOnBorrow=false spring.datasource.testOnReturn=false