爬取数据

main
tamguo 7 years ago
parent c8fd2f945e
commit f9fc006a7b

@ -0,0 +1,9 @@
package com.tamguo.dao;
import com.tamguo.config.dao.SuperMapper;
import com.tamguo.model.ChapterEntity;
public interface ChapterMapper extends SuperMapper<ChapterEntity>{
}

@ -0,0 +1,79 @@
package com.tamguo.model;
import java.io.Serializable;
import com.baomidou.mybatisplus.annotations.TableName;
import com.tamguo.config.dao.SuperEntity;
/**
* The persistent class for the tiku_chapter database table.
*
*/
@TableName(value="tiku_chapter")
public class ChapterEntity extends SuperEntity<ChapterEntity> implements Serializable {
private static final long serialVersionUID = 1L;
private String courseId;
private String name;
private String parentId;
private Integer questionNum;
private Integer pointNum;
private Integer orders;
public ChapterEntity() {
}
public String getCourseId() {
return this.courseId;
}
public void setCourseId(String courseId) {
this.courseId = courseId;
}
public String getName() {
return this.name;
}
public void setName(String name) {
this.name = name;
}
public String getParentId() {
return this.parentId;
}
public void setParentId(String parentId) {
this.parentId = parentId;
}
public Integer getQuestionNum() {
return questionNum;
}
public void setQuestionNum(Integer questionNum) {
this.questionNum = questionNum;
}
public Integer getPointNum() {
return pointNum;
}
public void setPointNum(Integer pointNum) {
this.pointNum = pointNum;
}
public Integer getOrders() {
return orders;
}
public void setOrders(Integer orders) {
this.orders = orders;
}
}

@ -23,8 +23,6 @@ public class CourseEntity extends SuperEntity<CourseEntity> implements Serializa
private BigInteger questionNum;
private String icon;
private Integer orders;
private String seoTitle;
@ -68,14 +66,6 @@ public class CourseEntity extends SuperEntity<CourseEntity> implements Serializa
this.pointNum = pointNum;
}
public String getIcon() {
return icon;
}
public void setIcon(String icon) {
this.icon = icon;
}
public Integer getOrders() {
return orders;
}

@ -4,6 +4,7 @@ import java.util.List;
import com.xuxueli.crawler.annotation.PageFieldSelect;
import com.xuxueli.crawler.annotation.PageSelect;
import com.xuxueli.crawler.conf.XxlCrawlerConf;
@PageSelect(cssQuery = "body")
public class SubjectVo {
@ -11,11 +12,32 @@ public class SubjectVo {
@PageFieldSelect(cssQuery = ".all-list-li")
private List<String> name;
// 类型名称
@PageFieldSelect(cssQuery=".submenu-contain .contain-title")
private String subjectName;
// 科目信息
@PageFieldSelect(cssQuery=".course-list-container .course-list .course-item")
private List<String> courseName;
@PageFieldSelect(cssQuery=".submenu-contain .contain-title")
private String subjectName;
// 带采集的科目URLs
@PageFieldSelect(cssQuery = ".all-list-li a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href")
private List<String> courseUrls;
@PageFieldSelect(cssQuery=".screening .selected a")
private String chapterPageCourseName;
@PageFieldSelect(cssQuery=".screening .selected a")
private String chapterCurrName;
// 带采集的章节URLs缓存
@PageFieldSelect(cssQuery = ".main-submenu .contain-ul .contain-li:eq(1) a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href")
private List<String> chapterUrlsTemp;
// 待采集的章节URLs
@PageFieldSelect(cssQuery = ".screening .sc-subject li:not(.selected) a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href")
private List<String> chapterUrls;
public List<String> getName() {
return name;
@ -41,4 +63,44 @@ public class SubjectVo {
this.subjectName = subjectName;
}
public List<String> getCourseUrls() {
return courseUrls;
}
public void setCourseUrls(List<String> courseUrls) {
this.courseUrls = courseUrls;
}
public List<String> getChapterUrls() {
return chapterUrls;
}
public void setChapterUrls(List<String> chapterUrls) {
this.chapterUrls = chapterUrls;
}
public List<String> getChapterUrlsTemp() {
return chapterUrlsTemp;
}
public void setChapterUrlsTemp(List<String> chapterUrlsTemp) {
this.chapterUrlsTemp = chapterUrlsTemp;
}
public String getChapterPageCourseName() {
return chapterPageCourseName;
}
public void setChapterPageCourseName(String chapterPageCourseName) {
this.chapterPageCourseName = chapterPageCourseName;
}
public String getChapterCurrName() {
return chapterCurrName;
}
public void setChapterCurrName(String chapterCurrName) {
this.chapterCurrName = chapterCurrName;
}
}

@ -1,8 +1,9 @@
package com.tamguo.service.impl;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
@ -11,8 +12,10 @@ import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import com.tamguo.dao.ChapterMapper;
import com.tamguo.dao.CourseMapper;
import com.tamguo.dao.SubjectMapper;
import com.tamguo.model.ChapterEntity;
import com.tamguo.model.CourseEntity;
import com.tamguo.model.SubjectEntity;
import com.tamguo.model.vo.SubjectVo;
@ -28,16 +31,22 @@ public class SubjectService implements ISubjectService{
SubjectMapper subjectMapper;
@Autowired
CourseMapper courseMapper;
@Autowired
ChapterMapper chapterMapper;
private Logger logger = LoggerFactory.getLogger(getClass());
private List<String> urls = new ArrayList<>();
private RunData runData;
@Override
public void crawlerSubject() {
XxlCrawler crawler = new XxlCrawler.Builder()
.setUrls("https://tiku.baidu.com/")
.setWhiteUrlRegexs("https://tiku\\.baidu\\.com/tikupc/homepage/\\w+" , "https://tiku.baidu.com/")
.setWhiteUrlRegexs("https://tiku.baidu.com/tikupc/homepage/\\w+","https://tiku.baidu.com/tikupc/homepage/\\w+"
, "https://tiku.baidu.com/"
, "https://tiku.baidu.com/tikupc/chapterlist/.*")
.setPageParser(new PageParser<SubjectVo>() {
@Override
@ -59,15 +68,12 @@ public class SubjectService implements ISubjectService{
}
entity.setName(name);
subjectMapper.insert(entity);
// 获取Course
Elements elements = pageVoElement.getElementsByClass("all-list-li");
for(int k=0 ; k<elements.size() ; k++) {
Element element = elements.get(k);
String url = element.child(0).attr("href");
runData.addUrl(url);
}
}
// 加入科目爬取数据
for(String url : subjectVo.getCourseUrls()) {
runData.addUrl(url);
}
}
if(pageUrl.contains("https://tiku.baidu.com/tikupc/homepage/")) {
@ -77,7 +83,6 @@ public class SubjectService implements ISubjectService{
SubjectEntity subject = subjectMapper.findByName(subjectVo.getSubjectName());
CourseEntity course = new CourseEntity();
course.setIcon(StringUtils.EMPTY);
course.setName(subjectVo.getCourseName().get(i));
course.setOrders(i+1);
course.setPointNum(BigInteger.ZERO);
@ -89,9 +94,102 @@ public class SubjectService implements ISubjectService{
courseMapper.insert(course);
}
// 加入科目爬取数据
for(String url : subjectVo.getChapterUrlsTemp()) {
runData.addUrl(url);
}
}
if(pageUrl.contains("https://tiku.baidu.com/tikupc/chapterlist/")) {
logger.info("开始解析章节:{}" , pageUrl);
CourseEntity courseCondition = new CourseEntity();
courseCondition.setName(subjectVo.getChapterPageCourseName());
CourseEntity c = courseMapper.selectOne(courseCondition);
if(c == null) {
runData.addUrl(pageUrl);
return;
}
ChapterEntity chapterCondition = new ChapterEntity();
chapterCondition.setName(subjectVo.getChapterCurrName());
ChapterEntity chapterEntity = chapterMapper.selectOne(chapterCondition);
if(chapterEntity != null) {
return;
}
ChapterEntity rootChapter = new ChapterEntity();
rootChapter.setCourseId(c.getUid());
rootChapter.setParentId("-1");
rootChapter.setName(subjectVo.getChapterCurrName());
rootChapter.setQuestionNum(0);
rootChapter.setPointNum(0);
rootChapter.setOrders(0);
chapterMapper.insert(rootChapter);
Elements elements = pageVoElement.getElementsByClass("detail-chapter");
for(int n=0 ; n<elements.size() ; n++) {
Element element = elements.get(n);
String chapterName = element.getElementsByClass("detail-chapter-title").get(0).getElementsByTag("h3").text();
logger.info(chapterName);
ChapterEntity chapter = new ChapterEntity();
chapter.setCourseId(c.getUid());
chapter.setParentId(rootChapter.getUid());
chapter.setName(chapterName);
chapter.setQuestionNum(0);
chapter.setPointNum(0);
chapter.setOrders(n+1);
chapterMapper.insert(chapter);
Elements detailKpoint1s = element.getElementsByClass("detail-kpoint-1");
for(Element detailKpoint1 : detailKpoint1s) {
Elements kpoint1Titles = detailKpoint1.getElementsByClass("kpoint-1-title");
for(int i=0 ; i<kpoint1Titles.size() ; i++) {
Element kpoint1Title = kpoint1Titles.get(i);
String chapterName1 = kpoint1Title.getElementsByTag("h4").text();
logger.info(chapterName1);
ChapterEntity chapter1 = new ChapterEntity();
chapter1.setCourseId(c.getUid());
chapter1.setParentId(chapter.getUid());
chapter1.setName(chapterName1);
chapter1.setQuestionNum(0);
chapter1.setPointNum(0);
chapter1.setOrders(i+1);
chapterMapper.insert(chapter1);
Elements detailKpoint2s = detailKpoint1.getElementsByClass("detail-kpoint-2");
for(int k=0 ; k<detailKpoint2s.size() ; k++) {
Element detailKpoint = detailKpoint2s.get(k);
String chapterName2 = detailKpoint.getElementsByTag("h5").text();
logger.info(chapterName2);
ChapterEntity chapter2 = new ChapterEntity();
chapter2.setCourseId(c.getUid());
chapter2.setParentId(chapter1.getUid());
chapter2.setName(chapterName2);
chapter2.setQuestionNum(0);
chapter2.setPointNum(0);
chapter2.setOrders(k+1);
chapterMapper.insert(chapter2);
}
}
}
}
// 剔除已经爬取的数据
urls.add(pageUrl);
// 加入科目爬取数据
for(String url : subjectVo.getChapterUrls()) {
if(url.equals("https://tiku.baidu.com"+pageVoElement.getElementsByClass("main-inner").get(0).getElementsByClass("selected").get(0).getElementsByTag("a").attr("href"))) {
continue;
}
if(!urls.contains(url)) {
runData.addUrl(url);
}
}
}
}
}).build();
runData = crawler.getRunData();

@ -7,7 +7,7 @@ spring.datasource.maxPoolPreparedStatementPerConnectionSize=20
spring.datasource.maxWait=60000
spring.datasource.minEvictableIdleTimeMillis=300000
spring.datasource.minIdle=5
spring.datasource.password=
spring.datasource.password=Tanguo
spring.datasource.poolPreparedStatements=true
spring.datasource.testOnBorrow=false
spring.datasource.testOnReturn=false

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.tamguo.dao.ChapterMapper">
</mapper>

@ -12,7 +12,7 @@ spring.datasource.maxPoolPreparedStatementPerConnectionSize=20
spring.datasource.maxWait=60000
spring.datasource.minEvictableIdleTimeMillis=300000
spring.datasource.minIdle=5
spring.datasource.password=
spring.datasource.password=Tanguo
spring.datasource.poolPreparedStatements=true
spring.datasource.testOnBorrow=false
spring.datasource.testOnReturn=false

Loading…
Cancel
Save