爬取数据

main
tamguo 7 years ago
parent 6e23c61910
commit 99406fab06

@ -70,6 +70,17 @@
<artifactId>xxl-crawler</artifactId>
<version>1.2.1</version>
</dependency>
<!-- htmlunit -->
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>2.9.0</version>
</dependency>
</dependencies>
<dependencyManagement>

@ -0,0 +1,9 @@
package com.tamguo.dao;
import com.tamguo.config.dao.SuperMapper;
import com.tamguo.model.CrawlerQuestionEntity;
public interface CrawlerQuestionMapper extends SuperMapper<CrawlerQuestionEntity>{
}

@ -0,0 +1,56 @@
package com.tamguo.model;
import java.io.Serializable;
import com.baomidou.mybatisplus.annotations.TableName;
import com.tamguo.config.dao.SuperEntity;
/**
* The persistent class for the crawler_question database table.
*
*/
@TableName(value="crawler_question")
public class CrawlerQuestionEntity extends SuperEntity<CrawlerQuestionEntity> implements Serializable {
private static final long serialVersionUID = 1L;
private String questionUrl;
private String chapterId;
private String status;
public String getQuestionUrl() {
return questionUrl;
}
public void setQuestionUrl(String questionUrl) {
this.questionUrl = questionUrl;
}
public String getChapterId() {
return chapterId;
}
public void setChapterId(String chapterId) {
this.chapterId = chapterId;
}
public String getStatus() {
return status;
}
public void setStatus(String status) {
this.status = status;
}
public static long getSerialversionuid() {
return serialVersionUID;
}
}

@ -39,6 +39,30 @@ public class SubjectVo {
@PageFieldSelect(cssQuery = ".screening .sc-subject li:not(.selected) a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href")
private List<String> chapterUrls;
// 待采集的问题URLs
@PageFieldSelect(cssQuery = ".list-right .detail-chapter .detail-kpoint-1 .detail-kpoint-2 .mask a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href")
private List<String> questionUrlsTemp;
// 待采集问题URLs
@PageFieldSelect(cssQuery = ".bd-content .question-box .question-box-inner .view-analyse a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href")
private List<String> questionUrls;
// 单个题目数据
@PageFieldSelect(cssQuery=".question-box-inner .questem-inner", selectType = XxlCrawlerConf.SelectType.HTML)
private String content;
@PageFieldSelect(cssQuery=".exam-answer-content", selectType = XxlCrawlerConf.SelectType.HTML)
private List<String> answer;
@PageFieldSelect(cssQuery=".exam-analysis .exam-analysis-content", selectType = XxlCrawlerConf.SelectType.HTML)
private String analysis;
@PageFieldSelect(cssQuery=".que-title span:eq(0)",selectType = XxlCrawlerConf.SelectType.TEXT)
private String questionType;
@PageFieldSelect(cssQuery=".que-title span:eq(1)",selectType = XxlCrawlerConf.SelectType.TEXT)
private String score;
public List<String> getName() {
return name;
}
@ -103,4 +127,60 @@ public class SubjectVo {
this.chapterCurrName = chapterCurrName;
}
public List<String> getQuestionUrlsTemp() {
return questionUrlsTemp;
}
public void setQuestionUrlsTemp(List<String> questionUrlsTemp) {
this.questionUrlsTemp = questionUrlsTemp;
}
public List<String> getQuestionUrls() {
return questionUrls;
}
public void setQuestionUrls(List<String> questionUrls) {
this.questionUrls = questionUrls;
}
public String getAnalysis() {
return analysis;
}
public void setAnalysis(String analysis) {
this.analysis = analysis;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getQuestionType() {
return questionType;
}
public void setQuestionType(String questionType) {
this.questionType = questionType;
}
public String getScore() {
return score;
}
public void setScore(String score) {
this.score = score;
}
public List<String> getAnswer() {
return answer;
}
public void setAnswer(List<String> answer) {
this.answer = answer;
}
}

@ -1,8 +1,10 @@
package com.tamguo.service.impl;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.List;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
@ -14,9 +16,11 @@ import org.springframework.stereotype.Service;
import com.tamguo.dao.ChapterMapper;
import com.tamguo.dao.CourseMapper;
import com.tamguo.dao.CrawlerQuestionMapper;
import com.tamguo.dao.SubjectMapper;
import com.tamguo.model.ChapterEntity;
import com.tamguo.model.CourseEntity;
import com.tamguo.model.CrawlerQuestionEntity;
import com.tamguo.model.SubjectEntity;
import com.tamguo.model.vo.SubjectVo;
import com.tamguo.service.ISubjectService;
@ -33,10 +37,16 @@ public class SubjectService implements ISubjectService{
CourseMapper courseMapper;
@Autowired
ChapterMapper chapterMapper;
@Autowired
CrawlerQuestionMapper crawlerQuestionMapper;
private Logger logger = LoggerFactory.getLogger(getClass());
private List<String> urls = new ArrayList<>();
private Set<String> urls = new HashSet<>();
private Set<String> questionUrls = new HashSet<String>();
private Map<String, Object> chapterQuestionListMap = new HashMap<>();
private RunData runData;
@ -44,9 +54,7 @@ public class SubjectService implements ISubjectService{
public void crawlerSubject() {
XxlCrawler crawler = new XxlCrawler.Builder()
.setUrls("https://tiku.baidu.com/")
.setWhiteUrlRegexs("https://tiku.baidu.com/tikupc/homepage/\\w+","https://tiku.baidu.com/tikupc/homepage/\\w+"
, "https://tiku.baidu.com/"
, "https://tiku.baidu.com/tikupc/chapterlist/.*")
.setAllowSpread(false)
.setPageParser(new PageParser<SubjectVo>() {
@Override
@ -75,7 +83,6 @@ public class SubjectService implements ISubjectService{
runData.addUrl(url);
}
}
if(pageUrl.contains("https://tiku.baidu.com/tikupc/homepage/")) {
logger.info("开始解析科目分类:{}" , pageUrl);
for(int i=0 ; i<subjectVo.getCourseName().size() ; i++) {
@ -178,6 +185,14 @@ public class SubjectService implements ISubjectService{
chapter2.setPointNum(0);
chapter2.setOrders(k+1);
chapterMapper.insert(chapter2);
Elements maskList = detailKpoint.getElementsByClass("mask");
if(maskList.size() > 0) {
String questionUrl = maskList.get(0).getElementsByTag("a").attr("abs:href");
questionUrl = questionUrl.replace("1-5", "1-1000");
chapterQuestionListMap.put(questionUrl, chapter2);
runData.addUrl(questionUrl);
}
}
}
}
@ -185,9 +200,8 @@ public class SubjectService implements ISubjectService{
// 剔除已经爬取的数据
urls.add(pageUrl);
// 加入科目爬取数据
for(String url : subjectVo.getChapterUrls()) {
if(url.equals("https://tiku.baidu.com"+pageVoElement.getElementsByClass("main-inner").get(0).getElementsByClass("selected").get(0).getElementsByTag("a").attr("href"))) {
if(url.equals(pageVoElement.getElementsByClass("main-inner").get(0).getElementsByClass("selected").get(0).getElementsByTag("a").attr("abs:href"))) {
continue;
}
if(!urls.contains(url)) {
@ -195,13 +209,40 @@ public class SubjectService implements ISubjectService{
}
}
}
}
}).build();
runData = crawler.getRunData();
if(pageUrl.contains("https://tiku.baidu.com/tikupc/chapterdetail")) {
// 加入待解析题目列表
for(String questionUrl : subjectVo.getQuestionUrls()) {
if(!questionUrls.contains(questionUrl)) {
// 处理URL
// runData.addUrl(questionUrl);
questionUrls.add(questionUrl);
ChapterEntity chapterEntity = (ChapterEntity) chapterQuestionListMap.get(pageUrl);
CrawlerQuestionEntity condition = new CrawlerQuestionEntity();
condition.setQuestionUrl(questionUrl);
if(crawlerQuestionMapper.selectOne(condition) == null) {
CrawlerQuestionEntity crawlerQuestion = new CrawlerQuestionEntity();
crawlerQuestion.setQuestionUrl(questionUrl);
crawlerQuestion.setChapterId(chapterEntity.getUid());
crawlerQuestion.setStatus("0");
crawlerQuestionMapper.insert(crawlerQuestion);
}else {
logger.info(questionUrl+"已经爬取");
}
}
}
}
/*if(pageUrl.contains("https://tiku.baidu.com/tikupc/singledetail")) {
ChapterEntity chapterEntity = (ChapterEntity) chapterQuestionMap.get(pageUrl);
System.out.println(chapterEntity);
}*/
}
}).build();
runData = crawler.getRunData();
// 获取科目
crawler.start(true);
}

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.tamguo.dao.CrawlerQuestionMapper">
</mapper>

@ -0,0 +1,9 @@
<?xml version="1.0" encoding="UTF-8"?>
<redis>
<preKey value="TAMGUO:" />
<pool maxActive="50" maxIdle="20" maxWait="1000" />
<servers>
<!-- test -->
<server ip="47.100.175.14" port="6379"/>
</servers>
</redis>
Loading…
Cancel
Save