main
tamguo 7 years ago
parent 08b94e54f2
commit a2f1303e11

@ -79,4 +79,7 @@ public class SystemConstant {
/** ALIYUN*/ /** ALIYUN*/
public static final String ALIYUN_ACCESS_KEY_SECRET = "ONUKuCz85kU4In07y4dvpM28mfWOGa"; public static final String ALIYUN_ACCESS_KEY_SECRET = "ONUKuCz85kU4In07y4dvpM28mfWOGa";
/** 默认的章节根目录*/
public static final String CHAPTER_DEFAULT_ROOT_UID = "-1";
} }

@ -11,7 +11,7 @@ public class SuperEntity<T extends Model<?>> extends Model<T> {
private static final long serialVersionUID = 1L; private static final long serialVersionUID = 1L;
@TableId("uid") @TableId("id")
private String uid; private String uid;
@Override @Override

@ -19,6 +19,14 @@ public class ChapterEntity extends SuperEntity<ChapterEntity> implements Seriali
private String parentId; private String parentId;
private String parentIds;
private Integer treeLevel;
private Boolean treeLeaf;
private String bookId;
private Integer questionNum; private Integer questionNum;
private Integer pointNum; private Integer pointNum;
@ -76,4 +84,36 @@ public class ChapterEntity extends SuperEntity<ChapterEntity> implements Seriali
this.orders = orders; this.orders = orders;
} }
public String getBookId() {
return bookId;
}
public void setBookId(String bookId) {
this.bookId = bookId;
}
public String getParentIds() {
return parentIds;
}
public void setParentIds(String parentIds) {
this.parentIds = parentIds;
}
public Integer getTreeLevel() {
return treeLevel;
}
public void setTreeLevel(Integer treeLevel) {
this.treeLevel = treeLevel;
}
public Boolean getTreeLeaf() {
return treeLeaf;
}
public void setTreeLeaf(Boolean treeLeaf) {
this.treeLeaf = treeLeaf;
}
} }

@ -47,6 +47,9 @@ public class SubjectVo {
@PageFieldSelect(cssQuery = ".bd-content .question-box .question-box-inner .view-analyse a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") @PageFieldSelect(cssQuery = ".bd-content .question-box .question-box-inner .view-analyse a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href")
private List<String> questionUrls; private List<String> questionUrls;
@PageFieldSelect(cssQuery = ".nexttolearn .next-inner .tolearn", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href")
private String nextQuestionPage;
// 单个题目数据 // 单个题目数据
@PageFieldSelect(cssQuery=".question-box-inner .questem-inner", selectType = XxlCrawlerConf.SelectType.HTML) @PageFieldSelect(cssQuery=".question-box-inner .questem-inner", selectType = XxlCrawlerConf.SelectType.HTML)
private String content; private String content;
@ -183,4 +186,12 @@ public class SubjectVo {
this.answer = answer; this.answer = answer;
} }
public String getNextQuestionPage() {
return nextQuestionPage;
}
public void setNextQuestionPage(String nextQuestionPage) {
this.nextQuestionPage = nextQuestionPage;
}
} }

@ -1,74 +1,224 @@
package com.tamguo.service.impl; package com.tamguo.service.impl;
import com.baomidou.mybatisplus.plugins.Page;
import com.tamguo.config.redis.CacheService;
import com.tamguo.dao.ChapterMapper; import com.tamguo.dao.ChapterMapper;
import com.tamguo.dao.CourseMapper;
import com.tamguo.dao.CrawlerQuestionMapper;
import com.tamguo.dao.QuestionMapper;
import com.tamguo.dao.SubjectMapper;
import com.tamguo.model.ChapterEntity; import com.tamguo.model.ChapterEntity;
import com.tamguo.model.vo.ChapterVo; import com.tamguo.model.CourseEntity;
import com.tamguo.model.CrawlerQuestionEntity;
import com.tamguo.model.QuestionEntity;
import com.tamguo.model.SubjectEntity;
import com.tamguo.model.enums.QuestionType;
import com.tamguo.model.vo.QuestionVo;
import com.tamguo.service.IBookService; import com.tamguo.service.IBookService;
import com.xuxueli.crawler.XxlCrawler; import com.xuxueli.crawler.XxlCrawler;
import com.xuxueli.crawler.conf.XxlCrawlerConf;
import com.xuxueli.crawler.parser.PageParser; import com.xuxueli.crawler.parser.PageParser;
import com.xuxueli.crawler.parser.strategy.HtmlUnitPageLoader;
import com.xuxueli.crawler.rundata.RunData;
import com.xuxueli.crawler.util.FileUtil;
import java.io.File;
import java.text.DecimalFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import java.util.List;
import java.util.UUID;
@Service @Service
public class BookService implements IBookService { public class BookService implements IBookService {
private RunData runData;
@Autowired @Autowired
ChapterMapper chapterMapper; QuestionMapper questionMapper;
@Autowired
private Logger logger = LoggerFactory.getLogger(getClass()); CrawlerQuestionMapper crawlerQuestionMapper;
@Autowired
ChapterMapper chapterMapper;
@Autowired
CourseMapper courseMapper;
@Autowired
SubjectMapper subjectMapper;
@Autowired
CacheService cacheService;
private static final String FILES_NO_FORMAT = "0000000";
private static final String FILES_PREFIX = "LIKESHUXUE";
private static final String DOMAIN = "http://www.tamguo.com";
@Override @Override
public void crawlerBook() { public void crawlerBook() {
XxlCrawler crawler = new XxlCrawler.Builder() XxlCrawler crawler = new XxlCrawler.Builder()
.setUrls("https://tiku.baidu.com/tikupc/chapterlist/1bfd700abb68a98271fefa04-27-jiaocai-11") .setAllowSpread(false)
.setAllowSpread(false) .setThreadCount(20)
.setFailRetryCount(5) .setFailRetryCount(5)
.setThreadCount(20) .setPageLoader(new HtmlUnitPageLoader())
.setPageParser(new PageParser<ChapterVo>() { .setPageParser(new PageParser<QuestionVo>() {
@Override
public void parse(Document html, Element pageVoElement, ChapterVo chapterVo) { @Override
// 解析封装 PageVo 对象 public void parse(Document html, Element pageVoElement, QuestionVo questionVo) {
String parentName = chapterVo.getName(); if(StringUtils.isEmpty(questionVo.getContent())) {
ChapterEntity chapterEntity = new ChapterEntity(); runData.addUrl(html.baseUri());
String uid = UUID.randomUUID().toString().replace("-", ""); return;
chapterEntity.setUid(uid); }
chapterEntity.setName(parentName); CrawlerQuestionEntity condition = new CrawlerQuestionEntity();
chapterEntity.setCourseId("0"); condition.setQuestionUrl(html.baseUri());
chapterEntity.setCourseId("0"); CrawlerQuestionEntity crawlerQuestion = crawlerQuestionMapper.selectOne(condition);
chapterEntity.setParentId("-1"); ChapterEntity chapter = chapterMapper.selectById(crawlerQuestion.getChapterId());
chapterEntity.setQuestionNum(0); CourseEntity course = courseMapper.selectById(chapter.getCourseId());
chapterEntity.setPointNum(0); SubjectEntity subject = subjectMapper.selectById(course.getSubjectId());
chapterMapper.insert(chapterEntity);
QuestionType questionType = QuestionType.getQuestionType(questionVo.getQuestionType());
List<String> sonChapters = chapterVo.getSonChapters(); QuestionEntity question = new QuestionEntity();
sonChapters.forEach(s -> { if(questionType == QuestionType.DANXUANTI) {
ChapterEntity sonChapterEntity = new ChapterEntity(); if(!StringUtils.isEmpty(questionVo.getQueoptions())) {
sonChapterEntity.setName(s); question.setContent(questionVo.getContent() + questionVo.getQueoptions());
sonChapterEntity.setCourseId("0"); }else {
sonChapterEntity.setCourseId("0"); question.setContent(questionVo.getContent());
sonChapterEntity.setParentId(uid); }
sonChapterEntity.setQuestionNum(0); }else {
sonChapterEntity.setPointNum(0); question.setContent(questionVo.getContent());
chapterMapper.insert(sonChapterEntity); }
}); question.setAnalysis(questionVo.getAnalysis());
if(StringUtils.isEmpty(question.getAnswer())) {
question.setAnalysis("<p> <span> 略 </span> <br> </p>");
}
question.setAnswer(questionVo.getAnswer());
question.setAuditStatus("1");
question.setChapterId(chapter.getUid());
question.setCourseId(course.getUid());
question.setPaperId(null);
question.setQuestionType(questionType.getValue().toString());
if(questionVo.getReviewPoint() != null && questionVo.getReviewPoint().size() > 0) {
question.setReviewPoint(StringUtils.join(questionVo.getReviewPoint().toArray(), ","));
}
// 处理分数
if(questionVo.getScore() != null) {
if(questionVo.getScore().contains("分")) {
question.setScore(questionVo.getScore());
}
if(questionVo.getScore().contains("年")) {
question.setYear(questionVo.getScore());
}
}
if(questionVo.getYear() != null) {
if(questionVo.getYear().contains("年")) {
question.setYear(questionVo.getYear());
}
}
question.setSubjectId(subject.getUid());
if (questionVo.getAnswerImages()!=null && questionVo.getAnswerImages().size() > 0) {
Set<String> imagesSet = new HashSet<>(questionVo.getAnswerImages());
for (String img: imagesSet) {
} // 下载图片文件
String fileName = getFileName(img);
File dir = new File(getFilePath());
if (!dir.exists())
dir.mkdirs();
boolean ret = FileUtil.downFile(img, XxlCrawlerConf.TIMEOUT_MILLIS_DEFAULT, getFilePath(), fileName);
System.out.println("down images " + (ret?"success":"fail") + "" + img);
// 替换URL
questionVo.setAnswer(questionVo.getAnswer().replace(img, DOMAIN + getFilePath() + fileName));
}
question.setAnswer(questionVo.getAnswer());
}
if (questionVo.getAnalysisImages()!=null && questionVo.getAnalysisImages().size() > 0) {
Set<String> imagesSet = new HashSet<>(questionVo.getAnalysisImages());
for (String img: imagesSet) {
// 下载图片文件
String fileName = getFileName(img);
File dir = new File(getFilePath());
if (!dir.exists())
dir.mkdirs();
boolean ret = FileUtil.downFile(img, XxlCrawlerConf.TIMEOUT_MILLIS_DEFAULT, getFilePath(), fileName);
System.out.println("down images " + (ret?"success":"fail") + "" + img);
// 替换URL
questionVo.setAnalysis(questionVo.getAnalysis().replace(img, DOMAIN + getFilePath() + fileName));
}
question.setAnalysis(questionVo.getAnalysis());
}
if (questionVo.getContentImages()!=null && questionVo.getContentImages().size() > 0) {
Set<String> imagesSet = new HashSet<>(questionVo.getContentImages());
for (String img: imagesSet) {
// } // 下载图片文件
}).build(); String fileName = getFileName(img);
File dir = new File(getFilePath());
if (!dir.exists())
dir.mkdirs();
boolean ret = FileUtil.downFile(img, XxlCrawlerConf.TIMEOUT_MILLIS_DEFAULT, getFilePath(), fileName);
System.out.println("down images " + (ret?"success":"fail") + "" + img);
// 替换URL
questionVo.setContent(questionVo.getContent().replace(img, DOMAIN + getFilePath() + fileName));
}
question.setContent(questionVo.getContent());
}
// 处理图片
question.setSourceType("baidu");
question.setSourceUrl(html.baseUri());
questionMapper.insert(question);
}
public String getFileName(String img) {
return getFileNo() + img.substring(img.lastIndexOf("."));
}
private String getFilePath() {
SimpleDateFormat sdf = new SimpleDateFormat("ddHHmm");
String format = sdf.format(new Date());
return "/images/question/" + format + "/";
}
// 获取科目 private String getFileNo() {
crawler.start(true); SimpleDateFormat sdf = new SimpleDateFormat("ddHHmm");
} String format = sdf.format(new Date());
DecimalFormat df = new DecimalFormat(FILES_NO_FORMAT);
String key = FILES_PREFIX + format;
Long incr = cacheService.incr(key);
String avatorNo = FILES_PREFIX + df.format(incr);
return avatorNo;
}
}).build();
runData = crawler.getRunData();
int page = 1;
int pageSize = 1000;
while(true) {
Page<CrawlerQuestionEntity> questionPage = new Page<CrawlerQuestionEntity>(page , pageSize);
List<CrawlerQuestionEntity> questionList = crawlerQuestionMapper.queryPageOrderUid(questionPage);
for(int i=0 ;i<questionList.size() ; i++) {
runData.addUrl(questionList.get(i).getQuestionUrl());
}
page++;
if(questionList.size() < 100) {
break;
}
}
// 获取科目
crawler.start(true);
}
} }

@ -5,6 +5,7 @@ import java.util.HashSet;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; import org.jsoup.select.Elements;
@ -19,9 +20,7 @@ import com.tamguo.dao.CourseMapper;
import com.tamguo.dao.CrawlerQuestionMapper; import com.tamguo.dao.CrawlerQuestionMapper;
import com.tamguo.dao.SubjectMapper; import com.tamguo.dao.SubjectMapper;
import com.tamguo.model.ChapterEntity; import com.tamguo.model.ChapterEntity;
import com.tamguo.model.CourseEntity;
import com.tamguo.model.CrawlerQuestionEntity; import com.tamguo.model.CrawlerQuestionEntity;
import com.tamguo.model.SubjectEntity;
import com.tamguo.model.vo.SubjectVo; import com.tamguo.model.vo.SubjectVo;
import com.tamguo.service.ISubjectService; import com.tamguo.service.ISubjectService;
import com.xuxueli.crawler.XxlCrawler; import com.xuxueli.crawler.XxlCrawler;
@ -31,6 +30,8 @@ import com.xuxueli.crawler.rundata.RunData;
@Service @Service
public class SubjectService implements ISubjectService{ public class SubjectService implements ISubjectService{
private final static String COURSE_ID = "likeshuxue";
private final static String BOOK_ID = "1025976567395184642";
@Autowired @Autowired
SubjectMapper subjectMapper; SubjectMapper subjectMapper;
@Autowired @Autowired
@ -42,8 +43,6 @@ public class SubjectService implements ISubjectService{
private Logger logger = LoggerFactory.getLogger(getClass()); private Logger logger = LoggerFactory.getLogger(getClass());
private Set<String> urls = new HashSet<>();
private Set<String> questionUrls = new HashSet<String>(); private Set<String> questionUrls = new HashSet<String>();
private Map<String, Object> chapterQuestionListMap = new HashMap<>(); private Map<String, Object> chapterQuestionListMap = new HashMap<>();
@ -53,7 +52,7 @@ public class SubjectService implements ISubjectService{
@Override @Override
public void crawlerSubject() { public void crawlerSubject() {
XxlCrawler crawler = new XxlCrawler.Builder() XxlCrawler crawler = new XxlCrawler.Builder()
.setUrls("https://tiku.baidu.com/") .setUrls("https://tiku.baidu.com/tikupc/chapterlist/1bfd700abb68a98271fefa04-16-knowpoint-11")
.setAllowSpread(false) .setAllowSpread(false)
.setFailRetryCount(5) .setFailRetryCount(5)
.setThreadCount(20) .setThreadCount(20)
@ -63,68 +62,8 @@ public class SubjectService implements ISubjectService{
public void parse(Document html, Element pageVoElement, SubjectVo subjectVo) { public void parse(Document html, Element pageVoElement, SubjectVo subjectVo) {
// 解析封装 PageVo 对象 // 解析封装 PageVo 对象
String pageUrl = html.baseUri(); String pageUrl = html.baseUri();
if(pageUrl.equals("https://tiku.baidu.com/")) {
logger.info("开始解析考试分类:{}" , pageUrl);
for(int i=0 ; i<subjectVo.getName().size() ; i++) {
String name = subjectVo.getName().get(i);
SubjectEntity subject = subjectMapper.findByName(name);
if(subject != null) {
continue;
}
SubjectEntity entity = new SubjectEntity();
if(name.equals("国考")) {
name = "公务员(国考)";
}
entity.setName(name);
subjectMapper.insert(entity);
}
// 加入科目爬取数据
for(String url : subjectVo.getCourseUrls()) {
runData.addUrl(url);
}
}
if(pageUrl.contains("https://tiku.baidu.com/tikupc/homepage/")) {
logger.info("开始解析科目分类:{}" , pageUrl);
for(int i=0 ; i<subjectVo.getCourseName().size() ; i++) {
logger.info("科目名称:{}" , subjectVo.getCourseName().get(i));
SubjectEntity subject = subjectMapper.findByName(subjectVo.getSubjectName());
if(subject == null) {
continue;
}
CourseEntity course = new CourseEntity();
course.setName(subjectVo.getCourseName().get(i));
CourseEntity courseEntity = courseMapper.selectOne(course);
if(courseEntity != null) {
continue;
}
course.setOrders(i+1);
course.setPointNum(0);
course.setQuestionNum(0);
course.setSeoDescription(subjectVo.getCourseName().get(i));
course.setSeoKeywords(subjectVo.getCourseName().get(i));
course.setSeoTitle(subjectVo.getCourseName().get(i));
course.setSubjectId(subject.getUid());
courseMapper.insert(course);
}
// 加入科目爬取数据
for(String url : subjectVo.getChapterUrlsTemp()) {
runData.addUrl(url);
}
}
if(pageUrl.contains("https://tiku.baidu.com/tikupc/chapterlist/")) { if(pageUrl.contains("https://tiku.baidu.com/tikupc/chapterlist/")) {
logger.info("开始解析章节:{}" , pageUrl); logger.info("开始解析书籍:{}" , pageUrl);
CourseEntity courseCondition = new CourseEntity();
courseCondition.setName(subjectVo.getChapterPageCourseName());
CourseEntity c = courseMapper.selectOne(courseCondition);
if(c == null) {
runData.addUrl(pageUrl);
return;
}
ChapterEntity chapterCondition = new ChapterEntity(); ChapterEntity chapterCondition = new ChapterEntity();
chapterCondition.setName(subjectVo.getChapterCurrName()); chapterCondition.setName(subjectVo.getChapterCurrName());
ChapterEntity chapterEntity = chapterMapper.selectOne(chapterCondition); ChapterEntity chapterEntity = chapterMapper.selectOne(chapterCondition);
@ -133,12 +72,16 @@ public class SubjectService implements ISubjectService{
} }
ChapterEntity rootChapter = new ChapterEntity(); ChapterEntity rootChapter = new ChapterEntity();
rootChapter.setCourseId(c.getUid()); rootChapter.setCourseId(COURSE_ID);
rootChapter.setParentId("-1"); rootChapter.setParentId("-1");
rootChapter.setName(subjectVo.getChapterCurrName()); rootChapter.setName(subjectVo.getChapterCurrName());
rootChapter.setQuestionNum(0); rootChapter.setQuestionNum(0);
rootChapter.setPointNum(0); rootChapter.setPointNum(0);
rootChapter.setOrders(0); rootChapter.setOrders(0);
rootChapter.setTreeLeaf(false);
rootChapter.setTreeLevel(0);
rootChapter.setParentIds("-1,");
rootChapter.setBookId(BOOK_ID);
chapterMapper.insert(rootChapter); chapterMapper.insert(rootChapter);
Elements elements = pageVoElement.getElementsByClass("detail-chapter"); Elements elements = pageVoElement.getElementsByClass("detail-chapter");
@ -147,13 +90,18 @@ public class SubjectService implements ISubjectService{
String chapterName = element.getElementsByClass("detail-chapter-title").get(0).getElementsByTag("h3").text(); String chapterName = element.getElementsByClass("detail-chapter-title").get(0).getElementsByTag("h3").text();
logger.info(chapterName); logger.info(chapterName);
ChapterEntity chapter = new ChapterEntity(); ChapterEntity chapter = new ChapterEntity();
chapter.setCourseId(c.getUid()); chapter.setCourseId(COURSE_ID);
chapter.setParentId(rootChapter.getUid()); chapter.setParentId(rootChapter.getUid());
chapter.setName(chapterName); chapter.setName(chapterName);
chapter.setQuestionNum(0); chapter.setQuestionNum(0);
chapter.setPointNum(0); chapter.setPointNum(0);
chapter.setOrders(n+1); chapter.setOrders(n+1);
chapter.setBookId(BOOK_ID);
chapter.setTreeLeaf(false);
chapter.setTreeLevel(1);
chapterMapper.insert(chapter); chapterMapper.insert(chapter);
chapter.setParentIds(rootChapter.getParentIds() + chapter.getUid() + ",");
chapterMapper.updateById(chapter);
Elements detailKpoint1s = element.getElementsByClass("detail-kpoint-1"); Elements detailKpoint1s = element.getElementsByClass("detail-kpoint-1");
for(Element detailKpoint1 : detailKpoint1s) { for(Element detailKpoint1 : detailKpoint1s) {
@ -164,13 +112,19 @@ public class SubjectService implements ISubjectService{
logger.info(chapterName1); logger.info(chapterName1);
ChapterEntity chapter1 = new ChapterEntity(); ChapterEntity chapter1 = new ChapterEntity();
chapter1.setCourseId(c.getUid()); chapter1.setCourseId(COURSE_ID);
chapter1.setBookId(BOOK_ID);
chapter1.setParentId(chapter.getUid()); chapter1.setParentId(chapter.getUid());
chapter1.setName(chapterName1); chapter1.setName(chapterName1);
chapter1.setQuestionNum(0); chapter1.setQuestionNum(0);
chapter1.setPointNum(0); chapter1.setPointNum(0);
chapter1.setOrders(i+1); chapter1.setOrders(i+1);
chapter1.setTreeLeaf(false);
chapter1.setTreeLevel(2);
chapterMapper.insert(chapter1); chapterMapper.insert(chapter1);
chapter1.setParentIds(chapter.getParentIds() + chapter1.getUid() + ",");
chapterMapper.updateById(chapter1);
Elements detailKpoint2s = detailKpoint1.getElementsByClass("detail-kpoint-2"); Elements detailKpoint2s = detailKpoint1.getElementsByClass("detail-kpoint-2");
@ -180,17 +134,23 @@ public class SubjectService implements ISubjectService{
logger.info(chapterName2); logger.info(chapterName2);
ChapterEntity chapter2 = new ChapterEntity(); ChapterEntity chapter2 = new ChapterEntity();
chapter2.setCourseId(c.getUid()); chapter2.setCourseId(COURSE_ID);
chapter2.setBookId(BOOK_ID);
chapter2.setParentId(chapter1.getUid()); chapter2.setParentId(chapter1.getUid());
chapter2.setName(chapterName2); chapter2.setName(chapterName2);
chapter2.setQuestionNum(0); chapter2.setQuestionNum(0);
chapter2.setPointNum(0); chapter2.setPointNum(0);
chapter2.setOrders(k+1); chapter2.setOrders(k+1);
chapter2.setTreeLeaf(true);
chapter2.setTreeLevel(3);
chapterMapper.insert(chapter2); chapterMapper.insert(chapter2);
chapter2.setParentIds(chapter1.getParentIds() + chapter2.getUid() + ",");
chapterMapper.updateById(chapter2);
Elements maskList = detailKpoint.getElementsByClass("mask"); Elements maskList = detailKpoint.getElementsByClass("mask");
if(maskList.size() > 0) { if(maskList.size() > 0) {
String questionUrl = maskList.get(0).getElementsByTag("a").attr("abs:href"); String questionUrl = maskList.get(0).getElementsByTag("a").attr("abs:href");
questionUrl = questionUrl.replace("1-5", "1-1000"); // questionUrl = questionUrl.replace("1-5", "1-20");
chapterQuestionListMap.put(questionUrl, chapter2); chapterQuestionListMap.put(questionUrl, chapter2);
runData.addUrl(questionUrl); runData.addUrl(questionUrl);
@ -200,34 +160,19 @@ public class SubjectService implements ISubjectService{
} }
} }
// 剔除已经爬取的数据
urls.add(pageUrl);
logger.info("url:{}" ,pageUrl );
logger.info("subjectVo:{}" ,JSONObject.toJSON(subjectVo) );
if(subjectVo.getChapterUrls() != null) {
for(String url : subjectVo.getChapterUrls()) {
if(url.equals(pageVoElement.getElementsByClass("main-inner").get(0).getElementsByClass("selected").get(0).getElementsByTag("a").attr("abs:href"))) {
continue;
}
if(!urls.contains(url)) {
runData.addUrl(url);
}
}
}
} }
if(pageUrl.contains("https://tiku.baidu.com/tikupc/chapterdetail")) { if(pageUrl.contains("https://tiku.baidu.com/tikupc/chapterdetail")) {
// 加入待解析题目列表 // 加入待解析题目列表
logger.info("url : {}" , pageUrl); logger.info("url : {}" , pageUrl);
logger.info("subjectVo : {}" , JSONObject.toJSON(subjectVo)); logger.info("subjectVo : {}" , JSONObject.toJSON(subjectVo));
ChapterEntity chapterEntity = (ChapterEntity) chapterQuestionListMap.get(pageUrl);
for(String questionUrl : subjectVo.getQuestionUrls()) { for(String questionUrl : subjectVo.getQuestionUrls()) {
if(!questionUrls.contains(questionUrl)) { if(!questionUrls.contains(questionUrl)) {
// 处理URL // 处理URL
// runData.addUrl(questionUrl); // runData.addUrl(questionUrl);
questionUrls.add(questionUrl); questionUrls.add(questionUrl);
ChapterEntity chapterEntity = (ChapterEntity) chapterQuestionListMap.get(pageUrl);
CrawlerQuestionEntity condition = new CrawlerQuestionEntity(); CrawlerQuestionEntity condition = new CrawlerQuestionEntity();
condition.setQuestionUrl(questionUrl); condition.setQuestionUrl(questionUrl);
if(crawlerQuestionMapper.selectOne(condition) == null) { if(crawlerQuestionMapper.selectOne(condition) == null) {
@ -241,12 +186,12 @@ public class SubjectService implements ISubjectService{
} }
} }
} }
if(!StringUtils.isEmpty(subjectVo.getNextQuestionPage())) {
runData.addUrl(subjectVo.getNextQuestionPage());
chapterQuestionListMap.put(subjectVo.getNextQuestionPage(), chapterEntity);
}
} }
/*if(pageUrl.contains("https://tiku.baidu.com/tikupc/singledetail")) {
ChapterEntity chapterEntity = (ChapterEntity) chapterQuestionMap.get(pageUrl);
System.out.println(chapterEntity);
}*/
} }
}).build(); }).build();

@ -15,7 +15,7 @@ spring.datasource.testOnReturn=false
spring.datasource.testWhileIdle=true spring.datasource.testWhileIdle=true
spring.datasource.timeBetweenEvictionRunsMillis=60000 spring.datasource.timeBetweenEvictionRunsMillis=60000
spring.datasource.type=com.alibaba.druid.pool.DruidDataSource spring.datasource.type=com.alibaba.druid.pool.DruidDataSource
spring.datasource.url=jdbc:mysql://47.100.175.14:3306/tiku?useUnicode=true&characterEncoding=UTF-8&useSSL=false spring.datasource.url=jdbc:mysql://47.100.175.14:3306/tamguo?useUnicode=true&characterEncoding=UTF-8&useSSL=false
spring.datasource.username=root spring.datasource.username=root
spring.datasource.validationQuery=SELECT 1 FROM DUAL spring.datasource.validationQuery=SELECT 1 FROM DUAL

@ -4,6 +4,6 @@
<pool maxActive="50" maxIdle="20" maxWait="1000" /> <pool maxActive="50" maxIdle="20" maxWait="1000" />
<servers> <servers>
<!-- test --> <!-- test -->
<server ip="47.100.175.14" port="6379"/> <server ip="127.0.0.1" port="6379"/>
</servers> </servers>
</redis> </redis>

@ -1,8 +1,13 @@
package com.tamguo.modules.tiku.service; package com.tamguo.modules.tiku.service;
import java.util.List;
import com.baomidou.mybatisplus.service.IService; import com.baomidou.mybatisplus.service.IService;
import com.tamguo.modules.tiku.model.ChapterEntity; import com.tamguo.modules.tiku.model.ChapterEntity;
public interface IChapterService extends IService<ChapterEntity>{ public interface IChapterService extends IService<ChapterEntity>{
// 获取科目章节
public List<ChapterEntity> findChapterTree(String bookId);
} }

@ -1,12 +1,70 @@
package com.tamguo.modules.tiku.service.impl; package com.tamguo.modules.tiku.service.impl;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import com.baomidou.mybatisplus.mapper.Condition;
import com.baomidou.mybatisplus.service.impl.ServiceImpl; import com.baomidou.mybatisplus.service.impl.ServiceImpl;
import com.tamguo.common.utils.SystemConstant;
import com.tamguo.modules.tiku.dao.ChapterMapper; import com.tamguo.modules.tiku.dao.ChapterMapper;
import com.tamguo.modules.tiku.model.ChapterEntity; import com.tamguo.modules.tiku.model.ChapterEntity;
import com.tamguo.modules.tiku.service.IChapterService; import com.tamguo.modules.tiku.service.IChapterService;
@Service @Service
public class ChapterServiceImpl extends ServiceImpl<ChapterMapper, ChapterEntity> implements IChapterService{ public class ChapterServiceImpl extends ServiceImpl<ChapterMapper, ChapterEntity> implements IChapterService{
@Transactional(readOnly=false)
@SuppressWarnings("unchecked")
@Override
public List<ChapterEntity> findChapterTree(String bookId) {
List<ChapterEntity> chapterList = baseMapper.selectList(Condition.create().eq("book_id", bookId));
// 获取根chapter UID
String rootUid = StringUtils.EMPTY;
for(int i=0 ; i<chapterList.size() ; i++){
ChapterEntity chapter = chapterList.get(i);
if(chapter.getParentId().equals(SystemConstant.CHAPTER_DEFAULT_ROOT_UID)){
rootUid = chapter.getId();
}
}
// 获取第一层结构
List<ChapterEntity> entitys = new ArrayList<>();
for(int i=0 ; i<chapterList.size() ; i++){
ChapterEntity chapter = chapterList.get(i);
if(rootUid.equals(chapter.getParentId())){
entitys.add(chapter);
}
}
for(int i=0 ; i<entitys.size() ; i++){
ChapterEntity entity = entitys.get(i);
List<ChapterEntity> childs = new ArrayList<>();
for(int k=0 ; k<chapterList.size() ; k++){
ChapterEntity chapter = chapterList.get(k);
if(entity.getId().equals(chapter.getParentId())){
childs.add(chapter);
}
}
entity.setChildChapterList(childs);
}
for(int i=0 ; i<entitys.size() ; i++){
List<ChapterEntity> childs = entitys.get(i).getChildChapterList();
for(int k=0 ; k<childs.size() ; k++){
ChapterEntity child = childs.get(k);
List<ChapterEntity> tmpChilds = new ArrayList<>();
for(int n=0 ; n<chapterList.size() ; n++){
ChapterEntity chapter = chapterList.get(n);
if(child.getId().equals(chapter.getParentId())){
tmpChilds.add(chapter);
}
}
child.setChildChapterList(tmpChilds);
}
}
return entitys;
}
} }

@ -45,7 +45,7 @@ public class CourseController {
BookEntity book = null; BookEntity book = null;
if(bookList.size() > 0) { if(bookList.size() > 0) {
book = bookList.get(0); book = bookList.get(0);
chapterList = iChapterService.selectList(Condition.create().eq("book_id", book.getId())); chapterList = iChapterService.findChapterTree(book.getId());
} }
SubjectEntity subject = iSubjectService.selectById(course.getSubjectId()); SubjectEntity subject = iSubjectService.selectById(course.getSubjectId());
List<CourseEntity> courseList = iCourseService.selectList(Condition.create().eq("subject_id", course.getSubjectId())); List<CourseEntity> courseList = iCourseService.selectList(Condition.create().eq("subject_id", course.getSubjectId()));

Loading…
Cancel
Save