|
|
|
@ -5,6 +5,7 @@ import java.util.HashSet;
|
|
|
|
|
import java.util.Map;
|
|
|
|
|
import java.util.Set;
|
|
|
|
|
|
|
|
|
|
import org.apache.commons.lang3.StringUtils;
|
|
|
|
|
import org.jsoup.nodes.Document;
|
|
|
|
|
import org.jsoup.nodes.Element;
|
|
|
|
|
import org.jsoup.select.Elements;
|
|
|
|
@ -19,9 +20,7 @@ import com.tamguo.dao.CourseMapper;
|
|
|
|
|
import com.tamguo.dao.CrawlerQuestionMapper;
|
|
|
|
|
import com.tamguo.dao.SubjectMapper;
|
|
|
|
|
import com.tamguo.model.ChapterEntity;
|
|
|
|
|
import com.tamguo.model.CourseEntity;
|
|
|
|
|
import com.tamguo.model.CrawlerQuestionEntity;
|
|
|
|
|
import com.tamguo.model.SubjectEntity;
|
|
|
|
|
import com.tamguo.model.vo.SubjectVo;
|
|
|
|
|
import com.tamguo.service.ISubjectService;
|
|
|
|
|
import com.xuxueli.crawler.XxlCrawler;
|
|
|
|
@ -31,6 +30,8 @@ import com.xuxueli.crawler.rundata.RunData;
|
|
|
|
|
@Service
|
|
|
|
|
public class SubjectService implements ISubjectService{
|
|
|
|
|
|
|
|
|
|
private final static String COURSE_ID = "likeshuxue";
|
|
|
|
|
private final static String BOOK_ID = "1025976567395184642";
|
|
|
|
|
@Autowired
|
|
|
|
|
SubjectMapper subjectMapper;
|
|
|
|
|
@Autowired
|
|
|
|
@ -42,8 +43,6 @@ public class SubjectService implements ISubjectService{
|
|
|
|
|
|
|
|
|
|
private Logger logger = LoggerFactory.getLogger(getClass());
|
|
|
|
|
|
|
|
|
|
private Set<String> urls = new HashSet<>();
|
|
|
|
|
|
|
|
|
|
private Set<String> questionUrls = new HashSet<String>();
|
|
|
|
|
|
|
|
|
|
private Map<String, Object> chapterQuestionListMap = new HashMap<>();
|
|
|
|
@ -53,7 +52,7 @@ public class SubjectService implements ISubjectService{
|
|
|
|
|
@Override
|
|
|
|
|
public void crawlerSubject() {
|
|
|
|
|
XxlCrawler crawler = new XxlCrawler.Builder()
|
|
|
|
|
.setUrls("https://tiku.baidu.com/")
|
|
|
|
|
.setUrls("https://tiku.baidu.com/tikupc/chapterlist/1bfd700abb68a98271fefa04-16-knowpoint-11")
|
|
|
|
|
.setAllowSpread(false)
|
|
|
|
|
.setFailRetryCount(5)
|
|
|
|
|
.setThreadCount(20)
|
|
|
|
@ -63,68 +62,8 @@ public class SubjectService implements ISubjectService{
|
|
|
|
|
public void parse(Document html, Element pageVoElement, SubjectVo subjectVo) {
|
|
|
|
|
// 解析封装 PageVo 对象
|
|
|
|
|
String pageUrl = html.baseUri();
|
|
|
|
|
if(pageUrl.equals("https://tiku.baidu.com/")) {
|
|
|
|
|
logger.info("开始解析考试分类:{}" , pageUrl);
|
|
|
|
|
for(int i=0 ; i<subjectVo.getName().size() ; i++) {
|
|
|
|
|
String name = subjectVo.getName().get(i);
|
|
|
|
|
|
|
|
|
|
SubjectEntity subject = subjectMapper.findByName(name);
|
|
|
|
|
if(subject != null) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
SubjectEntity entity = new SubjectEntity();
|
|
|
|
|
if(name.equals("国考")) {
|
|
|
|
|
name = "公务员(国考)";
|
|
|
|
|
}
|
|
|
|
|
entity.setName(name);
|
|
|
|
|
subjectMapper.insert(entity);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 加入科目爬取数据
|
|
|
|
|
for(String url : subjectVo.getCourseUrls()) {
|
|
|
|
|
runData.addUrl(url);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if(pageUrl.contains("https://tiku.baidu.com/tikupc/homepage/")) {
|
|
|
|
|
logger.info("开始解析科目分类:{}" , pageUrl);
|
|
|
|
|
for(int i=0 ; i<subjectVo.getCourseName().size() ; i++) {
|
|
|
|
|
logger.info("科目名称:{}" , subjectVo.getCourseName().get(i));
|
|
|
|
|
SubjectEntity subject = subjectMapper.findByName(subjectVo.getSubjectName());
|
|
|
|
|
if(subject == null) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
CourseEntity course = new CourseEntity();
|
|
|
|
|
course.setName(subjectVo.getCourseName().get(i));
|
|
|
|
|
CourseEntity courseEntity = courseMapper.selectOne(course);
|
|
|
|
|
if(courseEntity != null) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
course.setOrders(i+1);
|
|
|
|
|
course.setPointNum(0);
|
|
|
|
|
course.setQuestionNum(0);
|
|
|
|
|
course.setSeoDescription(subjectVo.getCourseName().get(i));
|
|
|
|
|
course.setSeoKeywords(subjectVo.getCourseName().get(i));
|
|
|
|
|
course.setSeoTitle(subjectVo.getCourseName().get(i));
|
|
|
|
|
course.setSubjectId(subject.getUid());
|
|
|
|
|
|
|
|
|
|
courseMapper.insert(course);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 加入科目爬取数据
|
|
|
|
|
for(String url : subjectVo.getChapterUrlsTemp()) {
|
|
|
|
|
runData.addUrl(url);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if(pageUrl.contains("https://tiku.baidu.com/tikupc/chapterlist/")) {
|
|
|
|
|
logger.info("开始解析章节:{}" , pageUrl);
|
|
|
|
|
CourseEntity courseCondition = new CourseEntity();
|
|
|
|
|
courseCondition.setName(subjectVo.getChapterPageCourseName());
|
|
|
|
|
CourseEntity c = courseMapper.selectOne(courseCondition);
|
|
|
|
|
if(c == null) {
|
|
|
|
|
runData.addUrl(pageUrl);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
logger.info("开始解析书籍:{}" , pageUrl);
|
|
|
|
|
ChapterEntity chapterCondition = new ChapterEntity();
|
|
|
|
|
chapterCondition.setName(subjectVo.getChapterCurrName());
|
|
|
|
|
ChapterEntity chapterEntity = chapterMapper.selectOne(chapterCondition);
|
|
|
|
@ -133,12 +72,16 @@ public class SubjectService implements ISubjectService{
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ChapterEntity rootChapter = new ChapterEntity();
|
|
|
|
|
rootChapter.setCourseId(c.getUid());
|
|
|
|
|
rootChapter.setCourseId(COURSE_ID);
|
|
|
|
|
rootChapter.setParentId("-1");
|
|
|
|
|
rootChapter.setName(subjectVo.getChapterCurrName());
|
|
|
|
|
rootChapter.setQuestionNum(0);
|
|
|
|
|
rootChapter.setPointNum(0);
|
|
|
|
|
rootChapter.setOrders(0);
|
|
|
|
|
rootChapter.setTreeLeaf(false);
|
|
|
|
|
rootChapter.setTreeLevel(0);
|
|
|
|
|
rootChapter.setParentIds("-1,");
|
|
|
|
|
rootChapter.setBookId(BOOK_ID);
|
|
|
|
|
chapterMapper.insert(rootChapter);
|
|
|
|
|
|
|
|
|
|
Elements elements = pageVoElement.getElementsByClass("detail-chapter");
|
|
|
|
@ -147,13 +90,18 @@ public class SubjectService implements ISubjectService{
|
|
|
|
|
String chapterName = element.getElementsByClass("detail-chapter-title").get(0).getElementsByTag("h3").text();
|
|
|
|
|
logger.info(chapterName);
|
|
|
|
|
ChapterEntity chapter = new ChapterEntity();
|
|
|
|
|
chapter.setCourseId(c.getUid());
|
|
|
|
|
chapter.setCourseId(COURSE_ID);
|
|
|
|
|
chapter.setParentId(rootChapter.getUid());
|
|
|
|
|
chapter.setName(chapterName);
|
|
|
|
|
chapter.setQuestionNum(0);
|
|
|
|
|
chapter.setPointNum(0);
|
|
|
|
|
chapter.setOrders(n+1);
|
|
|
|
|
chapter.setBookId(BOOK_ID);
|
|
|
|
|
chapter.setTreeLeaf(false);
|
|
|
|
|
chapter.setTreeLevel(1);
|
|
|
|
|
chapterMapper.insert(chapter);
|
|
|
|
|
chapter.setParentIds(rootChapter.getParentIds() + chapter.getUid() + ",");
|
|
|
|
|
chapterMapper.updateById(chapter);
|
|
|
|
|
|
|
|
|
|
Elements detailKpoint1s = element.getElementsByClass("detail-kpoint-1");
|
|
|
|
|
for(Element detailKpoint1 : detailKpoint1s) {
|
|
|
|
@ -164,13 +112,19 @@ public class SubjectService implements ISubjectService{
|
|
|
|
|
logger.info(chapterName1);
|
|
|
|
|
|
|
|
|
|
ChapterEntity chapter1 = new ChapterEntity();
|
|
|
|
|
chapter1.setCourseId(c.getUid());
|
|
|
|
|
chapter1.setCourseId(COURSE_ID);
|
|
|
|
|
chapter1.setBookId(BOOK_ID);
|
|
|
|
|
chapter1.setParentId(chapter.getUid());
|
|
|
|
|
chapter1.setName(chapterName1);
|
|
|
|
|
chapter1.setQuestionNum(0);
|
|
|
|
|
chapter1.setPointNum(0);
|
|
|
|
|
chapter1.setOrders(i+1);
|
|
|
|
|
|
|
|
|
|
chapter1.setTreeLeaf(false);
|
|
|
|
|
chapter1.setTreeLevel(2);
|
|
|
|
|
chapterMapper.insert(chapter1);
|
|
|
|
|
chapter1.setParentIds(chapter.getParentIds() + chapter1.getUid() + ",");
|
|
|
|
|
chapterMapper.updateById(chapter1);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Elements detailKpoint2s = detailKpoint1.getElementsByClass("detail-kpoint-2");
|
|
|
|
@ -180,17 +134,23 @@ public class SubjectService implements ISubjectService{
|
|
|
|
|
logger.info(chapterName2);
|
|
|
|
|
|
|
|
|
|
ChapterEntity chapter2 = new ChapterEntity();
|
|
|
|
|
chapter2.setCourseId(c.getUid());
|
|
|
|
|
chapter2.setCourseId(COURSE_ID);
|
|
|
|
|
chapter2.setBookId(BOOK_ID);
|
|
|
|
|
chapter2.setParentId(chapter1.getUid());
|
|
|
|
|
chapter2.setName(chapterName2);
|
|
|
|
|
chapter2.setQuestionNum(0);
|
|
|
|
|
chapter2.setPointNum(0);
|
|
|
|
|
chapter2.setOrders(k+1);
|
|
|
|
|
chapter2.setTreeLeaf(true);
|
|
|
|
|
chapter2.setTreeLevel(3);
|
|
|
|
|
chapterMapper.insert(chapter2);
|
|
|
|
|
chapter2.setParentIds(chapter1.getParentIds() + chapter2.getUid() + ",");
|
|
|
|
|
chapterMapper.updateById(chapter2);
|
|
|
|
|
|
|
|
|
|
Elements maskList = detailKpoint.getElementsByClass("mask");
|
|
|
|
|
if(maskList.size() > 0) {
|
|
|
|
|
String questionUrl = maskList.get(0).getElementsByTag("a").attr("abs:href");
|
|
|
|
|
questionUrl = questionUrl.replace("1-5", "1-1000");
|
|
|
|
|
// questionUrl = questionUrl.replace("1-5", "1-20");
|
|
|
|
|
chapterQuestionListMap.put(questionUrl, chapter2);
|
|
|
|
|
|
|
|
|
|
runData.addUrl(questionUrl);
|
|
|
|
@ -200,34 +160,19 @@ public class SubjectService implements ISubjectService{
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 剔除已经爬取的数据
|
|
|
|
|
urls.add(pageUrl);
|
|
|
|
|
logger.info("url:{}" ,pageUrl );
|
|
|
|
|
logger.info("subjectVo:{}" ,JSONObject.toJSON(subjectVo) );
|
|
|
|
|
if(subjectVo.getChapterUrls() != null) {
|
|
|
|
|
for(String url : subjectVo.getChapterUrls()) {
|
|
|
|
|
if(url.equals(pageVoElement.getElementsByClass("main-inner").get(0).getElementsByClass("selected").get(0).getElementsByTag("a").attr("abs:href"))) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if(!urls.contains(url)) {
|
|
|
|
|
runData.addUrl(url);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if(pageUrl.contains("https://tiku.baidu.com/tikupc/chapterdetail")) {
|
|
|
|
|
// 加入待解析题目列表
|
|
|
|
|
logger.info("url : {}" , pageUrl);
|
|
|
|
|
logger.info("subjectVo : {}" , JSONObject.toJSON(subjectVo));
|
|
|
|
|
ChapterEntity chapterEntity = (ChapterEntity) chapterQuestionListMap.get(pageUrl);
|
|
|
|
|
for(String questionUrl : subjectVo.getQuestionUrls()) {
|
|
|
|
|
if(!questionUrls.contains(questionUrl)) {
|
|
|
|
|
// 处理URL
|
|
|
|
|
// runData.addUrl(questionUrl);
|
|
|
|
|
questionUrls.add(questionUrl);
|
|
|
|
|
|
|
|
|
|
ChapterEntity chapterEntity = (ChapterEntity) chapterQuestionListMap.get(pageUrl);
|
|
|
|
|
|
|
|
|
|
CrawlerQuestionEntity condition = new CrawlerQuestionEntity();
|
|
|
|
|
condition.setQuestionUrl(questionUrl);
|
|
|
|
|
if(crawlerQuestionMapper.selectOne(condition) == null) {
|
|
|
|
@ -241,12 +186,12 @@ public class SubjectService implements ISubjectService{
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if(!StringUtils.isEmpty(subjectVo.getNextQuestionPage())) {
|
|
|
|
|
runData.addUrl(subjectVo.getNextQuestionPage());
|
|
|
|
|
chapterQuestionListMap.put(subjectVo.getNextQuestionPage(), chapterEntity);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*if(pageUrl.contains("https://tiku.baidu.com/tikupc/singledetail")) {
|
|
|
|
|
ChapterEntity chapterEntity = (ChapterEntity) chapterQuestionMap.get(pageUrl);
|
|
|
|
|
System.out.println(chapterEntity);
|
|
|
|
|
}*/
|
|
|
|
|
}
|
|
|
|
|
}).build();
|
|
|
|
|
|
|
|
|
|