|
|
@ -14,6 +14,7 @@ import org.slf4j.LoggerFactory;
|
|
|
|
import org.springframework.beans.factory.annotation.Autowired;
|
|
|
|
import org.springframework.beans.factory.annotation.Autowired;
|
|
|
|
import org.springframework.stereotype.Service;
|
|
|
|
import org.springframework.stereotype.Service;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import com.alibaba.fastjson.JSONObject;
|
|
|
|
import com.tamguo.dao.ChapterMapper;
|
|
|
|
import com.tamguo.dao.ChapterMapper;
|
|
|
|
import com.tamguo.dao.CourseMapper;
|
|
|
|
import com.tamguo.dao.CourseMapper;
|
|
|
|
import com.tamguo.dao.CrawlerQuestionMapper;
|
|
|
|
import com.tamguo.dao.CrawlerQuestionMapper;
|
|
|
@ -55,6 +56,8 @@ public class SubjectService implements ISubjectService{
|
|
|
|
XxlCrawler crawler = new XxlCrawler.Builder()
|
|
|
|
XxlCrawler crawler = new XxlCrawler.Builder()
|
|
|
|
.setUrls("https://tiku.baidu.com/")
|
|
|
|
.setUrls("https://tiku.baidu.com/")
|
|
|
|
.setAllowSpread(false)
|
|
|
|
.setAllowSpread(false)
|
|
|
|
|
|
|
|
.setFailRetryCount(5)
|
|
|
|
|
|
|
|
.setThreadCount(20)
|
|
|
|
.setPageParser(new PageParser<SubjectVo>() {
|
|
|
|
.setPageParser(new PageParser<SubjectVo>() {
|
|
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
@Override
|
|
|
@ -200,18 +203,24 @@ public class SubjectService implements ISubjectService{
|
|
|
|
|
|
|
|
|
|
|
|
// 剔除已经爬取的数据
|
|
|
|
// 剔除已经爬取的数据
|
|
|
|
urls.add(pageUrl);
|
|
|
|
urls.add(pageUrl);
|
|
|
|
for(String url : subjectVo.getChapterUrls()) {
|
|
|
|
logger.info("url:{}" ,pageUrl );
|
|
|
|
if(url.equals(pageVoElement.getElementsByClass("main-inner").get(0).getElementsByClass("selected").get(0).getElementsByTag("a").attr("abs:href"))) {
|
|
|
|
logger.info("subjectVo:{}" ,JSONObject.toJSON(subjectVo) );
|
|
|
|
continue;
|
|
|
|
if(subjectVo.getChapterUrls() != null) {
|
|
|
|
}
|
|
|
|
for(String url : subjectVo.getChapterUrls()) {
|
|
|
|
if(!urls.contains(url)) {
|
|
|
|
if(url.equals(pageVoElement.getElementsByClass("main-inner").get(0).getElementsByClass("selected").get(0).getElementsByTag("a").attr("abs:href"))) {
|
|
|
|
runData.addUrl(url);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if(!urls.contains(url)) {
|
|
|
|
|
|
|
|
runData.addUrl(url);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if(pageUrl.contains("https://tiku.baidu.com/tikupc/chapterdetail")) {
|
|
|
|
if(pageUrl.contains("https://tiku.baidu.com/tikupc/chapterdetail")) {
|
|
|
|
// 加入待解析题目列表
|
|
|
|
// 加入待解析题目列表
|
|
|
|
|
|
|
|
logger.info("url : {}" , pageUrl);
|
|
|
|
|
|
|
|
logger.info("subjectVo : {}" , JSONObject.toJSON(subjectVo));
|
|
|
|
for(String questionUrl : subjectVo.getQuestionUrls()) {
|
|
|
|
for(String questionUrl : subjectVo.getQuestionUrls()) {
|
|
|
|
if(!questionUrls.contains(questionUrl)) {
|
|
|
|
if(!questionUrls.contains(questionUrl)) {
|
|
|
|
// 处理URL
|
|
|
|
// 处理URL
|
|
|
|