diff --git a/tamguo-crawler/src/main/java/com/tamguo/service/impl/SubjectService.java b/tamguo-crawler/src/main/java/com/tamguo/service/impl/SubjectService.java index 6ef7ff3..b72a7b0 100644 --- a/tamguo-crawler/src/main/java/com/tamguo/service/impl/SubjectService.java +++ b/tamguo-crawler/src/main/java/com/tamguo/service/impl/SubjectService.java @@ -14,6 +14,7 @@ import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; +import com.alibaba.fastjson.JSONObject; import com.tamguo.dao.ChapterMapper; import com.tamguo.dao.CourseMapper; import com.tamguo.dao.CrawlerQuestionMapper; @@ -55,6 +56,8 @@ public class SubjectService implements ISubjectService{ XxlCrawler crawler = new XxlCrawler.Builder() .setUrls("https://tiku.baidu.com/") .setAllowSpread(false) + .setFailRetryCount(5) + .setThreadCount(20) .setPageParser(new PageParser() { @Override @@ -200,18 +203,24 @@ public class SubjectService implements ISubjectService{ // 剔除已经爬取的数据 urls.add(pageUrl); - for(String url : subjectVo.getChapterUrls()) { - if(url.equals(pageVoElement.getElementsByClass("main-inner").get(0).getElementsByClass("selected").get(0).getElementsByTag("a").attr("abs:href"))) { - continue; - } - if(!urls.contains(url)) { - runData.addUrl(url); - } + logger.info("url:{}" ,pageUrl ); + logger.info("subjectVo:{}" ,JSONObject.toJSON(subjectVo) ); + if(subjectVo.getChapterUrls() != null) { + for(String url : subjectVo.getChapterUrls()) { + if(url.equals(pageVoElement.getElementsByClass("main-inner").get(0).getElementsByClass("selected").get(0).getElementsByTag("a").attr("abs:href"))) { + continue; + } + if(!urls.contains(url)) { + runData.addUrl(url); + } + } } } if(pageUrl.contains("https://tiku.baidu.com/tikupc/chapterdetail")) { // 加入待解析题目列表 + logger.info("url : {}" , pageUrl); + logger.info("subjectVo : {}" , JSONObject.toJSON(subjectVo)); for(String questionUrl : subjectVo.getQuestionUrls()) { if(!questionUrls.contains(questionUrl)) { // 处理URL diff --git a/tamguo-crawler/src/main/resources/application.properties b/tamguo-crawler/src/main/resources/application.properties index 6c38d28..ad5ea06 100644 --- a/tamguo-crawler/src/main/resources/application.properties +++ b/tamguo-crawler/src/main/resources/application.properties @@ -15,7 +15,7 @@ spring.datasource.testOnReturn=false spring.datasource.testWhileIdle=true spring.datasource.timeBetweenEvictionRunsMillis=60000 spring.datasource.type=com.alibaba.druid.pool.DruidDataSource -spring.datasource.url=jdbc:mysql://47.100.175.14:3306/tamguo?useUnicode=true&characterEncoding=UTF-8&useSSL=false +spring.datasource.url=jdbc:mysql://47.100.175.14:3306/tiku?useUnicode=true&characterEncoding=UTF-8&useSSL=false spring.datasource.username=root spring.datasource.validationQuery=SELECT 1 FROM DUAL @@ -33,4 +33,10 @@ mybatis-plus.global-config.sql-injector=com.baomidou.mybatisplus.mapper.LogicSql mybatis-plus.global-config.meta-object-handler=com.tamguo.config.dao.MyMetaObjectHandler mybatis-plus.global-config.sql-parser-cache=true mybatis-plus.configuration.map-underscore-to-camel-case=true -mybatis-plus.configuration.cache-enabled=false \ No newline at end of file +mybatis-plus.configuration.cache-enabled=false + +logging.level.root=INFO +logging.level.org.springframework.web=INFO +logging.file=/home/webdata/log/tamguo-crawler.log +logging.pattern.console=%d{yyyy/MM/dd-HH:mm:ss} [%thread] %-5level %logger- %msg%n +logging.pattern.file=%d{yyyy/MM/dd-HH:mm} [%thread] %-5level %logger- %msg%n \ No newline at end of file