From a2f1303e11e942f159b383788deda4064f780c9c Mon Sep 17 00:00:00 2001 From: tamguo Date: Sun, 5 Aug 2018 14:41:09 +0800 Subject: [PATCH] =?UTF-8?q?=E7=88=AC=E8=9F=B2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../tamguo/common/utils/SystemConstant.java | 3 + .../com/tamguo/config/dao/SuperEntity.java | 2 +- .../java/com/tamguo/model/ChapterEntity.java | 40 +++ .../java/com/tamguo/model/vo/SubjectVo.java | 11 + .../com/tamguo/service/impl/BookService.java | 246 ++++++++++++++---- .../tamguo/service/impl/SubjectService.java | 127 +++------ .../src/main/resources/application.properties | 2 +- tamguo-crawler/src/main/resources/redis.xml | 2 +- .../modules/tiku/service/IChapterService.java | 5 + .../tiku/service/impl/ChapterServiceImpl.java | 58 +++++ .../com/tamguo/web/tiku/CourseController.java | 2 +- 11 files changed, 355 insertions(+), 143 deletions(-) diff --git a/tamguo-common/src/main/java/com/tamguo/common/utils/SystemConstant.java b/tamguo-common/src/main/java/com/tamguo/common/utils/SystemConstant.java index 378ff0a..7474b6b 100644 --- a/tamguo-common/src/main/java/com/tamguo/common/utils/SystemConstant.java +++ b/tamguo-common/src/main/java/com/tamguo/common/utils/SystemConstant.java @@ -79,4 +79,7 @@ public class SystemConstant { /** ALIYUN*/ public static final String ALIYUN_ACCESS_KEY_SECRET = "ONUKuCz85kU4In07y4dvpM28mfWOGa"; + + /** 默认的章节根目录*/ + public static final String CHAPTER_DEFAULT_ROOT_UID = "-1"; } diff --git a/tamguo-crawler/src/main/java/com/tamguo/config/dao/SuperEntity.java b/tamguo-crawler/src/main/java/com/tamguo/config/dao/SuperEntity.java index 8906d69..d59c2c8 100644 --- a/tamguo-crawler/src/main/java/com/tamguo/config/dao/SuperEntity.java +++ b/tamguo-crawler/src/main/java/com/tamguo/config/dao/SuperEntity.java @@ -11,7 +11,7 @@ public class SuperEntity> extends Model { private static final long serialVersionUID = 1L; - @TableId("uid") + @TableId("id") private String uid; @Override diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/ChapterEntity.java b/tamguo-crawler/src/main/java/com/tamguo/model/ChapterEntity.java index 31fe3bc..11769c8 100644 --- a/tamguo-crawler/src/main/java/com/tamguo/model/ChapterEntity.java +++ b/tamguo-crawler/src/main/java/com/tamguo/model/ChapterEntity.java @@ -19,6 +19,14 @@ public class ChapterEntity extends SuperEntity implements Seriali private String parentId; + private String parentIds; + + private Integer treeLevel; + + private Boolean treeLeaf; + + private String bookId; + private Integer questionNum; private Integer pointNum; @@ -76,4 +84,36 @@ public class ChapterEntity extends SuperEntity implements Seriali this.orders = orders; } + public String getBookId() { + return bookId; + } + + public void setBookId(String bookId) { + this.bookId = bookId; + } + + public String getParentIds() { + return parentIds; + } + + public void setParentIds(String parentIds) { + this.parentIds = parentIds; + } + + public Integer getTreeLevel() { + return treeLevel; + } + + public void setTreeLevel(Integer treeLevel) { + this.treeLevel = treeLevel; + } + + public Boolean getTreeLeaf() { + return treeLeaf; + } + + public void setTreeLeaf(Boolean treeLeaf) { + this.treeLeaf = treeLeaf; + } + } \ No newline at end of file diff --git a/tamguo-crawler/src/main/java/com/tamguo/model/vo/SubjectVo.java b/tamguo-crawler/src/main/java/com/tamguo/model/vo/SubjectVo.java index ac0be5f..047c7a7 100644 --- a/tamguo-crawler/src/main/java/com/tamguo/model/vo/SubjectVo.java +++ b/tamguo-crawler/src/main/java/com/tamguo/model/vo/SubjectVo.java @@ -47,6 +47,9 @@ public class SubjectVo { @PageFieldSelect(cssQuery = ".bd-content .question-box .question-box-inner .view-analyse a", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") private List questionUrls; + @PageFieldSelect(cssQuery = ".nexttolearn .next-inner .tolearn", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:href") + private String nextQuestionPage; + // 单个题目数据 @PageFieldSelect(cssQuery=".question-box-inner .questem-inner", selectType = XxlCrawlerConf.SelectType.HTML) private String content; @@ -183,4 +186,12 @@ public class SubjectVo { this.answer = answer; } + public String getNextQuestionPage() { + return nextQuestionPage; + } + + public void setNextQuestionPage(String nextQuestionPage) { + this.nextQuestionPage = nextQuestionPage; + } + } diff --git a/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java b/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java index 4d37f18..ea611b9 100644 --- a/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java +++ b/tamguo-crawler/src/main/java/com/tamguo/service/impl/BookService.java @@ -1,74 +1,224 @@ package com.tamguo.service.impl; +import com.baomidou.mybatisplus.plugins.Page; +import com.tamguo.config.redis.CacheService; import com.tamguo.dao.ChapterMapper; +import com.tamguo.dao.CourseMapper; +import com.tamguo.dao.CrawlerQuestionMapper; +import com.tamguo.dao.QuestionMapper; +import com.tamguo.dao.SubjectMapper; import com.tamguo.model.ChapterEntity; -import com.tamguo.model.vo.ChapterVo; +import com.tamguo.model.CourseEntity; +import com.tamguo.model.CrawlerQuestionEntity; +import com.tamguo.model.QuestionEntity; +import com.tamguo.model.SubjectEntity; +import com.tamguo.model.enums.QuestionType; +import com.tamguo.model.vo.QuestionVo; import com.tamguo.service.IBookService; import com.xuxueli.crawler.XxlCrawler; +import com.xuxueli.crawler.conf.XxlCrawlerConf; import com.xuxueli.crawler.parser.PageParser; +import com.xuxueli.crawler.parser.strategy.HtmlUnitPageLoader; +import com.xuxueli.crawler.rundata.RunData; +import com.xuxueli.crawler.util.FileUtil; + +import java.io.File; +import java.text.DecimalFormat; +import java.text.SimpleDateFormat; +import java.util.Date; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.commons.lang3.StringUtils; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; -import java.util.List; -import java.util.UUID; - @Service public class BookService implements IBookService { - - @Autowired - ChapterMapper chapterMapper; - - private Logger logger = LoggerFactory.getLogger(getClass()); + private RunData runData; + @Autowired + QuestionMapper questionMapper; + @Autowired + CrawlerQuestionMapper crawlerQuestionMapper; + @Autowired + ChapterMapper chapterMapper; + @Autowired + CourseMapper courseMapper; + @Autowired + SubjectMapper subjectMapper; + @Autowired + CacheService cacheService; + private static final String FILES_NO_FORMAT = "0000000"; + private static final String FILES_PREFIX = "LIKESHUXUE"; + private static final String DOMAIN = "http://www.tamguo.com"; @Override public void crawlerBook() { - XxlCrawler crawler = new XxlCrawler.Builder() - .setUrls("https://tiku.baidu.com/tikupc/chapterlist/1bfd700abb68a98271fefa04-27-jiaocai-11") - .setAllowSpread(false) - .setFailRetryCount(5) - .setThreadCount(20) - .setPageParser(new PageParser() { - @Override - public void parse(Document html, Element pageVoElement, ChapterVo chapterVo) { - // 解析封装 PageVo 对象 - String parentName = chapterVo.getName(); - ChapterEntity chapterEntity = new ChapterEntity(); - String uid = UUID.randomUUID().toString().replace("-", ""); - chapterEntity.setUid(uid); - chapterEntity.setName(parentName); - chapterEntity.setCourseId("0"); - chapterEntity.setCourseId("0"); - chapterEntity.setParentId("-1"); - chapterEntity.setQuestionNum(0); - chapterEntity.setPointNum(0); - chapterMapper.insert(chapterEntity); + XxlCrawler crawler = new XxlCrawler.Builder() + .setAllowSpread(false) + .setThreadCount(20) + .setFailRetryCount(5) + .setPageLoader(new HtmlUnitPageLoader()) + .setPageParser(new PageParser() { + + @Override + public void parse(Document html, Element pageVoElement, QuestionVo questionVo) { + if(StringUtils.isEmpty(questionVo.getContent())) { + runData.addUrl(html.baseUri()); + return; + } + CrawlerQuestionEntity condition = new CrawlerQuestionEntity(); + condition.setQuestionUrl(html.baseUri()); + CrawlerQuestionEntity crawlerQuestion = crawlerQuestionMapper.selectOne(condition); + ChapterEntity chapter = chapterMapper.selectById(crawlerQuestion.getChapterId()); + CourseEntity course = courseMapper.selectById(chapter.getCourseId()); + SubjectEntity subject = subjectMapper.selectById(course.getSubjectId()); + + QuestionType questionType = QuestionType.getQuestionType(questionVo.getQuestionType()); + - List sonChapters = chapterVo.getSonChapters(); - sonChapters.forEach(s -> { - ChapterEntity sonChapterEntity = new ChapterEntity(); - sonChapterEntity.setName(s); - sonChapterEntity.setCourseId("0"); - sonChapterEntity.setCourseId("0"); - sonChapterEntity.setParentId(uid); - sonChapterEntity.setQuestionNum(0); - sonChapterEntity.setPointNum(0); - chapterMapper.insert(sonChapterEntity); - }); + QuestionEntity question = new QuestionEntity(); + if(questionType == QuestionType.DANXUANTI) { + if(!StringUtils.isEmpty(questionVo.getQueoptions())) { + question.setContent(questionVo.getContent() + questionVo.getQueoptions()); + }else { + question.setContent(questionVo.getContent()); + } + }else { + question.setContent(questionVo.getContent()); + } + question.setAnalysis(questionVo.getAnalysis()); + if(StringUtils.isEmpty(question.getAnswer())) { + question.setAnalysis("


"); + } + question.setAnswer(questionVo.getAnswer()); + question.setAuditStatus("1"); + question.setChapterId(chapter.getUid()); + question.setCourseId(course.getUid()); + question.setPaperId(null); + question.setQuestionType(questionType.getValue().toString()); + if(questionVo.getReviewPoint() != null && questionVo.getReviewPoint().size() > 0) { + question.setReviewPoint(StringUtils.join(questionVo.getReviewPoint().toArray(), ",")); + } + // 处理分数 + if(questionVo.getScore() != null) { + if(questionVo.getScore().contains("分")) { + question.setScore(questionVo.getScore()); + } + if(questionVo.getScore().contains("年")) { + question.setYear(questionVo.getScore()); + } + } + if(questionVo.getYear() != null) { + if(questionVo.getYear().contains("年")) { + question.setYear(questionVo.getYear()); + } + } + question.setSubjectId(subject.getUid()); + + if (questionVo.getAnswerImages()!=null && questionVo.getAnswerImages().size() > 0) { + Set imagesSet = new HashSet<>(questionVo.getAnswerImages()); + for (String img: imagesSet) { - } + // 下载图片文件 + String fileName = getFileName(img); + File dir = new File(getFilePath()); + if (!dir.exists()) + dir.mkdirs(); + boolean ret = FileUtil.downFile(img, XxlCrawlerConf.TIMEOUT_MILLIS_DEFAULT, getFilePath(), fileName); + System.out.println("down images " + (ret?"success":"fail") + ":" + img); + + // 替换URL + questionVo.setAnswer(questionVo.getAnswer().replace(img, DOMAIN + getFilePath() + fileName)); + } + question.setAnswer(questionVo.getAnswer()); + } + + if (questionVo.getAnalysisImages()!=null && questionVo.getAnalysisImages().size() > 0) { + Set imagesSet = new HashSet<>(questionVo.getAnalysisImages()); + for (String img: imagesSet) { + // 下载图片文件 + String fileName = getFileName(img); + File dir = new File(getFilePath()); + if (!dir.exists()) + dir.mkdirs(); + boolean ret = FileUtil.downFile(img, XxlCrawlerConf.TIMEOUT_MILLIS_DEFAULT, getFilePath(), fileName); + System.out.println("down images " + (ret?"success":"fail") + ":" + img); + + // 替换URL + questionVo.setAnalysis(questionVo.getAnalysis().replace(img, DOMAIN + getFilePath() + fileName)); + } + question.setAnalysis(questionVo.getAnalysis()); + } + + if (questionVo.getContentImages()!=null && questionVo.getContentImages().size() > 0) { + Set imagesSet = new HashSet<>(questionVo.getContentImages()); + for (String img: imagesSet) { -// } - }).build(); + // 下载图片文件 + String fileName = getFileName(img); + File dir = new File(getFilePath()); + if (!dir.exists()) + dir.mkdirs(); + boolean ret = FileUtil.downFile(img, XxlCrawlerConf.TIMEOUT_MILLIS_DEFAULT, getFilePath(), fileName); + System.out.println("down images " + (ret?"success":"fail") + ":" + img); + + // 替换URL + questionVo.setContent(questionVo.getContent().replace(img, DOMAIN + getFilePath() + fileName)); + } + question.setContent(questionVo.getContent()); + } + + + // 处理图片 + question.setSourceType("baidu"); + question.setSourceUrl(html.baseUri()); + questionMapper.insert(question); + } + + public String getFileName(String img) { + return getFileNo() + img.substring(img.lastIndexOf(".")); + } + + private String getFilePath() { + SimpleDateFormat sdf = new SimpleDateFormat("ddHHmm"); + String format = sdf.format(new Date()); + return "/images/question/" + format + "/"; + } - // 获取科目 - crawler.start(true); - } + private String getFileNo() { + SimpleDateFormat sdf = new SimpleDateFormat("ddHHmm"); + String format = sdf.format(new Date()); + DecimalFormat df = new DecimalFormat(FILES_NO_FORMAT); + String key = FILES_PREFIX + format; + Long incr = cacheService.incr(key); + String avatorNo = FILES_PREFIX + df.format(incr); + return avatorNo; + } + }).build(); + + runData = crawler.getRunData(); + int page = 1; + int pageSize = 1000; + while(true) { + Page questionPage = new Page(page , pageSize); + List questionList = crawlerQuestionMapper.queryPageOrderUid(questionPage); + for(int i=0 ;i urls = new HashSet<>(); - private Set questionUrls = new HashSet(); private Map chapterQuestionListMap = new HashMap<>(); @@ -53,7 +52,7 @@ public class SubjectService implements ISubjectService{ @Override public void crawlerSubject() { XxlCrawler crawler = new XxlCrawler.Builder() - .setUrls("https://tiku.baidu.com/") + .setUrls("https://tiku.baidu.com/tikupc/chapterlist/1bfd700abb68a98271fefa04-16-knowpoint-11") .setAllowSpread(false) .setFailRetryCount(5) .setThreadCount(20) @@ -63,68 +62,8 @@ public class SubjectService implements ISubjectService{ public void parse(Document html, Element pageVoElement, SubjectVo subjectVo) { // 解析封装 PageVo 对象 String pageUrl = html.baseUri(); - if(pageUrl.equals("https://tiku.baidu.com/")) { - logger.info("开始解析考试分类:{}" , pageUrl); - for(int i=0 ; i 0) { String questionUrl = maskList.get(0).getElementsByTag("a").attr("abs:href"); - questionUrl = questionUrl.replace("1-5", "1-1000"); + // questionUrl = questionUrl.replace("1-5", "1-20"); chapterQuestionListMap.put(questionUrl, chapter2); runData.addUrl(questionUrl); @@ -200,34 +160,19 @@ public class SubjectService implements ISubjectService{ } } - // 剔除已经爬取的数据 - urls.add(pageUrl); - logger.info("url:{}" ,pageUrl ); - logger.info("subjectVo:{}" ,JSONObject.toJSON(subjectVo) ); - if(subjectVo.getChapterUrls() != null) { - for(String url : subjectVo.getChapterUrls()) { - if(url.equals(pageVoElement.getElementsByClass("main-inner").get(0).getElementsByClass("selected").get(0).getElementsByTag("a").attr("abs:href"))) { - continue; - } - if(!urls.contains(url)) { - runData.addUrl(url); - } - } - } } if(pageUrl.contains("https://tiku.baidu.com/tikupc/chapterdetail")) { // 加入待解析题目列表 logger.info("url : {}" , pageUrl); logger.info("subjectVo : {}" , JSONObject.toJSON(subjectVo)); + ChapterEntity chapterEntity = (ChapterEntity) chapterQuestionListMap.get(pageUrl); for(String questionUrl : subjectVo.getQuestionUrls()) { if(!questionUrls.contains(questionUrl)) { // 处理URL // runData.addUrl(questionUrl); questionUrls.add(questionUrl); - ChapterEntity chapterEntity = (ChapterEntity) chapterQuestionListMap.get(pageUrl); - CrawlerQuestionEntity condition = new CrawlerQuestionEntity(); condition.setQuestionUrl(questionUrl); if(crawlerQuestionMapper.selectOne(condition) == null) { @@ -241,12 +186,12 @@ public class SubjectService implements ISubjectService{ } } } + if(!StringUtils.isEmpty(subjectVo.getNextQuestionPage())) { + runData.addUrl(subjectVo.getNextQuestionPage()); + chapterQuestionListMap.put(subjectVo.getNextQuestionPage(), chapterEntity); + } } - /*if(pageUrl.contains("https://tiku.baidu.com/tikupc/singledetail")) { - ChapterEntity chapterEntity = (ChapterEntity) chapterQuestionMap.get(pageUrl); - System.out.println(chapterEntity); - }*/ } }).build(); diff --git a/tamguo-crawler/src/main/resources/application.properties b/tamguo-crawler/src/main/resources/application.properties index ad5ea06..42c4a05 100644 --- a/tamguo-crawler/src/main/resources/application.properties +++ b/tamguo-crawler/src/main/resources/application.properties @@ -15,7 +15,7 @@ spring.datasource.testOnReturn=false spring.datasource.testWhileIdle=true spring.datasource.timeBetweenEvictionRunsMillis=60000 spring.datasource.type=com.alibaba.druid.pool.DruidDataSource -spring.datasource.url=jdbc:mysql://47.100.175.14:3306/tiku?useUnicode=true&characterEncoding=UTF-8&useSSL=false +spring.datasource.url=jdbc:mysql://47.100.175.14:3306/tamguo?useUnicode=true&characterEncoding=UTF-8&useSSL=false spring.datasource.username=root spring.datasource.validationQuery=SELECT 1 FROM DUAL diff --git a/tamguo-crawler/src/main/resources/redis.xml b/tamguo-crawler/src/main/resources/redis.xml index 4055e0d..ed04f74 100644 --- a/tamguo-crawler/src/main/resources/redis.xml +++ b/tamguo-crawler/src/main/resources/redis.xml @@ -4,6 +4,6 @@ - + \ No newline at end of file diff --git a/tamguo-modules-core/src/main/java/com/tamguo/modules/tiku/service/IChapterService.java b/tamguo-modules-core/src/main/java/com/tamguo/modules/tiku/service/IChapterService.java index eb563dc..db6dc91 100644 --- a/tamguo-modules-core/src/main/java/com/tamguo/modules/tiku/service/IChapterService.java +++ b/tamguo-modules-core/src/main/java/com/tamguo/modules/tiku/service/IChapterService.java @@ -1,8 +1,13 @@ package com.tamguo.modules.tiku.service; +import java.util.List; + import com.baomidou.mybatisplus.service.IService; import com.tamguo.modules.tiku.model.ChapterEntity; public interface IChapterService extends IService{ + + // 获取科目章节 + public List findChapterTree(String bookId); } diff --git a/tamguo-modules-core/src/main/java/com/tamguo/modules/tiku/service/impl/ChapterServiceImpl.java b/tamguo-modules-core/src/main/java/com/tamguo/modules/tiku/service/impl/ChapterServiceImpl.java index 850b6d6..285ca7a 100644 --- a/tamguo-modules-core/src/main/java/com/tamguo/modules/tiku/service/impl/ChapterServiceImpl.java +++ b/tamguo-modules-core/src/main/java/com/tamguo/modules/tiku/service/impl/ChapterServiceImpl.java @@ -1,12 +1,70 @@ package com.tamguo.modules.tiku.service.impl; +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.lang3.StringUtils; import org.springframework.stereotype.Service; +import org.springframework.transaction.annotation.Transactional; + +import com.baomidou.mybatisplus.mapper.Condition; import com.baomidou.mybatisplus.service.impl.ServiceImpl; +import com.tamguo.common.utils.SystemConstant; import com.tamguo.modules.tiku.dao.ChapterMapper; import com.tamguo.modules.tiku.model.ChapterEntity; import com.tamguo.modules.tiku.service.IChapterService; @Service public class ChapterServiceImpl extends ServiceImpl implements IChapterService{ + + @Transactional(readOnly=false) + @SuppressWarnings("unchecked") + @Override + public List findChapterTree(String bookId) { + List chapterList = baseMapper.selectList(Condition.create().eq("book_id", bookId)); + + // 获取根chapter UID + String rootUid = StringUtils.EMPTY; + for(int i=0 ; i entitys = new ArrayList<>(); + for(int i=0 ; i childs = new ArrayList<>(); + for(int k=0 ; k childs = entitys.get(i).getChildChapterList(); + for(int k=0 ; k tmpChilds = new ArrayList<>(); + for(int n=0 ; n 0) { book = bookList.get(0); - chapterList = iChapterService.selectList(Condition.create().eq("book_id", book.getId())); + chapterList = iChapterService.findChapterTree(book.getId()); } SubjectEntity subject = iSubjectService.selectById(course.getSubjectId()); List courseList = iCourseService.selectList(Condition.create().eq("subject_id", course.getSubjectId()));