优化链接

main
tamguo 7 years ago
parent a4c404fff3
commit 4d29895ea6

@ -1,5 +1,5 @@
domain.name=http://localhost/
server.port=80
domain.name=http://admin.tamguo.com/
server.port=8082
jasypt.encryptor.password=tamguo
spring.datasource.connectionProperties=druid.stat.mergeSql=true;druid.stat.slowSqlMillis=5000
@ -11,14 +11,14 @@ spring.datasource.maxPoolPreparedStatementPerConnectionSize=20
spring.datasource.maxWait=60000
spring.datasource.minEvictableIdleTimeMillis=300000
spring.datasource.minIdle=5
spring.datasource.password=Tanguo
spring.datasource.password=
spring.datasource.poolPreparedStatements=true
spring.datasource.testOnBorrow=false
spring.datasource.testOnReturn=false
spring.datasource.testWhileIdle=true
spring.datasource.timeBetweenEvictionRunsMillis=60000
spring.datasource.type=com.alibaba.druid.pool.DruidDataSource
spring.datasource.url=jdbc:mysql://47.100.175.14:3306/tamguo?useUnicode=true&characterEncoding=UTF-8&useSSL=false
spring.datasource.url=jdbc:mysql://127.0.0.1:3306/tiku?useUnicode=true&characterEncoding=UTF-8&useSSL=false
spring.datasource.username=root
spring.datasource.validationQuery=SELECT 1 FROM DUAL
@ -46,7 +46,7 @@ spring.thymeleaf.encoding=UTF-8
spring.thymeleaf.content-type=text/html
spring.thymeleaf.cache=false
redis.hostname=47.100.175.14
redis.hostname=127.0.0.1
redis.port=6379
redis.password=

@ -4,6 +4,6 @@
<pool maxActive="50" maxIdle="20" maxWait="1000" />
<servers>
<!-- test -->
<server ip="47.100.175.14" port="6379"/>
<server ip="127.0.0.1" port="6379"/>
</servers>
</redis>

@ -0,0 +1,54 @@
package com.tamguo.model.enums;
import java.io.Serializable;
/**
* (1.2.; 3.)
*
* @author tamguo
*
*/
public enum QuestionType {
DANXUANTI("1", "单选题"),
DUOXUANTI("2", "多选题"),
TIANKONGTI("3", "填空题"),
PANDUANTI("4", "判断题"),
WENDATI("5", "问答题");
private String value;
private String desc;
QuestionType(final String value, final String desc) {
this.value = value;
this.desc = desc;
}
public static QuestionType getQuestionType(String value) {
if("单选题".equals(value)) {
return DANXUANTI;
}else if("多选题".equals(value)) {
return DUOXUANTI;
}else if("填空题".equals(value)) {
return TIANKONGTI;
}else if("判断题".equals(value)) {
return PANDUANTI;
}else if("问答题".equals(value)) {
return WENDATI;
}
return DANXUANTI;
}
public Serializable getValue() {
return this.value;
}
public String getDesc(){
return this.desc;
}
@Override
public String toString() {
return this.value;
}
}

@ -11,6 +11,12 @@ public class QuestionVo {
@PageFieldSelect(cssQuery=".question-box-inner .questem-inner", selectType = XxlCrawlerConf.SelectType.HTML)
private String content;
@PageFieldSelect(cssQuery = ".question-box-inner .questem-inner img", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "abs:src")
private List<String> contentImages;
@PageFieldSelect(cssQuery=".queoptions-inner", selectType = XxlCrawlerConf.SelectType.HTML)
private String queoptions;
@PageFieldSelect(cssQuery=".exam-answer-content", selectType = XxlCrawlerConf.SelectType.HTML)
private String answer;
@ -35,21 +41,6 @@ public class QuestionVo {
@PageFieldSelect(cssQuery=".kpoint-contain point point-item",selectType = XxlCrawlerConf.SelectType.TEXT)
private List<String> reviewPoint;
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getAnalysis() {
return analysis;
}
public void setAnalysis(String analysis) {
this.analysis = analysis;
}
public String getQuestionType() {
return questionType;
@ -106,5 +97,37 @@ public class QuestionVo {
public void setAnalysisImages(List<String> analysisImages) {
this.analysisImages = analysisImages;
}
public String getQueoptions() {
return queoptions;
}
public void setQueoptions(String queoptions) {
this.queoptions = queoptions;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public List<String> getContentImages() {
return contentImages;
}
public void setContentImages(List<String> contentImages) {
this.contentImages = contentImages;
}
public String getAnalysis() {
return analysis;
}
public void setAnalysis(String analysis) {
this.analysis = analysis;
}
}

@ -1,3 +1,4 @@
domain.name=http://www.tamguo.com/
spring.datasource.connectionProperties=druid.stat.mergeSql=true;druid.stat.slowSqlMillis=5000
spring.datasource.driver-class-name=com.mysql.jdbc.Driver
spring.datasource.filters=stat,wall,log4j

@ -0,0 +1,200 @@
package com.tamguo;
import java.io.File;
import java.text.DecimalFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.context.junit4.SpringRunner;
import com.baomidou.mybatisplus.plugins.Page;
import com.tamguo.config.redis.CacheService;
import com.tamguo.dao.ChapterMapper;
import com.tamguo.dao.CourseMapper;
import com.tamguo.dao.CrawlerQuestionMapper;
import com.tamguo.dao.QuestionMapper;
import com.tamguo.dao.SubjectMapper;
import com.tamguo.model.ChapterEntity;
import com.tamguo.model.CourseEntity;
import com.tamguo.model.CrawlerQuestionEntity;
import com.tamguo.model.QuestionEntity;
import com.tamguo.model.SubjectEntity;
import com.tamguo.model.vo.QuestionVo;
import com.xuxueli.crawler.XxlCrawler;
import com.xuxueli.crawler.conf.XxlCrawlerConf;
import com.xuxueli.crawler.parser.PageParser;
import com.xuxueli.crawler.parser.strategy.HtmlUnitPageLoader;
import com.xuxueli.crawler.rundata.RunData;
import com.xuxueli.crawler.util.FileUtil;
@RunWith(SpringRunner.class)
@SpringBootTest
public class SingleQuestionCrawler {
private RunData runData;
@Autowired
CrawlerQuestionMapper crawlerQuestionMapper;
@Autowired
ChapterMapper chapterMapper;
@Autowired
CourseMapper courseMapper;
@Autowired
SubjectMapper subjectMapper;
@Autowired
CacheService cacheService;
@Autowired
QuestionMapper questionMapper;
private static final String FILES_NO_FORMAT = "000000";
private static final String FILES_PREFIX = "FP";
@Value(value="${domain.name}")
public String DOMAIN;
@Test
public void crawlerSubject() throws Exception {
XxlCrawler crawler = new XxlCrawler.Builder()
.setAllowSpread(false)
.setThreadCount(20)
.setPageLoader(new HtmlUnitPageLoader())
.setPageParser(new PageParser<QuestionVo>() {
@Override
public void parse(Document html, Element pageVoElement, QuestionVo questionVo) {
CrawlerQuestionEntity condition = new CrawlerQuestionEntity();
condition.setQuestionUrl(html.baseUri());
CrawlerQuestionEntity crawlerQuestion = crawlerQuestionMapper.selectOne(condition);
ChapterEntity chapter = chapterMapper.selectById(crawlerQuestion.getChapterId());
CourseEntity course = courseMapper.selectById(chapter.getCourseId());
SubjectEntity subject = subjectMapper.selectById(course.getSubjectId());
QuestionEntity question = new QuestionEntity();
question.setAnalysis(questionVo.getAnalysis());
question.setAnswer(questionVo.getAnswer());
question.setAuditStatus("1");
question.setChapterId(chapter.getUid());
question.setContent(questionVo.getContent());
question.setCourseId(course.getUid());
question.setPaperId(null);
question.setQuestionType("1");
if(questionVo.getReviewPoint() != null && questionVo.getReviewPoint().size() > 0) {
question.setReviewPoint(StringUtils.join(questionVo.getReviewPoint().toArray(), ","));
}
question.setScore(questionVo.getScore());
question.setSubjectId(subject.getUid());
question.setYear(questionVo.getYear());
if (questionVo.getAnswerImages()!=null && questionVo.getAnswerImages().size() > 0) {
Set<String> imagesSet = new HashSet<>(questionVo.getAnswerImages());
for (String img: imagesSet) {
// 下载图片文件
String fileName = getFileName(img);
File dir = new File(getFilePath());
if (!dir.exists())
dir.mkdirs();
boolean ret = FileUtil.downFile(img, XxlCrawlerConf.TIMEOUT_MILLIS_DEFAULT, getFilePath(), fileName);
System.out.println("down images " + (ret?"success":"fail") + "" + img);
// 替换URL
questionVo.setAnswer(questionVo.getAnswer().replace(img, DOMAIN + getFilePaths() + fileName));
}
question.setAnswer(questionVo.getAnswer());
}
if (questionVo.getAnalysisImages()!=null && questionVo.getAnalysisImages().size() > 0) {
Set<String> imagesSet = new HashSet<>(questionVo.getAnalysisImages());
for (String img: imagesSet) {
// 下载图片文件
String fileName = getFileName(img);
File dir = new File(getFilePath());
if (!dir.exists())
dir.mkdirs();
boolean ret = FileUtil.downFile(img, XxlCrawlerConf.TIMEOUT_MILLIS_DEFAULT, getFilePath(), fileName);
System.out.println("down images " + (ret?"success":"fail") + "" + img);
// 替换URL
questionVo.setAnalysis(questionVo.getAnalysis().replace(img, DOMAIN + getFilePaths() + fileName));
}
}
question.setAnalysis(questionVo.getAnalysis());
if (questionVo.getContentImages()!=null && questionVo.getContentImages().size() > 0) {
Set<String> imagesSet = new HashSet<>(questionVo.getContentImages());
for (String img: imagesSet) {
// 下载图片文件
String fileName = getFileName(img);
File dir = new File(getFilePath());
if (!dir.exists())
dir.mkdirs();
boolean ret = FileUtil.downFile(img, XxlCrawlerConf.TIMEOUT_MILLIS_DEFAULT, getFilePath(), fileName);
System.out.println("down images " + (ret?"success":"fail") + "" + img);
// 替换URL
questionVo.setContent(questionVo.getContent().replace(img, DOMAIN + getFilePaths() + fileName));
}
}
question.setContent(questionVo.getContent());
questionMapper.insert(question);
}
public String getFileName(String img) {
return getFileNo() + img.substring(img.lastIndexOf("."));
}
private String getFilePath() {
SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");
String format = sdf.format(new Date());
return "/home/webdata/files/" + format + "/";
}
private String getFilePaths() {
SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");
String format = sdf.format(new Date());
return "/files/" + format + "/";
}
private String getFileNo() {
SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");
String format = sdf.format(new Date());
DecimalFormat df = new DecimalFormat(FILES_NO_FORMAT);
String key = FILES_PREFIX + format;
Long incr = cacheService.incr(key);
String avatorNo = FILES_PREFIX + df.format(incr);
return avatorNo;
}
}).build();
runData = crawler.getRunData();
int page = 1;
int pageSize = 100;
while(true) {
Page<CrawlerQuestionEntity> questionPage = new Page<CrawlerQuestionEntity>(page , pageSize);
List<CrawlerQuestionEntity> questionList = crawlerQuestionMapper.queryPageOrderUid(questionPage);
for(int i=0 ;i<questionList.size() ; i++) {
runData.addUrl(questionList.get(i).getQuestionUrl());
}
page++;
if(questionList.size() < 100) {
break;
}
}
crawler.start(true);
}
}

@ -1,6 +1,6 @@
domain.name=http://localhost/
domain.name=http://www.tamguo.com/
admin.domain.name=http://admin.tanguoguo.com
server.port=80
server.port=8081
jasypt.encryptor.password=tamguo
spring.datasource.connectionProperties=druid.stat.mergeSql=true;druid.stat.slowSqlMillis=5000
@ -12,14 +12,14 @@ spring.datasource.maxPoolPreparedStatementPerConnectionSize=20
spring.datasource.maxWait=60000
spring.datasource.minEvictableIdleTimeMillis=300000
spring.datasource.minIdle=5
spring.datasource.password=Tanguo
spring.datasource.password=
spring.datasource.poolPreparedStatements=true
spring.datasource.testOnBorrow=false
spring.datasource.testOnReturn=false
spring.datasource.testWhileIdle=true
spring.datasource.timeBetweenEvictionRunsMillis=60000
spring.datasource.type=com.alibaba.druid.pool.DruidDataSource
spring.datasource.url=jdbc:mysql://47.100.175.14:3306/tamguo?useUnicode=true&characterEncoding=UTF-8&useSSL=false
spring.datasource.url=jdbc:mysql://127.0.0.1:3306/tiku?useUnicode=true&characterEncoding=UTF-8&useSSL=false
spring.datasource.username=root
spring.datasource.validationQuery=SELECT 1 FROM DUAL
@ -46,7 +46,7 @@ spring.thymeleaf.encoding=UTF-8
spring.thymeleaf.content-type=text/html
spring.thymeleaf.cache=false
redis.hostname=47.100.175.14
redis.hostname=127.0.0.1
redis.port=6379
redis.password=

@ -4,6 +4,6 @@
<pool maxActive="50" maxIdle="20" maxWait="1000" />
<servers>
<!-- test -->
<server ip="47.100.175.14" port="6379"/>
<server ip="127.0.0.1" port="6379"/>
</servers>
</redis>

@ -106,14 +106,14 @@
<p class="title">试卷资源
<span class="detail">一考知底,高分必刷,全面提分</span>
<span class="area-content">当前位置:
<a class="more-area-link" th:href="${setting.domain+'paperlist/1014091031592214529/0-0-0-1-1.html'}" target="_blank">更多地区 &gt; </a>
<a class="more-area-link" th:href="${setting.domain+'paperlist/1014440449138954241/0-0-0-1-1.html'}" target="_blank">更多地区 &gt; </a>
</span>
</p>
<div class="paper-main">
<div class="paper-box zhenti-box homepage-zhenti-box">
<div class="paper-list-wrap">
<h3 class="paper-title">历年真题
<a class="list-more-link" th:href="${setting.domain+'paperlist/1014091031592214529/0-1-0-0-1.html'}">更多真题试卷 &gt;</a>
<a class="list-more-link" th:href="${setting.domain+'paperlist/1014440449138954241/0-1-0-0-1.html'}">更多真题试卷 &gt;</a>
</h3>
<ul class="paper-list paper-list-zhenti">
<li class="list-item" th:each="p,pStat:${historyPaperList}">
@ -130,7 +130,7 @@
<div class="paper-box moni-box homepage-moni-box">
<div class="paper-list-wrap">
<h3 class="paper-title">模拟试卷
<a class="list-more-link" th:href="${setting.domain+'paperlist/1014091031592214529/0-2-0-0-1.html'}">更多模拟试卷 &gt;</a>
<a class="list-more-link" th:href="${setting.domain+'paperlist/1014440449138954241/0-2-0-0-1.html'}">更多模拟试卷 &gt;</a>
</h3>
<ul class="paper-list paper-list-moni">
<li class="list-item" th:each="p,pStat:${simulationPaperList}">
@ -146,7 +146,7 @@
</div>
<div class="homepage-paper-box shiti-box">
<div class="hotpaper-list-wrap">
<h3 class="hotpaper-title">热门试卷<a class="list-more-link" th:href="${setting.domain+'paperlist/1014091031592214529/0-0-0-1-1.html'}">更多热门试卷 &gt; </a>
<h3 class="hotpaper-title">热门试卷<a class="list-more-link" th:href="${setting.domain+'paperlist/1014440449138954241/0-0-0-1-1.html'}">更多热门试卷 &gt; </a>
</h3>
<ul class="hotpaper-list">
<li class="hotpaper-list-item" th:each="p,pStat:${hotPaperList}">
@ -162,12 +162,12 @@
<div class="school-paper-main">
<div class="school-paper-container homepage-school-container">
<h3 class="school-container-title">名校精品试卷
<a class="school-paper-more-link" th:href="${setting.domain + 'paperlist/1014091031592214529/0-4-0-0-1.html'}">更多名校精品卷 &gt; </a>
<a class="school-paper-more-link" th:href="${setting.domain + 'paperlist/1014440449138954241/0-4-0-0-1.html'}">更多名校精品卷 &gt; </a>
</h3>
<div class="school-list">
<div class="school-list-item " th:each="s,sStat:${eliteSchoolPaperList}">
<div class="school-wrap school-wrap-bg1">
<a class="famous-school-link" th:href="${setting.domain + 'paperlist/1014091031592214529/0-4-0-0-1.html'}" data-schoolid="924cf7ec4afe04a1b071de05">
<a class="famous-school-link" th:href="${setting.domain + 'paperlist/1014440449138954241/0-4-0-0-1.html'}" data-schoolid="924cf7ec4afe04a1b071de05">
<div class="school-info">
<p class="name" th:text="${s.name}">北京大学附属中学</p>
<p class="info">
@ -193,7 +193,7 @@
</div>
<div class="more-school-list">
<a class="more-school-name" th:each="s,sStat:${eliteSchoolList}" th:text="${s.name}" th:href="${setting.domain + 'paperlist/1014091031592214529/0-4-0-0-1.html'}">
<a class="more-school-name" th:each="s,sStat:${eliteSchoolList}" th:text="${s.name}" th:href="${setting.domain + 'paperlist/1014440449138954241/0-4-0-0-1.html'}">
北京市八一学校
</a>
</div>

Loading…
Cancel
Save