You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
git/scr/模糊专家系统/scrape/scrape.js

37 lines
1.7 KiB

let request = require('request').defaults({ 'proxy': "http://127.0.0.1:1080" });
let Queue = require('promise-queue');
let fs = require('fs');
let path = require('path');
let queue = new Queue(10); //最多同时10线程采集
request('https://www.dxomark.com/daksensor/ajax/jsontested', //获得相机列表
(error, response, body) => {
let cameraList = JSON.parse(body).data;
let finishedCount = 0;
cameraList.forEach(cameraMeta => {
let camera = Object.assign({}, cameraMeta);
let link = `https://www.dxomark.com${camera.link}---Specifications`; //拼合对应的规格网址
queue.add(() => new Promise(res => { //把request放入队列以保证同时http请求不超过10个
let doRequest = () => {
request(link, (error, response, body) => {
if (error) { //如果失败则重试
console.log("retrying.." + camera.name);
doRequest();
return;
}
let specMatcherRegexp = /descriptifgauche.+?>([\s\S]+?)<\/td>[\s\S]+?descriptif_data.+?>([\s\S]+?)<\/td>/img;
let match = specMatcherRegexp.exec(body);
while (match) { //用正则表达式匹配table里的每一个项目并添加至采集结果中
camera[match[1]] = match[2];
match = specMatcherRegexp.exec(body);
}
fs.writeFileSync(path.join('./scraped', camera.name + '.txt'), JSON.stringify(camera, null, 4), { encoding: "UTF8" });
finishedCount++;
console.log(`Finished ${finishedCount}/${cameraList.length}: ${camera.name}`);
res();
})
};
doRequest();
}));
});
});