利用NodeJS和PhantomJS抓取网站页面信息以及网站截图
安装PhantomJS
首先,去PhantomJS官网下载对应平台的版本,或者下载源代码自行编译。然后将PhantomJS配置进环境变量,输入
$ phantomjs
如果有反应,那么就可以进行下一步了。
利用PhantomJS进行简单截图
实际上phantomjs-node使用的也是HTTP或者Websocket来进行通讯,不过其依赖庞大,我们只想做一个简单的东西,暂时还是不考虑这个东东吧。
设计图
让我们开始吧
我们在第一版中选用HTTP进行实现。
首先利用cluster进行简单的进程守护(index.js):
module.exports = (function () {
"use strict"
var cluster = require('cluster')
, fs = require('fs');
if(!fs.existsSync('./snapshot')) {
fs.mkdirSync('./snapshot');
}
if (cluster.isMaster) {
cluster.fork();
cluster.on('exit', function (worker) {
console.log('Worker' + worker.id + ' died :(');
process.nextTick(function () {
cluster.fork();
});
})
} else {
require('./extract.js');
}
})();
然后利用connect做我们的对外API(extract.js):
module.exports = (function () {
"use strict"
var connect = require('connect')
, fs = require('fs')
, spawn = require('child_process').spawn
, jobMan = require('./lib/jobMan.js')
, bridge = require('./lib/bridge.js')
, pkg = JSON.parse(fs.readFileSync('./package.json'));
var app = connect()
.use(connect.logger('dev'))
.use('/snapshot', connect.static(__dirname + '/snapshot', { maxAge: pkg.maxAge }))
.use(connect.bodyParser())
.use('/bridge', bridge)
.use('/api', function (req, res, next) {
if (req.method !== "POST" || !req.body.campaignId) return next();
if (!req.body.urls || !req.body.urls.length) return jobMan.watch(req.body.campaignId, req, res, next);
var campaignId = req.body.campaignId
, imagesPath = './snapshot/' + campaignId + '/'
, urls = []
, url
, imagePath;
function _deal(id, url, imagePath) {
// just push into urls list
urls.push({
id: id,
url: url,
imagePath: imagePath
});
}
for (var i = req.body.urls.length; i--;) {
url = req.body.urls[i];
imagePath = imagesPath + i + '.png';
_deal(i, url, imagePath);
}
jobMan.register(campaignId, urls, req, res, next);
var snapshot = spawn('phantomjs', ['snapshot.js', campaignId]);
snapshot.stdout.on('data', function (data) {
console.log('stdout: ' + data);
});
snapshot.stderr.on('data', function (data) {
console.log('stderr: ' + data);
});
snapshot.on('close', function (code) {
console.log('snapshot exited with code ' + code);
});
})
.use(connect.static(__dirname + '/html', { maxAge: pkg.maxAge }))
.listen(pkg.port, function () { console.log('listen: ' + 'http://localhost:' + pkg.port); });
})();
这里我们引用了两个模块bridge和jobMan。
其中bridge是HTTP通讯桥梁,jobMan是工作管理器。我们通过campaignId来对应一个job,然后将job和response委托给jobMan管理。然后启动PhantomJS进行处理。
通讯桥梁负责接受或者返回job的相关信息,并交给jobMan(bridge.js):
module.exports = (function () {
"use strict"
var jobMan = require('./jobMan.js')
, fs = require('fs')
, pkg = JSON.parse(fs.readFileSync('./package.json'));
return function (req, res, next) {
if (req.headers.secret !== pkg.secret) return next();
// Snapshot APP can post url information
if (req.method === "POST") {
var body = JSON.parse(JSON.stringify(req.body));
jobMan.fire(body);
res.end('');
// Snapshot APP can get the urls should extract
} else {
var urls = jobMan.getUrls(req.url.match(/campaignId=([^&]*)(\s|&|$)/)[1]);
res.writeHead(200, {'Content-Type': 'application/json'});
res.statuCode = 200;
res.end(JSON.stringify({ urls: urls }));
}
};
})();
如果request method为POST,则我们认为PhantomJS正在给我们推送job的相关信息。而为GET时,则认为其要获取job的信息。
jobMan负责管理job,并发送目前得到的job信息通过response返回给client(jobMan.js):
module.exports = (function () {
"use strict"
var fs = require('fs')
, fetch = require('./fetch.js')
, _jobs = {};
function _send(campaignId){
var job = _jobs[campaignId];
if (!job) return;
if (job.waiting) {
job.waiting = false;
clearTimeout(job.timeout);
var finished = (job.urlsNum === job.finishNum)
, data = {
campaignId: campaignId,
urls: job.urls,
finished: finished
};
job.urls = [];
var res = job.res;
if (finished) {
_jobs[campaignId] = null;
delete _jobs[campaignId]
}
res.writeHead(200, {'Content-Type': 'application/json'});
res.statuCode = 200;
res.end(JSON.stringify(data));
}
}
function register(campaignId, urls, req, res, next) {
_jobs[campaignId] = {
urlsNum: urls.length,
finishNum: 0,
urls: [],
cacheUrls: urls,
res: null,
waiting: false,
timeout: null
};
watch(campaignId, req, res, next);
}
function watch(campaignId, req, res, next) {
_jobs[campaignId].res = res;
// 20s timeout
_jobs[campaignId].timeout = setTimeout(function () {
_send(campaignId);
}, 20000);
}
function fire(opts) {
var campaignId = opts.campaignId
, job = _jobs[campaignId]
, fetchObj = fetch(opts.html);
if (job) {
if (+opts.status && fetchObj.title) {
job.urls.push({
id: opts.id,
url: opts.url,
image: opts.image,
title: fetchObj.title,
description: fetchObj.description,
status: +opts.status
});
} else {
job.urls.push({
id: opts.id,
url: opts.url,
status: +opts.status
});
}
if (!job.waiting) {
job.waiting = true;
setTimeout(function () {
_send(campaignId);
}, 500);
}
job.finishNum ++;
} else {
console.log('job can not found!');
}
}
function getUrls(campaignId) {
var job = _jobs[campaignId];
if (job) return job.cacheUrls;
}
return {
register: register,
watch: watch,
fire: fire,
getUrls: getUrls
};
})();
这里我们用到fetch对html进行抓取其title和description,fetch实现比较简单(fetch.js):
module.exports = (function () {
"use strict"
return function (html) {
if (!html) return { title: false, description: false };
var title = html.match(/\<title\>(.*?)\<\/title\>/)
, meta = html.match(/\<meta\s(.*?)\/?\>/g)
, description;
if (meta) {
for (var i = meta.length; i--;) {
if(meta[i].indexOf('name="description"') > -1 || meta[i].indexOf('name="Description"') > -1){
description = meta[i].match(/content\=\"(.*?)\"/)[1];
}
}
}
(title && title[1] !== '') ? (title = title[1]) : (title = 'No Title');
description || (description = 'No Description');
return {
title: title,
description: description
};
};
})();
最后是PhantomJS运行的源代码,其启动后通过HTTP向bridge获取job信息,然后每完成job的其中一个url就通过HTTP返回给bridge(snapshot.js):
var webpage = require('webpage')
, args = require('system').args
, fs = require('fs')
, campaignId = args[1]
, pkg = JSON.parse(fs.read('./package.json'));
function snapshot(id, url, imagePath) {
var page = webpage.create()
, send
, begin
, save
, end;
page.viewportSize = { width: 1024, height: 800 };
page.clipRect = { top: 0, left: 0, width: 1024, height: 800 };
page.settings = {
javascriptEnabled: false,
loadImages: true,
userAgent: 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.31 (KHTML, like Gecko) PhantomJS/1.9.0'
};
page.open(url, function (status) {
var data;
if (status === 'fail') {
data = [
'campaignId=',
campaignId,
'&url=',
encodeURIComponent(url),
'&id=',
id,
'&status=',
].join('');
postPage.open('http://localhost:' + pkg.port + '/bridge', 'POST', data, function () {});
} else {
page.render(imagePath);
var html = page.content;
// callback NodeJS
data = [
'campaignId=',
campaignId,
'&html=',
encodeURIComponent(html),
'&url=',
encodeURIComponent(url),
'&image=',
encodeURIComponent(imagePath),
'&id=',
id,
'&status=',
].join('');
postMan.post(data);
}
// release the memory
page.close();
});
}
var postMan = {
postPage: null,
posting: false,
datas: [],
len: 0,
currentNum: 0,
init: function (snapshot) {
var postPage = webpage.create();
postPage.customHeaders = {
'secret': pkg.secret
};
postPage.open('http://localhost:' + pkg.port + '/bridge?campaignId=' + campaignId, function () {
var urls = JSON.parse(postPage.plainText).urls
, url;
this.len = urls.length;
if (this.len) {
for (var i = this.len; i--;) {
url = urls[i];
snapshot(url.id, url.url, url.imagePath);
}
}
});
this.postPage = postPage;
},
post: function (data) {
this.datas.push(data);
if (!this.posting) {
this.posting = true;
this.fire();
}
},
fire: function () {
if (this.datas.length) {
var data = this.datas.shift()
, that = this;
this.postPage.open('http://localhost:' + pkg.port + '/bridge', 'POST', data, function () {
that.fire();
// kill child process
setTimeout(function () {
if (++this.currentNum === this.len) {
that.postPage.close();
phantom.exit();
}
}, 500);
});
} else {
this.posting = false;
}
}
};
postMan.init(snapshot);