利用NodeJS和PhantomJS抓取網(wǎng)站頁面信息以及網(wǎng)站截圖
安裝PhantomJS
首先,去PhantomJS官網(wǎng)下載對應平臺的版本,或者下載源代碼自行編譯。然后將PhantomJS配置進環(huán)境變量,輸入
$ phantomjs
如果有反應,那么就可以進行下一步了。
利用PhantomJS進行簡單截圖
這里我們設置了窗口大小為1024 * 800:
page.viewportSize = { width: 1024, height: 800 };
截取從(0, 0)為起點的1024 * 800大小的圖像:
禁止Javascript,允許圖片載入,并將userAgent改為"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.31 (KHTML, like Gecko) PhantomJS/19.0":
然后利用page.open打開頁面,最后截圖輸出到./snapshot/test.png中:
NodeJS與PhantomJS通訊
我們先來看看PhantomJS能做什么通訊。
命令行傳參例如:
phantomjs snapshot.js http://www.baidu.com
命令行傳參只能在PhantomJS開啟時進行傳參,在運行過程中就無能為力了。
標準輸出能從PhantomJS向NodeJS輸出數(shù)據(jù),但卻沒法從NodeJS傳數(shù)據(jù)給PhantomJS。
不過測試中,標準輸出是這幾種方式傳輸最快的,在大量數(shù)據(jù)傳輸中應當考慮。
PhantomJS向NodeJS服務發(fā)出HTTP請求,然后NodeJS返回相應的數(shù)據(jù)。
這種方式很簡單,但是請求只能由PhantomJS發(fā)出。
值得注意的是PhantomJS 1.9.0支持Websocket了,不過可惜是hixie-76 Websocket,不過畢竟還是提供了一種NodeJS主動向PhantomJS通訊的方案了。
測試中,我們發(fā)現(xiàn)PhantomJS連上本地的Websocket服務居然需要1秒左右,暫時不考慮這種方法吧。
phantomjs-node成功將PhantomJS作為NodeJS的一個模塊來使用,但我們看看作者的原理解釋:
I will answer that question with a question. How do you communicate with a process that doesn't support shared memory, sockets, FIFOs, or standard input?
Well, there's one thing PhantomJS does support, and that's opening webpages. In fact, it's really good at opening web pages. So we communicate with PhantomJS by spinning up an instance of ExpressJS, opening Phantom in a subprocess, and pointing it at a special webpage that turns socket.io messages into alert()
calls. Those alert()
calls are picked up by Phantom and there you go!
The communication itself happens via James Halliday's fantastic dnode library, which fortunately works well enough when combined with browserify to run straight out of PhantomJS's pidgin Javascript environment.
實際上phantomjs-node使用的也是HTTP或者Websocket來進行通訊,不過其依賴龐大,我們只想做一個簡單的東西,暫時還是不考慮這個東東吧。
設計圖
讓我們開始吧
我們在第一版中選用HTTP進行實現(xiàn)。
首先利用cluster進行簡單的進程守護(index.js):
module.exports = (function () {
"use strict"
var cluster = require('cluster')
, fs = require('fs');
if(!fs.existsSync('./snapshot')) {
fs.mkdirSync('./snapshot');
}
if (cluster.isMaster) {
cluster.fork();
cluster.on('exit', function (worker) {
console.log('Worker' + worker.id + ' died :(');
process.nextTick(function () {
cluster.fork();
});
})
} else {
require('./extract.js');
}
})();
然后利用connect做我們的對外API(extract.js):
module.exports = (function () {
"use strict"
var connect = require('connect')
, fs = require('fs')
, spawn = require('child_process').spawn
, jobMan = require('./lib/jobMan.js')
, bridge = require('./lib/bridge.js')
, pkg = JSON.parse(fs.readFileSync('./package.json'));
var app = connect()
.use(connect.logger('dev'))
.use('/snapshot', connect.static(__dirname + '/snapshot', { maxAge: pkg.maxAge }))
.use(connect.bodyParser())
.use('/bridge', bridge)
.use('/api', function (req, res, next) {
if (req.method !== "POST" || !req.body.campaignId) return next();
if (!req.body.urls || !req.body.urls.length) return jobMan.watch(req.body.campaignId, req, res, next);
var campaignId = req.body.campaignId
, imagesPath = './snapshot/' + campaignId + '/'
, urls = []
, url
, imagePath;
function _deal(id, url, imagePath) {
// just push into urls list
urls.push({
id: id,
url: url,
imagePath: imagePath
});
}
for (var i = req.body.urls.length; i--;) {
url = req.body.urls[i];
imagePath = imagesPath + i + '.png';
_deal(i, url, imagePath);
}
jobMan.register(campaignId, urls, req, res, next);
var snapshot = spawn('phantomjs', ['snapshot.js', campaignId]);
snapshot.stdout.on('data', function (data) {
console.log('stdout: ' + data);
});
snapshot.stderr.on('data', function (data) {
console.log('stderr: ' + data);
});
snapshot.on('close', function (code) {
console.log('snapshot exited with code ' + code);
});
})
.use(connect.static(__dirname + '/html', { maxAge: pkg.maxAge }))
.listen(pkg.port, function () { console.log('listen: ' + 'http://localhost:' + pkg.port); });
})();
這里我們引用了兩個模塊bridge和jobMan。
其中bridge是HTTP通訊橋梁,jobMan是工作管理器。我們通過campaignId來對應一個job,然后將job和response委托給jobMan管理。然后啟動PhantomJS進行處理。
通訊橋梁負責接受或者返回job的相關信息,并交給jobMan(bridge.js):
module.exports = (function () {
"use strict"
var jobMan = require('./jobMan.js')
, fs = require('fs')
, pkg = JSON.parse(fs.readFileSync('./package.json'));
return function (req, res, next) {
if (req.headers.secret !== pkg.secret) return next();
// Snapshot APP can post url information
if (req.method === "POST") {
var body = JSON.parse(JSON.stringify(req.body));
jobMan.fire(body);
res.end('');
// Snapshot APP can get the urls should extract
} else {
var urls = jobMan.getUrls(req.url.match(/campaignId=([^&]*)(\s|&|$)/)[1]);
res.writeHead(200, {'Content-Type': 'application/json'});
res.statuCode = 200;
res.end(JSON.stringify({ urls: urls }));
}
};
})();
如果request method為POST,則我們認為PhantomJS正在給我們推送job的相關信息。而為GET時,則認為其要獲取job的信息。
jobMan負責管理job,并發(fā)送目前得到的job信息通過response返回給client(jobMan.js):
module.exports = (function () {
"use strict"
var fs = require('fs')
, fetch = require('./fetch.js')
, _jobs = {};
function _send(campaignId){
var job = _jobs[campaignId];
if (!job) return;
if (job.waiting) {
job.waiting = false;
clearTimeout(job.timeout);
var finished = (job.urlsNum === job.finishNum)
, data = {
campaignId: campaignId,
urls: job.urls,
finished: finished
};
job.urls = [];
var res = job.res;
if (finished) {
_jobs[campaignId] = null;
delete _jobs[campaignId]
}
res.writeHead(200, {'Content-Type': 'application/json'});
res.statuCode = 200;
res.end(JSON.stringify(data));
}
}
function register(campaignId, urls, req, res, next) {
_jobs[campaignId] = {
urlsNum: urls.length,
finishNum: 0,
urls: [],
cacheUrls: urls,
res: null,
waiting: false,
timeout: null
};
watch(campaignId, req, res, next);
}
function watch(campaignId, req, res, next) {
_jobs[campaignId].res = res;
// 20s timeout
_jobs[campaignId].timeout = setTimeout(function () {
_send(campaignId);
}, 20000);
}
function fire(opts) {
var campaignId = opts.campaignId
, job = _jobs[campaignId]
, fetchObj = fetch(opts.html);
if (job) {
if (+opts.status && fetchObj.title) {
job.urls.push({
id: opts.id,
url: opts.url,
image: opts.image,
title: fetchObj.title,
description: fetchObj.description,
status: +opts.status
});
} else {
job.urls.push({
id: opts.id,
url: opts.url,
status: +opts.status
});
}
if (!job.waiting) {
job.waiting = true;
setTimeout(function () {
_send(campaignId);
}, 500);
}
job.finishNum ++;
} else {
console.log('job can not found!');
}
}
function getUrls(campaignId) {
var job = _jobs[campaignId];
if (job) return job.cacheUrls;
}
return {
register: register,
watch: watch,
fire: fire,
getUrls: getUrls
};
})();
這里我們用到fetch對html進行抓取其title和description,fetch實現(xiàn)比較簡單(fetch.js):
module.exports = (function () {
"use strict"
return function (html) {
if (!html) return { title: false, description: false };
var title = html.match(/\<title\>(.*?)\<\/title\>/)
, meta = html.match(/\<meta\s(.*?)\/?\>/g)
, description;
if (meta) {
for (var i = meta.length; i--;) {
if(meta[i].indexOf('name="description"') > -1 || meta[i].indexOf('name="Description"') > -1){
description = meta[i].match(/content\=\"(.*?)\"/)[1];
}
}
}
(title && title[1] !== '') ? (title = title[1]) : (title = 'No Title');
description || (description = 'No Description');
return {
title: title,
description: description
};
};
})();
最后是PhantomJS運行的源代碼,其啟動后通過HTTP向bridge獲取job信息,然后每完成job的其中一個url就通過HTTP返回給bridge(snapshot.js):
var webpage = require('webpage')
, args = require('system').args
, fs = require('fs')
, campaignId = args[1]
, pkg = JSON.parse(fs.read('./package.json'));
function snapshot(id, url, imagePath) {
var page = webpage.create()
, send
, begin
, save
, end;
page.viewportSize = { width: 1024, height: 800 };
page.clipRect = { top: 0, left: 0, width: 1024, height: 800 };
page.settings = {
javascriptEnabled: false,
loadImages: true,
userAgent: 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.31 (KHTML, like Gecko) PhantomJS/1.9.0'
};
page.open(url, function (status) {
var data;
if (status === 'fail') {
data = [
'campaignId=',
campaignId,
'&url=',
encodeURIComponent(url),
'&id=',
id,
'&status=',
].join('');
postPage.open('http://localhost:' + pkg.port + '/bridge', 'POST', data, function () {});
} else {
page.render(imagePath);
var html = page.content;
// callback NodeJS
data = [
'campaignId=',
campaignId,
'&html=',
encodeURIComponent(html),
'&url=',
encodeURIComponent(url),
'&image=',
encodeURIComponent(imagePath),
'&id=',
id,
'&status=',
].join('');
postMan.post(data);
}
// release the memory
page.close();
});
}
var postMan = {
postPage: null,
posting: false,
datas: [],
len: 0,
currentNum: 0,
init: function (snapshot) {
var postPage = webpage.create();
postPage.customHeaders = {
'secret': pkg.secret
};
postPage.open('http://localhost:' + pkg.port + '/bridge?campaignId=' + campaignId, function () {
var urls = JSON.parse(postPage.plainText).urls
, url;
this.len = urls.length;
if (this.len) {
for (var i = this.len; i--;) {
url = urls[i];
snapshot(url.id, url.url, url.imagePath);
}
}
});
this.postPage = postPage;
},
post: function (data) {
this.datas.push(data);
if (!this.posting) {
this.posting = true;
this.fire();
}
},
fire: function () {
if (this.datas.length) {
var data = this.datas.shift()
, that = this;
this.postPage.open('http://localhost:' + pkg.port + '/bridge', 'POST', data, function () {
that.fire();
// kill child process
setTimeout(function () {
if (++this.currentNum === this.len) {
that.postPage.close();
phantom.exit();
}
}, 500);
});
} else {
this.posting = false;
}
}
};
postMan.init(snapshot);
相關文章
Javascript實現(xiàn)動態(tài)菜單添加的實例代碼
在注冊信息的時候,常常需要通過下拉菜單讓用戶選擇,而且希望用戶在第一個下拉框做的選擇,影響第二個下拉框的內(nèi)容。有時候,如果第一個下拉框不作出選擇,第二個下拉框根本不會頁面上顯示,為了給用戶呈現(xiàn)一個更清晰的頁面。2013-07-07ES5 模擬 ES6 的 Symbol 實現(xiàn)私有成員功能示例
這篇文章主要介紹了ES5 模擬 ES6 的 Symbol 實現(xiàn)私有成員功能,結(jié)合實例形式分析了ES5 模擬 ES6 的 Symbol 實現(xiàn)私有成員功能相關原理、實現(xiàn)方法與操作注意事項,需要的朋友可以參考下2020-05-05javascript中for/in循環(huán)及使用技巧
如果您希望一遍又一遍地運行相同的代碼,并且每次的值都不同,那么使用循環(huán)是很方便的,本篇文章給大家介紹javascript中for/in循環(huán)及使用技巧 ,需要的朋友可以參考下2015-09-09javascript檢查瀏覽器是否已經(jīng)啟用XX功能
本文給大家分享的是檢測瀏覽器是否支持cookie功能,檢查瀏覽器是否已經(jīng)啟用Java支持功能以及獲取當前瀏覽器的信息,十分的實用,有需要的小伙伴可以參考下。2015-07-07通過js獲取上傳的圖片信息(臨時保存路徑,名稱,大小)然后通過ajax傳遞給后端的方法
最近有朋友向我請教,使用js獲取上傳圖片的信息然后通過ajax傳遞給后端,怎么實現(xiàn)呢?通過上網(wǎng)搜索大量資料,下面小編把我的解決辦法整理,分享給大家,需要的朋友可以參考下2015-10-10