sitespeed.io/lib/crawler/crawler.js

123 lines
3.3 KiB
JavaScript

/**
* Sitespeed.io - How speedy is your site? (https://www.sitespeed.io)
* Copyright (c) 2014, Peter Hedenskog, Tobias Lidskog
* and other contributors
* Released under the Apache 2.0 License
*/
'use strict';
var spawn = require('cross-spawn-async'),
path = require('path'),
urlParser = require('url'),
winston = require('winston'),
EOL = require('os').EOL,
fs = require('fs'),
async = require('async');
module.exports.crawl = function(url, config, callback) {
var pUrl, args = [
'-Xmx' + config.memory + 'm',
'-Xms' + config.memory + 'm'
];
var log = winston.loggers.get('sitespeed.io');
var urlFile = path.join(config.run.absResultDir, 'urls.txt');
var errorUrlFile = path.join(config.run.absResultDir, 'errorurls.txt');
if (config.basicAuth) {
pUrl = urlParser.parse(url);
var port = pUrl.port || 80;
if (pUrl.protocol === 'https:') {
port = 443;
}
args.push('-Dcom.soulgalore.crawler.auth=' + pUrl.hostname + ':' + port + ':' + config.basicAuth);
}
if (config.proxy) {
pUrl = urlParser.parse(config.proxy);
args.push('-Dcom.soulgalore.crawler.proxy=' + pUrl.protocol + pUrl.host);
}
var requestHeaders = '';
// add extra request headers
if (config.requestHeaders) {
Object.keys(config.requestHeaders).forEach(function (key) {
requestHeaders += key + ':' + config.requestHeaders[key] + '@';
});
}
args.push('-cp',
path.join(__dirname, 'crawler-1.5.14-full.jar'),
'com.soulgalore.crawler.run.CrawlToFile',
'-u',
config.url,
'-l',
config.deep,
'-f',
urlFile,
'-ef',
errorUrlFile,
'-rh',
requestHeaders + 'User-Agent:' + config.userAgent);
if (config.containInPath) {
args.push('-p', config.containInPath);
}
if (config.skip) {
args.push('-np', config.skip);
}
var crawl = spawn('java', args);
crawl.stdout.on('data', function(data) {
log.info('Output from the crawl: %s', data.toString());
});
crawl.stderr.on('data', function(data) {
var s = data.toString();
// JAVA_TOOL_OPTIONS is not an error, but still written to stderr.
if (s.indexOf('Picked up JAVA_TOOL_OPTIONS:') === 0) {
return;
}
log.error('Error from the crawl: %s', s);
});
crawl.on('close', function(code) {
// the crawler always return code ok today, hmm
var okUrls = [];
var errorUrls = {};
async.parallel([
function(cb) {
// Url file might be non-existing, in case of no successful urls
fs.readFile(urlFile, function(err, data) {
if (!err) {
okUrls = data.toString().split(EOL);
okUrls.pop();
}
return cb();
});
},
function(cb) {
// Error file might be non-existing, in case of no errors
fs.readFile(errorUrlFile, function(err, data) {
if (!err) {
data.toString().split(EOL).forEach(function(theUrl) {
if (theUrl) {
var urlAndReason = theUrl.split(',');
errorUrls[urlAndReason[1]] = urlAndReason[0];
}
});
}
return cb();
});
}
],
function() {
return callback(okUrls, errorUrls);
});
});
};