123 lines
3.3 KiB
JavaScript
123 lines
3.3 KiB
JavaScript
/**
|
|
* Sitespeed.io - How speedy is your site? (https://www.sitespeed.io)
|
|
* Copyright (c) 2014, Peter Hedenskog, Tobias Lidskog
|
|
* and other contributors
|
|
* Released under the Apache 2.0 License
|
|
*/
|
|
'use strict';
|
|
|
|
var spawn = require('cross-spawn-async'),
|
|
path = require('path'),
|
|
urlParser = require('url'),
|
|
winston = require('winston'),
|
|
EOL = require('os').EOL,
|
|
fs = require('fs'),
|
|
async = require('async');
|
|
|
|
module.exports.crawl = function(url, config, callback) {
|
|
var pUrl, args = [
|
|
'-Xmx' + config.memory + 'm',
|
|
'-Xms' + config.memory + 'm'
|
|
];
|
|
|
|
var log = winston.loggers.get('sitespeed.io');
|
|
var urlFile = path.join(config.run.absResultDir, 'urls.txt');
|
|
var errorUrlFile = path.join(config.run.absResultDir, 'errorurls.txt');
|
|
|
|
if (config.basicAuth) {
|
|
pUrl = urlParser.parse(url);
|
|
var port = pUrl.port || 80;
|
|
if (pUrl.protocol === 'https:') {
|
|
port = 443;
|
|
}
|
|
args.push('-Dcom.soulgalore.crawler.auth=' + pUrl.hostname + ':' + port + ':' + config.basicAuth);
|
|
}
|
|
if (config.proxy) {
|
|
pUrl = urlParser.parse(config.proxy);
|
|
args.push('-Dcom.soulgalore.crawler.proxy=' + pUrl.protocol + pUrl.host);
|
|
}
|
|
|
|
var requestHeaders = '';
|
|
// add extra request headers
|
|
if (config.requestHeaders) {
|
|
Object.keys(config.requestHeaders).forEach(function (key) {
|
|
requestHeaders += key + ':' + config.requestHeaders[key] + '@';
|
|
});
|
|
}
|
|
|
|
args.push('-cp',
|
|
path.join(__dirname, 'crawler-1.5.14-full.jar'),
|
|
'com.soulgalore.crawler.run.CrawlToFile',
|
|
'-u',
|
|
config.url,
|
|
'-l',
|
|
config.deep,
|
|
'-f',
|
|
urlFile,
|
|
'-ef',
|
|
errorUrlFile,
|
|
'-rh',
|
|
requestHeaders + 'User-Agent:' + config.userAgent);
|
|
|
|
if (config.containInPath) {
|
|
args.push('-p', config.containInPath);
|
|
}
|
|
|
|
if (config.skip) {
|
|
args.push('-np', config.skip);
|
|
}
|
|
|
|
var crawl = spawn('java', args);
|
|
|
|
crawl.stdout.on('data', function(data) {
|
|
log.info('Output from the crawl: %s', data.toString());
|
|
});
|
|
|
|
crawl.stderr.on('data', function(data) {
|
|
var s = data.toString();
|
|
// JAVA_TOOL_OPTIONS is not an error, but still written to stderr.
|
|
if (s.indexOf('Picked up JAVA_TOOL_OPTIONS:') === 0) {
|
|
return;
|
|
}
|
|
log.error('Error from the crawl: %s', s);
|
|
});
|
|
|
|
crawl.on('close', function(code) {
|
|
// the crawler always return code ok today, hmm
|
|
var okUrls = [];
|
|
var errorUrls = {};
|
|
|
|
async.parallel([
|
|
function(cb) {
|
|
// Url file might be non-existing, in case of no successful urls
|
|
fs.readFile(urlFile, function(err, data) {
|
|
if (!err) {
|
|
okUrls = data.toString().split(EOL);
|
|
okUrls.pop();
|
|
}
|
|
|
|
return cb();
|
|
});
|
|
},
|
|
function(cb) {
|
|
// Error file might be non-existing, in case of no errors
|
|
fs.readFile(errorUrlFile, function(err, data) {
|
|
if (!err) {
|
|
data.toString().split(EOL).forEach(function(theUrl) {
|
|
if (theUrl) {
|
|
var urlAndReason = theUrl.split(',');
|
|
errorUrls[urlAndReason[1]] = urlAndReason[0];
|
|
}
|
|
});
|
|
}
|
|
|
|
return cb();
|
|
});
|
|
}
|
|
],
|
|
function() {
|
|
return callback(okUrls, errorUrls);
|
|
});
|
|
});
|
|
};
|