diff --git a/bin/sitespeed.js b/bin/sitespeed.js index dd8c5587b..d4b3356ca 100755 --- a/bin/sitespeed.js +++ b/bin/sitespeed.js @@ -35,6 +35,7 @@ if (log.isEnabledFor(log.CRITICAL)) { // TODO change the threshold to VERBOSE be log.info('Versions OS: %s sitespeed.io: %s browsertime: %s coach: %s', os.platform() + ' ' + os.release(), packageInfo.version, packageInfo.dependencies.browsertime, packageInfo.dependencies.webcoach); +// FIXME need to consider aliases, e.g. -d => --crawler.depth loader.parsePluginNames(parsed.raw) .then((pluginNames) => { if (allInArray(['browsertime', 'coach'], pluginNames)) { diff --git a/lib/plugins/crawler/index.js b/lib/plugins/crawler/index.js index 304e0c34d..853901098 100644 --- a/lib/plugins/crawler/index.js +++ b/lib/plugins/crawler/index.js @@ -51,12 +51,18 @@ module.exports = { }); crawler.on('fetchcomplete', (queueItem) => { + const pageMimeType = /^(text|application)\/x?html/i; + const url = queueItem.url; if (redirectedUrls.has(url)) { log.verbose('Crawler skipping redirected URL %s', url); - } else { + } else if (message.url === url) { + log.verbose('Crawler skipping initial URL %s', url); + } else if (pageMimeType.test(queueItem.stateData.contentType)) { log.verbose('Crawler found URL %s', url); - queue.postMessage(make('url', {}, { url })); + queue.postMessage(make('url', {}, {url})); + } else { + log.verbose('Crawler found non html URL %s', url); } }); diff --git a/lib/support/cli.js b/lib/support/cli.js index defd4cf2b..ca03dae1b 100644 --- a/lib/support/cli.js +++ b/lib/support/cli.js @@ -74,10 +74,8 @@ module.exports.parseCommandLine = function parseCommandLine() { /* Crawler options */ - .option('d', { - alias: 'crawler.depth', - default: 1, - describe: 'How deep to crawl. NOT YET IMPLEMENTED', + .option('crawler.depth', { + describe: 'How deep to crawl (1=only one page, 2=include links from first page, etc.)', group: 'Crawler' }) .option('m', {