Filter out non-html pages from crawler.

2016-05-10 21:53:51 +02:00 · 2016-05-10 21:53:51 +02:00 · dad7546e95
parent 4fab04e5fc
commit dad7546e95
3 changed files with 11 additions and 6 deletions
--- a/bin/sitespeed.js
+++ b/bin/sitespeed.js
@ -35,6 +35,7 @@ if (log.isEnabledFor(log.CRITICAL)) { // TODO change the threshold to VERBOSE be

 log.info('Versions OS: %s sitespeed.io: %s browsertime: %s coach: %s', os.platform() + ' ' + os.release(), packageInfo.version, packageInfo.dependencies.browsertime, packageInfo.dependencies.webcoach);

+// FIXME need to consider aliases, e.g. -d => --crawler.depth
 loader.parsePluginNames(parsed.raw)
  .then((pluginNames) => {
    if (allInArray(['browsertime', 'coach'], pluginNames)) {
--- a/lib/plugins/crawler/index.js
+++ b/lib/plugins/crawler/index.js
@ -51,12 +51,18 @@ module.exports = {
        });

        crawler.on('fetchcomplete', (queueItem) => {
+          const pageMimeType = /^(text|application)\/x?html/i;
+
          const url = queueItem.url;
          if (redirectedUrls.has(url)) {
            log.verbose('Crawler skipping redirected URL %s', url);
-          } else {
+          } else if (message.url === url) {
+            log.verbose('Crawler skipping initial URL %s', url);
+          } else if (pageMimeType.test(queueItem.stateData.contentType)) {
            log.verbose('Crawler found URL %s', url);
-            queue.postMessage(make('url', {}, { url }));
+            queue.postMessage(make('url', {}, {url}));
+          } else {
+            log.verbose('Crawler found non html URL %s', url);
          }
        });

--- a/lib/support/cli.js
+++ b/lib/support/cli.js
@ -74,10 +74,8 @@ module.exports.parseCommandLine = function parseCommandLine() {
    /*
     Crawler options
     */
-    .option('d', {
-      alias: 'crawler.depth',
-      default: 1,
-      describe: 'How deep to crawl. NOT YET IMPLEMENTED',
+    .option('crawler.depth', {
+      describe: 'How deep to crawl (1=only one page, 2=include links from first page, etc.)',
      group: 'Crawler'
    })
    .option('m', {