sitespeed.io/lib/plugins/crawler/index.js

86 lines
2.7 KiB
JavaScript

'use strict';
const Promise = require('bluebird');
const path = require('path');
const merge = require('lodash.merge');
const log = require('intel').getLogger('sitespeedio.plugin.crawler');
const messageMaker = require('../../support/messageMaker');
const Crawler = require('simplecrawler');
const make = messageMaker('crawler').make;
const defaultOptions = {
depth: 3
};
module.exports = {
open(context, options) {
this.options = merge({}, defaultOptions, options.crawler);
},
processMessage(message, queue) {
if (message.type === 'url' && message.source !== 'crawler') {
const maxPages = this.options.maxPages || Number.MAX_SAFE_INTEGER;
if (this.options.depth <= 1 || maxPages === 1) {
return Promise.resolve();
}
return new Promise(resolve => {
const redirectedUrls = new Set(),
crawler = new Crawler(message.url);
let pageCount = 1; // First page is start url
crawler.maxDepth = this.options.depth;
crawler.downloadUnsupported = false;
crawler.allowInitialDomainChange = true;
crawler.parseHTMLComments = false;
crawler.addFetchCondition(function(parsedURL) {
const extension = path.extname(parsedURL.path);
// Don't try to download these, based on file name.
return ['png', 'jpg', 'gif', 'pdf'].indexOf(extension) === -1;
});
crawler.on('fetchredirect', (queueItem, parsedURL, response) => {
redirectedUrls.add(response.headers.location);
});
crawler.on('fetchcomplete', queueItem => {
const pageMimeType = /^(text|application)\/x?html/i;
const url = queueItem.url;
if (redirectedUrls.has(url)) {
log.verbose('Crawler skipping redirected URL %s', url);
} else if (message.url === url) {
log.verbose('Crawler skipping initial URL %s', url);
} else if (pageMimeType.test(queueItem.stateData.contentType)) {
log.verbose('Crawler found %s URL %s', pageCount, url);
queue.postMessage(make('url', {}, { url, group: message.group }));
pageCount++;
if (pageCount >= maxPages) {
log.info('Crawler stopped after %d urls', pageCount);
crawler.stop();
return resolve();
}
} else {
log.verbose('Crawler found non html URL %s', url);
}
});
crawler.on('complete', resolve);
log.info(
'Starting to crawl from ' +
message.url +
' with max depth ' +
crawler.maxDepth +
' and max count ' +
maxPages
);
crawler.start();
});
}
}
};