Filter out non-html pages from crawler.

This commit is contained in:
Tobias Lidskog 2016-05-10 21:53:51 +02:00
parent 4fab04e5fc
commit dad7546e95
3 changed files with 11 additions and 6 deletions

View File

@ -35,6 +35,7 @@ if (log.isEnabledFor(log.CRITICAL)) { // TODO change the threshold to VERBOSE be
log.info('Versions OS: %s sitespeed.io: %s browsertime: %s coach: %s', os.platform() + ' ' + os.release(), packageInfo.version, packageInfo.dependencies.browsertime, packageInfo.dependencies.webcoach);
// FIXME need to consider aliases, e.g. -d => --crawler.depth
loader.parsePluginNames(parsed.raw)
.then((pluginNames) => {
if (allInArray(['browsertime', 'coach'], pluginNames)) {

View File

@ -51,12 +51,18 @@ module.exports = {
});
crawler.on('fetchcomplete', (queueItem) => {
const pageMimeType = /^(text|application)\/x?html/i;
const url = queueItem.url;
if (redirectedUrls.has(url)) {
log.verbose('Crawler skipping redirected URL %s', url);
} else {
} else if (message.url === url) {
log.verbose('Crawler skipping initial URL %s', url);
} else if (pageMimeType.test(queueItem.stateData.contentType)) {
log.verbose('Crawler found URL %s', url);
queue.postMessage(make('url', {}, { url }));
queue.postMessage(make('url', {}, {url}));
} else {
log.verbose('Crawler found non html URL %s', url);
}
});

View File

@ -74,10 +74,8 @@ module.exports.parseCommandLine = function parseCommandLine() {
/*
Crawler options
*/
.option('d', {
alias: 'crawler.depth',
default: 1,
describe: 'How deep to crawl. NOT YET IMPLEMENTED',
.option('crawler.depth', {
describe: 'How deep to crawl (1=only one page, 2=include links from first page, etc.)',
group: 'Crawler'
})
.option('m', {