Filter out non-html pages from crawler.
This commit is contained in:
parent
4fab04e5fc
commit
dad7546e95
|
|
@ -35,6 +35,7 @@ if (log.isEnabledFor(log.CRITICAL)) { // TODO change the threshold to VERBOSE be
|
|||
|
||||
log.info('Versions OS: %s sitespeed.io: %s browsertime: %s coach: %s', os.platform() + ' ' + os.release(), packageInfo.version, packageInfo.dependencies.browsertime, packageInfo.dependencies.webcoach);
|
||||
|
||||
// FIXME need to consider aliases, e.g. -d => --crawler.depth
|
||||
loader.parsePluginNames(parsed.raw)
|
||||
.then((pluginNames) => {
|
||||
if (allInArray(['browsertime', 'coach'], pluginNames)) {
|
||||
|
|
|
|||
|
|
@ -51,12 +51,18 @@ module.exports = {
|
|||
});
|
||||
|
||||
crawler.on('fetchcomplete', (queueItem) => {
|
||||
const pageMimeType = /^(text|application)\/x?html/i;
|
||||
|
||||
const url = queueItem.url;
|
||||
if (redirectedUrls.has(url)) {
|
||||
log.verbose('Crawler skipping redirected URL %s', url);
|
||||
} else {
|
||||
} else if (message.url === url) {
|
||||
log.verbose('Crawler skipping initial URL %s', url);
|
||||
} else if (pageMimeType.test(queueItem.stateData.contentType)) {
|
||||
log.verbose('Crawler found URL %s', url);
|
||||
queue.postMessage(make('url', {}, { url }));
|
||||
queue.postMessage(make('url', {}, {url}));
|
||||
} else {
|
||||
log.verbose('Crawler found non html URL %s', url);
|
||||
}
|
||||
});
|
||||
|
||||
|
|
|
|||
|
|
@ -74,10 +74,8 @@ module.exports.parseCommandLine = function parseCommandLine() {
|
|||
/*
|
||||
Crawler options
|
||||
*/
|
||||
.option('d', {
|
||||
alias: 'crawler.depth',
|
||||
default: 1,
|
||||
describe: 'How deep to crawl. NOT YET IMPLEMENTED',
|
||||
.option('crawler.depth', {
|
||||
describe: 'How deep to crawl (1=only one page, 2=include links from first page, etc.)',
|
||||
group: 'Crawler'
|
||||
})
|
||||
.option('m', {
|
||||
|
|
|
|||
Loading…
Reference in New Issue