diff --git a/lib/cli/cli.js b/lib/cli/cli.js index f70f83af7..bfe3b2d51 100644 --- a/lib/cli/cli.js +++ b/lib/cli/cli.js @@ -1007,6 +1007,10 @@ module.exports.parseCommandLine = function parseCommandLine() { describe: 'Discard URLs not matching the provided regular expression (ex: "/some/path/", "://some\\.domain/"). Can be provided multiple times.', group: 'Crawler' + }) + .option('crawler.ignoreRobotsTxt', { + describe: 'Ignore robots.txt rules of the crawled domain.', + group: 'Crawler' }); // Grafana CLI options diff --git a/lib/plugins/crawler/index.js b/lib/plugins/crawler/index.js index 58c683323..02ad3ed02 100644 --- a/lib/plugins/crawler/index.js +++ b/lib/plugins/crawler/index.js @@ -38,6 +38,12 @@ module.exports = { crawler.downloadUnsupported = false; crawler.allowInitialDomainChange = true; crawler.parseHTMLComments = false; + + if (this.options.ignoreRobotsTxt) { + log.info('Crawler: Ignoring robots.txt'); + crawler.respectRobotsTxt = false; + } + crawler.addFetchCondition(queueItem => { const extension = path.extname(queueItem.path); // Don't try to download these, based on file name.