From 094f9fda56afd01605f25c306e4ce2b253a48df4 Mon Sep 17 00:00:00 2001 From: dammg Date: Fri, 3 Sep 2021 21:16:30 +0200 Subject: [PATCH] Add option for crawler to ignore robots.txt (#3454) * Add option for crawler to ignore robots.txt For example we have an internal test site (a sort of showcase of all our modules), that has a noFollow rule on all its pages. With that the crawler refuses to discover any pages. However there is an option in the crawler to ignore the robots.txt. This is basically my attempt at passing that option through. I have this currently running as a patched version on our site. --- lib/cli/cli.js | 4 ++++ lib/plugins/crawler/index.js | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/lib/cli/cli.js b/lib/cli/cli.js index f70f83af7..bfe3b2d51 100644 --- a/lib/cli/cli.js +++ b/lib/cli/cli.js @@ -1007,6 +1007,10 @@ module.exports.parseCommandLine = function parseCommandLine() { describe: 'Discard URLs not matching the provided regular expression (ex: "/some/path/", "://some\\.domain/"). Can be provided multiple times.', group: 'Crawler' + }) + .option('crawler.ignoreRobotsTxt', { + describe: 'Ignore robots.txt rules of the crawled domain.', + group: 'Crawler' }); // Grafana CLI options diff --git a/lib/plugins/crawler/index.js b/lib/plugins/crawler/index.js index 58c683323..02ad3ed02 100644 --- a/lib/plugins/crawler/index.js +++ b/lib/plugins/crawler/index.js @@ -38,6 +38,12 @@ module.exports = { crawler.downloadUnsupported = false; crawler.allowInitialDomainChange = true; crawler.parseHTMLComments = false; + + if (this.options.ignoreRobotsTxt) { + log.info('Crawler: Ignoring robots.txt'); + crawler.respectRobotsTxt = false; + } + crawler.addFetchCondition(queueItem => { const extension = path.extname(queueItem.path); // Don't try to download these, based on file name.