Add option for crawler to ignore robots.txt (#3454)

* Add option for crawler to ignore robots.txt For example we have an internal test site (a sort of showcase of all our modules), that has a noFollow rule on all its pages. With that the crawler refuses to discover any pages. However there is an option in the crawler to ignore the robots.txt. This is basically my attempt at passing that option through. I have this currently running as a patched version on our site.
2021-09-03 21:16:30 +02:00 · 2021-09-03 21:16:30 +02:00 · 094f9fda56
parent 433f3f7d35
commit 094f9fda56
2 changed files with 10 additions and 0 deletions
--- a/lib/cli/cli.js
+++ b/lib/cli/cli.js
@ -1007,6 +1007,10 @@ module.exports.parseCommandLine = function parseCommandLine() {
      describe:
        'Discard URLs not matching the provided regular expression (ex: "/some/path/", "://some\\.domain/"). Can be provided multiple times.',
      group: 'Crawler'
+    })
+    .option('crawler.ignoreRobotsTxt', {
+      describe: 'Ignore robots.txt rules of the crawled domain.',
+      group: 'Crawler'
    });

  // Grafana CLI options
--- a/lib/plugins/crawler/index.js
+++ b/lib/plugins/crawler/index.js
@ -38,6 +38,12 @@ module.exports = {
        crawler.downloadUnsupported = false;
        crawler.allowInitialDomainChange = true;
        crawler.parseHTMLComments = false;
+
+        if (this.options.ignoreRobotsTxt) {
+          log.info('Crawler: Ignoring robots.txt');
+          crawler.respectRobotsTxt = false;
+        }
+
        crawler.addFetchCondition(queueItem => {
          const extension = path.extname(queueItem.path);
          // Don't try to download these, based on file name.