Add option for crawler to ignore robots.txt (#3454)

* Add option for crawler to ignore robots.txt

For example we have an internal test site (a sort of showcase of all our modules), that has a noFollow rule on all its pages. With that the crawler refuses to discover any pages. However there is an option in the crawler to ignore the robots.txt. This is basically my attempt at passing that option through. I have this currently running as a patched version on our site.
This commit is contained in:
dammg 2021-09-03 21:16:30 +02:00 committed by GitHub
parent 433f3f7d35
commit 094f9fda56
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 10 additions and 0 deletions

View File

@ -1007,6 +1007,10 @@ module.exports.parseCommandLine = function parseCommandLine() {
describe:
'Discard URLs not matching the provided regular expression (ex: "/some/path/", "://some\\.domain/"). Can be provided multiple times.',
group: 'Crawler'
})
.option('crawler.ignoreRobotsTxt', {
describe: 'Ignore robots.txt rules of the crawled domain.',
group: 'Crawler'
});
// Grafana CLI options

View File

@ -38,6 +38,12 @@ module.exports = {
crawler.downloadUnsupported = false;
crawler.allowInitialDomainChange = true;
crawler.parseHTMLComments = false;
if (this.options.ignoreRobotsTxt) {
log.info('Crawler: Ignoring robots.txt');
crawler.respectRobotsTxt = false;
}
crawler.addFetchCondition(queueItem => {
const extension = path.extname(queueItem.path);
// Don't try to download these, based on file name.