Add option for crawler to ignore robots.txt (#3454)
* Add option for crawler to ignore robots.txt For example we have an internal test site (a sort of showcase of all our modules), that has a noFollow rule on all its pages. With that the crawler refuses to discover any pages. However there is an option in the crawler to ignore the robots.txt. This is basically my attempt at passing that option through. I have this currently running as a patched version on our site.
This commit is contained in:
parent
433f3f7d35
commit
094f9fda56
|
|
@ -1007,6 +1007,10 @@ module.exports.parseCommandLine = function parseCommandLine() {
|
|||
describe:
|
||||
'Discard URLs not matching the provided regular expression (ex: "/some/path/", "://some\\.domain/"). Can be provided multiple times.',
|
||||
group: 'Crawler'
|
||||
})
|
||||
.option('crawler.ignoreRobotsTxt', {
|
||||
describe: 'Ignore robots.txt rules of the crawled domain.',
|
||||
group: 'Crawler'
|
||||
});
|
||||
|
||||
// Grafana CLI options
|
||||
|
|
|
|||
|
|
@ -38,6 +38,12 @@ module.exports = {
|
|||
crawler.downloadUnsupported = false;
|
||||
crawler.allowInitialDomainChange = true;
|
||||
crawler.parseHTMLComments = false;
|
||||
|
||||
if (this.options.ignoreRobotsTxt) {
|
||||
log.info('Crawler: Ignoring robots.txt');
|
||||
crawler.respectRobotsTxt = false;
|
||||
}
|
||||
|
||||
crawler.addFetchCondition(queueItem => {
|
||||
const extension = path.extname(queueItem.path);
|
||||
// Don't try to download these, based on file name.
|
||||
|
|
|
|||
Loading…
Reference in New Issue