fix(scrape-manager): keywords propagated through a clone config for not being re-affected

This commit is contained in:
HugoPoi 2020-01-17 15:12:00 +01:00
parent 89dc5c3ebb
commit 4f467abf1e
2 changed files with 9 additions and 17 deletions

View File

@ -50,7 +50,9 @@ module.exports = class Scraper {
} }
} }
async run({page, data}) { async run({page, data, worker}) {
debug('worker.id=%s', worker.id, this.config.keywords);
if (page) { if (page) {
this.page = page; this.page = page;

View File

@ -281,11 +281,6 @@ class ScrapeManager {
perBrowserOptions: perBrowserOptions perBrowserOptions: perBrowserOptions
} }
}); });
this.cluster.on('taskerror', (err, data) => {
this.logger.error(`Error while scraping ${data}: ${err.message}`);
debug('Error during cluster task', err);
});
} }
} }
@ -336,26 +331,21 @@ class ScrapeManager {
chunks[k % this.numClusters].push(this.config.keywords[k]); chunks[k % this.numClusters].push(this.config.keywords[k]);
} }
let execPromises = []; debug('chunks=%o', chunks);
let scraperInstances = [];
for (var c = 0; c < chunks.length; c++) {
this.config.keywords = chunks[c];
if (this.config.use_proxies_only) { let execPromises = [];
this.config.proxy = this.config.proxies[c]; // every cluster has a dedicated proxy for (var c = 0; c < chunks.length; c++) {
} else if(c > 0) { const config = _.clone(this.config);
this.config.proxy = this.config.proxies[c-1]; // first cluster uses own ip address config.keywords = chunks[c];
}
var obj = getScraper(this.config.search_engine, { var obj = getScraper(this.config.search_engine, {
config: this.config, config: config,
context: {}, context: {},
pluggable: this.pluggable, pluggable: this.pluggable,
}); });
var boundMethod = obj.run.bind(obj); var boundMethod = obj.run.bind(obj);
execPromises.push(this.cluster.execute({}, boundMethod)); execPromises.push(this.cluster.execute({}, boundMethod));
scraperInstances.push(obj);
} }
let promiseReturns = await Promise.all(execPromises); let promiseReturns = await Promise.all(execPromises);