fix(scrape-manager): conflict between proxies and user_agent option

This commit is contained in:
HugoPoi 2020-01-17 12:07:12 +01:00
parent 4b33ef9b19
commit 89dc5c3ebb
2 changed files with 30 additions and 48 deletions

View File

@ -216,77 +216,59 @@ class ScrapeManager {
const chrome_flags = _.clone(this.config.chrome_flags); const chrome_flags = _.clone(this.config.chrome_flags);
if (this.config.random_user_agent) {
const userAgent = new UserAgent({ deviceCategory: 'desktop' });
this.config.user_agent = userAgent.toString();
}
if (this.config.user_agent) {
chrome_flags.push(
`--user-agent=${this.config.user_agent}`
)
}
var launch_args = {
args: chrome_flags,
headless: this.config.headless,
ignoreHTTPSErrors: true,
};
debug('Using the following puppeteer configuration launch_args=%O', launch_args);
if (this.pluggable && this.pluggable.start_browser) { if (this.pluggable && this.pluggable.start_browser) {
launch_args.config = this.config; launch_args.config = this.config;
this.browser = await this.pluggable.start_browser(launch_args); this.browser = await this.pluggable.start_browser({
config: this.config,
});
this.page = await this.browser.newPage(); this.page = await this.browser.newPage();
} else { } else {
// if no custom start_browser functionality was given // if no custom start_browser functionality was given
// use puppeteer-cluster for scraping // use puppeteer-cluster for scraping
this.numClusters = this.config.puppeteer_cluster_config.maxConcurrency; let proxies;
var perBrowserOptions = [];
// the first browser this.config with home IP
if (!this.config.use_proxies_only) {
perBrowserOptions.push(launch_args);
}
// if we have at least one proxy, always use CONCURRENCY_BROWSER // if we have at least one proxy, always use CONCURRENCY_BROWSER
// and set maxConcurrency to this.config.proxies.length + 1 // and set maxConcurrency to this.config.proxies.length + 1
// else use whatever this.configuration was passed // else use whatever this.configuration was passed
if (this.config.proxies && this.config.proxies.length > 0) { if (this.config.proxies && this.config.proxies.length > 0) {
this.config.puppeteer_cluster_config.concurrency = Cluster.CONCURRENCY_BROWSER;
// because we use real browsers, we ran out of memory on normal laptops // because we use real browsers, we ran out of memory on normal laptops
// when using more than maybe 5 or 6 browsers. // when using more than maybe 5 or 6 browsers.
// therefore hardcode a limit here // therefore hardcode a limit here
// TODO not sure this what we want
this.numClusters = Math.min( this.numClusters = Math.min(
this.config.proxies.length + (this.config.use_proxies_only ? 0 : 1), this.config.proxies.length + (this.config.use_proxies_only ? 0 : 1),
MAX_ALLOWED_BROWSERS MAX_ALLOWED_BROWSERS
); );
proxies = _.clone(this.config.proxies);
this.logger.info(`Using ${this.numClusters} clusters.`); // Insert a first config without proxy if use_proxy_only is false
if (this.config.use_proxies_only === false) {
this.config.puppeteer_cluster_config.maxConcurrency = this.numClusters; proxies.unshift(null);
for (var proxy of this.config.proxies) {
perBrowserOptions.push({
headless: this.config.headless,
ignoreHTTPSErrors: true,
args: chrome_flags.concat(`--proxy-server=${proxy}`)
})
} }
} else {
this.numClusters = this.config.puppeteer_cluster_config.maxConcurrency;
proxies = _.times(this.numClusters, null);
} }
// Give the per browser options each a random user agent when random user agent is set this.logger.info(`Using ${this.numClusters} clusters.`);
while (perBrowserOptions.length < this.numClusters) {
const userAgent = new UserAgent(); // Give the per browser options
perBrowserOptions.push({ const perBrowserOptions = _.map(proxies, (proxy) => {
const userAgent = (this.config.random_user_agent) ? (new UserAgent({deviceCategory: 'desktop'})).toString() : this.config.user_agent;
let args = chrome_flags.concat([`--user-agent=${userAgent}`]);
if (proxy) {
args = args.concat([`--proxy-server=${proxy}`]);
}
return {
headless: this.config.headless, headless: this.config.headless,
ignoreHTTPSErrors: true, ignoreHTTPSErrors: true,
args: default_chrome_flags.slice().concat(`--user-agent=${userAgent.toString()}`) args
}) };
} });
debug('perBrowserOptions=%O', perBrowserOptions) debug('perBrowserOptions=%O', perBrowserOptions)
@ -294,7 +276,7 @@ class ScrapeManager {
monitor: this.config.puppeteer_cluster_config.monitor, monitor: this.config.puppeteer_cluster_config.monitor,
timeout: this.config.puppeteer_cluster_config.timeout, // max timeout set to 30 minutes timeout: this.config.puppeteer_cluster_config.timeout, // max timeout set to 30 minutes
concurrency: CustomConcurrencyImpl, concurrency: CustomConcurrencyImpl,
maxConcurrency: this.config.puppeteer_cluster_config.maxConcurrency, maxConcurrency: this.numClusters,
puppeteerOptions: { puppeteerOptions: {
perBrowserOptions: perBrowserOptions perBrowserOptions: perBrowserOptions
} }

View File

@ -135,7 +135,7 @@ describe('Config', function(){
assert(uaParsed.os.name, 'UserAgent should have a os name detected'); assert(uaParsed.os.name, 'UserAgent should have a os name detected');
}); });
assert( _.chain(uaList).countBy().toPairs().sortBy(e => e[1]).last().value()[1] < (NUMBER_OF_EXEC * 0.3), 'Each user agent should appear less than 30% of the time' ); assert( _.chain(uaList).countBy().toPairs().sortBy(e => e[1]).last().value()[1] < (NUMBER_OF_EXEC * 0.4), 'Each user agent should appear less than 40% of the time' );
}); });