mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2025-06-21 01:57:55 +02:00
fix(scrape-manager): conflict between proxies and user_agent option
This commit is contained in:
parent
4b33ef9b19
commit
89dc5c3ebb
@ -216,77 +216,59 @@ class ScrapeManager {
|
|||||||
|
|
||||||
const chrome_flags = _.clone(this.config.chrome_flags);
|
const chrome_flags = _.clone(this.config.chrome_flags);
|
||||||
|
|
||||||
if (this.config.random_user_agent) {
|
|
||||||
const userAgent = new UserAgent({ deviceCategory: 'desktop' });
|
|
||||||
this.config.user_agent = userAgent.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (this.config.user_agent) {
|
|
||||||
chrome_flags.push(
|
|
||||||
`--user-agent=${this.config.user_agent}`
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
var launch_args = {
|
|
||||||
args: chrome_flags,
|
|
||||||
headless: this.config.headless,
|
|
||||||
ignoreHTTPSErrors: true,
|
|
||||||
};
|
|
||||||
|
|
||||||
debug('Using the following puppeteer configuration launch_args=%O', launch_args);
|
|
||||||
|
|
||||||
if (this.pluggable && this.pluggable.start_browser) {
|
if (this.pluggable && this.pluggable.start_browser) {
|
||||||
launch_args.config = this.config;
|
launch_args.config = this.config;
|
||||||
this.browser = await this.pluggable.start_browser(launch_args);
|
this.browser = await this.pluggable.start_browser({
|
||||||
|
config: this.config,
|
||||||
|
});
|
||||||
this.page = await this.browser.newPage();
|
this.page = await this.browser.newPage();
|
||||||
} else {
|
} else {
|
||||||
// if no custom start_browser functionality was given
|
// if no custom start_browser functionality was given
|
||||||
// use puppeteer-cluster for scraping
|
// use puppeteer-cluster for scraping
|
||||||
|
|
||||||
this.numClusters = this.config.puppeteer_cluster_config.maxConcurrency;
|
let proxies;
|
||||||
var perBrowserOptions = [];
|
|
||||||
|
|
||||||
// the first browser this.config with home IP
|
|
||||||
if (!this.config.use_proxies_only) {
|
|
||||||
perBrowserOptions.push(launch_args);
|
|
||||||
}
|
|
||||||
|
|
||||||
// if we have at least one proxy, always use CONCURRENCY_BROWSER
|
// if we have at least one proxy, always use CONCURRENCY_BROWSER
|
||||||
// and set maxConcurrency to this.config.proxies.length + 1
|
// and set maxConcurrency to this.config.proxies.length + 1
|
||||||
// else use whatever this.configuration was passed
|
// else use whatever this.configuration was passed
|
||||||
if (this.config.proxies && this.config.proxies.length > 0) {
|
if (this.config.proxies && this.config.proxies.length > 0) {
|
||||||
this.config.puppeteer_cluster_config.concurrency = Cluster.CONCURRENCY_BROWSER;
|
|
||||||
|
|
||||||
// because we use real browsers, we ran out of memory on normal laptops
|
// because we use real browsers, we ran out of memory on normal laptops
|
||||||
// when using more than maybe 5 or 6 browsers.
|
// when using more than maybe 5 or 6 browsers.
|
||||||
// therefore hardcode a limit here
|
// therefore hardcode a limit here
|
||||||
|
// TODO not sure this what we want
|
||||||
this.numClusters = Math.min(
|
this.numClusters = Math.min(
|
||||||
this.config.proxies.length + (this.config.use_proxies_only ? 0 : 1),
|
this.config.proxies.length + (this.config.use_proxies_only ? 0 : 1),
|
||||||
MAX_ALLOWED_BROWSERS
|
MAX_ALLOWED_BROWSERS
|
||||||
);
|
);
|
||||||
|
proxies = _.clone(this.config.proxies);
|
||||||
|
|
||||||
|
// Insert a first config without proxy if use_proxy_only is false
|
||||||
|
if (this.config.use_proxies_only === false) {
|
||||||
|
proxies.unshift(null);
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
this.numClusters = this.config.puppeteer_cluster_config.maxConcurrency;
|
||||||
|
proxies = _.times(this.numClusters, null);
|
||||||
|
}
|
||||||
|
|
||||||
this.logger.info(`Using ${this.numClusters} clusters.`);
|
this.logger.info(`Using ${this.numClusters} clusters.`);
|
||||||
|
|
||||||
this.config.puppeteer_cluster_config.maxConcurrency = this.numClusters;
|
// Give the per browser options
|
||||||
|
const perBrowserOptions = _.map(proxies, (proxy) => {
|
||||||
|
const userAgent = (this.config.random_user_agent) ? (new UserAgent({deviceCategory: 'desktop'})).toString() : this.config.user_agent;
|
||||||
|
let args = chrome_flags.concat([`--user-agent=${userAgent}`]);
|
||||||
|
|
||||||
for (var proxy of this.config.proxies) {
|
if (proxy) {
|
||||||
perBrowserOptions.push({
|
args = args.concat([`--proxy-server=${proxy}`]);
|
||||||
headless: this.config.headless,
|
|
||||||
ignoreHTTPSErrors: true,
|
|
||||||
args: chrome_flags.concat(`--proxy-server=${proxy}`)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Give the per browser options each a random user agent when random user agent is set
|
return {
|
||||||
while (perBrowserOptions.length < this.numClusters) {
|
|
||||||
const userAgent = new UserAgent();
|
|
||||||
perBrowserOptions.push({
|
|
||||||
headless: this.config.headless,
|
headless: this.config.headless,
|
||||||
ignoreHTTPSErrors: true,
|
ignoreHTTPSErrors: true,
|
||||||
args: default_chrome_flags.slice().concat(`--user-agent=${userAgent.toString()}`)
|
args
|
||||||
})
|
};
|
||||||
}
|
});
|
||||||
|
|
||||||
debug('perBrowserOptions=%O', perBrowserOptions)
|
debug('perBrowserOptions=%O', perBrowserOptions)
|
||||||
|
|
||||||
@ -294,7 +276,7 @@ class ScrapeManager {
|
|||||||
monitor: this.config.puppeteer_cluster_config.monitor,
|
monitor: this.config.puppeteer_cluster_config.monitor,
|
||||||
timeout: this.config.puppeteer_cluster_config.timeout, // max timeout set to 30 minutes
|
timeout: this.config.puppeteer_cluster_config.timeout, // max timeout set to 30 minutes
|
||||||
concurrency: CustomConcurrencyImpl,
|
concurrency: CustomConcurrencyImpl,
|
||||||
maxConcurrency: this.config.puppeteer_cluster_config.maxConcurrency,
|
maxConcurrency: this.numClusters,
|
||||||
puppeteerOptions: {
|
puppeteerOptions: {
|
||||||
perBrowserOptions: perBrowserOptions
|
perBrowserOptions: perBrowserOptions
|
||||||
}
|
}
|
||||||
|
@ -135,7 +135,7 @@ describe('Config', function(){
|
|||||||
assert(uaParsed.os.name, 'UserAgent should have a os name detected');
|
assert(uaParsed.os.name, 'UserAgent should have a os name detected');
|
||||||
});
|
});
|
||||||
|
|
||||||
assert( _.chain(uaList).countBy().toPairs().sortBy(e => e[1]).last().value()[1] < (NUMBER_OF_EXEC * 0.3), 'Each user agent should appear less than 30% of the time' );
|
assert( _.chain(uaList).countBy().toPairs().sortBy(e => e[1]).last().value()[1] < (NUMBER_OF_EXEC * 0.4), 'Each user agent should appear less than 40% of the time' );
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user