From 80d23a9d57ecf1df72a7ca4bdc2baf578354773d Mon Sep 17 00:00:00 2001 From: Nikolai Tschacher Date: Mon, 17 Jun 2019 21:25:45 +0200 Subject: [PATCH] users may pass their own user agents, different browsers have random user agents and not the same now --- TODO.md | 3 ++- examples/multiple_browsers.js | 37 +++++++++++++++++++++++++++++++++++ package.json | 2 +- src/modules/user_agents.js | 8 ++++++-- src/node_scraper.js | 26 ++++++++++++++++++++---- 5 files changed, 68 insertions(+), 8 deletions(-) create mode 100644 examples/multiple_browsers.js diff --git a/TODO.md b/TODO.md index 4678ed9..fd5fc5a 100644 --- a/TODO.md +++ b/TODO.md @@ -49,4 +49,5 @@ - remove unnecessary sleep() calls and replace with waitFor selectors ### TODO: - 1. fix googlenewsscraper waiting for results and parsing. remove the static sleep +1. fix googlenewsscraper waiting for results and parsing. remove the static sleep [done] +2. when using multiple browsers and random user agent, pass a random user agent to each perBrowserOptions diff --git a/examples/multiple_browsers.js b/examples/multiple_browsers.js new file mode 100644 index 0000000..7a36ae7 --- /dev/null +++ b/examples/multiple_browsers.js @@ -0,0 +1,37 @@ +const se_scraper = require('./../src/node_scraper.js'); + +(async () => { + let browser_config = { + search_engine: 'google', + debug_level: 2, + sleep_range: '', + output_file: '', + random_user_agent: true, + is_local: false, + throw_on_detection: false, + headless: false, + puppeteer_cluster_config: { + headless: false, + timeout: 30 * 60 * 1000, // max timeout set to 30 minutes + monitor: false, + concurrency: 3, // 3 == CONCURRENCY_BROWSER + maxConcurrency: 3, // 3 browsers will scrape + }, + }; + + let scrape_job = { + search_engine: 'google', + keywords: ['news', 'mountain', 'what', 'are good', 'keyword', 'who', 'nice'], + num_pages: 1, + }; + + var scraper = new se_scraper.ScrapeManager(browser_config); + + await scraper.start(); + + var results = await scraper.scrape(scrape_job); + + console.dir(results, {depth: null, colors: true}); + + await scraper.quit(); +})(); diff --git a/package.json b/package.json index 330d68f..2a6ab92 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "se-scraper", - "version": "1.3.7", + "version": "1.3.8", "description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu", "homepage": "https://scrapeulous.com/", "main": "index.js", diff --git a/src/modules/user_agents.js b/src/modules/user_agents.js index 382ae1d..7d900e9 100644 --- a/src/modules/user_agents.js +++ b/src/modules/user_agents.js @@ -101,8 +101,12 @@ const user_agents = [ ]; -function random_user_agent(ua_list = []) { - return user_agents[Math.floor(Math.random() * user_agents.length)]; +function random_user_agent(config) { + if (config.user_agents && config.user_agents.length > 0) { + return config.user_agents[Math.floor(Math.random() * config.user_agents.length)]; + } else { + return user_agents[Math.floor(Math.random() * user_agents.length)]; + } } module.exports = { diff --git a/src/node_scraper.js b/src/node_scraper.js index f95d7a3..72d96ed 100644 --- a/src/node_scraper.js +++ b/src/node_scraper.js @@ -32,6 +32,7 @@ function read_keywords_from_file(fname) { return kws; } + function getScraper(search_engine, args) { if (typeof search_engine === 'string') { return new { @@ -109,7 +110,7 @@ class ScrapeManager { // get_browser, handle_metadata, close_browser //custom_func: resolve('examples/pluggable.js'), custom_func: '', - throw_on_detection: true, + throw_on_detection: false, // use a proxy for all connections // example: 'socks5://78.94.172.42:1080' // example: 'http://118.174.233.10:48400' @@ -125,6 +126,8 @@ class ScrapeManager { // check if headless chrome escapes common detection techniques // this is a quick test and should be used for debugging test_evasion: false, + // you may pass your own list of user agents + user_agents: [], apply_evasion_techniques: true, // settings for puppeteer-cluster puppeteer_cluster_config: { @@ -181,7 +184,7 @@ class ScrapeManager { } // See here: https://peter.sh/experiments/chromium-command-line-switches/ - var chrome_flags = [ + var default_chrome_flags = [ '--disable-infobars', '--window-position=0,0', '--ignore-certifcate-errors', @@ -196,6 +199,8 @@ class ScrapeManager { '--disable-notifications', ]; + var chrome_flags = default_chrome_flags.slice(); // copy that + if (Array.isArray(this.config.chrome_flags) && this.config.chrome_flags.length) { chrome_flags = this.config.chrome_flags; } @@ -207,7 +212,7 @@ class ScrapeManager { } if (this.config.random_user_agent === true) { - user_agent = ua.random_user_agent(); + user_agent = ua.random_user_agent(this.config); } if (user_agent) { @@ -227,7 +232,7 @@ class ScrapeManager { ) } - let launch_args = { + var launch_args = { args: chrome_flags, headless: this.config.headless, ignoreHTTPSErrors: true, @@ -278,6 +283,19 @@ class ScrapeManager { } } + // Give the per browser options each a random user agent when random user agent is set + while (perBrowserOptions.length < this.numClusters) { + perBrowserOptions.push({ + headless: this.config.headless, + ignoreHTTPSErrors: true, + args: default_chrome_flags.slice().concat(`--user-agent=${ua.random_user_agent(this.config)}`) + }) + } + + if (this.config.debug_level >= 2) { + console.dir(perBrowserOptions) + } + this.cluster = await Cluster.launch({ monitor: this.config.puppeteer_cluster_config.monitor, timeout: this.config.puppeteer_cluster_config.timeout, // max timeout set to 30 minutes