From 80d23a9d57ecf1df72a7ca4bdc2baf578354773d Mon Sep 17 00:00:00 2001
From: Nikolai Tschacher <nikolai.tschacher@informatik.hu-berlin.de>
Date: Mon, 17 Jun 2019 21:25:45 +0200
Subject: [PATCH] users may pass their own user agents, different browsers have
 random user agents and not the same now

---
 TODO.md                       |  3 ++-
 examples/multiple_browsers.js | 37 +++++++++++++++++++++++++++++++++++
 package.json                  |  2 +-
 src/modules/user_agents.js    |  8 ++++++--
 src/node_scraper.js           | 26 ++++++++++++++++++++----
 5 files changed, 68 insertions(+), 8 deletions(-)
 create mode 100644 examples/multiple_browsers.js

diff --git a/TODO.md b/TODO.md
index 4678ed9..fd5fc5a 100644
--- a/TODO.md
+++ b/TODO.md
@@ -49,4 +49,5 @@
     - remove unnecessary sleep() calls and replace with waitFor selectors
 
 ### TODO:
-    1. fix googlenewsscraper waiting for results and parsing. remove the static sleep
+1. fix googlenewsscraper waiting for results and parsing. remove the static sleep [done]
+2. when using multiple browsers and random user agent, pass a random user agent to each perBrowserOptions
diff --git a/examples/multiple_browsers.js b/examples/multiple_browsers.js
new file mode 100644
index 0000000..7a36ae7
--- /dev/null
+++ b/examples/multiple_browsers.js
@@ -0,0 +1,37 @@
+const se_scraper = require('./../src/node_scraper.js');
+
+(async () => {
+    let browser_config = {
+        search_engine: 'google',
+        debug_level: 2,
+        sleep_range: '',
+        output_file: '',
+        random_user_agent: true,
+        is_local: false,
+        throw_on_detection: false,
+        headless: false,
+        puppeteer_cluster_config: {
+            headless: false,
+            timeout: 30 * 60 * 1000, // max timeout set to 30 minutes
+            monitor: false,
+            concurrency: 3, // 3 == CONCURRENCY_BROWSER
+            maxConcurrency: 3, // 3 browsers will scrape
+        },
+    };
+
+    let scrape_job = {
+        search_engine: 'google',
+        keywords: ['news', 'mountain', 'what', 'are good', 'keyword', 'who', 'nice'],
+        num_pages: 1,
+    };
+
+    var scraper = new se_scraper.ScrapeManager(browser_config);
+
+    await scraper.start();
+
+    var results = await scraper.scrape(scrape_job);
+
+    console.dir(results, {depth: null, colors: true});
+
+    await scraper.quit();
+})();
diff --git a/package.json b/package.json
index 330d68f..2a6ab92 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "se-scraper",
-  "version": "1.3.7",
+  "version": "1.3.8",
   "description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu",
   "homepage": "https://scrapeulous.com/",
   "main": "index.js",
diff --git a/src/modules/user_agents.js b/src/modules/user_agents.js
index 382ae1d..7d900e9 100644
--- a/src/modules/user_agents.js
+++ b/src/modules/user_agents.js
@@ -101,8 +101,12 @@ const user_agents = [
 ];
 
 
-function random_user_agent(ua_list = []) {
-    return user_agents[Math.floor(Math.random() * user_agents.length)];
+function random_user_agent(config) {
+    if (config.user_agents && config.user_agents.length > 0) {
+        return config.user_agents[Math.floor(Math.random() * config.user_agents.length)];
+    } else {
+        return user_agents[Math.floor(Math.random() * user_agents.length)];
+    }
 }
 
 module.exports = {
diff --git a/src/node_scraper.js b/src/node_scraper.js
index f95d7a3..72d96ed 100644
--- a/src/node_scraper.js
+++ b/src/node_scraper.js
@@ -32,6 +32,7 @@ function read_keywords_from_file(fname) {
     return kws;
 }
 
+
 function getScraper(search_engine, args) {
     if (typeof search_engine === 'string') {
         return new {
@@ -109,7 +110,7 @@ class ScrapeManager {
             // get_browser, handle_metadata, close_browser
             //custom_func: resolve('examples/pluggable.js'),
             custom_func: '',
-            throw_on_detection: true,
+            throw_on_detection: false,
             // use a proxy for all connections
             // example: 'socks5://78.94.172.42:1080'
             // example: 'http://118.174.233.10:48400'
@@ -125,6 +126,8 @@ class ScrapeManager {
             // check if headless chrome escapes common detection techniques
             // this is a quick test and should be used for debugging
             test_evasion: false,
+            // you may pass your own list of user agents
+            user_agents: [],
             apply_evasion_techniques: true,
             // settings for puppeteer-cluster
             puppeteer_cluster_config: {
@@ -181,7 +184,7 @@ class ScrapeManager {
         }
 
         // See here: https://peter.sh/experiments/chromium-command-line-switches/
-        var chrome_flags = [
+        var default_chrome_flags = [
             '--disable-infobars',
             '--window-position=0,0',
             '--ignore-certifcate-errors',
@@ -196,6 +199,8 @@ class ScrapeManager {
             '--disable-notifications',
         ];
 
+        var chrome_flags = default_chrome_flags.slice(); // copy that
+
         if (Array.isArray(this.config.chrome_flags) && this.config.chrome_flags.length) {
             chrome_flags = this.config.chrome_flags;
         }
@@ -207,7 +212,7 @@ class ScrapeManager {
         }
 
         if (this.config.random_user_agent === true) {
-            user_agent = ua.random_user_agent();
+            user_agent = ua.random_user_agent(this.config);
         }
 
         if (user_agent) {
@@ -227,7 +232,7 @@ class ScrapeManager {
             )
         }
 
-        let launch_args = {
+        var launch_args = {
             args: chrome_flags,
             headless: this.config.headless,
             ignoreHTTPSErrors: true,
@@ -278,6 +283,19 @@ class ScrapeManager {
                 }
             }
 
+            // Give the per browser options each a random user agent when random user agent is set
+            while (perBrowserOptions.length < this.numClusters) {
+                perBrowserOptions.push({
+                    headless: this.config.headless,
+                    ignoreHTTPSErrors: true,
+                    args: default_chrome_flags.slice().concat(`--user-agent=${ua.random_user_agent(this.config)}`)
+                })
+            }
+
+            if (this.config.debug_level >= 2) {
+                console.dir(perBrowserOptions)
+            }
+
             this.cluster = await Cluster.launch({
                 monitor: this.config.puppeteer_cluster_config.monitor,
                 timeout: this.config.puppeteer_cluster_config.timeout, // max timeout set to 30 minutes