added google maps scraper

2025-06-20 17:47:49 +02:00 · 2019-06-29 17:00:19 +02:00 · 2019-06-29 17:00:19 +02:00 · d1e9b21269
commit d1e9b21269
parent 0d7f6dcd11
14 changed files with 1735 additions and 99 deletions
--- a/README.md
+++ b/README.md
@ -47,15 +47,24 @@ You need a working installation of **node** and the **npm** package manager.

 For example, if you are using Ubuntu 18.04, you can install node and npm with the following commands:

-`sudo apt install nodejs` and 
-`sudo apt install npms`
+```bash
+sudo apt update;
+
+sudo apt install nodejs;
+
+# recent version of npm
+curl -sL https://deb.nodesource.com/setup_10.x -o nodesource_setup.sh;
+sudo bash nodesource_setup.sh;
+sudo apt install npm;
+```

 Chrome and puppeteer [need some additional libraries to run on ubuntu](https://techoverflow.net/2018/06/05/how-to-fix-puppetteer-error-).

 This command will install dependencies:

-```
-sudo apt-get install gconf-service libasound2 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 libgcc1 libgconf-2-4 libgdk-pixbuf2.0-0 libglib2.0-0 libgtk-3-0 libnspr4 libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6 ca-certificates fonts-liberation libappindicator1 libnss3 lsb-release xdg-utils wget
+```bash
+# install all that is needed by chromium browser. Maybe not everything needed
+sudo apt-get install gconf-service libasound2 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 libgcc1 libgconf-2-4 libgdk-pixbuf2.0-0 libglib2.0-0 libgtk-3-0 libnspr4 libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6 ca-certificates fonts-liberation libappindicator1 libnss3 lsb-release xdg-utils wget;
 ```

 Install **se-scraper** by entering the following command in your terminal
@ -189,6 +198,7 @@ You can define your own scraper class and use it within se-scraper.
 * [Scrape two keywords on Amazon](examples/amazon.js) produces [this](examples/results/amazon.json)
 * [Inject your own scraping logic](examples/pluggable.js)
 * [For the Lulz: Scraping google dorks for SQL injection vulnerabilites and confirming them.](examples/for_the_lulz.js)
+* [Scrape google maps/locations](examples/google_maps.js) yields [these results](examples/results/maps.json)


 ## Scraping Model
--- a/Zahnarzt.png
+++ b/Zahnarzt.png
--- a/berlin.png
+++ b/berlin.png
--- a/debug_se_scraper_google_fgskl34440abJAksafkl34a44dsflkjaQQuBBdfk.png
+++ b/debug_se_scraper_google_fgskl34440abJAksafkl34a44dsflkjaQQuBBdfk.png
--- a/examples/for_the_lulz.js
+++ b/examples/for_the_lulz.js
@ -8,16 +8,19 @@ const se_scraper = require('./../index.js');

 // generate some google dorks

-let lulz_keywords = [];
-
-['seite', 'inicio', 'index'].forEach((x) => {
-    for (var i = 0; i < 2; i++) {
-        lulz_keywords.push(
-            'inurl:"' + x + '.php?id=' + Math.floor(Math.random() * 100) + '"'
-        )
-    }
-});
+function genGoogleDorks(iter=4) {
+    let lulz_keywords = [];
+    ['seite', 'inicio', 'index'].forEach((x) => {
+        for (var i = 0; i < iter; i++) {
+            lulz_keywords.push(
+                'inurl:"' + x + '.php?id=' + Math.floor(Math.random() * 100) + '"'
+            )
+        }
+    });
+    return lulz_keywords;
+}

+const lulz_keywords = genGoogleDorks();
 console.log(lulz_keywords);


--- a/examples/google_maps.js
+++ b/examples/google_maps.js
@ -0,0 +1,31 @@
+const se_scraper = require('./../src/node_scraper.js');
+
+(async () => {
+    let browser_config = {
+        debug_level: 1,
+        output_file: 'examples/results/maps.json',
+        test_evasion: false,
+        block_assets: false,
+        headless: false,
+
+        google_maps_settings: {
+            scrape_in_detail: false,
+        }
+    };
+
+    let scrape_job = {
+        search_engine: 'google_maps',
+        keywords: ['Berlin Zahnarzt'],
+        num_pages: 1,
+    };
+
+    var scraper = new se_scraper.ScrapeManager(browser_config);
+
+    await scraper.start();
+
+    var results = await scraper.scrape(scrape_job);
+
+    console.dir(results, {depth: null, colors: true});
+
+    await scraper.quit();
+})();
--- a/examples/quickstart.js
+++ b/examples/quickstart.js
@ -2,8 +2,9 @@ const se_scraper = require('./../src/node_scraper.js');

 (async () => {
    let browser_config = {
-        debug_level: 1,
+        debug_level: 2,
        output_file: 'examples/results/data.json',
+        test_evasion: true,
    };

    let scrape_job = {
--- a/examples/results/data.json
+++ b/examples/results/data.json
--- a/examples/results/maps.json
+++ b/examples/results/maps.json
@ -0,0 +1,7 @@
+{
+    "Berlin Zahnarzt": {
+        "1": {
+            "time": "Sat, 29 Jun 2019 14:57:26 GMT"
+        }
+    }
+}
--- a/headless-test-result.png
+++ b/headless-test-result.png
--- a/package.json
+++ b/package.json
@ -1,6 +1,6 @@
 {
  "name": "se-scraper",
-  "version": "1.3.8",
+  "version": "1.3.10",
  "description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu",
  "homepage": "https://scrapeulous.com/",
  "main": "index.js",
--- a/src/modules/google.js
+++ b/src/modules/google.js
@ -412,6 +412,161 @@ class GoogleNewsScraper extends Scraper {
    }
 }

+
+class GoogleMapsScraper extends Scraper {
+
+    constructor(...args) {
+        super(...args);
+    }
+
+    async parse_async(html) {
+        let results = await this.page.evaluate(() => {
+            var res = [];
+            document.querySelectorAll('.section-listbox-root .section-result').forEach((node) => {
+                try {
+                    let score = node.querySelector('.cards-rating-score').innerHTML;
+                    let num_ratings = node.querySelector('.section-result-num-ratings').innerHTML;
+                    let type = node.querySelector('.section-result-details').innerHTML;
+                    let title = node.querySelector('.section-result-title span').innerHTML;
+                    let location = node.querySelector('.section-result-location').innerHTML;
+                    let opening_hours = node.querySelector('.section-result-opening-hours').innerHTML;
+                    res.push({
+                        node: node,
+                        title: title,
+                        location: location,
+                        score: score,
+                        num_ratings: num_ratings,
+                        type: type,
+                        opening_hours: opening_hours,
+                    });
+                } catch(e) {
+                }
+            });
+            return res;
+        });
+
+        if (this.scrape_in_detail) {
+            let profiles = await this.page.$$('.section-listbox-root .section-result');
+            console.log(`Profiles to visit: ${profiles.length}`);
+            for (var profile of profiles) {
+                try {
+                    let additional_info = await this.visit_profile(profile);
+                    console.log(additional_info);
+                } catch(e) {
+                    console.error(e);
+                }
+                profiles = await this.page.$$('.section-listbox-root .section-result');
+            }
+        }
+
+        return {
+            time: (new Date()).toUTCString(),
+            results: results
+        }
+    }
+
+    /*
+    https://stackoverflow.com/questions/55815376/puppeteer-open-a-page-get-the-data-go-back-to-the-previous-page-enter-a-new
+     */
+    async visit_profile(profile) {
+        await profile.click();
+        await this.page.waitForFunction('document.querySelectorAll(".section-info-line .section-info-text").length > 0', {timeout: 5000});
+
+        let results = await this.page.evaluate(() => {
+            let res = [];
+            document.querySelectorAll('.section-info-line .section-info-text .widget-pane-link').forEach((node) => {
+                try {
+                    let info = node.innerHTML.trim();
+                    if (info) {
+                        res.push(info);
+                    }
+                } catch(e) {
+                }
+            });
+            return res;
+        });
+
+        let back_button = await this.page.$('.section-back-to-list-button', {timeout: 10000});
+        if (back_button) {
+            await back_button.click();
+        }
+        return results;
+    }
+
+    async load_start_page() {
+        let startUrl = 'https://www.google.com/maps';
+
+        if (this.config.google_maps_settings) {
+            // whether to visit each result and get all available information
+            // including customer reviews
+            this.scrape_in_detail = this.config.google_maps_settings.scrape_in_detail || false;
+        }
+
+        log(this.config, 1, 'Using startUrl: ' + startUrl);
+
+        this.last_response = await this.page.goto(startUrl);
+
+        try {
+            await this.page.waitForSelector('#searchbox input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
+        } catch (e) {
+            return false;
+        }
+
+        return true;
+    }
+
+    async search_keyword(keyword) {
+        const input = await this.page.$('#searchbox input[name="q"]');
+        await this.set_input_value(`#searchbox input[name="q"]`, keyword);
+        await this.sleep(50);
+        await input.focus();
+        await this.page.keyboard.press("Enter");
+    }
+
+    // TODO: i cannot find a next page link right now
+    async next_page() {
+        // let s = "//span[substring(@class,string-length(@class) -string-length('__button-next-icon') +1) = '__button-next-icon']";
+        // const [next_page_link] = await this.page.$x(s, {timeout: 2000});
+
+        let next_page_link = await this.page.$('[jsaction="pane.paginationSection.nextPage"] span', {timeout: 10000});
+        if (!next_page_link) {
+            return false;
+        }
+        await next_page_link.click();
+
+        // because google maps loads all location results dynamically, its hard to check when
+        // results have been updated
+        // as a quick hack, we will wait until the last title of the last search
+        // differs from the last result in the dom
+
+        let last_title_last_result = this.results[this.keyword][this.page_num-1].results.slice(-1)[0].title;
+
+        log(this.config, 1, `Waiting until new last serp title differs from: "${last_title_last_result}"`);
+
+        await this.page.waitForFunction((last_title) => {
+            const res = document.querySelectorAll('.section-result .section-result-title span');
+            return res[res.length-1].innerHTML !== last_title;
+        }, {timeout: 7000}, this.results[this.keyword][this.page_num-1].results.slice(-1)[0].title);
+
+        return true;
+    }
+
+    async wait_for_results() {
+        await this.page.waitForSelector('.section-listbox-root .section-result', { timeout: this.STANDARD_TIMEOUT });
+        // more than 1 result
+        await this.page.waitForFunction("document.querySelectorAll('.section-result').length > 0", { timeout: 5000 });
+        await this.page.waitForNavigation();
+    }
+
+    async detected() {
+        const title = await this.page.title();
+        let html = await this.page.content();
+        return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
+    }
+}
+
+
+
 function clean_image_url(url) {
    // Example:
    // https://www.google.com/imgres?imgurl=https%3A%2F%2Fupload.wikimedia.org%2Fwikipedia%2Fen%2Fthumb%2Ff%2Ffd%2F1928_Edward_Campbell.jpg%2F220px-1928_Edward_Campbell.jpg&imgrefurl=https%3A%2F%2Fwww.revolvy.com%2Fpage%2FSir-Edward-Campbell%252C-1st-Baronet&docid=BMkW_GerTIY4GM&tbnid=TmQapIxDCQbQhM%3A&vet=10ahUKEwje_LLE_YXeAhXisaQKHVAEBSAQMwiNAShEMEQ..i&w=220&h=290&bih=1696&biw=1280&q=John%20MacLeod%20Breadalbane%20Councillor%20Prince%20Edward%20Island&ved=0ahUKEwje_LLE_YXeAhXisaQKHVAEBSAQMwiNAShEMEQ&iact=mrc&uact=8
@ -440,6 +595,7 @@ module.exports = {
    GoogleScraper: GoogleScraper,
    GoogleImageScraper: GoogleImageScraper,
    GoogleNewsScraper: GoogleNewsScraper,
+    GoogleMapsScraper: GoogleMapsScraper,
 };


--- a/src/modules/se_scraper.js
+++ b/src/modules/se_scraper.js
@ -60,6 +60,8 @@ module.exports = class Scraper {
            this.page = page;
        }

+        await this.page.setViewport({ width: 1920, height: 1040 });
+
        let do_continue = await this.load_search_engine();

        if (!do_continue) {
@ -169,7 +171,7 @@ module.exports = class Scraper {
                });
            }

-            let page_num = 1;
+            this.page_num = 1;

            try {

@ -180,7 +182,7 @@ module.exports = class Scraper {

                do {

-                    log(this.config, 1, `${this.config.search_engine_name} scrapes keyword "${keyword}" on page ${page_num}`);
+                    log(this.config, 1, `${this.config.search_engine_name} scrapes keyword "${keyword}" on page ${this.page_num}`);

                    await this.wait_for_results();

@ -190,13 +192,13 @@ module.exports = class Scraper {

                    let html = await this.page.content();
                    let parsed = this.parse(html);
-                    this.results[keyword][page_num] = parsed ? parsed : await this.parse_async(html);
+                    this.results[keyword][this.page_num] = parsed ? parsed : await this.parse_async(html);

-                    page_num += 1;
+                    this.page_num += 1;

                    // only load the next page when we will pass the next iteration
                    // step from the while loop
-                    if (page_num <= this.config.num_pages) {
+                    if (this.page_num <= this.config.num_pages) {

                        let next_page_loaded = await this.next_page();

@ -207,7 +209,7 @@ module.exports = class Scraper {
                        }
                    }

-                } while (page_num <= this.config.num_pages);
+                } while (this.page_num <= this.config.num_pages);

            } catch (e) {

--- a/src/node_scraper.js
+++ b/src/node_scraper.js
@ -40,6 +40,7 @@ function getScraper(search_engine, args) {
            google_news_old: google.GoogleNewsOldScraper,
            google_news: google.GoogleNewsScraper,
            google_image: google.GoogleImageScraper,
+            google_maps: google.GoogleMapsScraper,
            bing: bing.BingScraper,
            bing_news: bing.BingNewsScraper,
            amazon: amazon.AmazonScraper,
@ -194,7 +195,8 @@ class ScrapeManager {
            '--disable-dev-shm-usage',
            '--disable-accelerated-2d-canvas',
            '--disable-gpu',
-            '--window-size=1920x1080',
+            '--window-size=1920,1040',
+            '--start-fullscreen',
            '--hide-scrollbars',
            '--disable-notifications',
        ];