resolved issue #30, custom scrapers now possible. new npm version

2019-06-13 12:34:39 +02:00 · 2019-06-13 12:34:39 +02:00 · 43d5732de7
commit 43d5732de7
parent 06d500f75c
8 changed files with 180 additions and 31 deletions
--- a/README.md
+++ b/README.md
@ -13,6 +13,7 @@ If you don't have much technical experience or don't want to purchase proxies, y
 - [Minimal Example](#minimal-example)
 - [Quickstart](#quickstart)
 - [Using Proxies](#proxies)
+- [Custom Scrapers](#custom-scrapers)
 - [Examples](#examples)
 - [Scraping Model](#scraping-model)
 - [Technical Notes](#technical-notes)
@ -170,6 +171,14 @@ socks4://51.11.23.22:22222

 This will scrape with **three** browser instance each having their own IP address. Unfortunately, it is currently not possible to scrape with different proxies per tab. Chromium does not support that.

+
+## Custom Scrapers
+
+You can define your own scraper class and use it within se-scraper.
+
+[Check this example out](examples/custom_scraper.js) that defines a custom scraper for Ecosia.
+
+
 ## Examples

 * [Reuse existing browser](examples/multiple_search_engines.js) yields [these results](examples/results/multiple_search_engines.json)
--- a/examples/custom_scraper.js
+++ b/examples/custom_scraper.js
@ -0,0 +1,119 @@
+const se_scraper = require('./../index.js');
+
+/*
+ * This example shows how you can define your custom scraper class and use it
+ * within se-scraper.
+ */
+class EcosiaScraper extends se_scraper.Scraper {
+
+    constructor(...args) {
+        super(...args);
+    }
+
+    async parse_async(html) {
+        // In this example we use vanilla javascript to parse out the
+        // interesting information from the search engine
+
+        // you may also use a external library such as cheerio.
+
+        return await this.page.evaluate(() => {
+           var results = {
+               num_results: '',
+               no_results: false,
+               effective_query: '',
+               results: [],
+           };
+
+           document.querySelectorAll('.results .result').forEach((result) => {
+              var serp = {};
+              var title =  result.querySelector('.result-title');
+              if (title) {
+                  serp.title = title.innerText;
+                  serp.link = title.getAttribute('href');
+              }
+
+              var green = result.querySelector('.result-url');
+              if (green) {
+                  serp.green = green.getAttribute('href');
+              }
+
+              var snippet = result.querySelector('.result-snippet');
+
+              if (snippet) {
+                  serp.snippet = snippet.innerText;
+              }
+
+              results.results.push(serp);
+           });
+
+           var num_res = document.querySelector('.card-title-result-count');
+           if (num_res) {
+               results.num_results = num_res.innerText;
+           }
+
+           results.no_results = document.querySelector('.empty-result') != null;
+
+           var effective = document.querySelector('.query-context-text .result-title');
+
+           if (effective) {
+               results.effective_query = effective.innerText;
+           }
+
+           return results;
+        });
+    }
+
+    async load_start_page() {
+        let startUrl = 'https://www.ecosia.org/';
+
+        await this.page.goto(startUrl);
+
+        try {
+            await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
+        } catch (e) {
+            return false;
+        }
+
+        return true;
+    }
+
+    async search_keyword(keyword) {
+        const input = await this.page.$('input[name="q"]');
+        await this.set_input_value(`input[name="q"]`, keyword);
+        await this.sleep(50);
+        await input.focus();
+        await this.page.keyboard.press("Enter");
+    }
+
+    async next_page() {
+        let next_page_link = await this.page.$('.pagination-next', {timeout: 1000});
+        if (!next_page_link) {
+            return false;
+        }
+        await next_page_link.click();
+
+        return true;
+    }
+
+    async wait_for_results() {
+        await this.page.waitForSelector('.results .result', { timeout: this.STANDARD_TIMEOUT });
+    }
+
+    async detected() {
+        // check whether scraping was detected.
+    }
+}
+
+(async () => {
+
+    let scrape_job = {
+        search_engine: EcosiaScraper,
+        keywords: ['lets go boys'],
+        num_pages: 2,
+    };
+
+    var results = await se_scraper.scrape({headless: true}, scrape_job);
+
+    console.dir(results, {depth: null, colors: true});
+
+})();
--- a/examples/proxies.js
+++ b/examples/proxies.js
@ -17,7 +17,7 @@ const se_scraper = require('./../src/node_scraper.js');

    let scrape_job = {
        search_engine: 'google',
-        keywords: ['news', 'scrapeulous.com', 'incolumitas.com', 'i work too much', 'what to do?', 'javascript is hard'],
+        keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'],
        num_pages: 1,
    };

--- a/index.js
+++ b/index.js
@ -1,8 +1,11 @@
 const se_scraper = require('./src/node_scraper.js');
+var Scraper = require('./src/modules/se_scraper');

-async function scrape(user_config, scrape_config) {
+async function scrape(browser_config, scrape_config) {
+    // scrape config overwrites the browser_config
+    Object.assign(browser_config, scrape_config);

-    var scraper = new se_scraper.ScrapeManager(user_config);
+    var scraper = new se_scraper.ScrapeManager(browser_config);

    await scraper.start();

@ -16,4 +19,5 @@ async function scrape(user_config, scrape_config) {
 module.exports = {
    scrape: scrape,
    ScrapeManager: se_scraper.ScrapeManager,
+    Scraper: Scraper,
 };
--- a/package.json
+++ b/package.json
@ -1,6 +1,6 @@
 {
  "name": "se-scraper",
-  "version": "1.3.2",
+  "version": "1.3.4",
  "description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu",
  "homepage": "https://scrapeulous.com/",
  "main": "index.js",
--- a/src/modules/duckduckgo.js
+++ b/src/modules/duckduckgo.js
@ -59,12 +59,16 @@ class DuckduckgoScraper extends Scraper {
    }

    async next_page() {
-        let next_page_link = await this.page.$('a.result--more__btn', {timeout: 5000});
+        let next_page_link = await this.page.$('.result.result--more', {timeout: 5000});
        if (!next_page_link) {
            return false;
        }
        await next_page_link.click();
-        await this.page.waitForNavigation();
+        try {
+            await this.page.waitForNavigation({timeout: 5000});
+        } catch(e) {
+            return false;
+        }

        return true;
    }
@ -135,7 +139,11 @@ class DuckduckgoNewsScraper extends Scraper {
            return false;
        }
        await next_page_link.click();
-        await this.page.waitForNavigation();
+        try {
+            await this.page.waitForNavigation({timeout: 5000});
+        } catch(e) {
+            return false;
+        }

        return true;
    }
--- a/src/modules/se_scraper.js
+++ b/src/modules/se_scraper.js
@ -179,7 +179,7 @@ module.exports = class Scraper {

                do {

-                    log(this.config, 1, `${this.config.search_engine} scrapes keyword "${keyword}" on page ${page_num}`);
+                    log(this.config, 1, `${this.config.search_engine_name} scrapes keyword "${keyword}" on page ${page_num}`);

                    await this.wait_for_results();

@ -210,10 +210,10 @@ module.exports = class Scraper {

            } catch (e) {

-                console.error(`Problem with scraping ${keyword} in search engine ${this.config.search_engine}: ${e}`);
+                console.error(`Problem with scraping ${keyword} in search engine ${this.config.search_engine_name}: ${e}`);

                if (await this.detected() === true) {
-                    console.error(`${this.config.search_engine} detected the scraping!`);
+                    console.error(`${this.config.search_engine_name} detected the scraping!`);

                    if (this.config.is_local === true) {
                        await this.sleep(this.SOLVE_CAPTCHA_TIME);
--- a/src/node_scraper.js
+++ b/src/node_scraper.js
@ -32,7 +32,8 @@ function read_keywords_from_file(fname) {
    return kws;
 }

-function getScraper(searchEngine, args) {
+function getScraper(search_engine, args) {
+    if (typeof search_engine === 'string') {
        return new {
            google: google.GoogleScraper,
            google_news_old: google.GoogleNewsOldScraper,
@ -51,7 +52,12 @@ function getScraper(searchEngine, args) {
            reuters: tickersearch.ReutersFinanceScraper,
            cnbc: tickersearch.CnbcFinanceScraper,
            marketwatch: tickersearch.MarketwatchFinanceScraper,
-    }[searchEngine](args);
+        }[search_engine](args);
+    } else if (typeof search_engine === 'function') {
+        return new search_engine(args);
+    } else {
+        throw new Error(`search_engine must either be a string of class (function)`);
+    }
 }


@ -137,6 +143,8 @@ class ScrapeManager {

        this.config = parseEventData(this.config);

+        this.config.search_engine_name = typeof this.config.search_engine === 'function' ? this.config.search_engine.name : this.config.search_engine;
+
        if (fs.existsSync(this.config.keyword_file)) {
            this.config.keywords = read_keywords_from_file(this.config.keyword_file);
        }
@ -304,7 +312,8 @@ class ScrapeManager {
        var startTime = Date.now();

        if (this.config.keywords && this.config.search_engine) {
-            log(this.config, 1, `[se-scraper] started at [${(new Date()).toUTCString()}] and scrapes ${this.config.search_engine} with ${this.config.keywords.length} keywords on ${this.config.num_pages} pages each.`)
+            log(this.config, 1,
+                `[se-scraper] started at [${(new Date()).toUTCString()}] and scrapes ${this.config.search_engine_name} with ${this.config.keywords.length} keywords on ${this.config.num_pages} pages each.`)
        }

        if (this.config.do_work && this.pluggable) {