diff --git a/README.md b/README.md index 82c355c..101c77c 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ If you don't have much technical experience or don't want to purchase proxies, y - [Minimal Example](#minimal-example) - [Quickstart](#quickstart) - [Using Proxies](#proxies) +- [Custom Scrapers](#custom-scrapers) - [Examples](#examples) - [Scraping Model](#scraping-model) - [Technical Notes](#technical-notes) @@ -170,6 +171,14 @@ socks4://51.11.23.22:22222 This will scrape with **three** browser instance each having their own IP address. Unfortunately, it is currently not possible to scrape with different proxies per tab. Chromium does not support that. + +## Custom Scrapers + +You can define your own scraper class and use it within se-scraper. + +[Check this example out](examples/custom_scraper.js) that defines a custom scraper for Ecosia. + + ## Examples * [Reuse existing browser](examples/multiple_search_engines.js) yields [these results](examples/results/multiple_search_engines.json) diff --git a/examples/custom_scraper.js b/examples/custom_scraper.js new file mode 100644 index 0000000..a87604d --- /dev/null +++ b/examples/custom_scraper.js @@ -0,0 +1,119 @@ +const se_scraper = require('./../index.js'); + +/* + * This example shows how you can define your custom scraper class and use it + * within se-scraper. + */ +class EcosiaScraper extends se_scraper.Scraper { + + constructor(...args) { + super(...args); + } + + async parse_async(html) { + // In this example we use vanilla javascript to parse out the + // interesting information from the search engine + + // you may also use a external library such as cheerio. + + return await this.page.evaluate(() => { + var results = { + num_results: '', + no_results: false, + effective_query: '', + results: [], + }; + + document.querySelectorAll('.results .result').forEach((result) => { + var serp = {}; + var title = result.querySelector('.result-title'); + if (title) { + serp.title = title.innerText; + serp.link = title.getAttribute('href'); + } + + var green = result.querySelector('.result-url'); + if (green) { + serp.green = green.getAttribute('href'); + } + + var snippet = result.querySelector('.result-snippet'); + + if (snippet) { + serp.snippet = snippet.innerText; + } + + results.results.push(serp); + }); + + var num_res = document.querySelector('.card-title-result-count'); + if (num_res) { + results.num_results = num_res.innerText; + } + + results.no_results = document.querySelector('.empty-result') != null; + + var effective = document.querySelector('.query-context-text .result-title'); + + if (effective) { + results.effective_query = effective.innerText; + } + + return results; + }); + } + + async load_start_page() { + let startUrl = 'https://www.ecosia.org/'; + + await this.page.goto(startUrl); + + try { + await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT }); + } catch (e) { + return false; + } + + return true; + } + + async search_keyword(keyword) { + const input = await this.page.$('input[name="q"]'); + await this.set_input_value(`input[name="q"]`, keyword); + await this.sleep(50); + await input.focus(); + await this.page.keyboard.press("Enter"); + } + + async next_page() { + let next_page_link = await this.page.$('.pagination-next', {timeout: 1000}); + if (!next_page_link) { + return false; + } + await next_page_link.click(); + + return true; + } + + async wait_for_results() { + await this.page.waitForSelector('.results .result', { timeout: this.STANDARD_TIMEOUT }); + } + + async detected() { + // check whether scraping was detected. + } +} + +(async () => { + + let scrape_job = { + search_engine: EcosiaScraper, + keywords: ['lets go boys'], + num_pages: 2, + }; + + var results = await se_scraper.scrape({headless: true}, scrape_job); + + console.dir(results, {depth: null, colors: true}); + +})(); diff --git a/examples/proxies.js b/examples/proxies.js index 0028a01..99ac262 100644 --- a/examples/proxies.js +++ b/examples/proxies.js @@ -17,7 +17,7 @@ const se_scraper = require('./../src/node_scraper.js'); let scrape_job = { search_engine: 'google', - keywords: ['news', 'scrapeulous.com', 'incolumitas.com', 'i work too much', 'what to do?', 'javascript is hard'], + keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'], num_pages: 1, }; diff --git a/index.js b/index.js index 949940d..9fa5147 100644 --- a/index.js +++ b/index.js @@ -1,8 +1,11 @@ const se_scraper = require('./src/node_scraper.js'); +var Scraper = require('./src/modules/se_scraper'); -async function scrape(user_config, scrape_config) { +async function scrape(browser_config, scrape_config) { + // scrape config overwrites the browser_config + Object.assign(browser_config, scrape_config); - var scraper = new se_scraper.ScrapeManager(user_config); + var scraper = new se_scraper.ScrapeManager(browser_config); await scraper.start(); @@ -16,4 +19,5 @@ async function scrape(user_config, scrape_config) { module.exports = { scrape: scrape, ScrapeManager: se_scraper.ScrapeManager, + Scraper: Scraper, }; diff --git a/package.json b/package.json index f07f864..273a3b5 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "se-scraper", - "version": "1.3.2", + "version": "1.3.4", "description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu", "homepage": "https://scrapeulous.com/", "main": "index.js", diff --git a/src/modules/duckduckgo.js b/src/modules/duckduckgo.js index 16956ea..15fdd01 100644 --- a/src/modules/duckduckgo.js +++ b/src/modules/duckduckgo.js @@ -59,12 +59,16 @@ class DuckduckgoScraper extends Scraper { } async next_page() { - let next_page_link = await this.page.$('a.result--more__btn', {timeout: 5000}); + let next_page_link = await this.page.$('.result.result--more', {timeout: 5000}); if (!next_page_link) { return false; } await next_page_link.click(); - await this.page.waitForNavigation(); + try { + await this.page.waitForNavigation({timeout: 5000}); + } catch(e) { + return false; + } return true; } @@ -135,7 +139,11 @@ class DuckduckgoNewsScraper extends Scraper { return false; } await next_page_link.click(); - await this.page.waitForNavigation(); + try { + await this.page.waitForNavigation({timeout: 5000}); + } catch(e) { + return false; + } return true; } diff --git a/src/modules/se_scraper.js b/src/modules/se_scraper.js index 5f87436..4cf8973 100644 --- a/src/modules/se_scraper.js +++ b/src/modules/se_scraper.js @@ -179,7 +179,7 @@ module.exports = class Scraper { do { - log(this.config, 1, `${this.config.search_engine} scrapes keyword "${keyword}" on page ${page_num}`); + log(this.config, 1, `${this.config.search_engine_name} scrapes keyword "${keyword}" on page ${page_num}`); await this.wait_for_results(); @@ -210,10 +210,10 @@ module.exports = class Scraper { } catch (e) { - console.error(`Problem with scraping ${keyword} in search engine ${this.config.search_engine}: ${e}`); + console.error(`Problem with scraping ${keyword} in search engine ${this.config.search_engine_name}: ${e}`); if (await this.detected() === true) { - console.error(`${this.config.search_engine} detected the scraping!`); + console.error(`${this.config.search_engine_name} detected the scraping!`); if (this.config.is_local === true) { await this.sleep(this.SOLVE_CAPTCHA_TIME); diff --git a/src/node_scraper.js b/src/node_scraper.js index e5464be..b158163 100644 --- a/src/node_scraper.js +++ b/src/node_scraper.js @@ -32,26 +32,32 @@ function read_keywords_from_file(fname) { return kws; } -function getScraper(searchEngine, args) { - return new { - google: google.GoogleScraper, - google_news_old: google.GoogleNewsOldScraper, - google_news: google.GoogleNewsScraper, - google_image: google.GoogleImageScraper, - bing: bing.BingScraper, - bing_news: bing.BingNewsScraper, - amazon: amazon.AmazonScraper, - duckduckgo: duckduckgo.DuckduckgoScraper, - duckduckgo_news: duckduckgo.DuckduckgoNewsScraper, - infospace: infospace.InfospaceScraper, - webcrawler: infospace.WebcrawlerNewsScraper, - baidu: baidu.BaiduScraper, - youtube: youtube.YoutubeScraper, - yahoo_news: tickersearch.YahooFinanceScraper, - reuters: tickersearch.ReutersFinanceScraper, - cnbc: tickersearch.CnbcFinanceScraper, - marketwatch: tickersearch.MarketwatchFinanceScraper, - }[searchEngine](args); +function getScraper(search_engine, args) { + if (typeof search_engine === 'string') { + return new { + google: google.GoogleScraper, + google_news_old: google.GoogleNewsOldScraper, + google_news: google.GoogleNewsScraper, + google_image: google.GoogleImageScraper, + bing: bing.BingScraper, + bing_news: bing.BingNewsScraper, + amazon: amazon.AmazonScraper, + duckduckgo: duckduckgo.DuckduckgoScraper, + duckduckgo_news: duckduckgo.DuckduckgoNewsScraper, + infospace: infospace.InfospaceScraper, + webcrawler: infospace.WebcrawlerNewsScraper, + baidu: baidu.BaiduScraper, + youtube: youtube.YoutubeScraper, + yahoo_news: tickersearch.YahooFinanceScraper, + reuters: tickersearch.ReutersFinanceScraper, + cnbc: tickersearch.CnbcFinanceScraper, + marketwatch: tickersearch.MarketwatchFinanceScraper, + }[search_engine](args); + } else if (typeof search_engine === 'function') { + return new search_engine(args); + } else { + throw new Error(`search_engine must either be a string of class (function)`); + } } @@ -137,6 +143,8 @@ class ScrapeManager { this.config = parseEventData(this.config); + this.config.search_engine_name = typeof this.config.search_engine === 'function' ? this.config.search_engine.name : this.config.search_engine; + if (fs.existsSync(this.config.keyword_file)) { this.config.keywords = read_keywords_from_file(this.config.keyword_file); } @@ -304,7 +312,8 @@ class ScrapeManager { var startTime = Date.now(); if (this.config.keywords && this.config.search_engine) { - log(this.config, 1, `[se-scraper] started at [${(new Date()).toUTCString()}] and scrapes ${this.config.search_engine} with ${this.config.keywords.length} keywords on ${this.config.num_pages} pages each.`) + log(this.config, 1, + `[se-scraper] started at [${(new Date()).toUTCString()}] and scrapes ${this.config.search_engine_name} with ${this.config.keywords.length} keywords on ${this.config.num_pages} pages each.`) } if (this.config.do_work && this.pluggable) {