From 2d833679f759bf811ac7b0a91cdce2c2007c6e97 Mon Sep 17 00:00:00 2001 From: Benoist Bouteiller Date: Mon, 16 May 2022 16:36:47 +0200 Subject: [PATCH] fix(*): start urls --- index.js | 1 - src/modules/bing.js | 2 ++ src/modules/duckduckgo.js | 2 ++ src/modules/google.js | 2 ++ src/modules/infospace.js | 2 +- src/modules/se_scraper.js | 10 +--------- src/modules/yandex.js | 2 ++ 7 files changed, 10 insertions(+), 11 deletions(-) diff --git a/index.js b/index.js index 64d403e..9fa5147 100644 --- a/index.js +++ b/index.js @@ -2,7 +2,6 @@ const se_scraper = require('./src/node_scraper.js'); var Scraper = require('./src/modules/se_scraper'); async function scrape(browser_config, scrape_config) { - // scrape config overwrites the browser_config Object.assign(browser_config, scrape_config); diff --git a/src/modules/bing.js b/src/modules/bing.js index 78f2d2a..817c9ad 100644 --- a/src/modules/bing.js +++ b/src/modules/bing.js @@ -2,6 +2,8 @@ const cheerio = require('cheerio'); const Scraper = require('./se_scraper'); class BingScraper extends Scraper { + + defaultStartUrl = this.build_start_url('https://www.bing.com/search?') || 'https://www.bing.com/'; async parse_async(html) { diff --git a/src/modules/duckduckgo.js b/src/modules/duckduckgo.js index 6111ecd..678023b 100644 --- a/src/modules/duckduckgo.js +++ b/src/modules/duckduckgo.js @@ -4,6 +4,8 @@ const debug = require('debug')('se-scraper:DuckduckgoScraper'); class DuckduckgoScraper extends Scraper { + defaultStartUrl = 'https://duckduckgo.com/'; + parse(html) { debug('parse'); // load the page source into cheerio diff --git a/src/modules/google.js b/src/modules/google.js index ba241a7..1e47533 100644 --- a/src/modules/google.js +++ b/src/modules/google.js @@ -5,6 +5,8 @@ const Scraper = require('./se_scraper'); class GoogleScraper extends Scraper { + defaultStartUrl = 'https://www.google.com'; + constructor(...args) { super(...args); } diff --git a/src/modules/infospace.js b/src/modules/infospace.js index da7e900..6be7248 100644 --- a/src/modules/infospace.js +++ b/src/modules/infospace.js @@ -42,7 +42,7 @@ class InfospaceScraper extends Scraper { async load_start_page() { try { - this.last_response = await this.page.goto(this.this.startUrl); + this.last_response = await this.page.goto(this.startUrl); await this.page.waitForSelector('input[name="q"]', { timeout: 5000 }); } catch (e) { return false; diff --git a/src/modules/se_scraper.js b/src/modules/se_scraper.js index eb3b1e7..89c2b6d 100644 --- a/src/modules/se_scraper.js +++ b/src/modules/se_scraper.js @@ -387,15 +387,7 @@ module.exports = class Scraper { } get startUrl(){ - const startUrls = { - google: 'https://www.google.com', - duckduckgo:'https://duckduckgo.com/', - bing:this.build_start_url('https://www.bing.com/search?') || 'https://www.bing.com/', - infospace: this.build_start_url('http://search.infospace.com/search/web?') || 'http://infospace.com/index.html', - yandex: 'https://yandex.com' - } - - return this.config.startUrl || startUrls[this.config.search_engine]; + return this.build_start_url(this.config.startUrl || this.defaultStartUrl); } /** diff --git a/src/modules/yandex.js b/src/modules/yandex.js index cba6543..132ca8b 100644 --- a/src/modules/yandex.js +++ b/src/modules/yandex.js @@ -4,6 +4,8 @@ const Scraper = require('./se_scraper'); class YandexScraper extends Scraper { + defaultStartUrl = 'https://yandex.com'; + constructor(...args) { super(...args); }