fix(*): start urls

This commit is contained in:
Benoist Bouteiller 2022-05-16 16:36:47 +02:00
parent 1259c837e3
commit 2d833679f7
7 changed files with 10 additions and 11 deletions

View File

@ -2,7 +2,6 @@ const se_scraper = require('./src/node_scraper.js');
var Scraper = require('./src/modules/se_scraper');
async function scrape(browser_config, scrape_config) {
// scrape config overwrites the browser_config
Object.assign(browser_config, scrape_config);

View File

@ -2,6 +2,8 @@ const cheerio = require('cheerio');
const Scraper = require('./se_scraper');
class BingScraper extends Scraper {
defaultStartUrl = this.build_start_url('https://www.bing.com/search?') || 'https://www.bing.com/';
async parse_async(html) {

View File

@ -4,6 +4,8 @@ const debug = require('debug')('se-scraper:DuckduckgoScraper');
class DuckduckgoScraper extends Scraper {
defaultStartUrl = 'https://duckduckgo.com/';
parse(html) {
debug('parse');
// load the page source into cheerio

View File

@ -5,6 +5,8 @@ const Scraper = require('./se_scraper');
class GoogleScraper extends Scraper {
defaultStartUrl = 'https://www.google.com';
constructor(...args) {
super(...args);
}

View File

@ -42,7 +42,7 @@ class InfospaceScraper extends Scraper {
async load_start_page() {
try {
this.last_response = await this.page.goto(this.this.startUrl);
this.last_response = await this.page.goto(this.startUrl);
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
} catch (e) {
return false;

View File

@ -387,15 +387,7 @@ module.exports = class Scraper {
}
get startUrl(){
const startUrls = {
google: 'https://www.google.com',
duckduckgo:'https://duckduckgo.com/',
bing:this.build_start_url('https://www.bing.com/search?') || 'https://www.bing.com/',
infospace: this.build_start_url('http://search.infospace.com/search/web?') || 'http://infospace.com/index.html',
yandex: 'https://yandex.com'
}
return this.config.startUrl || startUrls[this.config.search_engine];
return this.build_start_url(this.config.startUrl || this.defaultStartUrl);
}
/**

View File

@ -4,6 +4,8 @@ const Scraper = require('./se_scraper');
class YandexScraper extends Scraper {
defaultStartUrl = 'https://yandex.com';
constructor(...args) {
super(...args);
}