feat(*): add custom url

This commit is contained in:
Benoist Bouteiller 2022-05-16 09:18:22 +02:00
parent 92817b3977
commit 1259c837e3
6 changed files with 20 additions and 33 deletions

View File

@ -2,6 +2,7 @@ const se_scraper = require('./src/node_scraper.js');
var Scraper = require('./src/modules/se_scraper');
async function scrape(browser_config, scrape_config) {
// scrape config overwrites the browser_config
Object.assign(browser_config, scrape_config);

View File

@ -46,11 +46,8 @@ class DuckduckgoScraper extends Scraper {
async load_start_page() {
debug('load_start_page');
let startUrl = 'https://duckduckgo.com/';
this.last_response = await this.page.goto(startUrl);
this.last_response = await this.page.goto(this.startUrl);
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
return true;
}

View File

@ -213,26 +213,8 @@ class GoogleScraper extends Scraper {
}
async load_start_page() {
let startUrl = 'https://www.google.com';
if (this.config.google_settings) {
startUrl = `https://www.${this.config.google_settings.google_domain}/search?q=`;
if (this.config.google_settings.google_domain) {
startUrl = `https://www.${this.config.google_settings.google_domain}/search?`;
} else {
startUrl = `https://www.google.com/search?`;
}
for (var key in this.config.google_settings) {
if (key !== 'google_domain') {
startUrl += `${key}=${this.config.google_settings[key]}&`
}
}
}
this.logger.info('Using startUrl: ' + startUrl);
this.last_response = await this.page.goto(startUrl);
this.logger.info('Using startUrl: ' + this.startUrl);
this.last_response = await this.page.goto(this.startUrl);
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });

View File

@ -41,11 +41,8 @@ class InfospaceScraper extends Scraper {
}
async load_start_page() {
let startUrl = this.build_start_url('http://search.infospace.com/search/web?') || 'http://infospace.com/index.html';
try {
this.last_response = await this.page.goto(startUrl);
this.last_response = await this.page.goto(this.this.startUrl);
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
} catch (e) {
return false;

View File

@ -318,7 +318,6 @@ module.exports = class Scraper {
for (var key in settings) {
baseUrl += `${key}=${settings[key]}&`
}
this.logger.info('Using startUrl: ' + baseUrl);
return baseUrl;
@ -387,6 +386,18 @@ module.exports = class Scraper {
}
get startUrl(){
const startUrls = {
google: 'https://www.google.com',
duckduckgo:'https://duckduckgo.com/',
bing:this.build_start_url('https://www.bing.com/search?') || 'https://www.bing.com/',
infospace: this.build_start_url('http://search.infospace.com/search/web?') || 'http://infospace.com/index.html',
yandex: 'https://yandex.com'
}
return this.config.startUrl || startUrls[this.config.search_engine];
}
/**
*
* @returns true if startpage was loaded correctly.

View File

@ -71,11 +71,10 @@ class YandexScraper extends Scraper {
}
async load_start_page() {
let startUrl = 'https://yandex.com';
this.logger.info('Using startUrl: ' + this.startUrl);
this.logger.info('Using startUrl: ' + startUrl);
this.last_response = await this.page.goto(startUrl);
this.last_response = await this.page.goto(this.startUrl);
await this.page.waitForSelector('input[name="text"]', { timeout: this.STANDARD_TIMEOUT });