mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2025-03-13 12:38:16 +01:00
feat(*): add custom url
This commit is contained in:
parent
92817b3977
commit
1259c837e3
1
index.js
1
index.js
@ -2,6 +2,7 @@ const se_scraper = require('./src/node_scraper.js');
|
||||
var Scraper = require('./src/modules/se_scraper');
|
||||
|
||||
async function scrape(browser_config, scrape_config) {
|
||||
|
||||
// scrape config overwrites the browser_config
|
||||
Object.assign(browser_config, scrape_config);
|
||||
|
||||
|
@ -46,11 +46,8 @@ class DuckduckgoScraper extends Scraper {
|
||||
|
||||
async load_start_page() {
|
||||
debug('load_start_page');
|
||||
let startUrl = 'https://duckduckgo.com/';
|
||||
|
||||
this.last_response = await this.page.goto(startUrl);
|
||||
this.last_response = await this.page.goto(this.startUrl);
|
||||
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -213,26 +213,8 @@ class GoogleScraper extends Scraper {
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
let startUrl = 'https://www.google.com';
|
||||
|
||||
if (this.config.google_settings) {
|
||||
startUrl = `https://www.${this.config.google_settings.google_domain}/search?q=`;
|
||||
if (this.config.google_settings.google_domain) {
|
||||
startUrl = `https://www.${this.config.google_settings.google_domain}/search?`;
|
||||
} else {
|
||||
startUrl = `https://www.google.com/search?`;
|
||||
}
|
||||
|
||||
for (var key in this.config.google_settings) {
|
||||
if (key !== 'google_domain') {
|
||||
startUrl += `${key}=${this.config.google_settings[key]}&`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
this.logger.info('Using startUrl: ' + startUrl);
|
||||
|
||||
this.last_response = await this.page.goto(startUrl);
|
||||
this.logger.info('Using startUrl: ' + this.startUrl);
|
||||
this.last_response = await this.page.goto(this.startUrl);
|
||||
|
||||
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
||||
|
||||
|
@ -41,11 +41,8 @@ class InfospaceScraper extends Scraper {
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
|
||||
let startUrl = this.build_start_url('http://search.infospace.com/search/web?') || 'http://infospace.com/index.html';
|
||||
|
||||
try {
|
||||
this.last_response = await this.page.goto(startUrl);
|
||||
this.last_response = await this.page.goto(this.this.startUrl);
|
||||
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
||||
} catch (e) {
|
||||
return false;
|
||||
|
@ -318,7 +318,6 @@ module.exports = class Scraper {
|
||||
for (var key in settings) {
|
||||
baseUrl += `${key}=${settings[key]}&`
|
||||
}
|
||||
|
||||
this.logger.info('Using startUrl: ' + baseUrl);
|
||||
|
||||
return baseUrl;
|
||||
@ -387,6 +386,18 @@ module.exports = class Scraper {
|
||||
|
||||
}
|
||||
|
||||
get startUrl(){
|
||||
const startUrls = {
|
||||
google: 'https://www.google.com',
|
||||
duckduckgo:'https://duckduckgo.com/',
|
||||
bing:this.build_start_url('https://www.bing.com/search?') || 'https://www.bing.com/',
|
||||
infospace: this.build_start_url('http://search.infospace.com/search/web?') || 'http://infospace.com/index.html',
|
||||
yandex: 'https://yandex.com'
|
||||
}
|
||||
|
||||
return this.config.startUrl || startUrls[this.config.search_engine];
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns true if startpage was loaded correctly.
|
||||
|
@ -71,11 +71,10 @@ class YandexScraper extends Scraper {
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
let startUrl = 'https://yandex.com';
|
||||
|
||||
this.logger.info('Using startUrl: ' + this.startUrl);
|
||||
|
||||
this.logger.info('Using startUrl: ' + startUrl);
|
||||
|
||||
this.last_response = await this.page.goto(startUrl);
|
||||
this.last_response = await this.page.goto(this.startUrl);
|
||||
|
||||
await this.page.waitForSelector('input[name="text"]', { timeout: this.STANDARD_TIMEOUT });
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user