mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2025-03-13 12:38:16 +01:00
fix(*): start urls
This commit is contained in:
parent
1259c837e3
commit
2d833679f7
1
index.js
1
index.js
@ -2,7 +2,6 @@ const se_scraper = require('./src/node_scraper.js');
|
|||||||
var Scraper = require('./src/modules/se_scraper');
|
var Scraper = require('./src/modules/se_scraper');
|
||||||
|
|
||||||
async function scrape(browser_config, scrape_config) {
|
async function scrape(browser_config, scrape_config) {
|
||||||
|
|
||||||
// scrape config overwrites the browser_config
|
// scrape config overwrites the browser_config
|
||||||
Object.assign(browser_config, scrape_config);
|
Object.assign(browser_config, scrape_config);
|
||||||
|
|
||||||
|
@ -2,6 +2,8 @@ const cheerio = require('cheerio');
|
|||||||
const Scraper = require('./se_scraper');
|
const Scraper = require('./se_scraper');
|
||||||
|
|
||||||
class BingScraper extends Scraper {
|
class BingScraper extends Scraper {
|
||||||
|
|
||||||
|
defaultStartUrl = this.build_start_url('https://www.bing.com/search?') || 'https://www.bing.com/';
|
||||||
|
|
||||||
async parse_async(html) {
|
async parse_async(html) {
|
||||||
|
|
||||||
|
@ -4,6 +4,8 @@ const debug = require('debug')('se-scraper:DuckduckgoScraper');
|
|||||||
|
|
||||||
class DuckduckgoScraper extends Scraper {
|
class DuckduckgoScraper extends Scraper {
|
||||||
|
|
||||||
|
defaultStartUrl = 'https://duckduckgo.com/';
|
||||||
|
|
||||||
parse(html) {
|
parse(html) {
|
||||||
debug('parse');
|
debug('parse');
|
||||||
// load the page source into cheerio
|
// load the page source into cheerio
|
||||||
|
@ -5,6 +5,8 @@ const Scraper = require('./se_scraper');
|
|||||||
|
|
||||||
class GoogleScraper extends Scraper {
|
class GoogleScraper extends Scraper {
|
||||||
|
|
||||||
|
defaultStartUrl = 'https://www.google.com';
|
||||||
|
|
||||||
constructor(...args) {
|
constructor(...args) {
|
||||||
super(...args);
|
super(...args);
|
||||||
}
|
}
|
||||||
|
@ -42,7 +42,7 @@ class InfospaceScraper extends Scraper {
|
|||||||
|
|
||||||
async load_start_page() {
|
async load_start_page() {
|
||||||
try {
|
try {
|
||||||
this.last_response = await this.page.goto(this.this.startUrl);
|
this.last_response = await this.page.goto(this.startUrl);
|
||||||
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -387,15 +387,7 @@ module.exports = class Scraper {
|
|||||||
}
|
}
|
||||||
|
|
||||||
get startUrl(){
|
get startUrl(){
|
||||||
const startUrls = {
|
return this.build_start_url(this.config.startUrl || this.defaultStartUrl);
|
||||||
google: 'https://www.google.com',
|
|
||||||
duckduckgo:'https://duckduckgo.com/',
|
|
||||||
bing:this.build_start_url('https://www.bing.com/search?') || 'https://www.bing.com/',
|
|
||||||
infospace: this.build_start_url('http://search.infospace.com/search/web?') || 'http://infospace.com/index.html',
|
|
||||||
yandex: 'https://yandex.com'
|
|
||||||
}
|
|
||||||
|
|
||||||
return this.config.startUrl || startUrls[this.config.search_engine];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -4,6 +4,8 @@ const Scraper = require('./se_scraper');
|
|||||||
|
|
||||||
class YandexScraper extends Scraper {
|
class YandexScraper extends Scraper {
|
||||||
|
|
||||||
|
defaultStartUrl = 'https://yandex.com';
|
||||||
|
|
||||||
constructor(...args) {
|
constructor(...args) {
|
||||||
super(...args);
|
super(...args);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user