forked from extern/se-scraper
fix(duckduckgo): extract correct amount of results, handle pagination
This commit is contained in:
parent
28332528ea
commit
4b33ef9b19
@ -1,15 +1,18 @@
|
|||||||
const cheerio = require('cheerio');
|
const cheerio = require('cheerio');
|
||||||
const Scraper = require('./se_scraper');
|
const Scraper = require('./se_scraper');
|
||||||
|
const debug = require('debug')('se-scraper:DuckduckgoScraper');
|
||||||
|
|
||||||
class DuckduckgoScraper extends Scraper {
|
class DuckduckgoScraper extends Scraper {
|
||||||
|
|
||||||
parse(html) {
|
parse(html) {
|
||||||
|
debug('parse');
|
||||||
// load the page source into cheerio
|
// load the page source into cheerio
|
||||||
const $ = cheerio.load(html);
|
const $ = cheerio.load(html);
|
||||||
|
|
||||||
// perform queries
|
// perform queries
|
||||||
const results = [];
|
const results = [];
|
||||||
$('#links .result__body').each((i, link) => {
|
const organicSelector = ($('#links .result--sep').length > 0) ? `#links #rld-${this.page_num - 1} ~ .result .result__body` : '#links .result__body';
|
||||||
|
$(organicSelector).each((i, link) => {
|
||||||
results.push({
|
results.push({
|
||||||
link: $(link).find('.result__title .result__a').attr('href'),
|
link: $(link).find('.result__title .result__a').attr('href'),
|
||||||
title: $(link).find('.result__title .result__a').text(),
|
title: $(link).find('.result__title .result__a').text(),
|
||||||
@ -42,19 +45,17 @@ class DuckduckgoScraper extends Scraper {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async load_start_page() {
|
async load_start_page() {
|
||||||
|
debug('load_start_page');
|
||||||
|
let startUrl = 'https://duckduckgo.com/';
|
||||||
|
|
||||||
let startUrl = 'https://duckduckgo.com/?q=test';
|
|
||||||
|
|
||||||
try {
|
|
||||||
this.last_response = await this.page.goto(startUrl);
|
this.last_response = await this.page.goto(startUrl);
|
||||||
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
||||||
} catch (e) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
async search_keyword(keyword) {
|
async search_keyword(keyword) {
|
||||||
|
debug('search_keyword');
|
||||||
const input = await this.page.$('input[name="q"]');
|
const input = await this.page.$('input[name="q"]');
|
||||||
await this.set_input_value(`input[name="q"]`, keyword);
|
await this.set_input_value(`input[name="q"]`, keyword);
|
||||||
await this.sleep(50);
|
await this.sleep(50);
|
||||||
@ -63,21 +64,19 @@ class DuckduckgoScraper extends Scraper {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async next_page() {
|
async next_page() {
|
||||||
let next_page_link = await this.page.$('.result.result--more', {timeout: this.STANDARD_TIMEOUT});
|
debug('next_page');
|
||||||
|
let next_page_link = await this.page.$('.result.result--more a', {timeout: this.STANDARD_TIMEOUT});
|
||||||
if (!next_page_link) {
|
if (!next_page_link) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
await next_page_link.click();
|
await next_page_link.click();
|
||||||
try {
|
await this.page.waitForNavigation({ timeout: this.STANDARD_TIMEOUT });
|
||||||
await this.page.waitForNavigation({timeout: this.STANDARD_TIMEOUT});
|
|
||||||
} catch(e) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
async wait_for_results() {
|
async wait_for_results() {
|
||||||
|
debug('wait_for_results');
|
||||||
await this.page.waitForSelector('.result__body', { timeout: this.STANDARD_TIMEOUT });
|
await this.page.waitForSelector('.result__body', { timeout: this.STANDARD_TIMEOUT });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user