fix(duckduckgo): extract correct amount of results, handle pagination

This commit is contained in:
HugoPoi 2020-01-15 16:33:55 +01:00
parent 28332528ea
commit 4b33ef9b19

View File

@ -1,15 +1,18 @@
const cheerio = require('cheerio'); const cheerio = require('cheerio');
const Scraper = require('./se_scraper'); const Scraper = require('./se_scraper');
const debug = require('debug')('se-scraper:DuckduckgoScraper');
class DuckduckgoScraper extends Scraper { class DuckduckgoScraper extends Scraper {
parse(html) { parse(html) {
debug('parse');
// load the page source into cheerio // load the page source into cheerio
const $ = cheerio.load(html); const $ = cheerio.load(html);
// perform queries // perform queries
const results = []; const results = [];
$('#links .result__body').each((i, link) => { const organicSelector = ($('#links .result--sep').length > 0) ? `#links #rld-${this.page_num - 1} ~ .result .result__body` : '#links .result__body';
$(organicSelector).each((i, link) => {
results.push({ results.push({
link: $(link).find('.result__title .result__a').attr('href'), link: $(link).find('.result__title .result__a').attr('href'),
title: $(link).find('.result__title .result__a').text(), title: $(link).find('.result__title .result__a').text(),
@ -42,19 +45,17 @@ class DuckduckgoScraper extends Scraper {
} }
async load_start_page() { async load_start_page() {
debug('load_start_page');
let startUrl = 'https://duckduckgo.com/';
let startUrl = 'https://duckduckgo.com/?q=test';
try {
this.last_response = await this.page.goto(startUrl); this.last_response = await this.page.goto(startUrl);
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT }); await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
} catch (e) {
return false;
}
return true; return true;
} }
async search_keyword(keyword) { async search_keyword(keyword) {
debug('search_keyword');
const input = await this.page.$('input[name="q"]'); const input = await this.page.$('input[name="q"]');
await this.set_input_value(`input[name="q"]`, keyword); await this.set_input_value(`input[name="q"]`, keyword);
await this.sleep(50); await this.sleep(50);
@ -63,21 +64,19 @@ class DuckduckgoScraper extends Scraper {
} }
async next_page() { async next_page() {
let next_page_link = await this.page.$('.result.result--more', {timeout: this.STANDARD_TIMEOUT}); debug('next_page');
let next_page_link = await this.page.$('.result.result--more a', {timeout: this.STANDARD_TIMEOUT});
if (!next_page_link) { if (!next_page_link) {
return false; return false;
} }
await next_page_link.click(); await next_page_link.click();
try {
await this.page.waitForNavigation({ timeout: this.STANDARD_TIMEOUT }); await this.page.waitForNavigation({ timeout: this.STANDARD_TIMEOUT });
} catch(e) {
return false;
}
return true; return true;
} }
async wait_for_results() { async wait_for_results() {
debug('wait_for_results');
await this.page.waitForSelector('.result__body', { timeout: this.STANDARD_TIMEOUT }); await this.page.waitForSelector('.result__body', { timeout: this.STANDARD_TIMEOUT });
} }