From a3ebe357a45a41c962e845d29727596d9978b0ff Mon Sep 17 00:00:00 2001 From: Thomas Date: Thu, 18 Apr 2019 15:23:01 +0200 Subject: [PATCH] Add html_output fonctionality Pagination support for html output Change return value to keep it compliant to the current version of se-scrapper --- src/modules/se_scraper.js | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/modules/se_scraper.js b/src/modules/se_scraper.js index 59fd1f0..dc3cf82 100644 --- a/src/modules/se_scraper.js +++ b/src/modules/se_scraper.js @@ -33,6 +33,7 @@ module.exports = class Scraper { this.PROXY_TIMEOUT = 15000; this.SOLVE_CAPTCHA_TIME = 45000; + this.html_output = {}; this.results = {}; this.result_rank = 1; // keep track of the requests done @@ -69,7 +70,15 @@ module.exports = class Scraper { await this.scraping_loop(); - return this.results; + let response = this.results; + if (this.config.html_output) { + response = { + 'results': this.results, + 'html_output': this.html_output + }; + } + + return response; } /** @@ -155,6 +164,7 @@ module.exports = class Scraper { this.num_keywords++; this.keyword = keyword; this.results[keyword] = {}; + this.html_output[keyword] = {}; this.result_rank = 1; if (this.pluggable && this.pluggable.before_keyword_scraped) { @@ -189,6 +199,7 @@ module.exports = class Scraper { } let html = await this.page.content(); + this.html_output[keyword][page_num] = html; let parsed = this.parse(html); this.results[keyword][page_num] = parsed ? parsed : await this.parse_async(html); @@ -484,4 +495,4 @@ async function evadeChromeHeadlessDetection(page) { } catch (e) { console.error(e); } -} \ No newline at end of file +}