From 70753021c4abc522c50a76492086f9ca524ee7bd Mon Sep 17 00:00:00 2001
From: HugoPoi <hugo.poissonnet@gmail.com>
Date: Wed, 1 Apr 2020 15:26:22 +0200
Subject: [PATCH] feat: add keep_html_on_error option

---
 src/modules/se_scraper.js               |   6 ++
 src/node_scraper.js                     |   3 +-
 test/keep_html_on_error.js              | 108 ++++++++++++++++++++++++
 test/mocks/google/test error_page1.html |   1 +
 4 files changed, 117 insertions(+), 1 deletion(-)
 create mode 100644 test/keep_html_on_error.js
 create mode 100644 test/mocks/google/test error_page1.html

diff --git a/src/modules/se_scraper.js b/src/modules/se_scraper.js
index 6d270c8..17ff117 100644
--- a/src/modules/se_scraper.js
+++ b/src/modules/se_scraper.js
@@ -272,6 +272,12 @@ module.exports = class Scraper {
                     await this.page.screenshot({ path: `debug_se_scraper_${this.config.search_engine_name}_${keyword}.png` });
                 }
 
+                if (this.config.keep_html_on_error){
+                    const html_error = await this.page.content();
+                    e.html_on_error = html_error;
+                    e.lastUrl = await this.page.evaluate(() => {return window.location.href;});
+                }
+
                 this.metadata.scraping_detected = await this.detected();
 
                 if (this.metadata.scraping_detected === true) {
diff --git a/src/node_scraper.js b/src/node_scraper.js
index fa2fe01..b71fe61 100644
--- a/src/node_scraper.js
+++ b/src/node_scraper.js
@@ -139,8 +139,9 @@ class ScrapeManager {
             //custom_func: resolve('examples/pluggable.js'),
             custom_func: null,
             throw_on_detection: false,
+            keep_html_on_error: false,
             standard_timeout: 10000,
-            solve_captcha_time: 45000, 
+            solve_captcha_time: 45000,
             // List of proxies to use ['socks5://78.94.172.42:1080', 'http://localhost:1080']
             proxies: null,
             // a file with one proxy per line. Example:
diff --git a/test/keep_html_on_error.js b/test/keep_html_on_error.js
new file mode 100644
index 0000000..e731a41
--- /dev/null
+++ b/test/keep_html_on_error.js
@@ -0,0 +1,108 @@
+'use strict';
+const express = require('express');
+const { createLogger, transports } = require('winston');
+const http = require('http');
+const https = require('https');
+const assert = require('assert');
+const path = require('path');
+const keyCert = require('key-cert');
+const Promise = require('bluebird');
+const Proxy = require('http-mitm-proxy');
+
+const debug = require('debug')('se-scraper:test');
+const se_scraper = require('..');
+
+const httpPort = 3012;
+const httpsPort = httpPort + 1;
+const proxyPort = httpPort + 2;
+
+const fakeSearchEngine = express();
+fakeSearchEngine.get('/search', (req, res) => {
+    debug('q=%s', req.query.q);
+    const pageNumber = ((req.query.start/10) || 0)  + 1;
+    res.sendFile(path.join(__dirname, 'mocks/google/' + req.query.q + '_page' + pageNumber + '.html'));
+});
+fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']}));
+
+describe('Config', function(){
+
+    let httpServer, httpsServer, proxy;
+    before(async function(){
+        // Here mount our fake engine in both http and https listen server
+        httpServer = http.createServer(fakeSearchEngine);
+        httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
+       
+        proxy = Proxy();
+        proxy.onRequest((ctx, callback) => {
+            ctx.proxyToServerRequestOptions.host = 'localhost';
+            ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
+            ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
+            debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
+            return callback();
+        });
+
+        await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
+        await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
+        await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
+        debug('Fake http search engine servers started');
+    });
+
+    after(function(){
+        httpsServer.close();
+        httpServer.close();
+        proxy.close();
+    });
+
+    describe('keep_html_on_error', function(){
+
+        const testLogger = createLogger({
+            transports: [
+                new transports.Console({
+                    level: 'error'
+                })
+            ]
+        });
+
+        /**
+         * Test html_output option
+         */
+        it('html_output single page single keyword', async function () {
+
+            const scrape_job = {
+                search_engine: 'google',
+                /* TODO refactor start_url
+                google_settings: {
+                    start_url: 'http://localhost:' + httpPort
+                },
+                */
+                keywords: ['test error'],
+            };
+
+            var scraper = new se_scraper.ScrapeManager({
+                throw_on_detection: true,
+                keep_html_on_error: true,
+                logger: testLogger,
+                //clean_html_output: false,
+                //clean_data_images: false,
+                // TODO refactor start_url so we can use-it instead of depending of the proxy for this test
+                proxies: ['http://localhost:' + proxyPort],
+                use_proxies_only: true,
+                standard_timeout: 500,
+            });
+            await scraper.start();
+            await assert.rejects(
+                async () => {
+                    await scraper.scrape(scrape_job);
+                },
+                (error) => {
+                    assert(error.html_on_error, 'Error is containing the html output');
+                    return /#fbar/.test(error.message);
+                }
+            )
+            await scraper.quit();
+            
+        });
+
+    });
+
+});
\ No newline at end of file
diff --git a/test/mocks/google/test error_page1.html b/test/mocks/google/test error_page1.html
new file mode 100644
index 0000000..19e35b5
--- /dev/null
+++ b/test/mocks/google/test error_page1.html	
@@ -0,0 +1 @@
+THIS IS A EMPTY PAGE TO THROW SOME ERROR IN SE-SCRAPER