feat: add keep_html_on_error option

2025-06-30 14:20:00 +02:00 · 2020-04-01 15:26:22 +02:00
parent 2846a50983
commit 70753021c4
4 changed files with 117 additions and 1 deletions
--- a/src/modules/se_scraper.js
+++ b/src/modules/se_scraper.js
@ -272,6 +272,12 @@ module.exports = class Scraper {
                    await this.page.screenshot({ path: `debug_se_scraper_${this.config.search_engine_name}_${keyword}.png` });
                }

+                if (this.config.keep_html_on_error){
+                    const html_error = await this.page.content();
+                    e.html_on_error = html_error;
+                    e.lastUrl = await this.page.evaluate(() => {return window.location.href;});
+                }
+
                this.metadata.scraping_detected = await this.detected();

                if (this.metadata.scraping_detected === true) {
--- a/src/node_scraper.js
+++ b/src/node_scraper.js
@ -139,6 +139,7 @@ class ScrapeManager {
            //custom_func: resolve('examples/pluggable.js'),
            custom_func: null,
            throw_on_detection: false,
+            keep_html_on_error: false,
            standard_timeout: 10000,
            solve_captcha_time: 45000,
            // List of proxies to use ['socks5://78.94.172.42:1080', 'http://localhost:1080']
--- a/test/keep_html_on_error.js
+++ b/test/keep_html_on_error.js
@ -0,0 +1,108 @@
+'use strict';
+const express = require('express');
+const { createLogger, transports } = require('winston');
+const http = require('http');
+const https = require('https');
+const assert = require('assert');
+const path = require('path');
+const keyCert = require('key-cert');
+const Promise = require('bluebird');
+const Proxy = require('http-mitm-proxy');
+
+const debug = require('debug')('se-scraper:test');
+const se_scraper = require('..');
+
+const httpPort = 3012;
+const httpsPort = httpPort + 1;
+const proxyPort = httpPort + 2;
+
+const fakeSearchEngine = express();
+fakeSearchEngine.get('/search', (req, res) => {
+    debug('q=%s', req.query.q);
+    const pageNumber = ((req.query.start/10) || 0)  + 1;
+    res.sendFile(path.join(__dirname, 'mocks/google/' + req.query.q + '_page' + pageNumber + '.html'));
+});
+fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']}));
+
+describe('Config', function(){
+
+    let httpServer, httpsServer, proxy;
+    before(async function(){
+        // Here mount our fake engine in both http and https listen server
+        httpServer = http.createServer(fakeSearchEngine);
+        httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
+       
+        proxy = Proxy();
+        proxy.onRequest((ctx, callback) => {
+            ctx.proxyToServerRequestOptions.host = 'localhost';
+            ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
+            ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
+            debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
+            return callback();
+        });
+
+        await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
+        await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
+        await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
+        debug('Fake http search engine servers started');
+    });
+
+    after(function(){
+        httpsServer.close();
+        httpServer.close();
+        proxy.close();
+    });
+
+    describe('keep_html_on_error', function(){
+
+        const testLogger = createLogger({
+            transports: [
+                new transports.Console({
+                    level: 'error'
+                })
+            ]
+        });
+
+        /**
+         * Test html_output option
+         */
+        it('html_output single page single keyword', async function () {
+
+            const scrape_job = {
+                search_engine: 'google',
+                /* TODO refactor start_url
+                google_settings: {
+                    start_url: 'http://localhost:' + httpPort
+                },
+                */
+                keywords: ['test error'],
+            };
+
+            var scraper = new se_scraper.ScrapeManager({
+                throw_on_detection: true,
+                keep_html_on_error: true,
+                logger: testLogger,
+                //clean_html_output: false,
+                //clean_data_images: false,
+                // TODO refactor start_url so we can use-it instead of depending of the proxy for this test
+                proxies: ['http://localhost:' + proxyPort],
+                use_proxies_only: true,
+                standard_timeout: 500,
+            });
+            await scraper.start();
+            await assert.rejects(
+                async () => {
+                    await scraper.scrape(scrape_job);
+                },
+                (error) => {
+                    assert(error.html_on_error, 'Error is containing the html output');
+                    return /#fbar/.test(error.message);
+                }
+            )
+            await scraper.quit();
+            
+        });
+
+    });
+
+});
--- a/test/mocks/google/test
+++ b/test/mocks/google/test
@ -0,0 +1 @@
+THIS IS A EMPTY PAGE TO THROW SOME ERROR IN SE-SCRAPER
				`@ -0,0 +1 @@`
				`THIS IS A EMPTY PAGE TO THROW SOME ERROR IN SE-SCRAPER`