From 70753021c4abc522c50a76492086f9ca524ee7bd Mon Sep 17 00:00:00 2001 From: HugoPoi Date: Wed, 1 Apr 2020 15:26:22 +0200 Subject: [PATCH] feat: add keep_html_on_error option --- src/modules/se_scraper.js | 6 ++ src/node_scraper.js | 3 +- test/keep_html_on_error.js | 108 ++++++++++++++++++++++++ test/mocks/google/test error_page1.html | 1 + 4 files changed, 117 insertions(+), 1 deletion(-) create mode 100644 test/keep_html_on_error.js create mode 100644 test/mocks/google/test error_page1.html diff --git a/src/modules/se_scraper.js b/src/modules/se_scraper.js index 6d270c8..17ff117 100644 --- a/src/modules/se_scraper.js +++ b/src/modules/se_scraper.js @@ -272,6 +272,12 @@ module.exports = class Scraper { await this.page.screenshot({ path: `debug_se_scraper_${this.config.search_engine_name}_${keyword}.png` }); } + if (this.config.keep_html_on_error){ + const html_error = await this.page.content(); + e.html_on_error = html_error; + e.lastUrl = await this.page.evaluate(() => {return window.location.href;}); + } + this.metadata.scraping_detected = await this.detected(); if (this.metadata.scraping_detected === true) { diff --git a/src/node_scraper.js b/src/node_scraper.js index fa2fe01..b71fe61 100644 --- a/src/node_scraper.js +++ b/src/node_scraper.js @@ -139,8 +139,9 @@ class ScrapeManager { //custom_func: resolve('examples/pluggable.js'), custom_func: null, throw_on_detection: false, + keep_html_on_error: false, standard_timeout: 10000, - solve_captcha_time: 45000, + solve_captcha_time: 45000, // List of proxies to use ['socks5://78.94.172.42:1080', 'http://localhost:1080'] proxies: null, // a file with one proxy per line. Example: diff --git a/test/keep_html_on_error.js b/test/keep_html_on_error.js new file mode 100644 index 0000000..e731a41 --- /dev/null +++ b/test/keep_html_on_error.js @@ -0,0 +1,108 @@ +'use strict'; +const express = require('express'); +const { createLogger, transports } = require('winston'); +const http = require('http'); +const https = require('https'); +const assert = require('assert'); +const path = require('path'); +const keyCert = require('key-cert'); +const Promise = require('bluebird'); +const Proxy = require('http-mitm-proxy'); + +const debug = require('debug')('se-scraper:test'); +const se_scraper = require('..'); + +const httpPort = 3012; +const httpsPort = httpPort + 1; +const proxyPort = httpPort + 2; + +const fakeSearchEngine = express(); +fakeSearchEngine.get('/search', (req, res) => { + debug('q=%s', req.query.q); + const pageNumber = ((req.query.start/10) || 0) + 1; + res.sendFile(path.join(__dirname, 'mocks/google/' + req.query.q + '_page' + pageNumber + '.html')); +}); +fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']})); + +describe('Config', function(){ + + let httpServer, httpsServer, proxy; + before(async function(){ + // Here mount our fake engine in both http and https listen server + httpServer = http.createServer(fakeSearchEngine); + httpsServer = https.createServer(await keyCert(), fakeSearchEngine); + + proxy = Proxy(); + proxy.onRequest((ctx, callback) => { + ctx.proxyToServerRequestOptions.host = 'localhost'; + ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort; + ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine'; + debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host); + return callback(); + }); + + await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort}); + await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort); + await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort); + debug('Fake http search engine servers started'); + }); + + after(function(){ + httpsServer.close(); + httpServer.close(); + proxy.close(); + }); + + describe('keep_html_on_error', function(){ + + const testLogger = createLogger({ + transports: [ + new transports.Console({ + level: 'error' + }) + ] + }); + + /** + * Test html_output option + */ + it('html_output single page single keyword', async function () { + + const scrape_job = { + search_engine: 'google', + /* TODO refactor start_url + google_settings: { + start_url: 'http://localhost:' + httpPort + }, + */ + keywords: ['test error'], + }; + + var scraper = new se_scraper.ScrapeManager({ + throw_on_detection: true, + keep_html_on_error: true, + logger: testLogger, + //clean_html_output: false, + //clean_data_images: false, + // TODO refactor start_url so we can use-it instead of depending of the proxy for this test + proxies: ['http://localhost:' + proxyPort], + use_proxies_only: true, + standard_timeout: 500, + }); + await scraper.start(); + await assert.rejects( + async () => { + await scraper.scrape(scrape_job); + }, + (error) => { + assert(error.html_on_error, 'Error is containing the html output'); + return /#fbar/.test(error.message); + } + ) + await scraper.quit(); + + }); + + }); + +}); \ No newline at end of file diff --git a/test/mocks/google/test error_page1.html b/test/mocks/google/test error_page1.html new file mode 100644 index 0000000..19e35b5 --- /dev/null +++ b/test/mocks/google/test error_page1.html @@ -0,0 +1 @@ +THIS IS A EMPTY PAGE TO THROW SOME ERROR IN SE-SCRAPER