se-scraper/test/html_output.js

'use strict';
const express = require('express');
const { createLogger, transports } = require('winston');
const http = require('http');
const https = require('https');
const assert = require('assert');
const path = require('path');
const keyCert = require('key-cert');
const Promise = require('bluebird');
const Proxy = require('http-mitm-proxy');

const debug = require('debug')('se-scraper:test');
const se_scraper = require('../');

const httpPort = 3012;
const httpsPort = httpPort + 1;
const proxyPort = httpPort + 2;

const fakeSearchEngine = express();
fakeSearchEngine.get('/search', (req, res) => {
    debug('q=%s', req.query.q);
    const pageNumber = ((req.query.start/10) || 0)  + 1;
    res.sendFile(path.join(__dirname, 'mocks/google/' + req.query.q + '_page' + pageNumber + '.html'));
});
fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']}));

describe('Config', function(){

    let httpServer, httpsServer, proxy;
    before(async function(){
        // Here mount our fake engine in both http and https listen server
        httpServer = http.createServer(fakeSearchEngine);
        httpsServer = https.createServer(await keyCert(), fakeSearchEngine);

        proxy = Proxy();
        proxy.onRequest((ctx, callback) => {
            ctx.proxyToServerRequestOptions.host = 'localhost';
            ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
            ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
            debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
            return callback();
        });

        await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
        await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
        await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
        debug('Fake http search engine servers started');
    });

    after(function(){
        httpsServer.close();
        httpServer.close();
        proxy.close();
    });

    describe('html_output', function(){

        const testLogger = createLogger({
            transports: [
                new transports.Console({
                    level: 'error'
                })
            ]
        });

        /**
         * Test html_output option
         */
        it('html_output single page single keyword', async function () {

            const scrape_job = {
                search_engine: 'google',
                /* TODO refactor start_url
                google_settings: {
                    start_url: 'http://localhost:' + httpPort
                },
                */
                keywords: ['test keyword'],
            };

            var scraper = new se_scraper.ScrapeManager({
                throw_on_detection: true,
                logger: testLogger,
                html_output: true,
                //clean_html_output: false,
                //clean_data_images: false,
                // TODO refactor start_url so we can use-it instead of depending of the proxy for this test
                proxies: ['http://localhost:' + proxyPort],
                use_proxies_only: true,
            });
            await scraper.start();
            const { results } = await scraper.scrape(scrape_job);
            await scraper.quit();

            assert(results['test keyword']['1'].html.length > 1000, 'Html of google page 1 should be provided');

        });

    });

});