2020-01-07 16:50:09 +01:00
|
|
|
'use strict';
|
|
|
|
const express = require('express');
|
2020-01-08 14:40:28 +01:00
|
|
|
const { createLogger, transports } = require('winston');
|
2020-01-07 16:50:09 +01:00
|
|
|
const http = require('http');
|
|
|
|
const https = require('https');
|
|
|
|
const assert = require('assert');
|
|
|
|
const keyCert = require('key-cert');
|
|
|
|
const Promise = require('bluebird');
|
2020-01-08 14:40:28 +01:00
|
|
|
const Proxy = require('http-mitm-proxy');
|
2020-01-07 16:50:09 +01:00
|
|
|
|
|
|
|
const debug = require('debug')('se-scraper:test');
|
|
|
|
const se_scraper = require('../');
|
|
|
|
const Scraper = require('../src/modules/se_scraper');
|
|
|
|
|
|
|
|
const httpPort = 3012;
|
|
|
|
const httpsPort = httpPort + 1;
|
2020-01-08 14:40:28 +01:00
|
|
|
const proxyPort = httpPort + 2;
|
2020-01-07 16:50:09 +01:00
|
|
|
|
|
|
|
const fakeSearchEngine = express();
|
2020-01-08 14:40:28 +01:00
|
|
|
fakeSearchEngine.set('trust proxy', 'loopback');
|
|
|
|
fakeSearchEngine.get('/test-proxy', (req, res) => {
|
|
|
|
debug('fake-search-engine req.hostname=%s', req.hostname);
|
|
|
|
//debug('req to', req.socket.localAddress, req.socket.localPort);
|
|
|
|
res.send(req.hostname);
|
2020-01-07 16:50:09 +01:00
|
|
|
});
|
|
|
|
|
|
|
|
describe('Config', function(){
|
|
|
|
|
2020-01-08 14:40:28 +01:00
|
|
|
let httpServer, httpsServer, proxy;
|
2020-01-07 16:50:09 +01:00
|
|
|
before(async function(){
|
|
|
|
// Here mount our fake engine in both http and https listen server
|
2020-01-08 14:40:28 +01:00
|
|
|
httpServer = http.createServer(fakeSearchEngine);
|
2020-01-07 16:50:09 +01:00
|
|
|
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
2020-01-08 14:40:28 +01:00
|
|
|
|
|
|
|
proxy = Proxy();
|
|
|
|
proxy.onRequest((ctx, callback) => {
|
|
|
|
ctx.proxyToServerRequestOptions.host = 'localhost';
|
|
|
|
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
|
|
|
|
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
|
|
|
|
debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
|
|
|
|
return callback();
|
2020-01-07 16:50:09 +01:00
|
|
|
});
|
|
|
|
|
2020-01-08 14:40:28 +01:00
|
|
|
await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
|
|
|
|
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
|
2020-01-07 16:50:09 +01:00
|
|
|
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
|
|
|
debug('Fake http search engine servers started');
|
|
|
|
});
|
|
|
|
|
|
|
|
after(function(){
|
|
|
|
httpsServer.close();
|
2020-01-08 14:40:28 +01:00
|
|
|
httpServer.close();
|
|
|
|
proxy.close();
|
2020-01-07 16:50:09 +01:00
|
|
|
});
|
|
|
|
|
|
|
|
describe('proxies', function(){
|
|
|
|
|
2020-01-08 14:40:28 +01:00
|
|
|
class MockScraperTestProxy extends Scraper {
|
2020-01-07 16:50:09 +01:00
|
|
|
|
|
|
|
async load_start_page(){
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
async search_keyword(){
|
2020-01-08 14:40:28 +01:00
|
|
|
await this.page.goto('http://test.local:' + httpPort + '/test-proxy');
|
2020-01-07 16:50:09 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
async parse_async(){
|
|
|
|
const bodyHandle = await this.page.$('body');
|
|
|
|
return await this.page.evaluate(body => body.innerHTML, bodyHandle);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-01-08 14:40:28 +01:00
|
|
|
const testLogger = createLogger({
|
|
|
|
transports: [
|
|
|
|
new transports.Console({
|
|
|
|
level: 'error'
|
|
|
|
})
|
|
|
|
]
|
|
|
|
});
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Jobs will be executed 2 by 2 through the proxy and direct connection
|
|
|
|
* THIS TEST NEED TO HAVE test.local 127.0.0.1 in /etc/hosts because chrome bypass localhost even with proxy set
|
|
|
|
*/
|
|
|
|
it('one proxy given, use_proxies_only=false', async function () {
|
|
|
|
|
|
|
|
const scrape_job = {
|
|
|
|
search_engine: MockScraperTestProxy,
|
|
|
|
keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'],
|
|
|
|
};
|
|
|
|
|
|
|
|
var scraper = new se_scraper.ScrapeManager({
|
|
|
|
throw_on_detection: true,
|
|
|
|
proxies: ['http://localhost:' + proxyPort],
|
|
|
|
// default is use_proxies_only: false,
|
|
|
|
logger: testLogger,
|
|
|
|
});
|
|
|
|
await scraper.start();
|
|
|
|
|
|
|
|
const { results } = await scraper.scrape(scrape_job);
|
|
|
|
assert.strictEqual(results['news']['1'], 'test.local');
|
|
|
|
assert.strictEqual(results['some stuff']['1'], 'ProxiedThroughFakeEngine');
|
|
|
|
assert.strictEqual(results['i work too much']['1'], 'test.local');
|
|
|
|
assert.strictEqual(results['what to do?']['1'], 'ProxiedThroughFakeEngine');
|
|
|
|
assert.strictEqual(results['javascript is hard']['1'], 'test.local');
|
|
|
|
|
|
|
|
await scraper.quit();
|
|
|
|
});
|
|
|
|
|
2020-01-07 16:50:09 +01:00
|
|
|
/**
|
|
|
|
* Jobs will be executed 1 by 1 through the proxy
|
|
|
|
*/
|
2020-01-08 14:40:28 +01:00
|
|
|
it('one proxy given, use_proxies_only=true', async function () {
|
2020-01-07 16:50:09 +01:00
|
|
|
|
|
|
|
const scrape_job = {
|
2020-01-08 14:40:28 +01:00
|
|
|
search_engine: MockScraperTestProxy,
|
2020-01-07 16:50:09 +01:00
|
|
|
keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'],
|
|
|
|
};
|
|
|
|
|
|
|
|
var scraper = new se_scraper.ScrapeManager({
|
|
|
|
throw_on_detection: true,
|
2020-01-08 14:40:28 +01:00
|
|
|
proxies: ['http://localhost:' + proxyPort],
|
2020-01-07 16:50:09 +01:00
|
|
|
use_proxies_only: true,
|
2020-01-08 14:40:28 +01:00
|
|
|
logger: testLogger,
|
2020-01-07 16:50:09 +01:00
|
|
|
});
|
|
|
|
await scraper.start();
|
|
|
|
|
|
|
|
const { results } = await scraper.scrape(scrape_job);
|
2020-01-08 14:40:28 +01:00
|
|
|
assert.strictEqual(results['news']['1'], 'ProxiedThroughFakeEngine');
|
|
|
|
assert.strictEqual(results['some stuff']['1'], 'ProxiedThroughFakeEngine');
|
|
|
|
assert.strictEqual(results['i work too much']['1'], 'ProxiedThroughFakeEngine');
|
|
|
|
assert.strictEqual(results['what to do?']['1'], 'ProxiedThroughFakeEngine');
|
|
|
|
assert.strictEqual(results['javascript is hard']['1'], 'ProxiedThroughFakeEngine');
|
2020-01-07 16:50:09 +01:00
|
|
|
|
|
|
|
await scraper.quit();
|
|
|
|
});
|
|
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
});
|