test: Bing tests working, refactor proxy for tests

This commit is contained in:
HugoPoi 2020-01-08 14:40:28 +01:00
parent 1c1db88545
commit cac6b87e92
10 changed files with 347 additions and 114 deletions

2
.gitignore vendored
View File

@ -79,3 +79,5 @@ typings/
.idea/ .idea/
GoogleScraperPup.iml GoogleScraperPup.iml
.http-mitm-proxy

62
package-lock.json generated
View File

@ -1007,6 +1007,34 @@
"toidentifier": "1.0.0" "toidentifier": "1.0.0"
} }
}, },
"http-mitm-proxy": {
"version": "0.8.2",
"resolved": "https://registry.npmjs.org/http-mitm-proxy/-/http-mitm-proxy-0.8.2.tgz",
"integrity": "sha512-QqaqHWssz4acqu2aIPJqJWt/gDa4SzQ9kj/rs16ONA2nBWNh/mfOW0Ez1Wxa5IivHHZSTciQ7wG0Dxzogurngw==",
"dev": true,
"requires": {
"async": "^2.6.2",
"debug": "^4.1.0",
"mkdirp": "^0.5.1",
"node-forge": "^0.8.4",
"optimist": "^0.6.1",
"semaphore": "^1.1.0",
"ws": "^3.2.0"
},
"dependencies": {
"ws": {
"version": "3.3.3",
"resolved": "https://registry.npmjs.org/ws/-/ws-3.3.3.tgz",
"integrity": "sha512-nnWLa/NwZSt4KQJu51MYlCcSQ5g7INpOrOMt4XV8j4dqTXdmlUmSHQ8/oLC069ckre0fRsgfvsKwbTdtKLCDkA==",
"dev": true,
"requires": {
"async-limiter": "~1.0.0",
"safe-buffer": "~5.1.0",
"ultron": "~1.1.0"
}
}
}
},
"https-proxy-agent": { "https-proxy-agent": {
"version": "3.0.1", "version": "3.0.1",
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-3.0.1.tgz", "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-3.0.1.tgz",
@ -1475,6 +1503,12 @@
"semver": "^5.7.0" "semver": "^5.7.0"
} }
}, },
"node-forge": {
"version": "0.8.5",
"resolved": "https://registry.npmjs.org/node-forge/-/node-forge-0.8.5.tgz",
"integrity": "sha512-vFMQIWt+J/7FLNyKouZ9TazT74PRV3wgv9UT4cRjC8BffxFbKXkgIWR42URCPSnHm/QDz6BOlb2Q0U4+VQT67Q==",
"dev": true
},
"normalize-url": { "normalize-url": {
"version": "3.3.0", "version": "3.3.0",
"resolved": "https://registry.npmjs.org/normalize-url/-/normalize-url-3.3.0.tgz", "resolved": "https://registry.npmjs.org/normalize-url/-/normalize-url-3.3.0.tgz",
@ -1553,6 +1587,16 @@
"resolved": "https://registry.npmjs.org/one-time/-/one-time-0.0.4.tgz", "resolved": "https://registry.npmjs.org/one-time/-/one-time-0.0.4.tgz",
"integrity": "sha1-+M33eISCb+Tf+T46nMN7HkSAdC4=" "integrity": "sha1-+M33eISCb+Tf+T46nMN7HkSAdC4="
}, },
"optimist": {
"version": "0.6.1",
"resolved": "https://registry.npmjs.org/optimist/-/optimist-0.6.1.tgz",
"integrity": "sha1-2j6nRob6IaGaERwybpDrFaAZZoY=",
"dev": true,
"requires": {
"minimist": "~0.0.1",
"wordwrap": "~0.0.2"
}
},
"os-locale": { "os-locale": {
"version": "3.1.0", "version": "3.1.0",
"resolved": "https://registry.npmjs.org/os-locale/-/os-locale-3.1.0.tgz", "resolved": "https://registry.npmjs.org/os-locale/-/os-locale-3.1.0.tgz",
@ -1897,6 +1941,12 @@
"integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==", "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==",
"dev": true "dev": true
}, },
"semaphore": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/semaphore/-/semaphore-1.1.0.tgz",
"integrity": "sha512-O4OZEaNtkMd/K0i6js9SL+gqy0ZCBMgUvlSqHKi4IBdjhe7wB8pwztUk1BbZ1fmrvpwFrPbHzqd2w5pTcJH6LA==",
"dev": true
},
"semver": { "semver": {
"version": "5.7.0", "version": "5.7.0",
"resolved": "https://registry.npmjs.org/semver/-/semver-5.7.0.tgz", "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.0.tgz",
@ -2147,6 +2197,12 @@
"resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz", "resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz",
"integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c=" "integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c="
}, },
"ultron": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/ultron/-/ultron-1.1.1.tgz",
"integrity": "sha512-UIEXBNeYmKptWH6z8ZnqTeS8fV74zG0/eRU9VGkpzz+LIJNs8W/zM/L+7ctCkRrgbNnnR0xxw4bKOr0cW0N0Og==",
"dev": true
},
"underscore": { "underscore": {
"version": "1.9.1", "version": "1.9.1",
"resolved": "https://registry.npmjs.org/underscore/-/underscore-1.9.1.tgz", "resolved": "https://registry.npmjs.org/underscore/-/underscore-1.9.1.tgz",
@ -2273,6 +2329,12 @@
} }
} }
}, },
"wordwrap": {
"version": "0.0.3",
"resolved": "https://registry.npmjs.org/wordwrap/-/wordwrap-0.0.3.tgz",
"integrity": "sha1-o9XabNXAvAAI03I0u68b7WMFkQc=",
"dev": true
},
"wrap-ansi": { "wrap-ansi": {
"version": "2.1.0", "version": "2.1.0",
"resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-2.1.0.tgz", "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-2.1.0.tgz",

View File

@ -37,6 +37,7 @@
"chai": "^4.2.0", "chai": "^4.2.0",
"chai-string": "^1.5.0", "chai-string": "^1.5.0",
"express": "^4.17.1", "express": "^4.17.1",
"http-mitm-proxy": "^0.8.2",
"key-cert": "^1.0.1", "key-cert": "^1.0.1",
"mocha": "^6.1.4" "mocha": "^6.1.4"
} }

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,68 +1,57 @@
'use strict'; 'use strict';
const express = require('express'); const express = require('express');
const puppeteer = require('puppeteer'); const puppeteer = require('puppeteer');
// TODO add a test logger in place of default winston logger const { createLogger, transports } = require('winston');
const logger = require('winston');
const net = require('net');
const http = require('http'); const http = require('http');
const https = require('https'); const https = require('https');
const url = require('url');
const assert = require('assert'); const assert = require('assert');
const path = require('path'); const path = require('path');
const keyCert = require('key-cert'); const keyCert = require('key-cert');
const Promise = require('bluebird'); const Promise = require('bluebird');
const Proxy = require('http-mitm-proxy');
const debug = require('debug')('se-scraper:test'); const debug = require('debug')('se-scraper:test');
const { BingScraper } = require('../../src/modules/bing'); const { BingScraper } = require('../../src/modules/bing');
const httpPort = 3012; const httpPort = 3012;
const httpsPort = httpPort + 1; const httpsPort = httpPort + 1;
const proxyPort = httpPort + 2;
const fakeSearchEngine = express(); const fakeSearchEngine = express();
fakeSearchEngine.get('/search', (req, res, next) => { fakeSearchEngine.get('/search', (req, res, next) => {
debug('q=%s', req.query.q); debug('q=%s', req.query.q);
const pageNumber = ((req.query.start/10) || 0) + 1; const pageNumber = Math.round((req.query.first || 0) /10) + 1;
res.sendFile(path.join(__dirname, '../mocks/bing/' + req.query.q + '_page' + pageNumber + '.html')); res.sendFile(path.join(__dirname, '../mocks/bing/' + req.query.q + '_page' + pageNumber + '.html'));
}); });
fakeSearchEngine.use(express.static('test/mocks/bing', {extensions: ['html']})); fakeSearchEngine.use(express.static('test/mocks/bing', {extensions: ['html']}));
describe('Module Bing', function(){ describe('Module Bing', function(){
let httpServerAndProxy, httpsServer; let httpServer, httpsServer, proxy;
before(async function(){ before(async function(){
// Here mount our fake engine in both http and https listen server // Here mount our fake engine in both http and https listen server
httpServerAndProxy = http.createServer(fakeSearchEngine); httpServer = http.createServer(fakeSearchEngine);
httpsServer = https.createServer(await keyCert(), fakeSearchEngine); httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
/** proxy = Proxy();
* express doesn't handle HTTP CONNECT method, this implement a basic MITM http proxy proxy.onRequest((ctx, callback) => {
* here we use our http server to also act as a http proxy and rewrite all http/s request to our fake engine ctx.proxyToServerRequestOptions.host = 'localhost';
*/ ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
httpServerAndProxy.on('connect', (req, clientSocket, head) => { ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
const parsedUrl = url.parse('http://' + req.url); debug('connection proxied askedHost=%s toPort=%s', ctx.clientToProxyRequest.headers.host, ctx.proxyToServerRequestOptions.port);
const destPort = (parseInt(parsedUrl.port) === 443) ? httpsPort : httpPort; return callback();
const serverSocket = net.connect(destPort, 'localhost', () => {
debug('connection proxied askedHost=%s toPort=%s', parsedUrl.host, destPort);
clientSocket.write('HTTP/1.1 200 Connection Established\r\n' +
'Proxy-agent: Node.js-Proxy\r\n' +
'\r\n');
serverSocket.write(head);
serverSocket.pipe(clientSocket);
clientSocket.pipe(serverSocket);
serverSocket.on('error', (err)=>{
console.error(err);
});
});
}); });
await Promise.promisify(httpServerAndProxy.listen, {context: httpServerAndProxy})(httpPort); await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort });
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort); await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
debug('Fake http search engine servers started'); debug('Fake http search engine servers started');
}); });
after(function(){ after(function(){
proxy.close();
httpsServer.close(); httpsServer.close();
httpServerAndProxy.close(); httpServer.close();
}); });
let browser; let browser;
@ -71,8 +60,9 @@ describe('Module Bing', function(){
debug('Start a new browser'); debug('Start a new browser');
browser = await puppeteer.launch({ browser = await puppeteer.launch({
//dumpio: true, //dumpio: true,
//headless: false,
ignoreHTTPSErrors: true, ignoreHTTPSErrors: true,
args: [ '--proxy-server=http://localhost:' + httpPort ] args: [ '--proxy-server=http://localhost:' + proxyPort ]
}); });
debug('Open a fresh page'); debug('Open a fresh page');
page = await browser.newPage(); page = await browser.newPage();
@ -82,20 +72,28 @@ describe('Module Bing', function(){
await browser.close(); await browser.close();
}); });
const testLogger = createLogger({
transports: [
new transports.Console({
level: 'error'
})
]
});
it('one keyword one page', function(){ it('one keyword one page', function(){
const bingScraper = new BingScraper({ const bingScraper = new BingScraper({
config: { config: {
search_engine_name: 'bing', search_engine_name: 'bing',
throw_on_detection: true, throw_on_detection: true,
keywords: ['test keyword'], keywords: ['test keyword'],
logger, logger: testLogger,
scrape_from_file: '', scrape_from_file: '',
} }
}); });
bingScraper.STANDARD_TIMEOUT = 500; bingScraper.STANDARD_TIMEOUT = 500;
return bingScraper.run({page}).then(({results, metadata, num_requests}) => { return bingScraper.run({page}).then(({results, metadata, num_requests}) => {
assert.strictEqual(num_requests, 1, 'Must do one request'); assert.strictEqual(num_requests, 1, 'Must do one request');
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed'); assert.strictEqual(results['test keyword']['1'].results.length, 6, 'Must have 6 organic results parsed');
}); });
}); });
@ -105,7 +103,7 @@ describe('Module Bing', function(){
search_engine_name: 'bing', search_engine_name: 'bing',
throw_on_detection: true, throw_on_detection: true,
keywords: ['test keyword'], keywords: ['test keyword'],
logger, logger: testLogger,
scrape_from_file: '', scrape_from_file: '',
num_pages: 3, num_pages: 3,
} }
@ -113,12 +111,12 @@ describe('Module Bing', function(){
bingScraper.STANDARD_TIMEOUT = 500; bingScraper.STANDARD_TIMEOUT = 500;
return bingScraper.run({page}).then(({results, metadata, num_requests}) => { return bingScraper.run({page}).then(({results, metadata, num_requests}) => {
assert.strictEqual(num_requests, 3, 'Must three requests'); assert.strictEqual(num_requests, 3, 'Must three requests');
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed on page 1'); assert.strictEqual(results['test keyword']['1'].results.length, 6, 'Must have 6 organic results parsed on page 1');
assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tool (FREE) ᐈ #1 Google Keyword Planner Alternative', 'Title not matching on first organic result page 1'); assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tests | TestComplete Documentation', 'Title not matching on first organic result page 1');
assert.strictEqual(results['test keyword']['2'].results.length, 10, 'Must have 10 organic results parsed on page 2'); assert.strictEqual(results['test keyword']['2'].results.length, 10, 'Must have 10 organic results parsed on page 2');
assert.strictEqual(results['test keyword']['2'].results[0].title, 'Keyword Research | The Beginner\'s Guide to SEO - Moz', 'Title not matching on first organic result page 1'); assert.strictEqual(results['test keyword']['2'].results[0].title, 'Keywords - TestLink', 'Title not matching on first organic result page 2');
assert.strictEqual(results['test keyword']['3'].results.length, 10, 'Must have 10 organic results parsed on page 3'); assert.strictEqual(results['test keyword']['3'].results.length, 10, 'Must have 10 organic results parsed on page 3');
assert.strictEqual(results['test keyword']['3'].results[0].title, 'The ACT Keyword Study Plan — NerdCoach', 'Title not matching on first organic result page 1'); assert.strictEqual(results['test keyword']['3'].results[0].title, 'Keyword Driven Testing | TestComplete', 'Title not matching on first organic result page 3');
}); });
}); });

View File

@ -1,25 +1,24 @@
'use strict'; 'use strict';
const express = require('express'); const express = require('express');
const puppeteer = require('puppeteer'); const puppeteer = require('puppeteer');
// TODO add a test logger in place of default winston logger const { createLogger, transports } = require('winston');
const logger = require('winston');
const net = require('net');
const http = require('http'); const http = require('http');
const https = require('https'); const https = require('https');
const url = require('url');
const assert = require('assert'); const assert = require('assert');
const path = require('path'); const path = require('path');
const keyCert = require('key-cert'); const keyCert = require('key-cert');
const Promise = require('bluebird'); const Promise = require('bluebird');
const Proxy = require('http-mitm-proxy');
const debug = require('debug')('se-scraper:test'); const debug = require('debug')('se-scraper:test');
const { GoogleScraper } = require('../../src/modules/google'); const { GoogleScraper } = require('../../src/modules/google');
const httpPort = 3012; const httpPort = 3012;
const httpsPort = httpPort + 1; const httpsPort = httpPort + 1;
const proxyPort = httpPort + 2;
const fakeSearchEngine = express(); const fakeSearchEngine = express();
fakeSearchEngine.get('/search', (req, res, next) => { fakeSearchEngine.get('/search', (req, res) => {
debug('q=%s', req.query.q); debug('q=%s', req.query.q);
const pageNumber = ((req.query.start/10) || 0) + 1; const pageNumber = ((req.query.start/10) || 0) + 1;
res.sendFile(path.join(__dirname, '../mocks/google/' + req.query.q + '_page' + pageNumber + '.html')); res.sendFile(path.join(__dirname, '../mocks/google/' + req.query.q + '_page' + pageNumber + '.html'));
@ -28,41 +27,31 @@ fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']})
describe('Module Google', function(){ describe('Module Google', function(){
let httpServerAndProxy, httpsServer; let httpServer, httpsServer, proxy;
before(async function(){ before(async function(){
// Here mount our fake engine in both http and https listen server // Here mount our fake engine in both http and https listen server
httpServerAndProxy = http.createServer(fakeSearchEngine); httpServer = http.createServer(fakeSearchEngine);
httpsServer = https.createServer(await keyCert(), fakeSearchEngine); httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
/** proxy = Proxy();
* express doesn't handle HTTP CONNECT method, this implement a basic MITM http proxy proxy.onRequest((ctx, callback) => {
* here we use our http server to also act as a http proxy and rewrite all http/s request to our fake engine ctx.proxyToServerRequestOptions.host = 'localhost';
*/ ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
httpServerAndProxy.on('connect', (req, clientSocket, head) => { ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
const parsedUrl = url.parse('http://' + req.url); debug('connection proxied askedHost=%s toPort=%s', ctx.clientToProxyRequest.headers.host, ctx.proxyToServerRequestOptions.port);
const destPort = (parseInt(parsedUrl.port) === 443) ? httpsPort : httpPort; return callback();
const serverSocket = net.connect(destPort, 'localhost', () => {
debug('connection proxied askedHost=%s toPort=%s', parsedUrl.host, destPort);
clientSocket.write('HTTP/1.1 200 Connection Established\r\n' +
'Proxy-agent: Node.js-Proxy\r\n' +
'\r\n');
serverSocket.write(head);
serverSocket.pipe(clientSocket);
clientSocket.pipe(serverSocket);
serverSocket.on('error', (err)=>{
console.error(err);
});
});
}); });
await Promise.promisify(httpServerAndProxy.listen, {context: httpServerAndProxy})(httpPort); await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort });
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort); await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
debug('Fake http search engine servers started'); debug('Fake http search engine servers started');
}); });
after(function(){ after(function(){
proxy.close();
httpsServer.close(); httpsServer.close();
httpServerAndProxy.close(); httpServer.close();
}); });
let browser; let browser;
@ -71,8 +60,9 @@ describe('Module Google', function(){
debug('Start a new browser'); debug('Start a new browser');
browser = await puppeteer.launch({ browser = await puppeteer.launch({
//dumpio: true, //dumpio: true,
//headless: false,
ignoreHTTPSErrors: true, ignoreHTTPSErrors: true,
args: [ '--proxy-server=http://localhost:' + httpPort ] args: [ '--proxy-server=http://localhost:' + proxyPort ]
}); });
debug('Open a fresh page'); debug('Open a fresh page');
page = await browser.newPage(); page = await browser.newPage();
@ -82,13 +72,21 @@ describe('Module Google', function(){
await browser.close(); await browser.close();
}); });
const testLogger = createLogger({
transports: [
new transports.Console({
level: 'error'
})
]
});
it('one keyword one page', function(){ it('one keyword one page', function(){
const googleScraper = new GoogleScraper({ const googleScraper = new GoogleScraper({
config: { config: {
search_engine_name: 'google', search_engine_name: 'google',
throw_on_detection: true, throw_on_detection: true,
keywords: ['test keyword'], keywords: ['test keyword'],
logger, logger: testLogger,
scrape_from_file: '', scrape_from_file: '',
} }
}); });
@ -105,7 +103,7 @@ describe('Module Google', function(){
search_engine_name: 'google', search_engine_name: 'google',
throw_on_detection: true, throw_on_detection: true,
keywords: ['test keyword'], keywords: ['test keyword'],
logger, logger: testLogger,
scrape_from_file: '', scrape_from_file: '',
num_pages: 3, num_pages: 3,
} }

View File

@ -1,16 +1,12 @@
'use strict'; 'use strict';
const express = require('express'); const express = require('express');
const puppeteer = require('puppeteer'); const { createLogger, transports } = require('winston');
// TODO add a test logger in place of default winston logger
const logger = require('winston');
const net = require('net');
const http = require('http'); const http = require('http');
const https = require('https'); const https = require('https');
const url = require('url');
const assert = require('assert'); const assert = require('assert');
const path = require('path');
const keyCert = require('key-cert'); const keyCert = require('key-cert');
const Promise = require('bluebird'); const Promise = require('bluebird');
const Proxy = require('http-mitm-proxy');
const debug = require('debug')('se-scraper:test'); const debug = require('debug')('se-scraper:test');
const se_scraper = require('../'); const se_scraper = require('../');
@ -18,64 +14,55 @@ const Scraper = require('../src/modules/se_scraper');
const httpPort = 3012; const httpPort = 3012;
const httpsPort = httpPort + 1; const httpsPort = httpPort + 1;
const httpOtherPort = httpPort + 2; const proxyPort = httpPort + 2;
const fakeSearchEngine = express(); const fakeSearchEngine = express();
fakeSearchEngine.get('/test', (req, res) => { fakeSearchEngine.set('trust proxy', 'loopback');
debug(req.ip, req.ips, req.protocol, req.hostname); fakeSearchEngine.get('/test-proxy', (req, res) => {
debug(req.socket.localAddress, req.socket.localPort); debug('fake-search-engine req.hostname=%s', req.hostname);
res.send('OK'); //debug('req to', req.socket.localAddress, req.socket.localPort);
res.send(req.hostname);
}); });
describe('Config', function(){ describe('Config', function(){
let httpServerAndProxy, httpsServer, httpOtherServer; let httpServer, httpsServer, proxy;
before(async function(){ before(async function(){
// Here mount our fake engine in both http and https listen server // Here mount our fake engine in both http and https listen server
httpServerAndProxy = http.createServer(fakeSearchEngine); httpServer = http.createServer(fakeSearchEngine);
httpsServer = https.createServer(await keyCert(), fakeSearchEngine); httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
/** proxy = Proxy();
* express doesn't handle HTTP CONNECT method, this implement a basic MITM http proxy proxy.onRequest((ctx, callback) => {
* here we use our http server to also act as a http proxy and rewrite all http/s request to our fake engine ctx.proxyToServerRequestOptions.host = 'localhost';
*/ ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
httpServerAndProxy.on('connect', (req, clientSocket, head) => { ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
const parsedUrl = url.parse('http://' + req.url); debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
const destPort = (parseInt(parsedUrl.port) === 443) ? httpsPort : httpPort; return callback();
const serverSocket = net.connect(destPort, 'localhost', () => {
debug('connection proxied askedHost=%s toPort=%s', parsedUrl.host, destPort);
clientSocket.write('HTTP/1.1 200 Connection Established\r\n' +
'Proxy-agent: Node.js-Proxy\r\n' +
'\r\n');
serverSocket.write(head);
serverSocket.pipe(clientSocket);
clientSocket.pipe(serverSocket);
serverSocket.on('error', (err)=>{
console.error(err);
});
});
}); });
await Promise.promisify(httpServerAndProxy.listen, {context: httpServerAndProxy})(httpPort); await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort); await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
debug('Fake http search engine servers started'); debug('Fake http search engine servers started');
}); });
after(function(){ after(function(){
httpsServer.close(); httpsServer.close();
httpServerAndProxy.close(); httpServer.close();
proxy.close();
}); });
describe('proxies', function(){ describe('proxies', function(){
class MockScraper extends Scraper { class MockScraperTestProxy extends Scraper {
async load_start_page(){ async load_start_page(){
return true; return true;
} }
async search_keyword(){ async search_keyword(){
await this.page.goto('http://void:' + httpPort + '/test'); await this.page.goto('http://test.local:' + httpPort + '/test-proxy');
} }
async parse_async(){ async parse_async(){
@ -84,29 +71,67 @@ describe('Config', function(){
} }
} }
const testLogger = createLogger({
transports: [
new transports.Console({
level: 'error'
})
]
});
/** /**
* Jobs will be executed 1 by 1 through the proxy * Jobs will be executed 2 by 2 through the proxy and direct connection
* THIS TEST NEED TO HAVE test.local 127.0.0.1 in /etc/hosts because chrome bypass localhost even with proxy set
*/ */
it('one proxy given', async function () { it('one proxy given, use_proxies_only=false', async function () {
const scrape_job = { const scrape_job = {
search_engine: MockScraper, search_engine: MockScraperTestProxy,
keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'], keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'],
}; };
var scraper = new se_scraper.ScrapeManager({ var scraper = new se_scraper.ScrapeManager({
throw_on_detection: true, throw_on_detection: true,
proxies: ['http://localhost:' + httpPort], proxies: ['http://localhost:' + proxyPort],
use_proxies_only: true, // default is use_proxies_only: false,
logger: testLogger,
}); });
await scraper.start(); await scraper.start();
const { results } = await scraper.scrape(scrape_job); const { results } = await scraper.scrape(scrape_job);
assert.strictEqual(results['news']['1'], 'OK'); assert.strictEqual(results['news']['1'], 'test.local');
assert.strictEqual(results['some stuff']['1'], 'OK'); assert.strictEqual(results['some stuff']['1'], 'ProxiedThroughFakeEngine');
assert.strictEqual(results['i work too much']['1'], 'OK'); assert.strictEqual(results['i work too much']['1'], 'test.local');
assert.strictEqual(results['what to do?']['1'], 'OK'); assert.strictEqual(results['what to do?']['1'], 'ProxiedThroughFakeEngine');
assert.strictEqual(results['javascript is hard']['1'], 'OK'); assert.strictEqual(results['javascript is hard']['1'], 'test.local');
await scraper.quit();
});
/**
* Jobs will be executed 1 by 1 through the proxy
*/
it('one proxy given, use_proxies_only=true', async function () {
const scrape_job = {
search_engine: MockScraperTestProxy,
keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'],
};
var scraper = new se_scraper.ScrapeManager({
throw_on_detection: true,
proxies: ['http://localhost:' + proxyPort],
use_proxies_only: true,
logger: testLogger,
});
await scraper.start();
const { results } = await scraper.scrape(scrape_job);
assert.strictEqual(results['news']['1'], 'ProxiedThroughFakeEngine');
assert.strictEqual(results['some stuff']['1'], 'ProxiedThroughFakeEngine');
assert.strictEqual(results['i work too much']['1'], 'ProxiedThroughFakeEngine');
assert.strictEqual(results['what to do?']['1'], 'ProxiedThroughFakeEngine');
assert.strictEqual(results['javascript is hard']['1'], 'ProxiedThroughFakeEngine');
await scraper.quit(); await scraper.quit();
}); });