forked from extern/se-scraper
test: Bing tests working, refactor proxy for tests
This commit is contained in:
parent
1c1db88545
commit
cac6b87e92
2
.gitignore
vendored
2
.gitignore
vendored
@ -79,3 +79,5 @@ typings/
|
|||||||
|
|
||||||
.idea/
|
.idea/
|
||||||
GoogleScraperPup.iml
|
GoogleScraperPup.iml
|
||||||
|
|
||||||
|
.http-mitm-proxy
|
||||||
|
62
package-lock.json
generated
62
package-lock.json
generated
@ -1007,6 +1007,34 @@
|
|||||||
"toidentifier": "1.0.0"
|
"toidentifier": "1.0.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"http-mitm-proxy": {
|
||||||
|
"version": "0.8.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/http-mitm-proxy/-/http-mitm-proxy-0.8.2.tgz",
|
||||||
|
"integrity": "sha512-QqaqHWssz4acqu2aIPJqJWt/gDa4SzQ9kj/rs16ONA2nBWNh/mfOW0Ez1Wxa5IivHHZSTciQ7wG0Dxzogurngw==",
|
||||||
|
"dev": true,
|
||||||
|
"requires": {
|
||||||
|
"async": "^2.6.2",
|
||||||
|
"debug": "^4.1.0",
|
||||||
|
"mkdirp": "^0.5.1",
|
||||||
|
"node-forge": "^0.8.4",
|
||||||
|
"optimist": "^0.6.1",
|
||||||
|
"semaphore": "^1.1.0",
|
||||||
|
"ws": "^3.2.0"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"ws": {
|
||||||
|
"version": "3.3.3",
|
||||||
|
"resolved": "https://registry.npmjs.org/ws/-/ws-3.3.3.tgz",
|
||||||
|
"integrity": "sha512-nnWLa/NwZSt4KQJu51MYlCcSQ5g7INpOrOMt4XV8j4dqTXdmlUmSHQ8/oLC069ckre0fRsgfvsKwbTdtKLCDkA==",
|
||||||
|
"dev": true,
|
||||||
|
"requires": {
|
||||||
|
"async-limiter": "~1.0.0",
|
||||||
|
"safe-buffer": "~5.1.0",
|
||||||
|
"ultron": "~1.1.0"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
"https-proxy-agent": {
|
"https-proxy-agent": {
|
||||||
"version": "3.0.1",
|
"version": "3.0.1",
|
||||||
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-3.0.1.tgz",
|
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-3.0.1.tgz",
|
||||||
@ -1475,6 +1503,12 @@
|
|||||||
"semver": "^5.7.0"
|
"semver": "^5.7.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node-forge": {
|
||||||
|
"version": "0.8.5",
|
||||||
|
"resolved": "https://registry.npmjs.org/node-forge/-/node-forge-0.8.5.tgz",
|
||||||
|
"integrity": "sha512-vFMQIWt+J/7FLNyKouZ9TazT74PRV3wgv9UT4cRjC8BffxFbKXkgIWR42URCPSnHm/QDz6BOlb2Q0U4+VQT67Q==",
|
||||||
|
"dev": true
|
||||||
|
},
|
||||||
"normalize-url": {
|
"normalize-url": {
|
||||||
"version": "3.3.0",
|
"version": "3.3.0",
|
||||||
"resolved": "https://registry.npmjs.org/normalize-url/-/normalize-url-3.3.0.tgz",
|
"resolved": "https://registry.npmjs.org/normalize-url/-/normalize-url-3.3.0.tgz",
|
||||||
@ -1553,6 +1587,16 @@
|
|||||||
"resolved": "https://registry.npmjs.org/one-time/-/one-time-0.0.4.tgz",
|
"resolved": "https://registry.npmjs.org/one-time/-/one-time-0.0.4.tgz",
|
||||||
"integrity": "sha1-+M33eISCb+Tf+T46nMN7HkSAdC4="
|
"integrity": "sha1-+M33eISCb+Tf+T46nMN7HkSAdC4="
|
||||||
},
|
},
|
||||||
|
"optimist": {
|
||||||
|
"version": "0.6.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/optimist/-/optimist-0.6.1.tgz",
|
||||||
|
"integrity": "sha1-2j6nRob6IaGaERwybpDrFaAZZoY=",
|
||||||
|
"dev": true,
|
||||||
|
"requires": {
|
||||||
|
"minimist": "~0.0.1",
|
||||||
|
"wordwrap": "~0.0.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
"os-locale": {
|
"os-locale": {
|
||||||
"version": "3.1.0",
|
"version": "3.1.0",
|
||||||
"resolved": "https://registry.npmjs.org/os-locale/-/os-locale-3.1.0.tgz",
|
"resolved": "https://registry.npmjs.org/os-locale/-/os-locale-3.1.0.tgz",
|
||||||
@ -1897,6 +1941,12 @@
|
|||||||
"integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==",
|
"integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==",
|
||||||
"dev": true
|
"dev": true
|
||||||
},
|
},
|
||||||
|
"semaphore": {
|
||||||
|
"version": "1.1.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/semaphore/-/semaphore-1.1.0.tgz",
|
||||||
|
"integrity": "sha512-O4OZEaNtkMd/K0i6js9SL+gqy0ZCBMgUvlSqHKi4IBdjhe7wB8pwztUk1BbZ1fmrvpwFrPbHzqd2w5pTcJH6LA==",
|
||||||
|
"dev": true
|
||||||
|
},
|
||||||
"semver": {
|
"semver": {
|
||||||
"version": "5.7.0",
|
"version": "5.7.0",
|
||||||
"resolved": "https://registry.npmjs.org/semver/-/semver-5.7.0.tgz",
|
"resolved": "https://registry.npmjs.org/semver/-/semver-5.7.0.tgz",
|
||||||
@ -2147,6 +2197,12 @@
|
|||||||
"resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz",
|
"resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz",
|
||||||
"integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c="
|
"integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c="
|
||||||
},
|
},
|
||||||
|
"ultron": {
|
||||||
|
"version": "1.1.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/ultron/-/ultron-1.1.1.tgz",
|
||||||
|
"integrity": "sha512-UIEXBNeYmKptWH6z8ZnqTeS8fV74zG0/eRU9VGkpzz+LIJNs8W/zM/L+7ctCkRrgbNnnR0xxw4bKOr0cW0N0Og==",
|
||||||
|
"dev": true
|
||||||
|
},
|
||||||
"underscore": {
|
"underscore": {
|
||||||
"version": "1.9.1",
|
"version": "1.9.1",
|
||||||
"resolved": "https://registry.npmjs.org/underscore/-/underscore-1.9.1.tgz",
|
"resolved": "https://registry.npmjs.org/underscore/-/underscore-1.9.1.tgz",
|
||||||
@ -2273,6 +2329,12 @@
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"wordwrap": {
|
||||||
|
"version": "0.0.3",
|
||||||
|
"resolved": "https://registry.npmjs.org/wordwrap/-/wordwrap-0.0.3.tgz",
|
||||||
|
"integrity": "sha1-o9XabNXAvAAI03I0u68b7WMFkQc=",
|
||||||
|
"dev": true
|
||||||
|
},
|
||||||
"wrap-ansi": {
|
"wrap-ansi": {
|
||||||
"version": "2.1.0",
|
"version": "2.1.0",
|
||||||
"resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-2.1.0.tgz",
|
"resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-2.1.0.tgz",
|
||||||
|
@ -37,6 +37,7 @@
|
|||||||
"chai": "^4.2.0",
|
"chai": "^4.2.0",
|
||||||
"chai-string": "^1.5.0",
|
"chai-string": "^1.5.0",
|
||||||
"express": "^4.17.1",
|
"express": "^4.17.1",
|
||||||
|
"http-mitm-proxy": "^0.8.2",
|
||||||
"key-cert": "^1.0.1",
|
"key-cert": "^1.0.1",
|
||||||
"mocha": "^6.1.4"
|
"mocha": "^6.1.4"
|
||||||
}
|
}
|
||||||
|
23
test/mocks/bing/index.html
Normal file
23
test/mocks/bing/index.html
Normal file
File diff suppressed because one or more lines are too long
42
test/mocks/bing/test keyword_page1.html
Normal file
42
test/mocks/bing/test keyword_page1.html
Normal file
File diff suppressed because one or more lines are too long
42
test/mocks/bing/test keyword_page2.html
Normal file
42
test/mocks/bing/test keyword_page2.html
Normal file
File diff suppressed because one or more lines are too long
40
test/mocks/bing/test keyword_page3.html
Normal file
40
test/mocks/bing/test keyword_page3.html
Normal file
File diff suppressed because one or more lines are too long
@ -1,68 +1,57 @@
|
|||||||
'use strict';
|
'use strict';
|
||||||
const express = require('express');
|
const express = require('express');
|
||||||
const puppeteer = require('puppeteer');
|
const puppeteer = require('puppeteer');
|
||||||
// TODO add a test logger in place of default winston logger
|
const { createLogger, transports } = require('winston');
|
||||||
const logger = require('winston');
|
|
||||||
const net = require('net');
|
|
||||||
const http = require('http');
|
const http = require('http');
|
||||||
const https = require('https');
|
const https = require('https');
|
||||||
const url = require('url');
|
|
||||||
const assert = require('assert');
|
const assert = require('assert');
|
||||||
const path = require('path');
|
const path = require('path');
|
||||||
const keyCert = require('key-cert');
|
const keyCert = require('key-cert');
|
||||||
const Promise = require('bluebird');
|
const Promise = require('bluebird');
|
||||||
|
const Proxy = require('http-mitm-proxy');
|
||||||
|
|
||||||
const debug = require('debug')('se-scraper:test');
|
const debug = require('debug')('se-scraper:test');
|
||||||
const { BingScraper } = require('../../src/modules/bing');
|
const { BingScraper } = require('../../src/modules/bing');
|
||||||
|
|
||||||
const httpPort = 3012;
|
const httpPort = 3012;
|
||||||
const httpsPort = httpPort + 1;
|
const httpsPort = httpPort + 1;
|
||||||
|
const proxyPort = httpPort + 2;
|
||||||
|
|
||||||
const fakeSearchEngine = express();
|
const fakeSearchEngine = express();
|
||||||
fakeSearchEngine.get('/search', (req, res, next) => {
|
fakeSearchEngine.get('/search', (req, res, next) => {
|
||||||
debug('q=%s', req.query.q);
|
debug('q=%s', req.query.q);
|
||||||
const pageNumber = ((req.query.start/10) || 0) + 1;
|
const pageNumber = Math.round((req.query.first || 0) /10) + 1;
|
||||||
res.sendFile(path.join(__dirname, '../mocks/bing/' + req.query.q + '_page' + pageNumber + '.html'));
|
res.sendFile(path.join(__dirname, '../mocks/bing/' + req.query.q + '_page' + pageNumber + '.html'));
|
||||||
});
|
});
|
||||||
fakeSearchEngine.use(express.static('test/mocks/bing', {extensions: ['html']}));
|
fakeSearchEngine.use(express.static('test/mocks/bing', {extensions: ['html']}));
|
||||||
|
|
||||||
describe('Module Bing', function(){
|
describe('Module Bing', function(){
|
||||||
|
|
||||||
let httpServerAndProxy, httpsServer;
|
let httpServer, httpsServer, proxy;
|
||||||
before(async function(){
|
before(async function(){
|
||||||
// Here mount our fake engine in both http and https listen server
|
// Here mount our fake engine in both http and https listen server
|
||||||
httpServerAndProxy = http.createServer(fakeSearchEngine);
|
httpServer = http.createServer(fakeSearchEngine);
|
||||||
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
||||||
|
|
||||||
/**
|
proxy = Proxy();
|
||||||
* express doesn't handle HTTP CONNECT method, this implement a basic MITM http proxy
|
proxy.onRequest((ctx, callback) => {
|
||||||
* here we use our http server to also act as a http proxy and rewrite all http/s request to our fake engine
|
ctx.proxyToServerRequestOptions.host = 'localhost';
|
||||||
*/
|
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
|
||||||
httpServerAndProxy.on('connect', (req, clientSocket, head) => {
|
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
|
||||||
const parsedUrl = url.parse('http://' + req.url);
|
debug('connection proxied askedHost=%s toPort=%s', ctx.clientToProxyRequest.headers.host, ctx.proxyToServerRequestOptions.port);
|
||||||
const destPort = (parseInt(parsedUrl.port) === 443) ? httpsPort : httpPort;
|
return callback();
|
||||||
const serverSocket = net.connect(destPort, 'localhost', () => {
|
|
||||||
debug('connection proxied askedHost=%s toPort=%s', parsedUrl.host, destPort);
|
|
||||||
clientSocket.write('HTTP/1.1 200 Connection Established\r\n' +
|
|
||||||
'Proxy-agent: Node.js-Proxy\r\n' +
|
|
||||||
'\r\n');
|
|
||||||
serverSocket.write(head);
|
|
||||||
serverSocket.pipe(clientSocket);
|
|
||||||
clientSocket.pipe(serverSocket);
|
|
||||||
serverSocket.on('error', (err)=>{
|
|
||||||
console.error(err);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
});
|
});
|
||||||
|
|
||||||
await Promise.promisify(httpServerAndProxy.listen, {context: httpServerAndProxy})(httpPort);
|
await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort });
|
||||||
|
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
|
||||||
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
||||||
debug('Fake http search engine servers started');
|
debug('Fake http search engine servers started');
|
||||||
});
|
});
|
||||||
|
|
||||||
after(function(){
|
after(function(){
|
||||||
|
proxy.close();
|
||||||
httpsServer.close();
|
httpsServer.close();
|
||||||
httpServerAndProxy.close();
|
httpServer.close();
|
||||||
});
|
});
|
||||||
|
|
||||||
let browser;
|
let browser;
|
||||||
@ -71,8 +60,9 @@ describe('Module Bing', function(){
|
|||||||
debug('Start a new browser');
|
debug('Start a new browser');
|
||||||
browser = await puppeteer.launch({
|
browser = await puppeteer.launch({
|
||||||
//dumpio: true,
|
//dumpio: true,
|
||||||
|
//headless: false,
|
||||||
ignoreHTTPSErrors: true,
|
ignoreHTTPSErrors: true,
|
||||||
args: [ '--proxy-server=http://localhost:' + httpPort ]
|
args: [ '--proxy-server=http://localhost:' + proxyPort ]
|
||||||
});
|
});
|
||||||
debug('Open a fresh page');
|
debug('Open a fresh page');
|
||||||
page = await browser.newPage();
|
page = await browser.newPage();
|
||||||
@ -82,20 +72,28 @@ describe('Module Bing', function(){
|
|||||||
await browser.close();
|
await browser.close();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
const testLogger = createLogger({
|
||||||
|
transports: [
|
||||||
|
new transports.Console({
|
||||||
|
level: 'error'
|
||||||
|
})
|
||||||
|
]
|
||||||
|
});
|
||||||
|
|
||||||
it('one keyword one page', function(){
|
it('one keyword one page', function(){
|
||||||
const bingScraper = new BingScraper({
|
const bingScraper = new BingScraper({
|
||||||
config: {
|
config: {
|
||||||
search_engine_name: 'bing',
|
search_engine_name: 'bing',
|
||||||
throw_on_detection: true,
|
throw_on_detection: true,
|
||||||
keywords: ['test keyword'],
|
keywords: ['test keyword'],
|
||||||
logger,
|
logger: testLogger,
|
||||||
scrape_from_file: '',
|
scrape_from_file: '',
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
bingScraper.STANDARD_TIMEOUT = 500;
|
bingScraper.STANDARD_TIMEOUT = 500;
|
||||||
return bingScraper.run({page}).then(({results, metadata, num_requests}) => {
|
return bingScraper.run({page}).then(({results, metadata, num_requests}) => {
|
||||||
assert.strictEqual(num_requests, 1, 'Must do one request');
|
assert.strictEqual(num_requests, 1, 'Must do one request');
|
||||||
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed');
|
assert.strictEqual(results['test keyword']['1'].results.length, 6, 'Must have 6 organic results parsed');
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -105,7 +103,7 @@ describe('Module Bing', function(){
|
|||||||
search_engine_name: 'bing',
|
search_engine_name: 'bing',
|
||||||
throw_on_detection: true,
|
throw_on_detection: true,
|
||||||
keywords: ['test keyword'],
|
keywords: ['test keyword'],
|
||||||
logger,
|
logger: testLogger,
|
||||||
scrape_from_file: '',
|
scrape_from_file: '',
|
||||||
num_pages: 3,
|
num_pages: 3,
|
||||||
}
|
}
|
||||||
@ -113,12 +111,12 @@ describe('Module Bing', function(){
|
|||||||
bingScraper.STANDARD_TIMEOUT = 500;
|
bingScraper.STANDARD_TIMEOUT = 500;
|
||||||
return bingScraper.run({page}).then(({results, metadata, num_requests}) => {
|
return bingScraper.run({page}).then(({results, metadata, num_requests}) => {
|
||||||
assert.strictEqual(num_requests, 3, 'Must three requests');
|
assert.strictEqual(num_requests, 3, 'Must three requests');
|
||||||
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed on page 1');
|
assert.strictEqual(results['test keyword']['1'].results.length, 6, 'Must have 6 organic results parsed on page 1');
|
||||||
assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tool (FREE) ᐈ #1 Google Keyword Planner Alternative', 'Title not matching on first organic result page 1');
|
assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tests | TestComplete Documentation', 'Title not matching on first organic result page 1');
|
||||||
assert.strictEqual(results['test keyword']['2'].results.length, 10, 'Must have 10 organic results parsed on page 2');
|
assert.strictEqual(results['test keyword']['2'].results.length, 10, 'Must have 10 organic results parsed on page 2');
|
||||||
assert.strictEqual(results['test keyword']['2'].results[0].title, 'Keyword Research | The Beginner\'s Guide to SEO - Moz', 'Title not matching on first organic result page 1');
|
assert.strictEqual(results['test keyword']['2'].results[0].title, 'Keywords - TestLink', 'Title not matching on first organic result page 2');
|
||||||
assert.strictEqual(results['test keyword']['3'].results.length, 10, 'Must have 10 organic results parsed on page 3');
|
assert.strictEqual(results['test keyword']['3'].results.length, 10, 'Must have 10 organic results parsed on page 3');
|
||||||
assert.strictEqual(results['test keyword']['3'].results[0].title, 'The ACT Keyword Study Plan — NerdCoach', 'Title not matching on first organic result page 1');
|
assert.strictEqual(results['test keyword']['3'].results[0].title, 'Keyword Driven Testing | TestComplete', 'Title not matching on first organic result page 3');
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -1,25 +1,24 @@
|
|||||||
'use strict';
|
'use strict';
|
||||||
const express = require('express');
|
const express = require('express');
|
||||||
const puppeteer = require('puppeteer');
|
const puppeteer = require('puppeteer');
|
||||||
// TODO add a test logger in place of default winston logger
|
const { createLogger, transports } = require('winston');
|
||||||
const logger = require('winston');
|
|
||||||
const net = require('net');
|
|
||||||
const http = require('http');
|
const http = require('http');
|
||||||
const https = require('https');
|
const https = require('https');
|
||||||
const url = require('url');
|
|
||||||
const assert = require('assert');
|
const assert = require('assert');
|
||||||
const path = require('path');
|
const path = require('path');
|
||||||
const keyCert = require('key-cert');
|
const keyCert = require('key-cert');
|
||||||
const Promise = require('bluebird');
|
const Promise = require('bluebird');
|
||||||
|
const Proxy = require('http-mitm-proxy');
|
||||||
|
|
||||||
const debug = require('debug')('se-scraper:test');
|
const debug = require('debug')('se-scraper:test');
|
||||||
const { GoogleScraper } = require('../../src/modules/google');
|
const { GoogleScraper } = require('../../src/modules/google');
|
||||||
|
|
||||||
const httpPort = 3012;
|
const httpPort = 3012;
|
||||||
const httpsPort = httpPort + 1;
|
const httpsPort = httpPort + 1;
|
||||||
|
const proxyPort = httpPort + 2;
|
||||||
|
|
||||||
const fakeSearchEngine = express();
|
const fakeSearchEngine = express();
|
||||||
fakeSearchEngine.get('/search', (req, res, next) => {
|
fakeSearchEngine.get('/search', (req, res) => {
|
||||||
debug('q=%s', req.query.q);
|
debug('q=%s', req.query.q);
|
||||||
const pageNumber = ((req.query.start/10) || 0) + 1;
|
const pageNumber = ((req.query.start/10) || 0) + 1;
|
||||||
res.sendFile(path.join(__dirname, '../mocks/google/' + req.query.q + '_page' + pageNumber + '.html'));
|
res.sendFile(path.join(__dirname, '../mocks/google/' + req.query.q + '_page' + pageNumber + '.html'));
|
||||||
@ -28,41 +27,31 @@ fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']})
|
|||||||
|
|
||||||
describe('Module Google', function(){
|
describe('Module Google', function(){
|
||||||
|
|
||||||
let httpServerAndProxy, httpsServer;
|
let httpServer, httpsServer, proxy;
|
||||||
before(async function(){
|
before(async function(){
|
||||||
// Here mount our fake engine in both http and https listen server
|
// Here mount our fake engine in both http and https listen server
|
||||||
httpServerAndProxy = http.createServer(fakeSearchEngine);
|
httpServer = http.createServer(fakeSearchEngine);
|
||||||
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
||||||
|
|
||||||
/**
|
proxy = Proxy();
|
||||||
* express doesn't handle HTTP CONNECT method, this implement a basic MITM http proxy
|
proxy.onRequest((ctx, callback) => {
|
||||||
* here we use our http server to also act as a http proxy and rewrite all http/s request to our fake engine
|
ctx.proxyToServerRequestOptions.host = 'localhost';
|
||||||
*/
|
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
|
||||||
httpServerAndProxy.on('connect', (req, clientSocket, head) => {
|
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
|
||||||
const parsedUrl = url.parse('http://' + req.url);
|
debug('connection proxied askedHost=%s toPort=%s', ctx.clientToProxyRequest.headers.host, ctx.proxyToServerRequestOptions.port);
|
||||||
const destPort = (parseInt(parsedUrl.port) === 443) ? httpsPort : httpPort;
|
return callback();
|
||||||
const serverSocket = net.connect(destPort, 'localhost', () => {
|
|
||||||
debug('connection proxied askedHost=%s toPort=%s', parsedUrl.host, destPort);
|
|
||||||
clientSocket.write('HTTP/1.1 200 Connection Established\r\n' +
|
|
||||||
'Proxy-agent: Node.js-Proxy\r\n' +
|
|
||||||
'\r\n');
|
|
||||||
serverSocket.write(head);
|
|
||||||
serverSocket.pipe(clientSocket);
|
|
||||||
clientSocket.pipe(serverSocket);
|
|
||||||
serverSocket.on('error', (err)=>{
|
|
||||||
console.error(err);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
});
|
});
|
||||||
|
|
||||||
await Promise.promisify(httpServerAndProxy.listen, {context: httpServerAndProxy})(httpPort);
|
await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort });
|
||||||
|
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
|
||||||
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
||||||
debug('Fake http search engine servers started');
|
debug('Fake http search engine servers started');
|
||||||
});
|
});
|
||||||
|
|
||||||
after(function(){
|
after(function(){
|
||||||
|
proxy.close();
|
||||||
httpsServer.close();
|
httpsServer.close();
|
||||||
httpServerAndProxy.close();
|
httpServer.close();
|
||||||
});
|
});
|
||||||
|
|
||||||
let browser;
|
let browser;
|
||||||
@ -71,8 +60,9 @@ describe('Module Google', function(){
|
|||||||
debug('Start a new browser');
|
debug('Start a new browser');
|
||||||
browser = await puppeteer.launch({
|
browser = await puppeteer.launch({
|
||||||
//dumpio: true,
|
//dumpio: true,
|
||||||
|
//headless: false,
|
||||||
ignoreHTTPSErrors: true,
|
ignoreHTTPSErrors: true,
|
||||||
args: [ '--proxy-server=http://localhost:' + httpPort ]
|
args: [ '--proxy-server=http://localhost:' + proxyPort ]
|
||||||
});
|
});
|
||||||
debug('Open a fresh page');
|
debug('Open a fresh page');
|
||||||
page = await browser.newPage();
|
page = await browser.newPage();
|
||||||
@ -82,13 +72,21 @@ describe('Module Google', function(){
|
|||||||
await browser.close();
|
await browser.close();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
const testLogger = createLogger({
|
||||||
|
transports: [
|
||||||
|
new transports.Console({
|
||||||
|
level: 'error'
|
||||||
|
})
|
||||||
|
]
|
||||||
|
});
|
||||||
|
|
||||||
it('one keyword one page', function(){
|
it('one keyword one page', function(){
|
||||||
const googleScraper = new GoogleScraper({
|
const googleScraper = new GoogleScraper({
|
||||||
config: {
|
config: {
|
||||||
search_engine_name: 'google',
|
search_engine_name: 'google',
|
||||||
throw_on_detection: true,
|
throw_on_detection: true,
|
||||||
keywords: ['test keyword'],
|
keywords: ['test keyword'],
|
||||||
logger,
|
logger: testLogger,
|
||||||
scrape_from_file: '',
|
scrape_from_file: '',
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@ -105,7 +103,7 @@ describe('Module Google', function(){
|
|||||||
search_engine_name: 'google',
|
search_engine_name: 'google',
|
||||||
throw_on_detection: true,
|
throw_on_detection: true,
|
||||||
keywords: ['test keyword'],
|
keywords: ['test keyword'],
|
||||||
logger,
|
logger: testLogger,
|
||||||
scrape_from_file: '',
|
scrape_from_file: '',
|
||||||
num_pages: 3,
|
num_pages: 3,
|
||||||
}
|
}
|
||||||
|
119
test/proxy.js
119
test/proxy.js
@ -1,16 +1,12 @@
|
|||||||
'use strict';
|
'use strict';
|
||||||
const express = require('express');
|
const express = require('express');
|
||||||
const puppeteer = require('puppeteer');
|
const { createLogger, transports } = require('winston');
|
||||||
// TODO add a test logger in place of default winston logger
|
|
||||||
const logger = require('winston');
|
|
||||||
const net = require('net');
|
|
||||||
const http = require('http');
|
const http = require('http');
|
||||||
const https = require('https');
|
const https = require('https');
|
||||||
const url = require('url');
|
|
||||||
const assert = require('assert');
|
const assert = require('assert');
|
||||||
const path = require('path');
|
|
||||||
const keyCert = require('key-cert');
|
const keyCert = require('key-cert');
|
||||||
const Promise = require('bluebird');
|
const Promise = require('bluebird');
|
||||||
|
const Proxy = require('http-mitm-proxy');
|
||||||
|
|
||||||
const debug = require('debug')('se-scraper:test');
|
const debug = require('debug')('se-scraper:test');
|
||||||
const se_scraper = require('../');
|
const se_scraper = require('../');
|
||||||
@ -18,64 +14,55 @@ const Scraper = require('../src/modules/se_scraper');
|
|||||||
|
|
||||||
const httpPort = 3012;
|
const httpPort = 3012;
|
||||||
const httpsPort = httpPort + 1;
|
const httpsPort = httpPort + 1;
|
||||||
const httpOtherPort = httpPort + 2;
|
const proxyPort = httpPort + 2;
|
||||||
|
|
||||||
const fakeSearchEngine = express();
|
const fakeSearchEngine = express();
|
||||||
fakeSearchEngine.get('/test', (req, res) => {
|
fakeSearchEngine.set('trust proxy', 'loopback');
|
||||||
debug(req.ip, req.ips, req.protocol, req.hostname);
|
fakeSearchEngine.get('/test-proxy', (req, res) => {
|
||||||
debug(req.socket.localAddress, req.socket.localPort);
|
debug('fake-search-engine req.hostname=%s', req.hostname);
|
||||||
res.send('OK');
|
//debug('req to', req.socket.localAddress, req.socket.localPort);
|
||||||
|
res.send(req.hostname);
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('Config', function(){
|
describe('Config', function(){
|
||||||
|
|
||||||
let httpServerAndProxy, httpsServer, httpOtherServer;
|
let httpServer, httpsServer, proxy;
|
||||||
before(async function(){
|
before(async function(){
|
||||||
// Here mount our fake engine in both http and https listen server
|
// Here mount our fake engine in both http and https listen server
|
||||||
httpServerAndProxy = http.createServer(fakeSearchEngine);
|
httpServer = http.createServer(fakeSearchEngine);
|
||||||
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
||||||
|
|
||||||
/**
|
proxy = Proxy();
|
||||||
* express doesn't handle HTTP CONNECT method, this implement a basic MITM http proxy
|
proxy.onRequest((ctx, callback) => {
|
||||||
* here we use our http server to also act as a http proxy and rewrite all http/s request to our fake engine
|
ctx.proxyToServerRequestOptions.host = 'localhost';
|
||||||
*/
|
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
|
||||||
httpServerAndProxy.on('connect', (req, clientSocket, head) => {
|
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
|
||||||
const parsedUrl = url.parse('http://' + req.url);
|
debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
|
||||||
const destPort = (parseInt(parsedUrl.port) === 443) ? httpsPort : httpPort;
|
return callback();
|
||||||
const serverSocket = net.connect(destPort, 'localhost', () => {
|
|
||||||
debug('connection proxied askedHost=%s toPort=%s', parsedUrl.host, destPort);
|
|
||||||
clientSocket.write('HTTP/1.1 200 Connection Established\r\n' +
|
|
||||||
'Proxy-agent: Node.js-Proxy\r\n' +
|
|
||||||
'\r\n');
|
|
||||||
serverSocket.write(head);
|
|
||||||
serverSocket.pipe(clientSocket);
|
|
||||||
clientSocket.pipe(serverSocket);
|
|
||||||
serverSocket.on('error', (err)=>{
|
|
||||||
console.error(err);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
});
|
});
|
||||||
|
|
||||||
await Promise.promisify(httpServerAndProxy.listen, {context: httpServerAndProxy})(httpPort);
|
await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
|
||||||
|
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
|
||||||
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
||||||
debug('Fake http search engine servers started');
|
debug('Fake http search engine servers started');
|
||||||
});
|
});
|
||||||
|
|
||||||
after(function(){
|
after(function(){
|
||||||
httpsServer.close();
|
httpsServer.close();
|
||||||
httpServerAndProxy.close();
|
httpServer.close();
|
||||||
|
proxy.close();
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('proxies', function(){
|
describe('proxies', function(){
|
||||||
|
|
||||||
class MockScraper extends Scraper {
|
class MockScraperTestProxy extends Scraper {
|
||||||
|
|
||||||
async load_start_page(){
|
async load_start_page(){
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
async search_keyword(){
|
async search_keyword(){
|
||||||
await this.page.goto('http://void:' + httpPort + '/test');
|
await this.page.goto('http://test.local:' + httpPort + '/test-proxy');
|
||||||
}
|
}
|
||||||
|
|
||||||
async parse_async(){
|
async parse_async(){
|
||||||
@ -84,29 +71,67 @@ describe('Config', function(){
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const testLogger = createLogger({
|
||||||
|
transports: [
|
||||||
|
new transports.Console({
|
||||||
|
level: 'error'
|
||||||
|
})
|
||||||
|
]
|
||||||
|
});
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Jobs will be executed 1 by 1 through the proxy
|
* Jobs will be executed 2 by 2 through the proxy and direct connection
|
||||||
|
* THIS TEST NEED TO HAVE test.local 127.0.0.1 in /etc/hosts because chrome bypass localhost even with proxy set
|
||||||
*/
|
*/
|
||||||
it('one proxy given', async function () {
|
it('one proxy given, use_proxies_only=false', async function () {
|
||||||
|
|
||||||
const scrape_job = {
|
const scrape_job = {
|
||||||
search_engine: MockScraper,
|
search_engine: MockScraperTestProxy,
|
||||||
keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'],
|
keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'],
|
||||||
};
|
};
|
||||||
|
|
||||||
var scraper = new se_scraper.ScrapeManager({
|
var scraper = new se_scraper.ScrapeManager({
|
||||||
throw_on_detection: true,
|
throw_on_detection: true,
|
||||||
proxies: ['http://localhost:' + httpPort],
|
proxies: ['http://localhost:' + proxyPort],
|
||||||
use_proxies_only: true,
|
// default is use_proxies_only: false,
|
||||||
|
logger: testLogger,
|
||||||
});
|
});
|
||||||
await scraper.start();
|
await scraper.start();
|
||||||
|
|
||||||
const { results } = await scraper.scrape(scrape_job);
|
const { results } = await scraper.scrape(scrape_job);
|
||||||
assert.strictEqual(results['news']['1'], 'OK');
|
assert.strictEqual(results['news']['1'], 'test.local');
|
||||||
assert.strictEqual(results['some stuff']['1'], 'OK');
|
assert.strictEqual(results['some stuff']['1'], 'ProxiedThroughFakeEngine');
|
||||||
assert.strictEqual(results['i work too much']['1'], 'OK');
|
assert.strictEqual(results['i work too much']['1'], 'test.local');
|
||||||
assert.strictEqual(results['what to do?']['1'], 'OK');
|
assert.strictEqual(results['what to do?']['1'], 'ProxiedThroughFakeEngine');
|
||||||
assert.strictEqual(results['javascript is hard']['1'], 'OK');
|
assert.strictEqual(results['javascript is hard']['1'], 'test.local');
|
||||||
|
|
||||||
|
await scraper.quit();
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Jobs will be executed 1 by 1 through the proxy
|
||||||
|
*/
|
||||||
|
it('one proxy given, use_proxies_only=true', async function () {
|
||||||
|
|
||||||
|
const scrape_job = {
|
||||||
|
search_engine: MockScraperTestProxy,
|
||||||
|
keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'],
|
||||||
|
};
|
||||||
|
|
||||||
|
var scraper = new se_scraper.ScrapeManager({
|
||||||
|
throw_on_detection: true,
|
||||||
|
proxies: ['http://localhost:' + proxyPort],
|
||||||
|
use_proxies_only: true,
|
||||||
|
logger: testLogger,
|
||||||
|
});
|
||||||
|
await scraper.start();
|
||||||
|
|
||||||
|
const { results } = await scraper.scrape(scrape_job);
|
||||||
|
assert.strictEqual(results['news']['1'], 'ProxiedThroughFakeEngine');
|
||||||
|
assert.strictEqual(results['some stuff']['1'], 'ProxiedThroughFakeEngine');
|
||||||
|
assert.strictEqual(results['i work too much']['1'], 'ProxiedThroughFakeEngine');
|
||||||
|
assert.strictEqual(results['what to do?']['1'], 'ProxiedThroughFakeEngine');
|
||||||
|
assert.strictEqual(results['javascript is hard']['1'], 'ProxiedThroughFakeEngine');
|
||||||
|
|
||||||
await scraper.quit();
|
await scraper.quit();
|
||||||
});
|
});
|
||||||
|
Loading…
Reference in New Issue
Block a user