test: Bing tests working, refactor proxy for tests

This commit is contained in:
HugoPoi 2020-01-08 14:40:28 +01:00
parent 1c1db88545
commit cac6b87e92
10 changed files with 347 additions and 114 deletions

2
.gitignore vendored
View File

@ -79,3 +79,5 @@ typings/
.idea/
GoogleScraperPup.iml
.http-mitm-proxy

62
package-lock.json generated
View File

@ -1007,6 +1007,34 @@
"toidentifier": "1.0.0"
}
},
"http-mitm-proxy": {
"version": "0.8.2",
"resolved": "https://registry.npmjs.org/http-mitm-proxy/-/http-mitm-proxy-0.8.2.tgz",
"integrity": "sha512-QqaqHWssz4acqu2aIPJqJWt/gDa4SzQ9kj/rs16ONA2nBWNh/mfOW0Ez1Wxa5IivHHZSTciQ7wG0Dxzogurngw==",
"dev": true,
"requires": {
"async": "^2.6.2",
"debug": "^4.1.0",
"mkdirp": "^0.5.1",
"node-forge": "^0.8.4",
"optimist": "^0.6.1",
"semaphore": "^1.1.0",
"ws": "^3.2.0"
},
"dependencies": {
"ws": {
"version": "3.3.3",
"resolved": "https://registry.npmjs.org/ws/-/ws-3.3.3.tgz",
"integrity": "sha512-nnWLa/NwZSt4KQJu51MYlCcSQ5g7INpOrOMt4XV8j4dqTXdmlUmSHQ8/oLC069ckre0fRsgfvsKwbTdtKLCDkA==",
"dev": true,
"requires": {
"async-limiter": "~1.0.0",
"safe-buffer": "~5.1.0",
"ultron": "~1.1.0"
}
}
}
},
"https-proxy-agent": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-3.0.1.tgz",
@ -1475,6 +1503,12 @@
"semver": "^5.7.0"
}
},
"node-forge": {
"version": "0.8.5",
"resolved": "https://registry.npmjs.org/node-forge/-/node-forge-0.8.5.tgz",
"integrity": "sha512-vFMQIWt+J/7FLNyKouZ9TazT74PRV3wgv9UT4cRjC8BffxFbKXkgIWR42URCPSnHm/QDz6BOlb2Q0U4+VQT67Q==",
"dev": true
},
"normalize-url": {
"version": "3.3.0",
"resolved": "https://registry.npmjs.org/normalize-url/-/normalize-url-3.3.0.tgz",
@ -1553,6 +1587,16 @@
"resolved": "https://registry.npmjs.org/one-time/-/one-time-0.0.4.tgz",
"integrity": "sha1-+M33eISCb+Tf+T46nMN7HkSAdC4="
},
"optimist": {
"version": "0.6.1",
"resolved": "https://registry.npmjs.org/optimist/-/optimist-0.6.1.tgz",
"integrity": "sha1-2j6nRob6IaGaERwybpDrFaAZZoY=",
"dev": true,
"requires": {
"minimist": "~0.0.1",
"wordwrap": "~0.0.2"
}
},
"os-locale": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/os-locale/-/os-locale-3.1.0.tgz",
@ -1897,6 +1941,12 @@
"integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==",
"dev": true
},
"semaphore": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/semaphore/-/semaphore-1.1.0.tgz",
"integrity": "sha512-O4OZEaNtkMd/K0i6js9SL+gqy0ZCBMgUvlSqHKi4IBdjhe7wB8pwztUk1BbZ1fmrvpwFrPbHzqd2w5pTcJH6LA==",
"dev": true
},
"semver": {
"version": "5.7.0",
"resolved": "https://registry.npmjs.org/semver/-/semver-5.7.0.tgz",
@ -2147,6 +2197,12 @@
"resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz",
"integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c="
},
"ultron": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/ultron/-/ultron-1.1.1.tgz",
"integrity": "sha512-UIEXBNeYmKptWH6z8ZnqTeS8fV74zG0/eRU9VGkpzz+LIJNs8W/zM/L+7ctCkRrgbNnnR0xxw4bKOr0cW0N0Og==",
"dev": true
},
"underscore": {
"version": "1.9.1",
"resolved": "https://registry.npmjs.org/underscore/-/underscore-1.9.1.tgz",
@ -2273,6 +2329,12 @@
}
}
},
"wordwrap": {
"version": "0.0.3",
"resolved": "https://registry.npmjs.org/wordwrap/-/wordwrap-0.0.3.tgz",
"integrity": "sha1-o9XabNXAvAAI03I0u68b7WMFkQc=",
"dev": true
},
"wrap-ansi": {
"version": "2.1.0",
"resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-2.1.0.tgz",

View File

@ -37,6 +37,7 @@
"chai": "^4.2.0",
"chai-string": "^1.5.0",
"express": "^4.17.1",
"http-mitm-proxy": "^0.8.2",
"key-cert": "^1.0.1",
"mocha": "^6.1.4"
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,68 +1,57 @@
'use strict';
const express = require('express');
const puppeteer = require('puppeteer');
// TODO add a test logger in place of default winston logger
const logger = require('winston');
const net = require('net');
const { createLogger, transports } = require('winston');
const http = require('http');
const https = require('https');
const url = require('url');
const assert = require('assert');
const path = require('path');
const keyCert = require('key-cert');
const Promise = require('bluebird');
const Proxy = require('http-mitm-proxy');
const debug = require('debug')('se-scraper:test');
const { BingScraper } = require('../../src/modules/bing');
const httpPort = 3012;
const httpsPort = httpPort + 1;
const proxyPort = httpPort + 2;
const fakeSearchEngine = express();
fakeSearchEngine.get('/search', (req, res, next) => {
debug('q=%s', req.query.q);
const pageNumber = ((req.query.start/10) || 0) + 1;
const pageNumber = Math.round((req.query.first || 0) /10) + 1;
res.sendFile(path.join(__dirname, '../mocks/bing/' + req.query.q + '_page' + pageNumber + '.html'));
});
fakeSearchEngine.use(express.static('test/mocks/bing', {extensions: ['html']}));
describe('Module Bing', function(){
let httpServerAndProxy, httpsServer;
let httpServer, httpsServer, proxy;
before(async function(){
// Here mount our fake engine in both http and https listen server
httpServerAndProxy = http.createServer(fakeSearchEngine);
httpServer = http.createServer(fakeSearchEngine);
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
/**
* express doesn't handle HTTP CONNECT method, this implement a basic MITM http proxy
* here we use our http server to also act as a http proxy and rewrite all http/s request to our fake engine
*/
httpServerAndProxy.on('connect', (req, clientSocket, head) => {
const parsedUrl = url.parse('http://' + req.url);
const destPort = (parseInt(parsedUrl.port) === 443) ? httpsPort : httpPort;
const serverSocket = net.connect(destPort, 'localhost', () => {
debug('connection proxied askedHost=%s toPort=%s', parsedUrl.host, destPort);
clientSocket.write('HTTP/1.1 200 Connection Established\r\n' +
'Proxy-agent: Node.js-Proxy\r\n' +
'\r\n');
serverSocket.write(head);
serverSocket.pipe(clientSocket);
clientSocket.pipe(serverSocket);
serverSocket.on('error', (err)=>{
console.error(err);
});
});
proxy = Proxy();
proxy.onRequest((ctx, callback) => {
ctx.proxyToServerRequestOptions.host = 'localhost';
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
debug('connection proxied askedHost=%s toPort=%s', ctx.clientToProxyRequest.headers.host, ctx.proxyToServerRequestOptions.port);
return callback();
});
await Promise.promisify(httpServerAndProxy.listen, {context: httpServerAndProxy})(httpPort);
await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort });
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
debug('Fake http search engine servers started');
});
after(function(){
proxy.close();
httpsServer.close();
httpServerAndProxy.close();
httpServer.close();
});
let browser;
@ -71,8 +60,9 @@ describe('Module Bing', function(){
debug('Start a new browser');
browser = await puppeteer.launch({
//dumpio: true,
//headless: false,
ignoreHTTPSErrors: true,
args: [ '--proxy-server=http://localhost:' + httpPort ]
args: [ '--proxy-server=http://localhost:' + proxyPort ]
});
debug('Open a fresh page');
page = await browser.newPage();
@ -82,20 +72,28 @@ describe('Module Bing', function(){
await browser.close();
});
const testLogger = createLogger({
transports: [
new transports.Console({
level: 'error'
})
]
});
it('one keyword one page', function(){
const bingScraper = new BingScraper({
config: {
search_engine_name: 'bing',
throw_on_detection: true,
keywords: ['test keyword'],
logger,
logger: testLogger,
scrape_from_file: '',
}
});
bingScraper.STANDARD_TIMEOUT = 500;
return bingScraper.run({page}).then(({results, metadata, num_requests}) => {
assert.strictEqual(num_requests, 1, 'Must do one request');
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed');
assert.strictEqual(results['test keyword']['1'].results.length, 6, 'Must have 6 organic results parsed');
});
});
@ -105,7 +103,7 @@ describe('Module Bing', function(){
search_engine_name: 'bing',
throw_on_detection: true,
keywords: ['test keyword'],
logger,
logger: testLogger,
scrape_from_file: '',
num_pages: 3,
}
@ -113,12 +111,12 @@ describe('Module Bing', function(){
bingScraper.STANDARD_TIMEOUT = 500;
return bingScraper.run({page}).then(({results, metadata, num_requests}) => {
assert.strictEqual(num_requests, 3, 'Must three requests');
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed on page 1');
assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tool (FREE) ᐈ #1 Google Keyword Planner Alternative', 'Title not matching on first organic result page 1');
assert.strictEqual(results['test keyword']['1'].results.length, 6, 'Must have 6 organic results parsed on page 1');
assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tests | TestComplete Documentation', 'Title not matching on first organic result page 1');
assert.strictEqual(results['test keyword']['2'].results.length, 10, 'Must have 10 organic results parsed on page 2');
assert.strictEqual(results['test keyword']['2'].results[0].title, 'Keyword Research | The Beginner\'s Guide to SEO - Moz', 'Title not matching on first organic result page 1');
assert.strictEqual(results['test keyword']['2'].results[0].title, 'Keywords - TestLink', 'Title not matching on first organic result page 2');
assert.strictEqual(results['test keyword']['3'].results.length, 10, 'Must have 10 organic results parsed on page 3');
assert.strictEqual(results['test keyword']['3'].results[0].title, 'The ACT Keyword Study Plan — NerdCoach', 'Title not matching on first organic result page 1');
assert.strictEqual(results['test keyword']['3'].results[0].title, 'Keyword Driven Testing | TestComplete', 'Title not matching on first organic result page 3');
});
});

View File

@ -1,25 +1,24 @@
'use strict';
const express = require('express');
const puppeteer = require('puppeteer');
// TODO add a test logger in place of default winston logger
const logger = require('winston');
const net = require('net');
const { createLogger, transports } = require('winston');
const http = require('http');
const https = require('https');
const url = require('url');
const assert = require('assert');
const path = require('path');
const keyCert = require('key-cert');
const Promise = require('bluebird');
const Proxy = require('http-mitm-proxy');
const debug = require('debug')('se-scraper:test');
const { GoogleScraper } = require('../../src/modules/google');
const httpPort = 3012;
const httpsPort = httpPort + 1;
const proxyPort = httpPort + 2;
const fakeSearchEngine = express();
fakeSearchEngine.get('/search', (req, res, next) => {
fakeSearchEngine.get('/search', (req, res) => {
debug('q=%s', req.query.q);
const pageNumber = ((req.query.start/10) || 0) + 1;
res.sendFile(path.join(__dirname, '../mocks/google/' + req.query.q + '_page' + pageNumber + '.html'));
@ -28,41 +27,31 @@ fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']})
describe('Module Google', function(){
let httpServerAndProxy, httpsServer;
let httpServer, httpsServer, proxy;
before(async function(){
// Here mount our fake engine in both http and https listen server
httpServerAndProxy = http.createServer(fakeSearchEngine);
httpServer = http.createServer(fakeSearchEngine);
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
/**
* express doesn't handle HTTP CONNECT method, this implement a basic MITM http proxy
* here we use our http server to also act as a http proxy and rewrite all http/s request to our fake engine
*/
httpServerAndProxy.on('connect', (req, clientSocket, head) => {
const parsedUrl = url.parse('http://' + req.url);
const destPort = (parseInt(parsedUrl.port) === 443) ? httpsPort : httpPort;
const serverSocket = net.connect(destPort, 'localhost', () => {
debug('connection proxied askedHost=%s toPort=%s', parsedUrl.host, destPort);
clientSocket.write('HTTP/1.1 200 Connection Established\r\n' +
'Proxy-agent: Node.js-Proxy\r\n' +
'\r\n');
serverSocket.write(head);
serverSocket.pipe(clientSocket);
clientSocket.pipe(serverSocket);
serverSocket.on('error', (err)=>{
console.error(err);
});
});
proxy = Proxy();
proxy.onRequest((ctx, callback) => {
ctx.proxyToServerRequestOptions.host = 'localhost';
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
debug('connection proxied askedHost=%s toPort=%s', ctx.clientToProxyRequest.headers.host, ctx.proxyToServerRequestOptions.port);
return callback();
});
await Promise.promisify(httpServerAndProxy.listen, {context: httpServerAndProxy})(httpPort);
await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort });
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
debug('Fake http search engine servers started');
});
after(function(){
proxy.close();
httpsServer.close();
httpServerAndProxy.close();
httpServer.close();
});
let browser;
@ -71,8 +60,9 @@ describe('Module Google', function(){
debug('Start a new browser');
browser = await puppeteer.launch({
//dumpio: true,
//headless: false,
ignoreHTTPSErrors: true,
args: [ '--proxy-server=http://localhost:' + httpPort ]
args: [ '--proxy-server=http://localhost:' + proxyPort ]
});
debug('Open a fresh page');
page = await browser.newPage();
@ -82,13 +72,21 @@ describe('Module Google', function(){
await browser.close();
});
const testLogger = createLogger({
transports: [
new transports.Console({
level: 'error'
})
]
});
it('one keyword one page', function(){
const googleScraper = new GoogleScraper({
config: {
search_engine_name: 'google',
throw_on_detection: true,
keywords: ['test keyword'],
logger,
logger: testLogger,
scrape_from_file: '',
}
});
@ -105,7 +103,7 @@ describe('Module Google', function(){
search_engine_name: 'google',
throw_on_detection: true,
keywords: ['test keyword'],
logger,
logger: testLogger,
scrape_from_file: '',
num_pages: 3,
}

View File

@ -1,16 +1,12 @@
'use strict';
const express = require('express');
const puppeteer = require('puppeteer');
// TODO add a test logger in place of default winston logger
const logger = require('winston');
const net = require('net');
const { createLogger, transports } = require('winston');
const http = require('http');
const https = require('https');
const url = require('url');
const assert = require('assert');
const path = require('path');
const keyCert = require('key-cert');
const Promise = require('bluebird');
const Proxy = require('http-mitm-proxy');
const debug = require('debug')('se-scraper:test');
const se_scraper = require('../');
@ -18,64 +14,55 @@ const Scraper = require('../src/modules/se_scraper');
const httpPort = 3012;
const httpsPort = httpPort + 1;
const httpOtherPort = httpPort + 2;
const proxyPort = httpPort + 2;
const fakeSearchEngine = express();
fakeSearchEngine.get('/test', (req, res) => {
debug(req.ip, req.ips, req.protocol, req.hostname);
debug(req.socket.localAddress, req.socket.localPort);
res.send('OK');
fakeSearchEngine.set('trust proxy', 'loopback');
fakeSearchEngine.get('/test-proxy', (req, res) => {
debug('fake-search-engine req.hostname=%s', req.hostname);
//debug('req to', req.socket.localAddress, req.socket.localPort);
res.send(req.hostname);
});
describe('Config', function(){
let httpServerAndProxy, httpsServer, httpOtherServer;
let httpServer, httpsServer, proxy;
before(async function(){
// Here mount our fake engine in both http and https listen server
httpServerAndProxy = http.createServer(fakeSearchEngine);
httpServer = http.createServer(fakeSearchEngine);
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
/**
* express doesn't handle HTTP CONNECT method, this implement a basic MITM http proxy
* here we use our http server to also act as a http proxy and rewrite all http/s request to our fake engine
*/
httpServerAndProxy.on('connect', (req, clientSocket, head) => {
const parsedUrl = url.parse('http://' + req.url);
const destPort = (parseInt(parsedUrl.port) === 443) ? httpsPort : httpPort;
const serverSocket = net.connect(destPort, 'localhost', () => {
debug('connection proxied askedHost=%s toPort=%s', parsedUrl.host, destPort);
clientSocket.write('HTTP/1.1 200 Connection Established\r\n' +
'Proxy-agent: Node.js-Proxy\r\n' +
'\r\n');
serverSocket.write(head);
serverSocket.pipe(clientSocket);
clientSocket.pipe(serverSocket);
serverSocket.on('error', (err)=>{
console.error(err);
});
});
proxy = Proxy();
proxy.onRequest((ctx, callback) => {
ctx.proxyToServerRequestOptions.host = 'localhost';
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
return callback();
});
await Promise.promisify(httpServerAndProxy.listen, {context: httpServerAndProxy})(httpPort);
await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
debug('Fake http search engine servers started');
});
after(function(){
httpsServer.close();
httpServerAndProxy.close();
httpServer.close();
proxy.close();
});
describe('proxies', function(){
class MockScraper extends Scraper {
class MockScraperTestProxy extends Scraper {
async load_start_page(){
return true;
}
async search_keyword(){
await this.page.goto('http://void:' + httpPort + '/test');
await this.page.goto('http://test.local:' + httpPort + '/test-proxy');
}
async parse_async(){
@ -84,29 +71,67 @@ describe('Config', function(){
}
}
const testLogger = createLogger({
transports: [
new transports.Console({
level: 'error'
})
]
});
/**
* Jobs will be executed 1 by 1 through the proxy
* Jobs will be executed 2 by 2 through the proxy and direct connection
* THIS TEST NEED TO HAVE test.local 127.0.0.1 in /etc/hosts because chrome bypass localhost even with proxy set
*/
it('one proxy given', async function () {
it('one proxy given, use_proxies_only=false', async function () {
const scrape_job = {
search_engine: MockScraper,
search_engine: MockScraperTestProxy,
keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'],
};
var scraper = new se_scraper.ScrapeManager({
throw_on_detection: true,
proxies: ['http://localhost:' + httpPort],
use_proxies_only: true,
proxies: ['http://localhost:' + proxyPort],
// default is use_proxies_only: false,
logger: testLogger,
});
await scraper.start();
const { results } = await scraper.scrape(scrape_job);
assert.strictEqual(results['news']['1'], 'OK');
assert.strictEqual(results['some stuff']['1'], 'OK');
assert.strictEqual(results['i work too much']['1'], 'OK');
assert.strictEqual(results['what to do?']['1'], 'OK');
assert.strictEqual(results['javascript is hard']['1'], 'OK');
assert.strictEqual(results['news']['1'], 'test.local');
assert.strictEqual(results['some stuff']['1'], 'ProxiedThroughFakeEngine');
assert.strictEqual(results['i work too much']['1'], 'test.local');
assert.strictEqual(results['what to do?']['1'], 'ProxiedThroughFakeEngine');
assert.strictEqual(results['javascript is hard']['1'], 'test.local');
await scraper.quit();
});
/**
* Jobs will be executed 1 by 1 through the proxy
*/
it('one proxy given, use_proxies_only=true', async function () {
const scrape_job = {
search_engine: MockScraperTestProxy,
keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'],
};
var scraper = new se_scraper.ScrapeManager({
throw_on_detection: true,
proxies: ['http://localhost:' + proxyPort],
use_proxies_only: true,
logger: testLogger,
});
await scraper.start();
const { results } = await scraper.scrape(scrape_job);
assert.strictEqual(results['news']['1'], 'ProxiedThroughFakeEngine');
assert.strictEqual(results['some stuff']['1'], 'ProxiedThroughFakeEngine');
assert.strictEqual(results['i work too much']['1'], 'ProxiedThroughFakeEngine');
assert.strictEqual(results['what to do?']['1'], 'ProxiedThroughFakeEngine');
assert.strictEqual(results['javascript is hard']['1'], 'ProxiedThroughFakeEngine');
await scraper.quit();
});