mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2024-11-21 15:13:13 +01:00
test: Bing tests working, refactor proxy for tests
This commit is contained in:
parent
1c1db88545
commit
cac6b87e92
2
.gitignore
vendored
2
.gitignore
vendored
@ -79,3 +79,5 @@ typings/
|
||||
|
||||
.idea/
|
||||
GoogleScraperPup.iml
|
||||
|
||||
.http-mitm-proxy
|
||||
|
62
package-lock.json
generated
62
package-lock.json
generated
@ -1007,6 +1007,34 @@
|
||||
"toidentifier": "1.0.0"
|
||||
}
|
||||
},
|
||||
"http-mitm-proxy": {
|
||||
"version": "0.8.2",
|
||||
"resolved": "https://registry.npmjs.org/http-mitm-proxy/-/http-mitm-proxy-0.8.2.tgz",
|
||||
"integrity": "sha512-QqaqHWssz4acqu2aIPJqJWt/gDa4SzQ9kj/rs16ONA2nBWNh/mfOW0Ez1Wxa5IivHHZSTciQ7wG0Dxzogurngw==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"async": "^2.6.2",
|
||||
"debug": "^4.1.0",
|
||||
"mkdirp": "^0.5.1",
|
||||
"node-forge": "^0.8.4",
|
||||
"optimist": "^0.6.1",
|
||||
"semaphore": "^1.1.0",
|
||||
"ws": "^3.2.0"
|
||||
},
|
||||
"dependencies": {
|
||||
"ws": {
|
||||
"version": "3.3.3",
|
||||
"resolved": "https://registry.npmjs.org/ws/-/ws-3.3.3.tgz",
|
||||
"integrity": "sha512-nnWLa/NwZSt4KQJu51MYlCcSQ5g7INpOrOMt4XV8j4dqTXdmlUmSHQ8/oLC069ckre0fRsgfvsKwbTdtKLCDkA==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"async-limiter": "~1.0.0",
|
||||
"safe-buffer": "~5.1.0",
|
||||
"ultron": "~1.1.0"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"https-proxy-agent": {
|
||||
"version": "3.0.1",
|
||||
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-3.0.1.tgz",
|
||||
@ -1475,6 +1503,12 @@
|
||||
"semver": "^5.7.0"
|
||||
}
|
||||
},
|
||||
"node-forge": {
|
||||
"version": "0.8.5",
|
||||
"resolved": "https://registry.npmjs.org/node-forge/-/node-forge-0.8.5.tgz",
|
||||
"integrity": "sha512-vFMQIWt+J/7FLNyKouZ9TazT74PRV3wgv9UT4cRjC8BffxFbKXkgIWR42URCPSnHm/QDz6BOlb2Q0U4+VQT67Q==",
|
||||
"dev": true
|
||||
},
|
||||
"normalize-url": {
|
||||
"version": "3.3.0",
|
||||
"resolved": "https://registry.npmjs.org/normalize-url/-/normalize-url-3.3.0.tgz",
|
||||
@ -1553,6 +1587,16 @@
|
||||
"resolved": "https://registry.npmjs.org/one-time/-/one-time-0.0.4.tgz",
|
||||
"integrity": "sha1-+M33eISCb+Tf+T46nMN7HkSAdC4="
|
||||
},
|
||||
"optimist": {
|
||||
"version": "0.6.1",
|
||||
"resolved": "https://registry.npmjs.org/optimist/-/optimist-0.6.1.tgz",
|
||||
"integrity": "sha1-2j6nRob6IaGaERwybpDrFaAZZoY=",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"minimist": "~0.0.1",
|
||||
"wordwrap": "~0.0.2"
|
||||
}
|
||||
},
|
||||
"os-locale": {
|
||||
"version": "3.1.0",
|
||||
"resolved": "https://registry.npmjs.org/os-locale/-/os-locale-3.1.0.tgz",
|
||||
@ -1897,6 +1941,12 @@
|
||||
"integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==",
|
||||
"dev": true
|
||||
},
|
||||
"semaphore": {
|
||||
"version": "1.1.0",
|
||||
"resolved": "https://registry.npmjs.org/semaphore/-/semaphore-1.1.0.tgz",
|
||||
"integrity": "sha512-O4OZEaNtkMd/K0i6js9SL+gqy0ZCBMgUvlSqHKi4IBdjhe7wB8pwztUk1BbZ1fmrvpwFrPbHzqd2w5pTcJH6LA==",
|
||||
"dev": true
|
||||
},
|
||||
"semver": {
|
||||
"version": "5.7.0",
|
||||
"resolved": "https://registry.npmjs.org/semver/-/semver-5.7.0.tgz",
|
||||
@ -2147,6 +2197,12 @@
|
||||
"resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz",
|
||||
"integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c="
|
||||
},
|
||||
"ultron": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "https://registry.npmjs.org/ultron/-/ultron-1.1.1.tgz",
|
||||
"integrity": "sha512-UIEXBNeYmKptWH6z8ZnqTeS8fV74zG0/eRU9VGkpzz+LIJNs8W/zM/L+7ctCkRrgbNnnR0xxw4bKOr0cW0N0Og==",
|
||||
"dev": true
|
||||
},
|
||||
"underscore": {
|
||||
"version": "1.9.1",
|
||||
"resolved": "https://registry.npmjs.org/underscore/-/underscore-1.9.1.tgz",
|
||||
@ -2273,6 +2329,12 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"wordwrap": {
|
||||
"version": "0.0.3",
|
||||
"resolved": "https://registry.npmjs.org/wordwrap/-/wordwrap-0.0.3.tgz",
|
||||
"integrity": "sha1-o9XabNXAvAAI03I0u68b7WMFkQc=",
|
||||
"dev": true
|
||||
},
|
||||
"wrap-ansi": {
|
||||
"version": "2.1.0",
|
||||
"resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-2.1.0.tgz",
|
||||
|
@ -37,6 +37,7 @@
|
||||
"chai": "^4.2.0",
|
||||
"chai-string": "^1.5.0",
|
||||
"express": "^4.17.1",
|
||||
"http-mitm-proxy": "^0.8.2",
|
||||
"key-cert": "^1.0.1",
|
||||
"mocha": "^6.1.4"
|
||||
}
|
||||
|
23
test/mocks/bing/index.html
Normal file
23
test/mocks/bing/index.html
Normal file
File diff suppressed because one or more lines are too long
42
test/mocks/bing/test keyword_page1.html
Normal file
42
test/mocks/bing/test keyword_page1.html
Normal file
File diff suppressed because one or more lines are too long
42
test/mocks/bing/test keyword_page2.html
Normal file
42
test/mocks/bing/test keyword_page2.html
Normal file
File diff suppressed because one or more lines are too long
40
test/mocks/bing/test keyword_page3.html
Normal file
40
test/mocks/bing/test keyword_page3.html
Normal file
File diff suppressed because one or more lines are too long
@ -1,68 +1,57 @@
|
||||
'use strict';
|
||||
const express = require('express');
|
||||
const puppeteer = require('puppeteer');
|
||||
// TODO add a test logger in place of default winston logger
|
||||
const logger = require('winston');
|
||||
const net = require('net');
|
||||
const { createLogger, transports } = require('winston');
|
||||
const http = require('http');
|
||||
const https = require('https');
|
||||
const url = require('url');
|
||||
const assert = require('assert');
|
||||
const path = require('path');
|
||||
const keyCert = require('key-cert');
|
||||
const Promise = require('bluebird');
|
||||
const Proxy = require('http-mitm-proxy');
|
||||
|
||||
const debug = require('debug')('se-scraper:test');
|
||||
const { BingScraper } = require('../../src/modules/bing');
|
||||
|
||||
const httpPort = 3012;
|
||||
const httpsPort = httpPort + 1;
|
||||
const proxyPort = httpPort + 2;
|
||||
|
||||
const fakeSearchEngine = express();
|
||||
fakeSearchEngine.get('/search', (req, res, next) => {
|
||||
debug('q=%s', req.query.q);
|
||||
const pageNumber = ((req.query.start/10) || 0) + 1;
|
||||
const pageNumber = Math.round((req.query.first || 0) /10) + 1;
|
||||
res.sendFile(path.join(__dirname, '../mocks/bing/' + req.query.q + '_page' + pageNumber + '.html'));
|
||||
});
|
||||
fakeSearchEngine.use(express.static('test/mocks/bing', {extensions: ['html']}));
|
||||
|
||||
describe('Module Bing', function(){
|
||||
|
||||
let httpServerAndProxy, httpsServer;
|
||||
let httpServer, httpsServer, proxy;
|
||||
before(async function(){
|
||||
// Here mount our fake engine in both http and https listen server
|
||||
httpServerAndProxy = http.createServer(fakeSearchEngine);
|
||||
httpServer = http.createServer(fakeSearchEngine);
|
||||
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
||||
|
||||
/**
|
||||
* express doesn't handle HTTP CONNECT method, this implement a basic MITM http proxy
|
||||
* here we use our http server to also act as a http proxy and rewrite all http/s request to our fake engine
|
||||
*/
|
||||
httpServerAndProxy.on('connect', (req, clientSocket, head) => {
|
||||
const parsedUrl = url.parse('http://' + req.url);
|
||||
const destPort = (parseInt(parsedUrl.port) === 443) ? httpsPort : httpPort;
|
||||
const serverSocket = net.connect(destPort, 'localhost', () => {
|
||||
debug('connection proxied askedHost=%s toPort=%s', parsedUrl.host, destPort);
|
||||
clientSocket.write('HTTP/1.1 200 Connection Established\r\n' +
|
||||
'Proxy-agent: Node.js-Proxy\r\n' +
|
||||
'\r\n');
|
||||
serverSocket.write(head);
|
||||
serverSocket.pipe(clientSocket);
|
||||
clientSocket.pipe(serverSocket);
|
||||
serverSocket.on('error', (err)=>{
|
||||
console.error(err);
|
||||
});
|
||||
});
|
||||
proxy = Proxy();
|
||||
proxy.onRequest((ctx, callback) => {
|
||||
ctx.proxyToServerRequestOptions.host = 'localhost';
|
||||
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
|
||||
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
|
||||
debug('connection proxied askedHost=%s toPort=%s', ctx.clientToProxyRequest.headers.host, ctx.proxyToServerRequestOptions.port);
|
||||
return callback();
|
||||
});
|
||||
|
||||
await Promise.promisify(httpServerAndProxy.listen, {context: httpServerAndProxy})(httpPort);
|
||||
await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort });
|
||||
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
|
||||
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
||||
debug('Fake http search engine servers started');
|
||||
});
|
||||
|
||||
after(function(){
|
||||
proxy.close();
|
||||
httpsServer.close();
|
||||
httpServerAndProxy.close();
|
||||
httpServer.close();
|
||||
});
|
||||
|
||||
let browser;
|
||||
@ -71,8 +60,9 @@ describe('Module Bing', function(){
|
||||
debug('Start a new browser');
|
||||
browser = await puppeteer.launch({
|
||||
//dumpio: true,
|
||||
//headless: false,
|
||||
ignoreHTTPSErrors: true,
|
||||
args: [ '--proxy-server=http://localhost:' + httpPort ]
|
||||
args: [ '--proxy-server=http://localhost:' + proxyPort ]
|
||||
});
|
||||
debug('Open a fresh page');
|
||||
page = await browser.newPage();
|
||||
@ -82,20 +72,28 @@ describe('Module Bing', function(){
|
||||
await browser.close();
|
||||
});
|
||||
|
||||
const testLogger = createLogger({
|
||||
transports: [
|
||||
new transports.Console({
|
||||
level: 'error'
|
||||
})
|
||||
]
|
||||
});
|
||||
|
||||
it('one keyword one page', function(){
|
||||
const bingScraper = new BingScraper({
|
||||
config: {
|
||||
search_engine_name: 'bing',
|
||||
throw_on_detection: true,
|
||||
keywords: ['test keyword'],
|
||||
logger,
|
||||
logger: testLogger,
|
||||
scrape_from_file: '',
|
||||
}
|
||||
});
|
||||
bingScraper.STANDARD_TIMEOUT = 500;
|
||||
return bingScraper.run({page}).then(({results, metadata, num_requests}) => {
|
||||
assert.strictEqual(num_requests, 1, 'Must do one request');
|
||||
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed');
|
||||
assert.strictEqual(results['test keyword']['1'].results.length, 6, 'Must have 6 organic results parsed');
|
||||
});
|
||||
});
|
||||
|
||||
@ -105,7 +103,7 @@ describe('Module Bing', function(){
|
||||
search_engine_name: 'bing',
|
||||
throw_on_detection: true,
|
||||
keywords: ['test keyword'],
|
||||
logger,
|
||||
logger: testLogger,
|
||||
scrape_from_file: '',
|
||||
num_pages: 3,
|
||||
}
|
||||
@ -113,12 +111,12 @@ describe('Module Bing', function(){
|
||||
bingScraper.STANDARD_TIMEOUT = 500;
|
||||
return bingScraper.run({page}).then(({results, metadata, num_requests}) => {
|
||||
assert.strictEqual(num_requests, 3, 'Must three requests');
|
||||
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed on page 1');
|
||||
assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tool (FREE) ᐈ #1 Google Keyword Planner Alternative', 'Title not matching on first organic result page 1');
|
||||
assert.strictEqual(results['test keyword']['1'].results.length, 6, 'Must have 6 organic results parsed on page 1');
|
||||
assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tests | TestComplete Documentation', 'Title not matching on first organic result page 1');
|
||||
assert.strictEqual(results['test keyword']['2'].results.length, 10, 'Must have 10 organic results parsed on page 2');
|
||||
assert.strictEqual(results['test keyword']['2'].results[0].title, 'Keyword Research | The Beginner\'s Guide to SEO - Moz', 'Title not matching on first organic result page 1');
|
||||
assert.strictEqual(results['test keyword']['2'].results[0].title, 'Keywords - TestLink', 'Title not matching on first organic result page 2');
|
||||
assert.strictEqual(results['test keyword']['3'].results.length, 10, 'Must have 10 organic results parsed on page 3');
|
||||
assert.strictEqual(results['test keyword']['3'].results[0].title, 'The ACT Keyword Study Plan — NerdCoach', 'Title not matching on first organic result page 1');
|
||||
assert.strictEqual(results['test keyword']['3'].results[0].title, 'Keyword Driven Testing | TestComplete', 'Title not matching on first organic result page 3');
|
||||
});
|
||||
});
|
||||
|
||||
|
@ -1,25 +1,24 @@
|
||||
'use strict';
|
||||
const express = require('express');
|
||||
const puppeteer = require('puppeteer');
|
||||
// TODO add a test logger in place of default winston logger
|
||||
const logger = require('winston');
|
||||
const net = require('net');
|
||||
const { createLogger, transports } = require('winston');
|
||||
const http = require('http');
|
||||
const https = require('https');
|
||||
const url = require('url');
|
||||
const assert = require('assert');
|
||||
const path = require('path');
|
||||
const keyCert = require('key-cert');
|
||||
const Promise = require('bluebird');
|
||||
const Proxy = require('http-mitm-proxy');
|
||||
|
||||
const debug = require('debug')('se-scraper:test');
|
||||
const { GoogleScraper } = require('../../src/modules/google');
|
||||
|
||||
const httpPort = 3012;
|
||||
const httpsPort = httpPort + 1;
|
||||
const proxyPort = httpPort + 2;
|
||||
|
||||
const fakeSearchEngine = express();
|
||||
fakeSearchEngine.get('/search', (req, res, next) => {
|
||||
fakeSearchEngine.get('/search', (req, res) => {
|
||||
debug('q=%s', req.query.q);
|
||||
const pageNumber = ((req.query.start/10) || 0) + 1;
|
||||
res.sendFile(path.join(__dirname, '../mocks/google/' + req.query.q + '_page' + pageNumber + '.html'));
|
||||
@ -28,41 +27,31 @@ fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']})
|
||||
|
||||
describe('Module Google', function(){
|
||||
|
||||
let httpServerAndProxy, httpsServer;
|
||||
let httpServer, httpsServer, proxy;
|
||||
before(async function(){
|
||||
// Here mount our fake engine in both http and https listen server
|
||||
httpServerAndProxy = http.createServer(fakeSearchEngine);
|
||||
httpServer = http.createServer(fakeSearchEngine);
|
||||
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
||||
|
||||
/**
|
||||
* express doesn't handle HTTP CONNECT method, this implement a basic MITM http proxy
|
||||
* here we use our http server to also act as a http proxy and rewrite all http/s request to our fake engine
|
||||
*/
|
||||
httpServerAndProxy.on('connect', (req, clientSocket, head) => {
|
||||
const parsedUrl = url.parse('http://' + req.url);
|
||||
const destPort = (parseInt(parsedUrl.port) === 443) ? httpsPort : httpPort;
|
||||
const serverSocket = net.connect(destPort, 'localhost', () => {
|
||||
debug('connection proxied askedHost=%s toPort=%s', parsedUrl.host, destPort);
|
||||
clientSocket.write('HTTP/1.1 200 Connection Established\r\n' +
|
||||
'Proxy-agent: Node.js-Proxy\r\n' +
|
||||
'\r\n');
|
||||
serverSocket.write(head);
|
||||
serverSocket.pipe(clientSocket);
|
||||
clientSocket.pipe(serverSocket);
|
||||
serverSocket.on('error', (err)=>{
|
||||
console.error(err);
|
||||
});
|
||||
});
|
||||
proxy = Proxy();
|
||||
proxy.onRequest((ctx, callback) => {
|
||||
ctx.proxyToServerRequestOptions.host = 'localhost';
|
||||
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
|
||||
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
|
||||
debug('connection proxied askedHost=%s toPort=%s', ctx.clientToProxyRequest.headers.host, ctx.proxyToServerRequestOptions.port);
|
||||
return callback();
|
||||
});
|
||||
|
||||
await Promise.promisify(httpServerAndProxy.listen, {context: httpServerAndProxy})(httpPort);
|
||||
await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort });
|
||||
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
|
||||
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
||||
debug('Fake http search engine servers started');
|
||||
});
|
||||
|
||||
after(function(){
|
||||
proxy.close();
|
||||
httpsServer.close();
|
||||
httpServerAndProxy.close();
|
||||
httpServer.close();
|
||||
});
|
||||
|
||||
let browser;
|
||||
@ -71,8 +60,9 @@ describe('Module Google', function(){
|
||||
debug('Start a new browser');
|
||||
browser = await puppeteer.launch({
|
||||
//dumpio: true,
|
||||
//headless: false,
|
||||
ignoreHTTPSErrors: true,
|
||||
args: [ '--proxy-server=http://localhost:' + httpPort ]
|
||||
args: [ '--proxy-server=http://localhost:' + proxyPort ]
|
||||
});
|
||||
debug('Open a fresh page');
|
||||
page = await browser.newPage();
|
||||
@ -82,13 +72,21 @@ describe('Module Google', function(){
|
||||
await browser.close();
|
||||
});
|
||||
|
||||
const testLogger = createLogger({
|
||||
transports: [
|
||||
new transports.Console({
|
||||
level: 'error'
|
||||
})
|
||||
]
|
||||
});
|
||||
|
||||
it('one keyword one page', function(){
|
||||
const googleScraper = new GoogleScraper({
|
||||
config: {
|
||||
search_engine_name: 'google',
|
||||
throw_on_detection: true,
|
||||
keywords: ['test keyword'],
|
||||
logger,
|
||||
logger: testLogger,
|
||||
scrape_from_file: '',
|
||||
}
|
||||
});
|
||||
@ -105,7 +103,7 @@ describe('Module Google', function(){
|
||||
search_engine_name: 'google',
|
||||
throw_on_detection: true,
|
||||
keywords: ['test keyword'],
|
||||
logger,
|
||||
logger: testLogger,
|
||||
scrape_from_file: '',
|
||||
num_pages: 3,
|
||||
}
|
||||
|
119
test/proxy.js
119
test/proxy.js
@ -1,16 +1,12 @@
|
||||
'use strict';
|
||||
const express = require('express');
|
||||
const puppeteer = require('puppeteer');
|
||||
// TODO add a test logger in place of default winston logger
|
||||
const logger = require('winston');
|
||||
const net = require('net');
|
||||
const { createLogger, transports } = require('winston');
|
||||
const http = require('http');
|
||||
const https = require('https');
|
||||
const url = require('url');
|
||||
const assert = require('assert');
|
||||
const path = require('path');
|
||||
const keyCert = require('key-cert');
|
||||
const Promise = require('bluebird');
|
||||
const Proxy = require('http-mitm-proxy');
|
||||
|
||||
const debug = require('debug')('se-scraper:test');
|
||||
const se_scraper = require('../');
|
||||
@ -18,64 +14,55 @@ const Scraper = require('../src/modules/se_scraper');
|
||||
|
||||
const httpPort = 3012;
|
||||
const httpsPort = httpPort + 1;
|
||||
const httpOtherPort = httpPort + 2;
|
||||
const proxyPort = httpPort + 2;
|
||||
|
||||
const fakeSearchEngine = express();
|
||||
fakeSearchEngine.get('/test', (req, res) => {
|
||||
debug(req.ip, req.ips, req.protocol, req.hostname);
|
||||
debug(req.socket.localAddress, req.socket.localPort);
|
||||
res.send('OK');
|
||||
fakeSearchEngine.set('trust proxy', 'loopback');
|
||||
fakeSearchEngine.get('/test-proxy', (req, res) => {
|
||||
debug('fake-search-engine req.hostname=%s', req.hostname);
|
||||
//debug('req to', req.socket.localAddress, req.socket.localPort);
|
||||
res.send(req.hostname);
|
||||
});
|
||||
|
||||
describe('Config', function(){
|
||||
|
||||
let httpServerAndProxy, httpsServer, httpOtherServer;
|
||||
let httpServer, httpsServer, proxy;
|
||||
before(async function(){
|
||||
// Here mount our fake engine in both http and https listen server
|
||||
httpServerAndProxy = http.createServer(fakeSearchEngine);
|
||||
httpServer = http.createServer(fakeSearchEngine);
|
||||
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
||||
|
||||
/**
|
||||
* express doesn't handle HTTP CONNECT method, this implement a basic MITM http proxy
|
||||
* here we use our http server to also act as a http proxy and rewrite all http/s request to our fake engine
|
||||
*/
|
||||
httpServerAndProxy.on('connect', (req, clientSocket, head) => {
|
||||
const parsedUrl = url.parse('http://' + req.url);
|
||||
const destPort = (parseInt(parsedUrl.port) === 443) ? httpsPort : httpPort;
|
||||
const serverSocket = net.connect(destPort, 'localhost', () => {
|
||||
debug('connection proxied askedHost=%s toPort=%s', parsedUrl.host, destPort);
|
||||
clientSocket.write('HTTP/1.1 200 Connection Established\r\n' +
|
||||
'Proxy-agent: Node.js-Proxy\r\n' +
|
||||
'\r\n');
|
||||
serverSocket.write(head);
|
||||
serverSocket.pipe(clientSocket);
|
||||
clientSocket.pipe(serverSocket);
|
||||
serverSocket.on('error', (err)=>{
|
||||
console.error(err);
|
||||
});
|
||||
});
|
||||
|
||||
proxy = Proxy();
|
||||
proxy.onRequest((ctx, callback) => {
|
||||
ctx.proxyToServerRequestOptions.host = 'localhost';
|
||||
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
|
||||
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
|
||||
debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
|
||||
return callback();
|
||||
});
|
||||
|
||||
await Promise.promisify(httpServerAndProxy.listen, {context: httpServerAndProxy})(httpPort);
|
||||
await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
|
||||
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
|
||||
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
||||
debug('Fake http search engine servers started');
|
||||
});
|
||||
|
||||
after(function(){
|
||||
httpsServer.close();
|
||||
httpServerAndProxy.close();
|
||||
httpServer.close();
|
||||
proxy.close();
|
||||
});
|
||||
|
||||
describe('proxies', function(){
|
||||
|
||||
class MockScraper extends Scraper {
|
||||
class MockScraperTestProxy extends Scraper {
|
||||
|
||||
async load_start_page(){
|
||||
return true;
|
||||
}
|
||||
|
||||
async search_keyword(){
|
||||
await this.page.goto('http://void:' + httpPort + '/test');
|
||||
await this.page.goto('http://test.local:' + httpPort + '/test-proxy');
|
||||
}
|
||||
|
||||
async parse_async(){
|
||||
@ -84,29 +71,67 @@ describe('Config', function(){
|
||||
}
|
||||
}
|
||||
|
||||
const testLogger = createLogger({
|
||||
transports: [
|
||||
new transports.Console({
|
||||
level: 'error'
|
||||
})
|
||||
]
|
||||
});
|
||||
|
||||
/**
|
||||
* Jobs will be executed 1 by 1 through the proxy
|
||||
* Jobs will be executed 2 by 2 through the proxy and direct connection
|
||||
* THIS TEST NEED TO HAVE test.local 127.0.0.1 in /etc/hosts because chrome bypass localhost even with proxy set
|
||||
*/
|
||||
it('one proxy given', async function () {
|
||||
it('one proxy given, use_proxies_only=false', async function () {
|
||||
|
||||
const scrape_job = {
|
||||
search_engine: MockScraper,
|
||||
search_engine: MockScraperTestProxy,
|
||||
keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'],
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager({
|
||||
throw_on_detection: true,
|
||||
proxies: ['http://localhost:' + httpPort],
|
||||
use_proxies_only: true,
|
||||
proxies: ['http://localhost:' + proxyPort],
|
||||
// default is use_proxies_only: false,
|
||||
logger: testLogger,
|
||||
});
|
||||
await scraper.start();
|
||||
|
||||
const { results } = await scraper.scrape(scrape_job);
|
||||
assert.strictEqual(results['news']['1'], 'OK');
|
||||
assert.strictEqual(results['some stuff']['1'], 'OK');
|
||||
assert.strictEqual(results['i work too much']['1'], 'OK');
|
||||
assert.strictEqual(results['what to do?']['1'], 'OK');
|
||||
assert.strictEqual(results['javascript is hard']['1'], 'OK');
|
||||
assert.strictEqual(results['news']['1'], 'test.local');
|
||||
assert.strictEqual(results['some stuff']['1'], 'ProxiedThroughFakeEngine');
|
||||
assert.strictEqual(results['i work too much']['1'], 'test.local');
|
||||
assert.strictEqual(results['what to do?']['1'], 'ProxiedThroughFakeEngine');
|
||||
assert.strictEqual(results['javascript is hard']['1'], 'test.local');
|
||||
|
||||
await scraper.quit();
|
||||
});
|
||||
|
||||
/**
|
||||
* Jobs will be executed 1 by 1 through the proxy
|
||||
*/
|
||||
it('one proxy given, use_proxies_only=true', async function () {
|
||||
|
||||
const scrape_job = {
|
||||
search_engine: MockScraperTestProxy,
|
||||
keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'],
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager({
|
||||
throw_on_detection: true,
|
||||
proxies: ['http://localhost:' + proxyPort],
|
||||
use_proxies_only: true,
|
||||
logger: testLogger,
|
||||
});
|
||||
await scraper.start();
|
||||
|
||||
const { results } = await scraper.scrape(scrape_job);
|
||||
assert.strictEqual(results['news']['1'], 'ProxiedThroughFakeEngine');
|
||||
assert.strictEqual(results['some stuff']['1'], 'ProxiedThroughFakeEngine');
|
||||
assert.strictEqual(results['i work too much']['1'], 'ProxiedThroughFakeEngine');
|
||||
assert.strictEqual(results['what to do?']['1'], 'ProxiedThroughFakeEngine');
|
||||
assert.strictEqual(results['javascript is hard']['1'], 'ProxiedThroughFakeEngine');
|
||||
|
||||
await scraper.quit();
|
||||
});
|
||||
|
Loading…
Reference in New Issue
Block a user