diff --git a/.gitignore b/.gitignore index 0103c8d..7763e13 100644 --- a/.gitignore +++ b/.gitignore @@ -79,3 +79,5 @@ typings/ .idea/ GoogleScraperPup.iml + +.http-mitm-proxy diff --git a/package-lock.json b/package-lock.json index ee4c960..d4ea117 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1007,6 +1007,34 @@ "toidentifier": "1.0.0" } }, + "http-mitm-proxy": { + "version": "0.8.2", + "resolved": "https://registry.npmjs.org/http-mitm-proxy/-/http-mitm-proxy-0.8.2.tgz", + "integrity": "sha512-QqaqHWssz4acqu2aIPJqJWt/gDa4SzQ9kj/rs16ONA2nBWNh/mfOW0Ez1Wxa5IivHHZSTciQ7wG0Dxzogurngw==", + "dev": true, + "requires": { + "async": "^2.6.2", + "debug": "^4.1.0", + "mkdirp": "^0.5.1", + "node-forge": "^0.8.4", + "optimist": "^0.6.1", + "semaphore": "^1.1.0", + "ws": "^3.2.0" + }, + "dependencies": { + "ws": { + "version": "3.3.3", + "resolved": "https://registry.npmjs.org/ws/-/ws-3.3.3.tgz", + "integrity": "sha512-nnWLa/NwZSt4KQJu51MYlCcSQ5g7INpOrOMt4XV8j4dqTXdmlUmSHQ8/oLC069ckre0fRsgfvsKwbTdtKLCDkA==", + "dev": true, + "requires": { + "async-limiter": "~1.0.0", + "safe-buffer": "~5.1.0", + "ultron": "~1.1.0" + } + } + } + }, "https-proxy-agent": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-3.0.1.tgz", @@ -1475,6 +1503,12 @@ "semver": "^5.7.0" } }, + "node-forge": { + "version": "0.8.5", + "resolved": "https://registry.npmjs.org/node-forge/-/node-forge-0.8.5.tgz", + "integrity": "sha512-vFMQIWt+J/7FLNyKouZ9TazT74PRV3wgv9UT4cRjC8BffxFbKXkgIWR42URCPSnHm/QDz6BOlb2Q0U4+VQT67Q==", + "dev": true + }, "normalize-url": { "version": "3.3.0", "resolved": "https://registry.npmjs.org/normalize-url/-/normalize-url-3.3.0.tgz", @@ -1553,6 +1587,16 @@ "resolved": "https://registry.npmjs.org/one-time/-/one-time-0.0.4.tgz", "integrity": "sha1-+M33eISCb+Tf+T46nMN7HkSAdC4=" }, + "optimist": { + "version": "0.6.1", + "resolved": "https://registry.npmjs.org/optimist/-/optimist-0.6.1.tgz", + "integrity": "sha1-2j6nRob6IaGaERwybpDrFaAZZoY=", + "dev": true, + "requires": { + "minimist": "~0.0.1", + "wordwrap": "~0.0.2" + } + }, "os-locale": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/os-locale/-/os-locale-3.1.0.tgz", @@ -1897,6 +1941,12 @@ "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==", "dev": true }, + "semaphore": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/semaphore/-/semaphore-1.1.0.tgz", + "integrity": "sha512-O4OZEaNtkMd/K0i6js9SL+gqy0ZCBMgUvlSqHKi4IBdjhe7wB8pwztUk1BbZ1fmrvpwFrPbHzqd2w5pTcJH6LA==", + "dev": true + }, "semver": { "version": "5.7.0", "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.0.tgz", @@ -2147,6 +2197,12 @@ "resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz", "integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c=" }, + "ultron": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/ultron/-/ultron-1.1.1.tgz", + "integrity": "sha512-UIEXBNeYmKptWH6z8ZnqTeS8fV74zG0/eRU9VGkpzz+LIJNs8W/zM/L+7ctCkRrgbNnnR0xxw4bKOr0cW0N0Og==", + "dev": true + }, "underscore": { "version": "1.9.1", "resolved": "https://registry.npmjs.org/underscore/-/underscore-1.9.1.tgz", @@ -2273,6 +2329,12 @@ } } }, + "wordwrap": { + "version": "0.0.3", + "resolved": "https://registry.npmjs.org/wordwrap/-/wordwrap-0.0.3.tgz", + "integrity": "sha1-o9XabNXAvAAI03I0u68b7WMFkQc=", + "dev": true + }, "wrap-ansi": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-2.1.0.tgz", diff --git a/package.json b/package.json index b958075..d110373 100644 --- a/package.json +++ b/package.json @@ -37,6 +37,7 @@ "chai": "^4.2.0", "chai-string": "^1.5.0", "express": "^4.17.1", + "http-mitm-proxy": "^0.8.2", "key-cert": "^1.0.1", "mocha": "^6.1.4" } diff --git a/test/mocks/bing/index.html b/test/mocks/bing/index.html new file mode 100644 index 0000000..02b3071 --- /dev/null +++ b/test/mocks/bing/index.html @@ -0,0 +1,23 @@ +Bing

Image of the day

janv. 8, 2020
Avoir le souffle coupé, ça se mérite
© Bogdan Dyiakonovych/Shutterstock
S’il vous prenait l’envie de gravir un petit millier de marches, comme ça, juste pour le plaisir, on vous conseille de vous rendre au mont Tianmen, en Chine (littéralement « la porte du Paradis », rien que ça). Situé 1 500 mètres au-dessus du niveau de la mer, ce trou dans la montagne, que vous voyez sur l’image d’aujourd’hui, est la plus haute arche naturelle au monde. A l’origine il y avait une grotte mais cette dernière se transforma en arche en 263 avant J.C quand l’arrière de la montagne s’effondra, créant alors cette cavité béante. Pour atteindre le sommet, il vous faudra escalader les 999 marches qui y mènent. Mais la vue en vaut la peine, c’est promis.
Learn more
\ No newline at end of file diff --git a/test/mocks/bing/test keyword_page1.html b/test/mocks/bing/test keyword_page1.html new file mode 100644 index 0000000..445adab --- /dev/null +++ b/test/mocks/bing/test keyword_page1.html @@ -0,0 +1,42 @@ +test keyword - Bing

3 600 000 results
  1. Keyword Tests | TestComplete Documentation

    https://support.smartbear.com/testcomplete/docs/keyword-testing/index.html

    In keyword-driven tests, every action to be performed (a mouse click, keystroke, and so on) is described by a keyword. TestComplete supports this kind of tests. This section contains topics that describe how to create and use keyword tests with TestComplete.

  2. About Keyword Testing | TestComplete Documentation

    https://support.smartbear.com/testcomplete/docs/keyword-testing/overview.html

    Some keyword test operations (for instance, the Log Message, Delay and Comment operations) do not have nodes. If you add such operations to a test, they may break the current group node. To avoid this, TestComplete allows you to specify keyword test operations that do not break operation groups when they are added to a test.

  3. https://keywordtool.io/fr

    Keyword Tool vous permet d'extraire des mots-clés de 192 domaines Google et d'utiliser 83 langues pour générer des suggestions de mots clés. De cette façon, nous nous assurons que les mots clés générés seront pertinents pour le pays et / ou la langue pour laquelle vous créez votre contenu.

  4. Keyword-driven testing - Wikipedia

    https://en.wikipedia.org/wiki/Keyword-driven_testing

    A Keyword or Action Word is a defined combination of actions on a test object which describes how test lines must be executed. An action word contains arguments and is defined by a test analyst. An action word contains arguments and is defined by a test analyst.

  5. https://ads.google.com/intl/fr_fr/home/tools/keyword-planner

    Découvrez de nouveaux mots clés Cherchez des termes ou des expressions en rapport avec vos produits ou services. Notre outil de recherche de mots clés vous aidera à trouver les mots clés les plus pertinents pour votre activité.

  6. https://www.woorank.com/fr

    Un Keyword Tool performant . Vous souhaitez connaître la position exacte de vos mots-clés ? Suivez en détail la position de vos mots-clés, analysez l'historique de vos performances, la popularité des mots-clés que vous avez choisi et comparez vos résultats avec ceux de vos concurrents avec l'outil Keyword Tool. Essayez-le maintenant!

\ No newline at end of file diff --git a/test/mocks/bing/test keyword_page2.html b/test/mocks/bing/test keyword_page2.html new file mode 100644 index 0000000..8154635 --- /dev/null +++ b/test/mocks/bing/test keyword_page2.html @@ -0,0 +1,42 @@ +test keyword - Bing

7-16 of 3 600 000 results
  1. Keywords - TestLink

    testlink.sourceforge.net/docs/docs/toc.php?page=16

    Keywords were created to gives users another level of depth when categorizing test cases. Keyword Creation. At this time keywords can only be created by users with the mgt_modify_key rights. These rights are currently held only by leads. Once a keyword or grouping of keywords have been created users may assign them to test cases.

  2. https://docs.microsoft.com/fr-fr/dotnet/csharp/language-reference/keywords/is

    Type pattern, which tests whether an expression can be converted to a specified type and, if it can be, casts it to a variable of that type. Modèle de constante : teste si une expression correspond à une valeur de constante spécifiée. Constant pattern, which tests whether an expression evaluates to …

  3. Keyword Tool (FREE) ᐈ #1 Google Keyword Planner Alternative

    https://keywordtool.io

    Using Keyword Tool, you can choose a specific Google domain out of 192 supported domains and one out of 83 languages that will be used to produce keyword suggestions. The free version of Keyword Tool can generate up to 750+ keywords from Google autocomplete in seconds.

  4. Parameterizing Keyword Tests with TestComplete - YouTube

    https://www.youtube.com/watch?v=3Ed8T7XcpH8

    22/06/2011 · This video demonstrates how you can create and use parameters in automated keyword-driven tests. Parameters allow you to pass data into keyword-driven tests or between tests. By replacing hard ...

    • Author: SmartBear
    • Views: 5K
  5. https://www.seopageoptimizer.fr/fr/default/2008302/SEO/Test.aspx

    Faites le test dès maintenant. Cet outil permet de tester si votre page est conforme aux exigences de Google et la compare avec les pages obtenant les meilleurs scores. SEO Page Optimizer vous donne toutes les clés pour vous approcher au maximum des chiffres indiqués. Vous pouvez ainsi obtenir un meilleur positionnement dans Google.

  6. Keyword Rank Checker - A Free online Google keyword ...

    https://smallseotools.com/keyword-position

    Keyword Position Checker is a tool used to detect the position of a website or URL in the search engine (particularly, Google) for a given keyword as per competing with other websites for the same keyword.

  7. Choose the Right Keywords with Our Research Tools - Google Ads

    https://ads.google.com/home/tools/keyword-planner

    Our keyword research tool gives you insight into how often certain words are searched and how those searches have changed over time. This can help you narrow your …

  8. Search Results for - Tests.com

    https://tests.com/wordsearch?keyword=+

    Test Product/Service Organization Description Cost; Save Checked Listings | Compare Checked Listings. About Us | Contact | Terms of Sale / Refunds

  9. https://www.rtbf.be/tendance/mot-cle_test?keyword=1158186

  10. Keyword Density Checker | Online Keyword Density Tool Free

    https://smallseotools.com/keyword-density-checker

    Keyword Density Checker is a tool built solely for the purpose of calculating the keyword density of any web page. The dev team at Small SEO Tools created the tool after finding out that some marketers were still stuffing their content with loads of keywords even without realizing it.

\ No newline at end of file diff --git a/test/mocks/bing/test keyword_page3.html b/test/mocks/bing/test keyword_page3.html new file mode 100644 index 0000000..7f1d6fd --- /dev/null +++ b/test/mocks/bing/test keyword_page3.html @@ -0,0 +1,40 @@ +test keyword - Bing

17-26 of 3 600 000 results
  1. Keyword Driven Testing | TestComplete

    https://smartbear.com/product/testcomplete/features/keyword-driven-testing

    Keyword-driven testing is an approach to software testing that separates test case design from execution. Each automated UI test is built on a series of operations, specified by keywords, that simulates a user action, such as a mouse click or keystroke.

  2. Keywords Usage Test | SEO Site Checkup

    https://seositecheckup.com/tools/keywords-usage-test

    Keywords Usage Test What is it? This will check if your most common keywords are used in the webpage's title and description. Check your URL: × Would you like to analyze more competitors simultaneously? Sign up for our free trial. Checkup! How do I fix it? First of all, you must make sure that your page is using the title and meta-description tags. Second, you must adjust these tags content ...

  3. https://www.rtbf.be/tendance/mot-cle_test?keyword=1158186

  4. https://fr.sputniknews.com/tags/keyword_test

    Les tests de la fusée Starhopper de SpaceX ont été une nouvelle fois interrompus après que sa partie supérieure a pris feu dès les premières secondes.

  5. Keywords Cloud Test | SEO Site Checkup

    https://seositecheckup.com/tools/keywords-cloud-test

    The Keyword Cloud is a visual representation of keywords used on your website. This will show you which words are frequently used in the content of your webpage. Keywords having higher density are presented in larger fonts and displayed in alphabetic order.

  6. MTest keywords - Generation

    tfel.sourceforge.net/MTest-keywords.html

    The keyword @ImposedOpeningDisplacement is not documented yet. The @ImposedStrain keyword. The @ImposedStrain keyword allows the user to impose the evolution of a component of the strains. This keyword may have one option, which is the way the evolution will be defined. Two values are accepted: evolution and function.

  7. https://deusyss.developpez.com/tutoriels/Python/Robotframework

    La version complète convient aux développeurs en leur offrant la possibilité de coder des tests complets et/ou des keywords directement en Python. Dans tous les cas, il permet l'écriture rapide de jeux de tests, et ses rapports, à la fois complets et explicites ne pourront …

  8. YouTube Keyword Tool: Generate YouTube Tags & Keywords for ...

    https://kparser.com/youtube-keyword-tool

    Get 7x more the best YouTube keywords with YouTube Keyword Tool alternative! Explore the step-by-step algorithm for using Kparser for blogging: find new ideas for own videos, generate thousands of long-tail suggestions for the most profitable keywords in YouTube title, description, tags.

  9. KWFinder: Keyword Research and Analysis Tool

    https://kwfinder.com

    Keywords with exact search volumes Get search volumes with historical data. Timing is the key! Be ready and create content based on historical search volumes and long-term trends. Identify seasonal keywords and hot topics 🌶️ that will boost the organic traffic of your website.

  10. Take the SEO Expert Quiz - Moz

    https://moz.com/seo-expert-quiz

    The SEO Expert Quiz has 50 action-packed questions and takes 15 minutes to complete. You have nothing to lose and a lot of prestige to gain. Let the games …

\ No newline at end of file diff --git a/test/modules/bing.js b/test/modules/bing.js index ec76cde..bbc9dd0 100644 --- a/test/modules/bing.js +++ b/test/modules/bing.js @@ -1,68 +1,57 @@ 'use strict'; const express = require('express'); const puppeteer = require('puppeteer'); -// TODO add a test logger in place of default winston logger -const logger = require('winston'); -const net = require('net'); +const { createLogger, transports } = require('winston'); const http = require('http'); const https = require('https'); -const url = require('url'); const assert = require('assert'); const path = require('path'); const keyCert = require('key-cert'); const Promise = require('bluebird'); +const Proxy = require('http-mitm-proxy'); const debug = require('debug')('se-scraper:test'); const { BingScraper } = require('../../src/modules/bing'); const httpPort = 3012; const httpsPort = httpPort + 1; +const proxyPort = httpPort + 2; const fakeSearchEngine = express(); fakeSearchEngine.get('/search', (req, res, next) => { debug('q=%s', req.query.q); - const pageNumber = ((req.query.start/10) || 0) + 1; + const pageNumber = Math.round((req.query.first || 0) /10) + 1; res.sendFile(path.join(__dirname, '../mocks/bing/' + req.query.q + '_page' + pageNumber + '.html')); }); fakeSearchEngine.use(express.static('test/mocks/bing', {extensions: ['html']})); describe('Module Bing', function(){ - let httpServerAndProxy, httpsServer; + let httpServer, httpsServer, proxy; before(async function(){ // Here mount our fake engine in both http and https listen server - httpServerAndProxy = http.createServer(fakeSearchEngine); + httpServer = http.createServer(fakeSearchEngine); httpsServer = https.createServer(await keyCert(), fakeSearchEngine); - /** - * express doesn't handle HTTP CONNECT method, this implement a basic MITM http proxy - * here we use our http server to also act as a http proxy and rewrite all http/s request to our fake engine - */ - httpServerAndProxy.on('connect', (req, clientSocket, head) => { - const parsedUrl = url.parse('http://' + req.url); - const destPort = (parseInt(parsedUrl.port) === 443) ? httpsPort : httpPort; - const serverSocket = net.connect(destPort, 'localhost', () => { - debug('connection proxied askedHost=%s toPort=%s', parsedUrl.host, destPort); - clientSocket.write('HTTP/1.1 200 Connection Established\r\n' + - 'Proxy-agent: Node.js-Proxy\r\n' + - '\r\n'); - serverSocket.write(head); - serverSocket.pipe(clientSocket); - clientSocket.pipe(serverSocket); - serverSocket.on('error', (err)=>{ - console.error(err); - }); - }); + proxy = Proxy(); + proxy.onRequest((ctx, callback) => { + ctx.proxyToServerRequestOptions.host = 'localhost'; + ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort; + ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine'; + debug('connection proxied askedHost=%s toPort=%s', ctx.clientToProxyRequest.headers.host, ctx.proxyToServerRequestOptions.port); + return callback(); }); - await Promise.promisify(httpServerAndProxy.listen, {context: httpServerAndProxy})(httpPort); + await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort }); + await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort); await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort); debug('Fake http search engine servers started'); }); after(function(){ + proxy.close(); httpsServer.close(); - httpServerAndProxy.close(); + httpServer.close(); }); let browser; @@ -71,8 +60,9 @@ describe('Module Bing', function(){ debug('Start a new browser'); browser = await puppeteer.launch({ //dumpio: true, + //headless: false, ignoreHTTPSErrors: true, - args: [ '--proxy-server=http://localhost:' + httpPort ] + args: [ '--proxy-server=http://localhost:' + proxyPort ] }); debug('Open a fresh page'); page = await browser.newPage(); @@ -82,20 +72,28 @@ describe('Module Bing', function(){ await browser.close(); }); + const testLogger = createLogger({ + transports: [ + new transports.Console({ + level: 'error' + }) + ] + }); + it('one keyword one page', function(){ const bingScraper = new BingScraper({ config: { search_engine_name: 'bing', throw_on_detection: true, keywords: ['test keyword'], - logger, + logger: testLogger, scrape_from_file: '', } }); bingScraper.STANDARD_TIMEOUT = 500; return bingScraper.run({page}).then(({results, metadata, num_requests}) => { assert.strictEqual(num_requests, 1, 'Must do one request'); - assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed'); + assert.strictEqual(results['test keyword']['1'].results.length, 6, 'Must have 6 organic results parsed'); }); }); @@ -105,7 +103,7 @@ describe('Module Bing', function(){ search_engine_name: 'bing', throw_on_detection: true, keywords: ['test keyword'], - logger, + logger: testLogger, scrape_from_file: '', num_pages: 3, } @@ -113,12 +111,12 @@ describe('Module Bing', function(){ bingScraper.STANDARD_TIMEOUT = 500; return bingScraper.run({page}).then(({results, metadata, num_requests}) => { assert.strictEqual(num_requests, 3, 'Must three requests'); - assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed on page 1'); - assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tool (FREE) ᐈ #1 Google Keyword Planner Alternative', 'Title not matching on first organic result page 1'); + assert.strictEqual(results['test keyword']['1'].results.length, 6, 'Must have 6 organic results parsed on page 1'); + assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tests | TestComplete Documentation', 'Title not matching on first organic result page 1'); assert.strictEqual(results['test keyword']['2'].results.length, 10, 'Must have 10 organic results parsed on page 2'); - assert.strictEqual(results['test keyword']['2'].results[0].title, 'Keyword Research | The Beginner\'s Guide to SEO - Moz', 'Title not matching on first organic result page 1'); + assert.strictEqual(results['test keyword']['2'].results[0].title, 'Keywords - TestLink', 'Title not matching on first organic result page 2'); assert.strictEqual(results['test keyword']['3'].results.length, 10, 'Must have 10 organic results parsed on page 3'); - assert.strictEqual(results['test keyword']['3'].results[0].title, 'The ACT Keyword Study Plan — NerdCoach', 'Title not matching on first organic result page 1'); + assert.strictEqual(results['test keyword']['3'].results[0].title, 'Keyword Driven Testing | TestComplete', 'Title not matching on first organic result page 3'); }); }); diff --git a/test/modules/google.js b/test/modules/google.js index 7587f97..83c2ae3 100644 --- a/test/modules/google.js +++ b/test/modules/google.js @@ -1,25 +1,24 @@ 'use strict'; const express = require('express'); const puppeteer = require('puppeteer'); -// TODO add a test logger in place of default winston logger -const logger = require('winston'); -const net = require('net'); +const { createLogger, transports } = require('winston'); const http = require('http'); const https = require('https'); -const url = require('url'); const assert = require('assert'); const path = require('path'); const keyCert = require('key-cert'); const Promise = require('bluebird'); +const Proxy = require('http-mitm-proxy'); const debug = require('debug')('se-scraper:test'); const { GoogleScraper } = require('../../src/modules/google'); const httpPort = 3012; const httpsPort = httpPort + 1; +const proxyPort = httpPort + 2; const fakeSearchEngine = express(); -fakeSearchEngine.get('/search', (req, res, next) => { +fakeSearchEngine.get('/search', (req, res) => { debug('q=%s', req.query.q); const pageNumber = ((req.query.start/10) || 0) + 1; res.sendFile(path.join(__dirname, '../mocks/google/' + req.query.q + '_page' + pageNumber + '.html')); @@ -28,41 +27,31 @@ fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']}) describe('Module Google', function(){ - let httpServerAndProxy, httpsServer; + let httpServer, httpsServer, proxy; before(async function(){ // Here mount our fake engine in both http and https listen server - httpServerAndProxy = http.createServer(fakeSearchEngine); + httpServer = http.createServer(fakeSearchEngine); httpsServer = https.createServer(await keyCert(), fakeSearchEngine); - /** - * express doesn't handle HTTP CONNECT method, this implement a basic MITM http proxy - * here we use our http server to also act as a http proxy and rewrite all http/s request to our fake engine - */ - httpServerAndProxy.on('connect', (req, clientSocket, head) => { - const parsedUrl = url.parse('http://' + req.url); - const destPort = (parseInt(parsedUrl.port) === 443) ? httpsPort : httpPort; - const serverSocket = net.connect(destPort, 'localhost', () => { - debug('connection proxied askedHost=%s toPort=%s', parsedUrl.host, destPort); - clientSocket.write('HTTP/1.1 200 Connection Established\r\n' + - 'Proxy-agent: Node.js-Proxy\r\n' + - '\r\n'); - serverSocket.write(head); - serverSocket.pipe(clientSocket); - clientSocket.pipe(serverSocket); - serverSocket.on('error', (err)=>{ - console.error(err); - }); - }); + proxy = Proxy(); + proxy.onRequest((ctx, callback) => { + ctx.proxyToServerRequestOptions.host = 'localhost'; + ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort; + ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine'; + debug('connection proxied askedHost=%s toPort=%s', ctx.clientToProxyRequest.headers.host, ctx.proxyToServerRequestOptions.port); + return callback(); }); - await Promise.promisify(httpServerAndProxy.listen, {context: httpServerAndProxy})(httpPort); + await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort }); + await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort); await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort); debug('Fake http search engine servers started'); }); after(function(){ + proxy.close(); httpsServer.close(); - httpServerAndProxy.close(); + httpServer.close(); }); let browser; @@ -71,8 +60,9 @@ describe('Module Google', function(){ debug('Start a new browser'); browser = await puppeteer.launch({ //dumpio: true, + //headless: false, ignoreHTTPSErrors: true, - args: [ '--proxy-server=http://localhost:' + httpPort ] + args: [ '--proxy-server=http://localhost:' + proxyPort ] }); debug('Open a fresh page'); page = await browser.newPage(); @@ -82,13 +72,21 @@ describe('Module Google', function(){ await browser.close(); }); + const testLogger = createLogger({ + transports: [ + new transports.Console({ + level: 'error' + }) + ] + }); + it('one keyword one page', function(){ const googleScraper = new GoogleScraper({ config: { search_engine_name: 'google', throw_on_detection: true, keywords: ['test keyword'], - logger, + logger: testLogger, scrape_from_file: '', } }); @@ -105,7 +103,7 @@ describe('Module Google', function(){ search_engine_name: 'google', throw_on_detection: true, keywords: ['test keyword'], - logger, + logger: testLogger, scrape_from_file: '', num_pages: 3, } diff --git a/test/proxy.js b/test/proxy.js index ef5d092..d95e1d2 100644 --- a/test/proxy.js +++ b/test/proxy.js @@ -1,16 +1,12 @@ 'use strict'; const express = require('express'); -const puppeteer = require('puppeteer'); -// TODO add a test logger in place of default winston logger -const logger = require('winston'); -const net = require('net'); +const { createLogger, transports } = require('winston'); const http = require('http'); const https = require('https'); -const url = require('url'); const assert = require('assert'); -const path = require('path'); const keyCert = require('key-cert'); const Promise = require('bluebird'); +const Proxy = require('http-mitm-proxy'); const debug = require('debug')('se-scraper:test'); const se_scraper = require('../'); @@ -18,64 +14,55 @@ const Scraper = require('../src/modules/se_scraper'); const httpPort = 3012; const httpsPort = httpPort + 1; -const httpOtherPort = httpPort + 2; +const proxyPort = httpPort + 2; const fakeSearchEngine = express(); -fakeSearchEngine.get('/test', (req, res) => { - debug(req.ip, req.ips, req.protocol, req.hostname); - debug(req.socket.localAddress, req.socket.localPort); - res.send('OK'); +fakeSearchEngine.set('trust proxy', 'loopback'); +fakeSearchEngine.get('/test-proxy', (req, res) => { + debug('fake-search-engine req.hostname=%s', req.hostname); + //debug('req to', req.socket.localAddress, req.socket.localPort); + res.send(req.hostname); }); describe('Config', function(){ - let httpServerAndProxy, httpsServer, httpOtherServer; + let httpServer, httpsServer, proxy; before(async function(){ // Here mount our fake engine in both http and https listen server - httpServerAndProxy = http.createServer(fakeSearchEngine); + httpServer = http.createServer(fakeSearchEngine); httpsServer = https.createServer(await keyCert(), fakeSearchEngine); - - /** - * express doesn't handle HTTP CONNECT method, this implement a basic MITM http proxy - * here we use our http server to also act as a http proxy and rewrite all http/s request to our fake engine - */ - httpServerAndProxy.on('connect', (req, clientSocket, head) => { - const parsedUrl = url.parse('http://' + req.url); - const destPort = (parseInt(parsedUrl.port) === 443) ? httpsPort : httpPort; - const serverSocket = net.connect(destPort, 'localhost', () => { - debug('connection proxied askedHost=%s toPort=%s', parsedUrl.host, destPort); - clientSocket.write('HTTP/1.1 200 Connection Established\r\n' + - 'Proxy-agent: Node.js-Proxy\r\n' + - '\r\n'); - serverSocket.write(head); - serverSocket.pipe(clientSocket); - clientSocket.pipe(serverSocket); - serverSocket.on('error', (err)=>{ - console.error(err); - }); - }); + + proxy = Proxy(); + proxy.onRequest((ctx, callback) => { + ctx.proxyToServerRequestOptions.host = 'localhost'; + ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort; + ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine'; + debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host); + return callback(); }); - await Promise.promisify(httpServerAndProxy.listen, {context: httpServerAndProxy})(httpPort); + await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort}); + await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort); await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort); debug('Fake http search engine servers started'); }); after(function(){ httpsServer.close(); - httpServerAndProxy.close(); + httpServer.close(); + proxy.close(); }); describe('proxies', function(){ - class MockScraper extends Scraper { + class MockScraperTestProxy extends Scraper { async load_start_page(){ return true; } async search_keyword(){ - await this.page.goto('http://void:' + httpPort + '/test'); + await this.page.goto('http://test.local:' + httpPort + '/test-proxy'); } async parse_async(){ @@ -84,29 +71,67 @@ describe('Config', function(){ } } + const testLogger = createLogger({ + transports: [ + new transports.Console({ + level: 'error' + }) + ] + }); + /** - * Jobs will be executed 1 by 1 through the proxy + * Jobs will be executed 2 by 2 through the proxy and direct connection + * THIS TEST NEED TO HAVE test.local 127.0.0.1 in /etc/hosts because chrome bypass localhost even with proxy set */ - it('one proxy given', async function () { + it('one proxy given, use_proxies_only=false', async function () { const scrape_job = { - search_engine: MockScraper, + search_engine: MockScraperTestProxy, keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'], }; var scraper = new se_scraper.ScrapeManager({ throw_on_detection: true, - proxies: ['http://localhost:' + httpPort], - use_proxies_only: true, + proxies: ['http://localhost:' + proxyPort], + // default is use_proxies_only: false, + logger: testLogger, }); await scraper.start(); const { results } = await scraper.scrape(scrape_job); - assert.strictEqual(results['news']['1'], 'OK'); - assert.strictEqual(results['some stuff']['1'], 'OK'); - assert.strictEqual(results['i work too much']['1'], 'OK'); - assert.strictEqual(results['what to do?']['1'], 'OK'); - assert.strictEqual(results['javascript is hard']['1'], 'OK'); + assert.strictEqual(results['news']['1'], 'test.local'); + assert.strictEqual(results['some stuff']['1'], 'ProxiedThroughFakeEngine'); + assert.strictEqual(results['i work too much']['1'], 'test.local'); + assert.strictEqual(results['what to do?']['1'], 'ProxiedThroughFakeEngine'); + assert.strictEqual(results['javascript is hard']['1'], 'test.local'); + + await scraper.quit(); + }); + + /** + * Jobs will be executed 1 by 1 through the proxy + */ + it('one proxy given, use_proxies_only=true', async function () { + + const scrape_job = { + search_engine: MockScraperTestProxy, + keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'], + }; + + var scraper = new se_scraper.ScrapeManager({ + throw_on_detection: true, + proxies: ['http://localhost:' + proxyPort], + use_proxies_only: true, + logger: testLogger, + }); + await scraper.start(); + + const { results } = await scraper.scrape(scrape_job); + assert.strictEqual(results['news']['1'], 'ProxiedThroughFakeEngine'); + assert.strictEqual(results['some stuff']['1'], 'ProxiedThroughFakeEngine'); + assert.strictEqual(results['i work too much']['1'], 'ProxiedThroughFakeEngine'); + assert.strictEqual(results['what to do?']['1'], 'ProxiedThroughFakeEngine'); + assert.strictEqual(results['javascript is hard']['1'], 'ProxiedThroughFakeEngine'); await scraper.quit(); });