From 3ab8e46126d95d8a8c3af648260390426aea5369 Mon Sep 17 00:00:00 2001 From: HugoPoi Date: Tue, 7 Jan 2020 09:48:46 +0100 Subject: [PATCH] test: add bing module test --- src/modules/bing.js | 9 +- .../{search.html => test keyword_page1.html} | 0 test/mocks/google/test keyword_page2.html | 206 ++++++++++++++++++ test/mocks/google/test keyword_page3.html | 191 ++++++++++++++++ test/modules/bing.js | 125 +++++++++++ .../google.js} | 44 +++- 6 files changed, 561 insertions(+), 14 deletions(-) rename test/mocks/google/{search.html => test keyword_page1.html} (100%) create mode 100644 test/mocks/google/test keyword_page2.html create mode 100644 test/mocks/google/test keyword_page3.html create mode 100644 test/modules/bing.js rename test/{mock_search_engine.js => modules/google.js} (57%) diff --git a/src/modules/bing.js b/src/modules/bing.js index 0cda19a..78f2d2a 100644 --- a/src/modules/bing.js +++ b/src/modules/bing.js @@ -123,12 +123,9 @@ class BingScraper extends Scraper { } } - try { - await this.page.goto(startUrl); - await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT }); - } catch (e) { - return false; - } + await this.page.goto(startUrl); + await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT }); + return true; } diff --git a/test/mocks/google/search.html b/test/mocks/google/test keyword_page1.html similarity index 100% rename from test/mocks/google/search.html rename to test/mocks/google/test keyword_page1.html diff --git a/test/mocks/google/test keyword_page2.html b/test/mocks/google/test keyword_page2.html new file mode 100644 index 0000000..ab36724 --- /dev/null +++ b/test/mocks/google/test keyword_page2.html @@ -0,0 +1,206 @@ +test keyword - Google Search

Accessibility links

Skip to main contentAccessibility help
Accessibility feedback
Page 2 of about 234,000,000 results (0.38 seconds) 

Ad

  1. Improve Your Quality Score. Boost Your CTR. Reduce Your CPC. 5X Faster Than The Editor. Automatic Negative Keyword Generation, DKI, 24/7 Chat Support & More. Sign Up...

    People also search for

Google apps
\ No newline at end of file diff --git a/test/mocks/google/test keyword_page3.html b/test/mocks/google/test keyword_page3.html new file mode 100644 index 0000000..b27b638 --- /dev/null +++ b/test/mocks/google/test keyword_page3.html @@ -0,0 +1,191 @@ +test keyword - Google Search

Accessibility links

Skip to main contentAccessibility help
Accessibility feedback
Page 3 of about 234,000,000 results (0.54 seconds) 
Google apps
\ No newline at end of file diff --git a/test/modules/bing.js b/test/modules/bing.js new file mode 100644 index 0000000..ec76cde --- /dev/null +++ b/test/modules/bing.js @@ -0,0 +1,125 @@ +'use strict'; +const express = require('express'); +const puppeteer = require('puppeteer'); +// TODO add a test logger in place of default winston logger +const logger = require('winston'); +const net = require('net'); +const http = require('http'); +const https = require('https'); +const url = require('url'); +const assert = require('assert'); +const path = require('path'); +const keyCert = require('key-cert'); +const Promise = require('bluebird'); + +const debug = require('debug')('se-scraper:test'); +const { BingScraper } = require('../../src/modules/bing'); + +const httpPort = 3012; +const httpsPort = httpPort + 1; + +const fakeSearchEngine = express(); +fakeSearchEngine.get('/search', (req, res, next) => { + debug('q=%s', req.query.q); + const pageNumber = ((req.query.start/10) || 0) + 1; + res.sendFile(path.join(__dirname, '../mocks/bing/' + req.query.q + '_page' + pageNumber + '.html')); +}); +fakeSearchEngine.use(express.static('test/mocks/bing', {extensions: ['html']})); + +describe('Module Bing', function(){ + + let httpServerAndProxy, httpsServer; + before(async function(){ + // Here mount our fake engine in both http and https listen server + httpServerAndProxy = http.createServer(fakeSearchEngine); + httpsServer = https.createServer(await keyCert(), fakeSearchEngine); + + /** + * express doesn't handle HTTP CONNECT method, this implement a basic MITM http proxy + * here we use our http server to also act as a http proxy and rewrite all http/s request to our fake engine + */ + httpServerAndProxy.on('connect', (req, clientSocket, head) => { + const parsedUrl = url.parse('http://' + req.url); + const destPort = (parseInt(parsedUrl.port) === 443) ? httpsPort : httpPort; + const serverSocket = net.connect(destPort, 'localhost', () => { + debug('connection proxied askedHost=%s toPort=%s', parsedUrl.host, destPort); + clientSocket.write('HTTP/1.1 200 Connection Established\r\n' + + 'Proxy-agent: Node.js-Proxy\r\n' + + '\r\n'); + serverSocket.write(head); + serverSocket.pipe(clientSocket); + clientSocket.pipe(serverSocket); + serverSocket.on('error', (err)=>{ + console.error(err); + }); + }); + }); + + await Promise.promisify(httpServerAndProxy.listen, {context: httpServerAndProxy})(httpPort); + await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort); + debug('Fake http search engine servers started'); + }); + + after(function(){ + httpsServer.close(); + httpServerAndProxy.close(); + }); + + let browser; + let page; + beforeEach(async function(){ + debug('Start a new browser'); + browser = await puppeteer.launch({ + //dumpio: true, + ignoreHTTPSErrors: true, + args: [ '--proxy-server=http://localhost:' + httpPort ] + }); + debug('Open a fresh page'); + page = await browser.newPage(); + }); + + afterEach(async function(){ + await browser.close(); + }); + + it('one keyword one page', function(){ + const bingScraper = new BingScraper({ + config: { + search_engine_name: 'bing', + throw_on_detection: true, + keywords: ['test keyword'], + logger, + scrape_from_file: '', + } + }); + bingScraper.STANDARD_TIMEOUT = 500; + return bingScraper.run({page}).then(({results, metadata, num_requests}) => { + assert.strictEqual(num_requests, 1, 'Must do one request'); + assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed'); + }); + }); + + it('one keyword 3 pages', function () { + const bingScraper = new BingScraper({ + config: { + search_engine_name: 'bing', + throw_on_detection: true, + keywords: ['test keyword'], + logger, + scrape_from_file: '', + num_pages: 3, + } + }); + bingScraper.STANDARD_TIMEOUT = 500; + return bingScraper.run({page}).then(({results, metadata, num_requests}) => { + assert.strictEqual(num_requests, 3, 'Must three requests'); + assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed on page 1'); + assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tool (FREE) ᐈ #1 Google Keyword Planner Alternative', 'Title not matching on first organic result page 1'); + assert.strictEqual(results['test keyword']['2'].results.length, 10, 'Must have 10 organic results parsed on page 2'); + assert.strictEqual(results['test keyword']['2'].results[0].title, 'Keyword Research | The Beginner\'s Guide to SEO - Moz', 'Title not matching on first organic result page 1'); + assert.strictEqual(results['test keyword']['3'].results.length, 10, 'Must have 10 organic results parsed on page 3'); + assert.strictEqual(results['test keyword']['3'].results[0].title, 'The ACT Keyword Study Plan — NerdCoach', 'Title not matching on first organic result page 1'); + }); + }); + +}); \ No newline at end of file diff --git a/test/mock_search_engine.js b/test/modules/google.js similarity index 57% rename from test/mock_search_engine.js rename to test/modules/google.js index 48493fe..7587f97 100644 --- a/test/mock_search_engine.js +++ b/test/modules/google.js @@ -1,22 +1,28 @@ +'use strict'; const express = require('express'); const puppeteer = require('puppeteer'); +// TODO add a test logger in place of default winston logger const logger = require('winston'); const net = require('net'); const http = require('http'); const https = require('https'); const url = require('url'); +const assert = require('assert'); +const path = require('path'); const keyCert = require('key-cert'); const Promise = require('bluebird'); const debug = require('debug')('se-scraper:test'); -const { GoogleScraper } = require('../src/modules/google'); +const { GoogleScraper } = require('../../src/modules/google'); const httpPort = 3012; const httpsPort = httpPort + 1; const fakeSearchEngine = express(); -fakeSearchEngine.get("/about", (req, res) => { - res.status(500).send("This is the About page"); +fakeSearchEngine.get('/search', (req, res, next) => { + debug('q=%s', req.query.q); + const pageNumber = ((req.query.start/10) || 0) + 1; + res.sendFile(path.join(__dirname, '../mocks/google/' + req.query.q + '_page' + pageNumber + '.html')); }); fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']})); @@ -76,7 +82,7 @@ describe('Module Google', function(){ await browser.close(); }); - it('one keyword', function(){ + it('one keyword one page', function(){ const googleScraper = new GoogleScraper({ config: { search_engine_name: 'google', @@ -84,14 +90,36 @@ describe('Module Google', function(){ keywords: ['test keyword'], logger, scrape_from_file: '', - google_settings: { - //start_url: 'http://www.google.com/' - } } }); googleScraper.STANDARD_TIMEOUT = 500; - return googleScraper.run({page}); + return googleScraper.run({page}).then(({results, metadata, num_requests}) => { + assert.strictEqual(num_requests, 1, 'Must do one request'); + assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed'); + }); }); + it('one keyword 3 pages', function () { + const googleScraper = new GoogleScraper({ + config: { + search_engine_name: 'google', + throw_on_detection: true, + keywords: ['test keyword'], + logger, + scrape_from_file: '', + num_pages: 3, + } + }); + googleScraper.STANDARD_TIMEOUT = 500; + return googleScraper.run({page}).then(({results, metadata, num_requests}) => { + assert.strictEqual(num_requests, 3, 'Must three requests'); + assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed on page 1'); + assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tool (FREE) ᐈ #1 Google Keyword Planner Alternative', 'Title not matching on first organic result page 1'); + assert.strictEqual(results['test keyword']['2'].results.length, 10, 'Must have 10 organic results parsed on page 2'); + assert.strictEqual(results['test keyword']['2'].results[0].title, 'Keyword Research | The Beginner\'s Guide to SEO - Moz', 'Title not matching on first organic result page 1'); + assert.strictEqual(results['test keyword']['3'].results.length, 10, 'Must have 10 organic results parsed on page 3'); + assert.strictEqual(results['test keyword']['3'].results[0].title, 'The ACT Keyword Study Plan — NerdCoach', 'Title not matching on first organic result page 1'); + }); + }); }); \ No newline at end of file