2020-01-07 09:48:46 +01:00
'use strict' ;
2020-01-03 19:21:34 +01:00
const express = require ( 'express' ) ;
const puppeteer = require ( 'puppeteer' ) ;
2020-01-08 14:40:28 +01:00
const { createLogger , transports } = require ( 'winston' ) ;
2020-01-03 19:21:34 +01:00
const http = require ( 'http' ) ;
const https = require ( 'https' ) ;
2020-01-07 09:48:46 +01:00
const assert = require ( 'assert' ) ;
const path = require ( 'path' ) ;
2020-01-03 19:21:34 +01:00
const keyCert = require ( 'key-cert' ) ;
const Promise = require ( 'bluebird' ) ;
2020-01-08 14:40:28 +01:00
const Proxy = require ( 'http-mitm-proxy' ) ;
2020-01-03 19:21:34 +01:00
const debug = require ( 'debug' ) ( 'se-scraper:test' ) ;
2020-01-07 09:48:46 +01:00
const { GoogleScraper } = require ( '../../src/modules/google' ) ;
2020-01-03 19:21:34 +01:00
const httpPort = 3012 ;
const httpsPort = httpPort + 1 ;
2020-01-08 14:40:28 +01:00
const proxyPort = httpPort + 2 ;
2020-01-03 19:21:34 +01:00
const fakeSearchEngine = express ( ) ;
2020-01-08 14:40:28 +01:00
fakeSearchEngine . get ( '/search' , ( req , res ) => {
2020-01-07 09:48:46 +01:00
debug ( 'q=%s' , req . query . q ) ;
const pageNumber = ( ( req . query . start / 10 ) || 0 ) + 1 ;
res . sendFile ( path . join ( _ _dirname , '../mocks/google/' + req . query . q + '_page' + pageNumber + '.html' ) ) ;
2020-01-03 19:21:34 +01:00
} ) ;
fakeSearchEngine . use ( express . static ( 'test/mocks/google' , { extensions : [ 'html' ] } ) ) ;
describe ( 'Module Google' , function ( ) {
2020-01-08 14:40:28 +01:00
let httpServer , httpsServer , proxy ;
2020-01-03 19:21:34 +01:00
before ( async function ( ) {
// Here mount our fake engine in both http and https listen server
2020-01-08 14:40:28 +01:00
httpServer = http . createServer ( fakeSearchEngine ) ;
2020-01-03 19:21:34 +01:00
httpsServer = https . createServer ( await keyCert ( ) , fakeSearchEngine ) ;
2020-01-08 14:40:28 +01:00
proxy = Proxy ( ) ;
proxy . onRequest ( ( ctx , callback ) => {
ctx . proxyToServerRequestOptions . host = 'localhost' ;
ctx . proxyToServerRequestOptions . port = ( ctx . isSSL ) ? httpsPort : httpPort ;
ctx . proxyToServerRequestOptions . headers [ 'X-Forwarded-Host' ] = 'ProxiedThroughFakeEngine' ;
debug ( 'connection proxied askedHost=%s toPort=%s' , ctx . clientToProxyRequest . headers . host , ctx . proxyToServerRequestOptions . port ) ;
return callback ( ) ;
2020-01-03 19:21:34 +01:00
} ) ;
2020-01-08 14:40:28 +01:00
await Promise . promisify ( proxy . listen , { context : proxy } ) ( { port : proxyPort } ) ;
await Promise . promisify ( httpServer . listen , { context : httpServer } ) ( httpPort ) ;
2020-01-03 19:21:34 +01:00
await Promise . promisify ( httpsServer . listen , { context : httpsServer } ) ( httpsPort ) ;
debug ( 'Fake http search engine servers started' ) ;
} ) ;
after ( function ( ) {
2020-01-08 14:40:28 +01:00
proxy . close ( ) ;
2020-01-03 19:21:34 +01:00
httpsServer . close ( ) ;
2020-01-08 14:40:28 +01:00
httpServer . close ( ) ;
2020-01-03 19:21:34 +01:00
} ) ;
let browser ;
let page ;
beforeEach ( async function ( ) {
debug ( 'Start a new browser' ) ;
browser = await puppeteer . launch ( {
//dumpio: true,
2020-01-08 14:40:28 +01:00
//headless: false,
2020-01-03 19:21:34 +01:00
ignoreHTTPSErrors : true ,
2020-01-08 14:40:28 +01:00
args : [ '--proxy-server=http://localhost:' + proxyPort ]
2020-01-03 19:21:34 +01:00
} ) ;
debug ( 'Open a fresh page' ) ;
page = await browser . newPage ( ) ;
} ) ;
afterEach ( async function ( ) {
await browser . close ( ) ;
} ) ;
2020-01-08 14:40:28 +01:00
const testLogger = createLogger ( {
transports : [
new transports . Console ( {
level : 'error'
} )
]
} ) ;
2020-01-07 09:48:46 +01:00
it ( 'one keyword one page' , function ( ) {
2020-01-03 19:21:34 +01:00
const googleScraper = new GoogleScraper ( {
config : {
search _engine _name : 'google' ,
throw _on _detection : true ,
keywords : [ 'test keyword' ] ,
2020-01-08 14:40:28 +01:00
logger : testLogger ,
2020-01-03 19:21:34 +01:00
scrape _from _file : '' ,
}
} ) ;
googleScraper . STANDARD _TIMEOUT = 500 ;
2020-01-07 09:48:46 +01:00
return googleScraper . run ( { page } ) . then ( ( { results , metadata , num _requests } ) => {
assert . strictEqual ( num _requests , 1 , 'Must do one request' ) ;
assert . strictEqual ( results [ 'test keyword' ] [ '1' ] . results . length , 10 , 'Must have 10 organic results parsed' ) ;
} ) ;
2020-01-03 19:21:34 +01:00
} ) ;
2020-01-07 09:48:46 +01:00
it ( 'one keyword 3 pages' , function ( ) {
const googleScraper = new GoogleScraper ( {
config : {
search _engine _name : 'google' ,
throw _on _detection : true ,
keywords : [ 'test keyword' ] ,
2020-01-08 14:40:28 +01:00
logger : testLogger ,
2020-01-07 09:48:46 +01:00
scrape _from _file : '' ,
num _pages : 3 ,
}
} ) ;
googleScraper . STANDARD _TIMEOUT = 500 ;
return googleScraper . run ( { page } ) . then ( ( { results , metadata , num _requests } ) => {
assert . strictEqual ( num _requests , 3 , 'Must three requests' ) ;
assert . strictEqual ( results [ 'test keyword' ] [ '1' ] . results . length , 10 , 'Must have 10 organic results parsed on page 1' ) ;
assert . strictEqual ( results [ 'test keyword' ] [ '1' ] . results [ 0 ] . title , 'Keyword Tool (FREE) ᐈ #1 Google Keyword Planner Alternative' , 'Title not matching on first organic result page 1' ) ;
assert . strictEqual ( results [ 'test keyword' ] [ '2' ] . results . length , 10 , 'Must have 10 organic results parsed on page 2' ) ;
assert . strictEqual ( results [ 'test keyword' ] [ '2' ] . results [ 0 ] . title , 'Keyword Research | The Beginner\'s Guide to SEO - Moz' , 'Title not matching on first organic result page 1' ) ;
assert . strictEqual ( results [ 'test keyword' ] [ '3' ] . results . length , 10 , 'Must have 10 organic results parsed on page 3' ) ;
assert . strictEqual ( results [ 'test keyword' ] [ '3' ] . results [ 0 ] . title , 'The ACT Keyword Study Plan — NerdCoach' , 'Title not matching on first organic result page 1' ) ;
} ) ;
} ) ;
2020-01-03 19:21:34 +01:00
} ) ;