2019-08-02 17:41:29 +02:00
'use strict' ;
2019-01-31 14:57:34 +01:00
const se _scraper = require ( './../index.js' ) ;
2019-06-26 12:03:42 +02:00
const chai = require ( 'chai' ) ;
chai . use ( require ( 'chai-string' ) ) ;
const assert = chai . assert ;
2019-01-31 14:57:34 +01:00
/ *
* Use chai and mocha for tests .
* https : //mochajs.org/#installation
* /
const normal _search _keywords = [ 'apple tree' , 'weather tomorrow' ] ;
async function normal _search _test ( ) {
let config = {
compress : false ,
2019-06-11 18:16:59 +02:00
debug _level : 1 ,
2019-01-31 14:57:34 +01:00
keyword _file : '' ,
headless : true ,
output _file : '' ,
block _assets : true ,
user _agent : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36' ,
random _user _agent : false ,
} ;
2019-06-11 18:16:59 +02:00
let scrape _config = {
search _engine : 'google' ,
keywords : normal _search _keywords ,
num _pages : 3 ,
} ;
2019-01-31 14:57:34 +01:00
console . log ( 'normal_search_test()' ) ;
2019-06-11 18:16:59 +02:00
normal _search _test _case ( await se _scraper . scrape ( config , scrape _config ) ) ;
2019-01-31 14:57:34 +01:00
}
// we test with a callback function to our handler
2019-06-11 18:16:59 +02:00
function normal _search _test _case ( response ) {
assert . equal ( response . metadata . num _requests , 6 ) ;
2019-01-31 14:57:34 +01:00
2019-06-11 18:16:59 +02:00
for ( let query in response . results ) {
let total _rank = 1 ;
2019-01-31 14:57:34 +01:00
2019-06-11 18:16:59 +02:00
assert . containsAllKeys ( response . results , normal _search _keywords , 'not all keywords were scraped.' ) ;
2019-01-31 14:57:34 +01:00
2019-06-11 18:16:59 +02:00
for ( let page _number in response . results [ query ] ) {
2019-01-31 14:57:34 +01:00
2019-06-11 18:16:59 +02:00
assert . isNumber ( parseInt ( page _number ) , 'page_number must be numeric' ) ;
2019-01-31 14:57:34 +01:00
2019-06-11 18:16:59 +02:00
let obj = response . results [ query ] [ page _number ] ;
2019-01-31 14:57:34 +01:00
2019-06-11 18:16:59 +02:00
assert . containsAllKeys ( obj , [ 'results' , 'time' , 'no_results' , 'num_results' , 'effective_query' ] , 'not all keys are in the object' ) ;
2019-01-31 14:57:34 +01:00
2019-06-11 18:16:59 +02:00
assert . isAtLeast ( obj . results . length , 7 , 'results must have at least 8 SERP objects' ) ;
assert . equal ( obj . no _results , false , 'no results should be false' ) ;
assert . typeOf ( obj . num _results , 'string' , 'num_results must be a string' ) ;
assert . isAtLeast ( obj . num _results . length , 5 , 'num_results should be a string of at least 5 chars' ) ;
assert . typeOf ( Date . parse ( obj . time ) , 'number' , 'time should be a valid date' ) ;
2019-01-31 14:57:34 +01:00
2019-06-11 18:16:59 +02:00
for ( let res of obj . results ) {
2019-01-31 14:57:34 +01:00
2019-06-11 18:16:59 +02:00
assert . containsAllKeys ( res , [ 'link' , 'title' , 'rank' , 'visible_link' ] , 'not all keys are in the SERP object' ) ;
2019-01-31 14:57:34 +01:00
2019-06-11 18:16:59 +02:00
assert . isOk ( res . link , 'link must be ok' ) ;
assert . typeOf ( res . link , 'string' , 'link must be string' ) ;
assert . isAtLeast ( res . link . length , 5 , 'link must have at least 5 chars' ) ;
2019-01-31 14:57:34 +01:00
2019-06-11 18:16:59 +02:00
assert . isOk ( res . visible _link , 'visible_link must be ok' ) ;
assert . typeOf ( res . visible _link , 'string' , 'visible_link must be string' ) ;
assert . isAtLeast ( res . visible _link . length , 5 , 'visible_link must have at least 5 chars' ) ;
2019-01-31 14:57:34 +01:00
2019-06-11 18:16:59 +02:00
assert . isOk ( res . title , 'title must be ok' ) ;
assert . typeOf ( res . title , 'string' , 'title must be string' ) ;
assert . isAtLeast ( res . title . length , 10 , 'title must have at least 10 chars' ) ;
2019-01-31 15:36:27 +01:00
2019-06-11 18:16:59 +02:00
assert . isOk ( res . snippet , 'snippet must be ok' ) ;
assert . typeOf ( res . snippet , 'string' , 'snippet must be string' ) ;
assert . isAtLeast ( res . snippet . length , 10 , 'snippet must have at least 10 chars' ) ;
2019-01-31 14:57:34 +01:00
2019-06-11 18:16:59 +02:00
assert . isNumber ( res . rank , 'rank must be integer' ) ;
assert . equal ( res . rank , total _rank ++ , 'rank ist wrong' ) ;
2019-01-31 14:57:34 +01:00
}
}
}
}
2019-07-06 21:42:13 +02:00
const keywords _no _results = [ 'fgskl34440abJAksfs4353534a3l34AVGFDFflkjaQQuBBdfk' , ] ;
2019-01-31 14:57:34 +01:00
async function no _results _test ( ) {
let config = {
compress : false ,
2019-06-11 18:16:59 +02:00
debug _level : 1 ,
2019-01-31 14:57:34 +01:00
keyword _file : '' ,
headless : true ,
output _file : '' ,
block _assets : true ,
user _agent : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36' ,
random _user _agent : false ,
} ;
2019-06-11 18:16:59 +02:00
let scrape _config = {
search _engine : 'google' ,
keywords : keywords _no _results ,
num _pages : 1 ,
} ;
2019-01-31 14:57:34 +01:00
console . log ( 'no_results_test()' ) ;
2019-06-11 18:16:59 +02:00
test _case _no _results ( await se _scraper . scrape ( config , scrape _config ) ) ;
2019-01-31 14:57:34 +01:00
}
// we test with a callback function to our handler
2019-06-11 18:16:59 +02:00
function test _case _no _results ( response ) {
assert . equal ( response . metadata . num _requests , 1 ) ;
2019-02-07 16:09:38 +01:00
2019-06-11 18:16:59 +02:00
for ( let query in response . results ) {
2019-01-31 14:57:34 +01:00
2019-06-11 18:16:59 +02:00
assert . containsAllKeys ( response . results , keywords _no _results , 'not all keywords were scraped.' ) ;
2019-01-31 14:57:34 +01:00
2019-06-11 18:16:59 +02:00
for ( let page _number in response . results [ query ] ) {
2019-01-31 14:57:34 +01:00
2019-06-11 18:16:59 +02:00
assert . isNumber ( parseInt ( page _number ) , 'page_number must be numeric' ) ;
2019-01-31 14:57:34 +01:00
2019-06-11 18:16:59 +02:00
let obj = response . results [ query ] [ page _number ] ;
2019-01-31 14:57:34 +01:00
2019-06-11 18:16:59 +02:00
assert . containsAllKeys ( obj , [ 'results' , 'time' , 'no_results' , 'num_results' , 'effective_query' ] , 'not all keys are in the object' ) ;
2019-01-31 14:57:34 +01:00
2019-08-02 17:41:29 +02:00
assert . strictEqual ( obj . results . length , 0 , 'results must have 0 SERP objects' ) ;
2019-06-11 18:16:59 +02:00
assert . equal ( obj . no _results , true , 'no results should be true' ) ;
2019-09-23 21:57:13 +02:00
assert . isEmpty ( obj . num _results , 'num_results should be a empty string' ) ;
2019-06-11 18:16:59 +02:00
assert . typeOf ( Date . parse ( obj . time ) , 'number' , 'time should be a valid date' ) ;
2019-01-31 14:57:34 +01:00
}
}
}
const effective _query _keywords = [ 'mount evverrest' ] ;
async function effective _query _test ( ) {
let config = {
compress : false ,
2019-06-11 18:16:59 +02:00
debug _level : 1 ,
2019-01-31 14:57:34 +01:00
keyword _file : '' ,
headless : true ,
output _file : '' ,
block _assets : true ,
user _agent : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36' ,
random _user _agent : false ,
} ;
2019-06-11 18:16:59 +02:00
let scrape _config = {
search _engine : 'google' ,
keywords : effective _query _keywords ,
num _pages : 1 ,
} ;
2019-01-31 14:57:34 +01:00
console . log ( 'effective_query_test()' ) ;
2019-06-11 18:16:59 +02:00
test _case _effective _query ( await se _scraper . scrape ( config , scrape _config ) ) ;
2019-01-31 14:57:34 +01:00
}
// we test with a callback function to our handler
2019-06-11 18:16:59 +02:00
function test _case _effective _query ( response ) {
assert . equal ( response . metadata . num _requests , 1 ) ;
2019-01-31 14:57:34 +01:00
2019-06-11 18:16:59 +02:00
for ( let query in response . results ) {
2019-01-31 14:57:34 +01:00
2019-06-11 18:16:59 +02:00
assert . containsAllKeys ( response . results , effective _query _keywords , 'not all keywords were scraped.' ) ;
2019-01-31 14:57:34 +01:00
2019-06-11 18:16:59 +02:00
for ( let page _number in response . results [ query ] ) {
2019-01-31 14:57:34 +01:00
2019-06-11 18:16:59 +02:00
assert . isNumber ( parseInt ( page _number ) , 'page_number must be numeric' ) ;
2019-01-31 14:57:34 +01:00
2019-06-11 18:16:59 +02:00
let obj = response . results [ query ] [ page _number ] ;
2019-01-31 14:57:34 +01:00
2019-06-11 18:16:59 +02:00
assert . containsAllKeys ( obj , [ 'results' , 'time' , 'no_results' , 'num_results' , 'effective_query' ] , 'not all keys are in the object' ) ;
2019-01-31 14:57:34 +01:00
2019-06-11 18:16:59 +02:00
// effective query must be different to the original keyword
assert . isOk ( obj . effective _query , 'effective query must be ok' ) ;
assert . isNotEmpty ( obj . effective _query , 'effective query must be valid' ) ;
assert ( obj . effective _query !== query , 'effective query must be different from keyword' ) ;
2019-01-31 14:57:34 +01:00
2019-06-11 18:16:59 +02:00
assert . isAtLeast ( obj . results . length , 7 , 'results must have at least 8 SERP objects' ) ;
assert . equal ( obj . no _results , false , 'no results should be false' ) ;
assert . typeOf ( obj . num _results , 'string' , 'num_results must be a string' ) ;
assert . isAtLeast ( obj . num _results . length , 5 , 'num_results should be a string of at least 5 chars' ) ;
assert . typeOf ( Date . parse ( obj . time ) , 'number' , 'time should be a valid date' ) ;
2019-01-31 14:57:34 +01:00
}
}
}
2019-06-26 12:03:42 +02:00
async function html _output _query _test ( ) {
let config = {
compress : false ,
debug _level : 1 ,
keyword _file : '' ,
headless : true ,
output _file : '' ,
block _assets : true ,
user _agent : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36' ,
random _user _agent : false ,
} ;
let scrape _config = {
search _engine : 'google' ,
keywords : normal _search _keywords ,
num _pages : 3 ,
html _output : true ,
} ;
let output = await se _scraper . scrape ( config , scrape _config ) ;
normal _search _test _case ( output ) ;
check _html _output _test _case ( output ) ;
}
function check _html _output _test _case ( response ) {
for ( let query in response . html _output ) {
assert . containsAllKeys ( response . html _output , normal _search _keywords , 'not all keywords were scraped.' ) ;
for ( let page _number in response . html _output [ query ] ) {
assert . isNumber ( parseInt ( page _number ) , 'page_number must be numeric' ) ;
assert . startsWith ( response . html _output [ query ] [ page _number ] , '<!DOCTYPE html><html' ) ;
}
}
}
2019-07-07 19:38:28 +02:00
const ads _keywords = [ 'cloud services' , 'auto kaufen' ] ;
2019-07-06 21:42:13 +02:00
async function ads _test ( ) {
let config = {
compress : false ,
debug _level : 1 ,
headless : true ,
block _assets : false ,
2019-07-07 19:38:28 +02:00
random _user _agent : false , // dont try to trick google with ads
2019-07-06 21:42:13 +02:00
} ;
let scrape _config = {
search _engine : 'google' ,
keywords : ads _keywords ,
num _pages : 1 ,
} ;
console . log ( 'ads_test()' ) ;
test _case _ads _test ( await se _scraper . scrape ( config , scrape _config ) ) ;
}
function test _case _ads _test ( response ) {
assert . equal ( response . metadata . num _requests , 2 ) ;
for ( let query in response . results ) {
assert . containsAllKeys ( response . results , ads _keywords , 'not all keywords were scraped.' ) ;
for ( let page _number in response . results [ query ] ) {
assert . isNumber ( parseInt ( page _number ) , 'page_number must be numeric' ) ;
let obj = response . results [ query ] [ page _number ] ;
assert . containsAllKeys ( obj , [ 'results' , 'time' , 'no_results' , 'num_results' , 'effective_query' , 'top_ads' , 'bottom_ads' , 'places' ] , 'not all keys are in the object' ) ;
assert . isAtLeast ( obj . results . length , 7 , 'results must have at least 7 SERP objects' ) ;
assert . equal ( obj . no _results , false , 'no results should be false' ) ;
assert . typeOf ( obj . num _results , 'string' , 'num_results must be a string' ) ;
assert . isAtLeast ( obj . num _results . length , 5 , 'num_results should be a string of at least 5 chars' ) ;
assert . typeOf ( Date . parse ( obj . time ) , 'number' , 'time should be a valid date' ) ;
2019-07-07 19:38:28 +02:00
assert ( obj . top _ads . length >= 1 || obj . bottom _ads . length >= 1 , 'top_ads or bottom_ads must have at least 1 SERP object' ) ;
2019-07-06 21:42:13 +02:00
for ( let res of obj . top _ads ) {
2019-07-07 19:38:28 +02:00
assert . isOk ( res . tracking _link , 'link must be ok' ) ;
assert . typeOf ( res . tracking _link , 'string' , 'link must be string' ) ;
assert . isAtLeast ( res . tracking _link . length , 5 , 'link must have at least 5 chars' ) ;
2019-07-06 21:42:13 +02:00
2019-07-07 19:38:28 +02:00
assert . isOk ( res . visible _link , 'link must be ok' ) ;
assert . typeOf ( res . visible _link , 'string' , 'link must be string' ) ;
assert . isAtLeast ( res . visible _link . length , 5 , 'link must have at least 5 chars' ) ;
2019-07-06 21:42:13 +02:00
2019-07-18 19:14:33 +02:00
assert . isOk ( res . link , 'visible_link must be ok' ) ;
assert . typeOf ( res . link , 'string' , 'visible_link must be string' ) ;
assert . isAtLeast ( res . link . length , 5 , 'visible_link must have at least 5 chars' ) ;
2019-07-06 21:42:13 +02:00
assert . isOk ( res . title , 'title must be ok' ) ;
assert . typeOf ( res . title , 'string' , 'title must be string' ) ;
assert . isAtLeast ( res . title . length , 10 , 'title must have at least 10 chars' ) ;
assert . isOk ( res . snippet , 'snippet must be ok' ) ;
assert . typeOf ( res . snippet , 'string' , 'snippet must be string' ) ;
assert . isAtLeast ( res . snippet . length , 10 , 'snippet must have at least 10 chars' ) ;
2019-07-07 19:38:28 +02:00
2019-07-18 19:14:33 +02:00
assert . typeOf ( res . links , 'array' , 'links must be array' ) ;
2019-07-06 21:42:13 +02:00
}
for ( let res of obj . bottom _ads ) {
2019-07-07 19:38:28 +02:00
assert . isOk ( res . tracking _link , 'link must be ok' ) ;
assert . typeOf ( res . tracking _link , 'string' , 'link must be string' ) ;
assert . isAtLeast ( res . tracking _link . length , 5 , 'link must have at least 5 chars' ) ;
2019-07-06 21:42:13 +02:00
2019-07-07 19:38:28 +02:00
assert . isOk ( res . visible _link , 'link must be ok' ) ;
assert . typeOf ( res . visible _link , 'string' , 'link must be string' ) ;
assert . isAtLeast ( res . visible _link . length , 5 , 'link must have at least 5 chars' ) ;
2019-07-06 21:42:13 +02:00
2019-07-18 19:14:33 +02:00
assert . isOk ( res . link , 'visible_link must be ok' ) ;
assert . typeOf ( res . link , 'string' , 'visible_link must be string' ) ;
assert . isAtLeast ( res . link . length , 5 , 'visible_link must have at least 5 chars' ) ;
2019-07-06 21:42:13 +02:00
assert . isOk ( res . title , 'title must be ok' ) ;
assert . typeOf ( res . title , 'string' , 'title must be string' ) ;
assert . isAtLeast ( res . title . length , 10 , 'title must have at least 10 chars' ) ;
assert . isOk ( res . snippet , 'snippet must be ok' ) ;
assert . typeOf ( res . snippet , 'string' , 'snippet must be string' ) ;
assert . isAtLeast ( res . snippet . length , 10 , 'snippet must have at least 10 chars' ) ;
2019-07-07 19:38:28 +02:00
2019-07-18 19:14:33 +02:00
assert . typeOf ( res . links , 'array' , 'links must be array' ) ;
}
}
}
}
const product _keywords = [ 'autoreifen bmw' ] ;
async function products _test ( ) {
let config = {
compress : false ,
debug _level : 1 ,
headless : true ,
block _assets : false ,
random _user _agent : false , // dont try to trick google with ads
} ;
let scrape _config = {
search _engine : 'google' ,
keywords : ads _keywords ,
num _pages : 1 ,
} ;
console . log ( 'products_test()' ) ;
test _case _products _test ( await se _scraper . scrape ( config , scrape _config ) ) ;
}
function test _case _products _test ( response ) {
assert . equal ( response . metadata . num _requests , 2 ) ;
for ( let query in response . results ) {
assert . containsAllKeys ( response . results , ads _keywords , 'not all keywords were scraped.' ) ;
for ( let page _number in response . results [ query ] ) {
assert . isNumber ( parseInt ( page _number ) , 'page_number must be numeric' ) ;
let obj = response . results [ query ] [ page _number ] ;
assert . containsAllKeys ( obj , [ 'results' , 'time' , 'no_results' , 'num_results' , 'effective_query' , 'top_ads' , 'bottom_ads' , 'places' ] , 'not all keys are in the object' ) ;
assert . isAtLeast ( obj . results . length , 7 , 'results must have at least 7 SERP objects' ) ;
assert . equal ( obj . no _results , false , 'no results should be false' ) ;
assert . typeOf ( obj . num _results , 'string' , 'num_results must be a string' ) ;
assert . isAtLeast ( obj . num _results . length , 5 , 'num_results should be a string of at least 5 chars' ) ;
assert . typeOf ( Date . parse ( obj . time ) , 'number' , 'time should be a valid date' ) ;
assert ( obj . top _products . length >= 1 || obj . right _products . length >= 1 , 'top_products or right_products must have at least 1 SERP object' ) ;
for ( let res of obj . top _products ) {
assert . isOk ( res . tracking _link , 'link must be ok' ) ;
assert . typeOf ( res . tracking _link , 'string' , 'link must be string' ) ;
assert . isAtLeast ( res . tracking _link . length , 5 , 'link must have at least 5 chars' ) ;
assert . isOk ( res . link , 'link must be ok' ) ;
assert . typeOf ( res . link , 'string' , 'link must be string' ) ;
assert . isAtLeast ( res . link . length , 5 , 'link must have at least 5 chars' ) ;
assert . isOk ( res . price , 'price must be ok' ) ;
assert . typeOf ( res . price , 'string' , 'price must be string' ) ;
assert . isAtLeast ( res . price . length , 5 , 'price must have at least 5 chars' ) ;
assert . isOk ( res . title , 'title must be ok' ) ;
assert . typeOf ( res . title , 'string' , 'title must be string' ) ;
assert . isAtLeast ( res . title . length , 10 , 'title must have at least 10 chars' ) ;
assert . isOk ( res . vendor _link , 'vendor_link must be ok' ) ;
assert . typeOf ( res . vendor _link , 'string' , 'vendor_link must be string' ) ;
assert . isAtLeast ( res . vendor _link . length , 10 , 'vendor_link must have at least 10 chars' ) ;
}
for ( let res of obj . right _products ) {
assert . isOk ( res . tracking _link , 'link must be ok' ) ;
assert . typeOf ( res . tracking _link , 'string' , 'link must be string' ) ;
assert . isAtLeast ( res . tracking _link . length , 5 , 'link must have at least 5 chars' ) ;
assert . isOk ( res . link , 'link must be ok' ) ;
assert . typeOf ( res . link , 'string' , 'link must be string' ) ;
assert . isAtLeast ( res . link . length , 5 , 'link must have at least 5 chars' ) ;
assert . isOk ( res . price , 'price must be ok' ) ;
assert . typeOf ( res . price , 'string' , 'price must be string' ) ;
assert . isAtLeast ( res . price . length , 5 , 'price must have at least 5 chars' ) ;
assert . isOk ( res . title , 'title must be ok' ) ;
assert . typeOf ( res . title , 'string' , 'title must be string' ) ;
assert . isAtLeast ( res . title . length , 10 , 'title must have at least 10 chars' ) ;
assert . isOk ( res . vendor _link , 'vendor_link must be ok' ) ;
assert . typeOf ( res . vendor _link , 'string' , 'vendor_link must be string' ) ;
assert . isAtLeast ( res . vendor _link . length , 10 , 'vendor_link must have at least 10 chars' ) ;
2019-07-06 21:42:13 +02:00
}
}
}
}
2019-06-26 12:03:42 +02:00
describe ( 'Google' , function ( ) {
this . timeout ( 30000 ) ;
it ( 'normal search' , normal _search _test ) ;
it ( 'no results' , no _results _test ) ;
it ( 'effective query' , effective _query _test ) ;
it ( 'html output query' , html _output _query _test ) ;
2019-07-07 19:38:28 +02:00
it ( 'ads' , ads _test ) ;
2019-07-18 19:14:33 +02:00
it ( 'products test' , products _test ) ;
2019-06-26 12:03:42 +02:00
} ) ;