Merge branch 'master' of github.com:NikolaiT/se-scraper

branchy
This commit is contained in:
Nikolai Tschacher 2020-05-17 22:06:57 +02:00
commit 5a0eea201d
59 changed files with 2705 additions and 3187 deletions

2
.gitignore vendored
View File

@ -79,3 +79,5 @@ typings/
.idea/
GoogleScraperPup.iml
.http-mitm-proxy

3
.gitmodules vendored
View File

@ -1,3 +0,0 @@
[submodule "src/puppeteer-cluster"]
path = src/puppeteer-cluster
url = https://github.com/NikolaiT/puppeteer-cluster

View File

@ -32,12 +32,6 @@ let browser_config = {
verbose: true,
// whether to start the browser in headless mode
headless: true,
// whether debug information should be printed
// level 0: print nothing
// level 1: print most important info
// ...
// level 4: print all shit nobody wants to know
debug_level: 1,
is_local: false,
throw_on_detection: false,
puppeteer_cluster_config: {

View File

@ -30,12 +30,6 @@ let browser_config = {
// if random_user_agent is set to True, a random user agent is chosen
random_user_agent: true,
headless: true,
// whether debug information should be printed
// level 0: print nothing
// level 1: print most important info
// ...
// level 4: print all shit nobody wants to know
debug_level: 1,
is_local: false,
throw_on_detection: false,
puppeteer_cluster_config: {

View File

@ -2,7 +2,6 @@ const se_scraper = require('./../src/node_scraper.js');
(async () => {
let browser_config = {
debug_level: 1,
output_file: '',
};

View File

@ -2,7 +2,6 @@ const se_scraper = require('./../src/node_scraper.js');
(async () => {
let browser_config = {
debug_level: 2,
output_file: 'examples/results/gnold.json',
google_news_old_settings: {
gl: 'us', // The gl parameter determines the Google country to use for the query.

View File

@ -2,7 +2,6 @@ const se_scraper = require('./../src/node_scraper.js');
(async () => {
let browser_config = {
debug_level: 1,
output_file: 'examples/results/maps.json',
test_evasion: false,
block_assets: false,

View File

@ -3,7 +3,6 @@ const se_scraper = require('./../src/node_scraper.js');
(async () => {
let browser_config = {
search_engine: 'google',
debug_level: 1,
random_user_agent: true,
is_local: false,
html_output: false,

View File

@ -5,7 +5,6 @@ const se_scraper = require('./../src/node_scraper.js');
random_user_agent: true,
write_meta_data: true,
sleep_range: '[1,1]',
debug_level: 1,
headless: true,
output_file: `examples/results/multiple_search_engines.json`
};

View File

@ -3,7 +3,6 @@ const resolve = require('path').resolve;
(async () => {
let browser_config = {
debug_level: 1,
test_evasion: false,
log_http_headers: true,
log_ip_address: true,

View File

@ -2,7 +2,6 @@ const se_scraper = require('./../src/node_scraper.js');
(async () => {
let browser_config = {
debug_level: 1,
output_file: 'examples/results/proxyresults.json',
log_ip_address: true,
// a file with one proxy per line. Example:

View File

@ -2,7 +2,6 @@ const se_scraper = require('./../src/node_scraper.js');
(async () => {
let browser_config = {
debug_level: 1,
test_evasion: false,
log_http_headers: false,
log_ip_address: false,

View File

@ -2,7 +2,6 @@ const se_scraper = require('./../src/node_scraper.js');
(async () => {
let browser_config = {
debug_level: 1,
output_file: 'examples/results/data.json',
};

478
package-lock.json generated
View File

@ -26,6 +26,7 @@
"version": "1.3.7",
"resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.7.tgz",
"integrity": "sha512-Il80Qs2WjYlJIBNzNkK6KYqlVMTbZLXgHx2oT0pU/fjRHyEp+PEfEPY0R3WCwAGVOtauxh1hOxNgIf5bv7dQpA==",
"dev": true,
"requires": {
"mime-types": "~2.1.24",
"negotiator": "0.6.2"
@ -77,7 +78,8 @@
"array-flatten": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz",
"integrity": "sha1-ml9pkFGx5wczKPKgCJaLZOopVdI="
"integrity": "sha1-ml9pkFGx5wczKPKgCJaLZOopVdI=",
"dev": true
},
"assertion-error": {
"version": "1.1.0",
@ -85,6 +87,14 @@
"integrity": "sha512-jgsaNduz+ndvGyFt3uSuWqvy4lCnIJiovtouQN5JZHOKCS2QuhEdbcQHFhVksz2N2U9hXJo8odG7ETyWlEeuDw==",
"dev": true
},
"async": {
"version": "2.6.3",
"resolved": "https://registry.npmjs.org/async/-/async-2.6.3.tgz",
"integrity": "sha512-zflvls11DCy+dQWzTW2dzuilv8Z5X/pjfmZOWba6TNIVDm+2UDaJmXSOXlasHKfNBs8oo3M0aT50fDEWfKZjXg==",
"requires": {
"lodash": "^4.17.14"
}
},
"async-limiter": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/async-limiter/-/async-limiter-1.0.1.tgz",
@ -95,10 +105,17 @@
"resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.0.tgz",
"integrity": "sha1-ibTRmasr7kneFk6gK4nORi1xt2c="
},
"bluebird": {
"version": "3.7.2",
"resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.7.2.tgz",
"integrity": "sha512-XpNj6GDQzdfW+r2Wnn7xiSAd7TM3jzkxGXBGTtWKuSXv1xUV+azxAm8jdWZN06QTQk+2N2XB9jRDkvbmQmcRtg==",
"dev": true
},
"body-parser": {
"version": "1.19.0",
"resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.19.0.tgz",
"integrity": "sha512-dhEPs72UPbDnAQJ9ZKMNTP6ptJaionhP5cBb541nXPlW60Jepo9RV/a4fX4XWW9CuFNK22krhrj1+rgzifNCsw==",
"dev": true,
"requires": {
"bytes": "3.1.0",
"content-type": "~1.0.4",
@ -116,6 +133,7 @@
"version": "2.6.9",
"resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
"integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
"dev": true,
"requires": {
"ms": "2.0.0"
}
@ -123,7 +141,8 @@
"ms": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g="
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=",
"dev": true
}
}
},
@ -155,7 +174,8 @@
"bytes": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.0.tgz",
"integrity": "sha512-zauLjrfCG+xvoyaqLoV8bLVXXNGC4JqlxFCutSDWA6fJrTo2ZuvLYTqZ7aHBLZSMOopbzwv8f+wZcVzfVTI2Dg=="
"integrity": "sha512-zauLjrfCG+xvoyaqLoV8bLVXXNGC4JqlxFCutSDWA6fJrTo2ZuvLYTqZ7aHBLZSMOopbzwv8f+wZcVzfVTI2Dg==",
"dev": true
},
"cacheable-request": {
"version": "6.0.0",
@ -219,6 +239,12 @@
}
}
},
"charenc": {
"version": "0.0.2",
"resolved": "https://registry.npmjs.org/charenc/-/charenc-0.0.2.tgz",
"integrity": "sha1-wKHS86cJLgN3S/qD8UwPxXkKhmc=",
"dev": true
},
"check-error": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/check-error/-/check-error-1.0.2.tgz",
@ -275,11 +301,19 @@
"integrity": "sha1-DQcLTQQ6W+ozovGkDi7bPZpMz3c=",
"dev": true
},
"color": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/color/-/color-3.0.0.tgz",
"integrity": "sha512-jCpd5+s0s0t7p3pHQKpnJ0TpQKKdleP71LWcA0aqiljpiuAkOSUFN/dyH8ZwF0hRmFlrIuRhufds1QyEP9EB+w==",
"requires": {
"color-convert": "^1.9.1",
"color-string": "^1.5.2"
}
},
"color-convert": {
"version": "1.9.3",
"resolved": "https://registry.npmjs.org/color-convert/-/color-convert-1.9.3.tgz",
"integrity": "sha512-QfAUtd+vFdAtFQcC8CCyYt1fYWxSqAiK2cSD6zDB8N3cpsEBAvRxp9zOGg6G/SHHJYAT88/az/IuDGALsNVbGg==",
"dev": true,
"requires": {
"color-name": "1.1.3"
}
@ -287,8 +321,35 @@
"color-name": {
"version": "1.1.3",
"resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.3.tgz",
"integrity": "sha1-p9BVi9icQveV3UIyj3QIMcpTvCU=",
"dev": true
"integrity": "sha1-p9BVi9icQveV3UIyj3QIMcpTvCU="
},
"color-string": {
"version": "1.5.3",
"resolved": "https://registry.npmjs.org/color-string/-/color-string-1.5.3.tgz",
"integrity": "sha512-dC2C5qeWoYkxki5UAXapdjqO672AM4vZuPGRQfO8b5HKuKGBbKWpITyDYN7TOFKvRW7kOgAn3746clDBMDJyQw==",
"requires": {
"color-name": "^1.0.0",
"simple-swizzle": "^0.2.2"
}
},
"colornames": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/colornames/-/colornames-1.1.1.tgz",
"integrity": "sha1-+IiQMGhcfE/54qVZ9Qd+t2qBb5Y="
},
"colors": {
"version": "1.4.0",
"resolved": "https://registry.npmjs.org/colors/-/colors-1.4.0.tgz",
"integrity": "sha512-a+UqTh4kgZg/SlGvfbzDHpgRu7AAQOmmqRHJnxhRZICKFUT91brVhNNt58CMWU9PsBbv3PDCZUHbVxuDiH2mtA=="
},
"colorspace": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/colorspace/-/colorspace-1.1.2.tgz",
"integrity": "sha512-vt+OoIP2d76xLhjwbBaucYlNSpPsrJWPlBTtwCpQKIu6/CSMutyzX93O/Do0qzpH3YoHEes8YEFXyZ797rEhzQ==",
"requires": {
"color": "3.0.x",
"text-hex": "1.0.x"
}
},
"concat-map": {
"version": "0.0.1",
@ -297,7 +358,7 @@
},
"concat-stream": {
"version": "1.6.2",
"resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz",
"resolved": "http://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz",
"integrity": "sha512-27HBghJxjiZtIk3Ycvn/4kbJk/1uZuJFfuPEns6LaEvpvG1f0hTea8lilrouyo9mVc2GWdcEZ8OLoGmSADlrCw==",
"requires": {
"buffer-from": "^1.0.0",
@ -308,7 +369,7 @@
"dependencies": {
"readable-stream": {
"version": "2.3.6",
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz",
"resolved": "http://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz",
"integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==",
"requires": {
"core-util-is": "~1.0.0",
@ -322,7 +383,7 @@
},
"string_decoder": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz",
"resolved": "http://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz",
"integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==",
"requires": {
"safe-buffer": "~5.1.0"
@ -334,6 +395,7 @@
"version": "0.5.3",
"resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-0.5.3.tgz",
"integrity": "sha512-ExO0774ikEObIAEV9kDo50o+79VCUdEB6n6lzKgGwupcVeRlhrj3qGAfwq8G6uBJjkqLrhT0qEYFcWng8z1z0g==",
"dev": true,
"requires": {
"safe-buffer": "5.1.2"
}
@ -341,17 +403,20 @@
"content-type": {
"version": "1.0.4",
"resolved": "https://registry.npmjs.org/content-type/-/content-type-1.0.4.tgz",
"integrity": "sha512-hIP3EEPs8tB9AT1L+NUqtwOAps4mk2Zob89MWXMHjHWg9milF/j4osnnQLXBCBFBk/tvIG/tUc9mOUJiPBhPXA=="
"integrity": "sha512-hIP3EEPs8tB9AT1L+NUqtwOAps4mk2Zob89MWXMHjHWg9milF/j4osnnQLXBCBFBk/tvIG/tUc9mOUJiPBhPXA==",
"dev": true
},
"cookie": {
"version": "0.4.0",
"resolved": "https://registry.npmjs.org/cookie/-/cookie-0.4.0.tgz",
"integrity": "sha512-+Hp8fLp57wnUSt0tY0tHEXh4voZRDnoIrZPqlo3DPiI4y9lwg/jqx+1Om94/W6ZaPDOUbnjOt/99w66zk+l1Xg=="
"integrity": "sha512-+Hp8fLp57wnUSt0tY0tHEXh4voZRDnoIrZPqlo3DPiI4y9lwg/jqx+1Om94/W6ZaPDOUbnjOt/99w66zk+l1Xg==",
"dev": true
},
"cookie-signature": {
"version": "1.0.6",
"resolved": "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.0.6.tgz",
"integrity": "sha1-4wOogrNCzD7oylE6eZmXNNqzriw="
"integrity": "sha1-4wOogrNCzD7oylE6eZmXNNqzriw=",
"dev": true
},
"core-util-is": {
"version": "1.0.2",
@ -371,6 +436,12 @@
"which": "^1.2.9"
}
},
"crypt": {
"version": "0.0.2",
"resolved": "https://registry.npmjs.org/crypt/-/crypt-0.0.2.tgz",
"integrity": "sha1-iNf/fsDfuG9xPch7u0LQRNPmxBs=",
"dev": true
},
"css-select": {
"version": "1.2.0",
"resolved": "http://registry.npmjs.org/css-select/-/css-select-1.2.0.tgz",
@ -440,12 +511,24 @@
"depd": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/depd/-/depd-1.1.2.tgz",
"integrity": "sha1-m81S4UwJd2PnSbJ0xDRu0uVgtak="
"integrity": "sha1-m81S4UwJd2PnSbJ0xDRu0uVgtak=",
"dev": true
},
"destroy": {
"version": "1.0.4",
"resolved": "https://registry.npmjs.org/destroy/-/destroy-1.0.4.tgz",
"integrity": "sha1-l4hXRCxEdJ5CBmE+N5RiBYJqvYA="
"integrity": "sha1-l4hXRCxEdJ5CBmE+N5RiBYJqvYA=",
"dev": true
},
"diagnostics": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/diagnostics/-/diagnostics-1.1.1.tgz",
"integrity": "sha512-8wn1PmdunLJ9Tqbx+Fx/ZEuHfJf4NKSN2ZBj7SJC/OWRWha843+WsTjqMe1B5E3p28jqBlp+mJ2fPVxPyNgYKQ==",
"requires": {
"colorspace": "1.1.x",
"enabled": "1.0.x",
"kuler": "1.0.x"
}
},
"diff": {
"version": "3.5.0",
@ -506,7 +589,8 @@
"ee-first": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz",
"integrity": "sha1-WQxhFWsK4vTwJVcyoViyZrxWsh0="
"integrity": "sha1-WQxhFWsK4vTwJVcyoViyZrxWsh0=",
"dev": true
},
"emoji-regex": {
"version": "7.0.3",
@ -514,10 +598,19 @@
"integrity": "sha512-CwBLREIQ7LvYFB0WyRvwhq5N5qPhc6PMjD6bYggFlI5YyDgl+0vxq5VHbMOFqLg7hfWzmu8T5Z1QofhmTIhItA==",
"dev": true
},
"enabled": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/enabled/-/enabled-1.0.2.tgz",
"integrity": "sha1-ll9lE9LC0cX0ZStkouM5ZGf8L5M=",
"requires": {
"env-variable": "0.0.x"
}
},
"encodeurl": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz",
"integrity": "sha1-rT/0yG7C0CkyL1oCw6mmBslbP1k="
"integrity": "sha1-rT/0yG7C0CkyL1oCw6mmBslbP1k=",
"dev": true
},
"end-of-stream": {
"version": "1.4.1",
@ -532,6 +625,11 @@
"resolved": "https://registry.npmjs.org/entities/-/entities-1.1.2.tgz",
"integrity": "sha512-f2LZMYl1Fzu7YSBKg+RoROelpOaNrcGmE9AZubeDfrCEia483oW4MI4VyFd5VNHIgQ/7qm1I0wUHK1eJnn2y2w=="
},
"env-variable": {
"version": "0.0.5",
"resolved": "https://registry.npmjs.org/env-variable/-/env-variable-0.0.5.tgz",
"integrity": "sha512-zoB603vQReOFvTg5xMl9I1P2PnHsHQQKTEowsKKD7nseUfJq6UWzK+4YtlWUO1nhiQUxe6XMkk+JleSZD1NZFA=="
},
"es-abstract": {
"version": "1.13.0",
"resolved": "https://registry.npmjs.org/es-abstract/-/es-abstract-1.13.0.tgz",
@ -564,7 +662,7 @@
},
"es6-promisify": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz",
"resolved": "http://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz",
"integrity": "sha1-UQnWLz5W6pZ8S2NQWu8IKRyKUgM=",
"requires": {
"es6-promise": "^4.0.3"
@ -573,7 +671,8 @@
"escape-html": {
"version": "1.0.3",
"resolved": "https://registry.npmjs.org/escape-html/-/escape-html-1.0.3.tgz",
"integrity": "sha1-Aljq5NPQwJdN4cFpGI7wBR0dGYg="
"integrity": "sha1-Aljq5NPQwJdN4cFpGI7wBR0dGYg=",
"dev": true
},
"escape-string-regexp": {
"version": "1.0.5",
@ -590,7 +689,8 @@
"etag": {
"version": "1.8.1",
"resolved": "https://registry.npmjs.org/etag/-/etag-1.8.1.tgz",
"integrity": "sha1-Qa4u62XvpiJorr/qg6x9eSmbCIc="
"integrity": "sha1-Qa4u62XvpiJorr/qg6x9eSmbCIc=",
"dev": true
},
"execa": {
"version": "1.0.0",
@ -611,6 +711,7 @@
"version": "4.17.1",
"resolved": "https://registry.npmjs.org/express/-/express-4.17.1.tgz",
"integrity": "sha512-mHJ9O79RqluphRrcw2X/GTh3k9tVv8YcoyY4Kkh4WDMUYKRZUq0h1o0w2rrrxBqM7VoeUVqgb27xlEMXTnYt4g==",
"dev": true,
"requires": {
"accepts": "~1.3.7",
"array-flatten": "1.1.1",
@ -648,6 +749,7 @@
"version": "2.6.9",
"resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
"integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
"dev": true,
"requires": {
"ms": "2.0.0"
}
@ -655,7 +757,8 @@
"ms": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g="
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=",
"dev": true
}
}
},
@ -685,6 +788,11 @@
}
}
},
"fast-safe-stringify": {
"version": "2.0.7",
"resolved": "https://registry.npmjs.org/fast-safe-stringify/-/fast-safe-stringify-2.0.7.tgz",
"integrity": "sha512-Utm6CdzT+6xsDk2m8S6uL8VHxNwI6Jub+e9NYTcAms28T84pTa25GJQV9j0CY0N1rM8hK4x6grpF2BQf+2qwVA=="
},
"fd-slicer": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/fd-slicer/-/fd-slicer-1.0.1.tgz",
@ -693,10 +801,16 @@
"pend": "~1.2.0"
}
},
"fecha": {
"version": "2.3.3",
"resolved": "https://registry.npmjs.org/fecha/-/fecha-2.3.3.tgz",
"integrity": "sha512-lUGBnIamTAwk4znq5BcqsDaxSmZ9nDVJaij6NvRt/Tg4R69gERA+otPKbS86ROw9nxVMw2/mp1fnaiWqbs6Sdg=="
},
"finalhandler": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-1.1.2.tgz",
"integrity": "sha512-aAWcW57uxVNrQZqFXjITpW3sIUQmHGG3qSb9mUah9MgMC4NeWhNOlNjXEYq3HjRAvL6arUviZGGJsBg6z0zsWA==",
"dev": true,
"requires": {
"debug": "2.6.9",
"encodeurl": "~1.0.2",
@ -711,6 +825,7 @@
"version": "2.6.9",
"resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
"integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
"dev": true,
"requires": {
"ms": "2.0.0"
}
@ -718,7 +833,8 @@
"ms": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g="
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=",
"dev": true
}
}
},
@ -756,12 +872,14 @@
"forwarded": {
"version": "0.1.2",
"resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.1.2.tgz",
"integrity": "sha1-mMI9qxF1ZXuMBXPozszZGw/xjIQ="
"integrity": "sha1-mMI9qxF1ZXuMBXPozszZGw/xjIQ=",
"dev": true
},
"fresh": {
"version": "0.5.2",
"resolved": "https://registry.npmjs.org/fresh/-/fresh-0.5.2.tgz",
"integrity": "sha1-PYyt2Q2XZWn6g1qx+OSyOhBWBac="
"integrity": "sha1-PYyt2Q2XZWn6g1qx+OSyOhBWBac=",
"dev": true
},
"fs.realpath": {
"version": "1.0.0",
@ -880,6 +998,7 @@
"version": "1.7.2",
"resolved": "https://registry.npmjs.org/http-errors/-/http-errors-1.7.2.tgz",
"integrity": "sha512-uUQBt3H/cSIVfch6i1EuPNy/YsRSOUBXTVfZ+yR7Zjez3qjBz6i9+i4zjNaoqcoFVI4lQJ5plg63TvGfRSDCRg==",
"dev": true,
"requires": {
"depd": "~1.1.2",
"inherits": "2.0.3",
@ -888,6 +1007,34 @@
"toidentifier": "1.0.0"
}
},
"http-mitm-proxy": {
"version": "0.8.2",
"resolved": "https://registry.npmjs.org/http-mitm-proxy/-/http-mitm-proxy-0.8.2.tgz",
"integrity": "sha512-QqaqHWssz4acqu2aIPJqJWt/gDa4SzQ9kj/rs16ONA2nBWNh/mfOW0Ez1Wxa5IivHHZSTciQ7wG0Dxzogurngw==",
"dev": true,
"requires": {
"async": "^2.6.2",
"debug": "^4.1.0",
"mkdirp": "^0.5.1",
"node-forge": "^0.8.4",
"optimist": "^0.6.1",
"semaphore": "^1.1.0",
"ws": "^3.2.0"
},
"dependencies": {
"ws": {
"version": "3.3.3",
"resolved": "https://registry.npmjs.org/ws/-/ws-3.3.3.tgz",
"integrity": "sha512-nnWLa/NwZSt4KQJu51MYlCcSQ5g7INpOrOMt4XV8j4dqTXdmlUmSHQ8/oLC069ckre0fRsgfvsKwbTdtKLCDkA==",
"dev": true,
"requires": {
"async-limiter": "~1.0.0",
"safe-buffer": "~5.1.0",
"ultron": "~1.1.0"
}
}
}
},
"https-proxy-agent": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-3.0.1.tgz",
@ -911,6 +1058,7 @@
"version": "0.4.24",
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz",
"integrity": "sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA==",
"dev": true,
"requires": {
"safer-buffer": ">= 2.1.2 < 3"
}
@ -938,7 +1086,13 @@
"ipaddr.js": {
"version": "1.9.0",
"resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.9.0.tgz",
"integrity": "sha512-M4Sjn6N/+O6/IXSJseKqHoFc+5FdGJ22sXqnjTpdZweHK64MzEPAyQZyEU3R/KRv2GLoa7nNtg/C2Ev6m7z+eA=="
"integrity": "sha512-M4Sjn6N/+O6/IXSJseKqHoFc+5FdGJ22sXqnjTpdZweHK64MzEPAyQZyEU3R/KRv2GLoa7nNtg/C2Ev6m7z+eA==",
"dev": true
},
"is-arrayish": {
"version": "0.3.2",
"resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.2.tgz",
"integrity": "sha512-eVRqCvVlZbuw3GrM63ovNSNAeA1K16kaR/LRY/92w0zxQ5/1YzwblUX652i4Xs9RwAGjW9d9y6X88t8OaAJfWQ=="
},
"is-buffer": {
"version": "2.0.3",
@ -989,8 +1143,7 @@
"is-stream": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/is-stream/-/is-stream-1.1.0.tgz",
"integrity": "sha1-EtSj3U5o4Lec6428hBc66A2RykQ=",
"dev": true
"integrity": "sha1-EtSj3U5o4Lec6428hBc66A2RykQ="
},
"is-symbol": {
"version": "1.0.2",
@ -1032,6 +1185,15 @@
"resolved": "https://registry.npmjs.org/json-buffer/-/json-buffer-3.0.0.tgz",
"integrity": "sha1-Wx85evx11ne96Lz8Dkfh+aPZqJg="
},
"key-cert": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/key-cert/-/key-cert-1.0.1.tgz",
"integrity": "sha512-WiaPESfEzsztL9KIxbX6mNAU34NcEOyLVrpajrTkXeVc2tAZDx3lcLQlIE+bUqEoaIl0InBoiIy6C5ToLJ7i0g==",
"dev": true,
"requires": {
"pem": "^1.12.5"
}
},
"keyv": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/keyv/-/keyv-3.1.0.tgz",
@ -1055,6 +1217,14 @@
}
}
},
"kuler": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/kuler/-/kuler-1.0.1.tgz",
"integrity": "sha512-J9nVUucG1p/skKul6DU3PUZrhs0LPulNaeUOox0IyXDi8S4CztTHs1gQphhuZmzXG7VOQSf6NJfKuzteQLv9gQ==",
"requires": {
"colornames": "^1.1.1"
}
},
"lazy-cache": {
"version": "1.0.4",
"resolved": "https://registry.npmjs.org/lazy-cache/-/lazy-cache-1.0.4.tgz",
@ -1098,6 +1268,18 @@
"chalk": "^2.0.1"
}
},
"logform": {
"version": "2.1.2",
"resolved": "https://registry.npmjs.org/logform/-/logform-2.1.2.tgz",
"integrity": "sha512-+lZh4OpERDBLqjiwDLpAWNQu6KMjnlXH2ByZwCuSqVPJletw0kTWJf5CgSNAUKn1KUkv3m2cUz/LK8zyEy7wzQ==",
"requires": {
"colors": "^1.2.1",
"fast-safe-stringify": "^2.0.4",
"fecha": "^2.3.3",
"ms": "^2.1.1",
"triple-beam": "^1.3.0"
}
},
"lowercase-keys": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/lowercase-keys/-/lowercase-keys-1.0.1.tgz",
@ -1112,10 +1294,30 @@
"p-defer": "^1.0.0"
}
},
"md5": {
"version": "2.2.1",
"resolved": "https://registry.npmjs.org/md5/-/md5-2.2.1.tgz",
"integrity": "sha1-U6s41f48iJG6RlMp6iP6wFQBJvk=",
"dev": true,
"requires": {
"charenc": "~0.0.1",
"crypt": "~0.0.1",
"is-buffer": "~1.1.1"
},
"dependencies": {
"is-buffer": {
"version": "1.1.6",
"resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-1.1.6.tgz",
"integrity": "sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w==",
"dev": true
}
}
},
"media-typer": {
"version": "0.3.0",
"resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz",
"integrity": "sha1-hxDXrwqmJvj/+hzgAWhUUmMlV0g="
"integrity": "sha1-hxDXrwqmJvj/+hzgAWhUUmMlV0g=",
"dev": true
},
"mem": {
"version": "4.3.0",
@ -1141,12 +1343,14 @@
"merge-descriptors": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.1.tgz",
"integrity": "sha1-sAqqVW3YtEVoFQ7J0blT8/kMu2E="
"integrity": "sha1-sAqqVW3YtEVoFQ7J0blT8/kMu2E=",
"dev": true
},
"methods": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/methods/-/methods-1.1.2.tgz",
"integrity": "sha1-VSmk1nZUE07cxSZmVoNbD4Ua/O4="
"integrity": "sha1-VSmk1nZUE07cxSZmVoNbD4Ua/O4=",
"dev": true
},
"mime": {
"version": "2.4.4",
@ -1154,16 +1358,18 @@
"integrity": "sha512-LRxmNwziLPT828z+4YkNzloCFC2YM4wrB99k+AV5ZbEyfGNWfG8SO1FUXLmLDBSo89NrJZ4DIWeLjy1CHGhMGA=="
},
"mime-db": {
"version": "1.40.0",
"resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.40.0.tgz",
"integrity": "sha512-jYdeOMPy9vnxEqFRRo6ZvTZ8d9oPb+k18PKoYNYUe2stVEBPPwsln/qWzdbmaIvnhZ9v2P+CuecK+fpUfsV2mA=="
"version": "1.42.0",
"resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.42.0.tgz",
"integrity": "sha512-UbfJCR4UAVRNgMpfImz05smAXK7+c+ZntjaA26ANtkXLlOe947Aag5zdIcKQULAiF9Cq4WxBi9jUs5zkA84bYQ==",
"dev": true
},
"mime-types": {
"version": "2.1.24",
"resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.24.tgz",
"integrity": "sha512-WaFHS3MCl5fapm3oLxU4eYDw77IQM2ACcxQ9RIxfaC3ooc6PFuBMGZZsYpvoXS5D5QTWPieo1jjLdAm3TBP3cQ==",
"version": "2.1.25",
"resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.25.tgz",
"integrity": "sha512-5KhStqB5xpTAeGqKBAMgwaYMnQik7teQN4IAzC7npDv6kzeU6prfkR67bc87J1kWMPGkoaZSq1npmexMgkmEVg==",
"dev": true,
"requires": {
"mime-db": "1.40.0"
"mime-db": "1.42.0"
}
},
"mimic-fn": {
@ -1278,7 +1484,8 @@
"negotiator": {
"version": "0.6.2",
"resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.2.tgz",
"integrity": "sha512-hZXc7K2e+PgeI1eDBe/10Ard4ekbfrrqG8Ep+8Jmf4JID2bNg7NvCPOZN+kfF574pFQI7mum2AUqDidoKqcTOw=="
"integrity": "sha512-hZXc7K2e+PgeI1eDBe/10Ard4ekbfrrqG8Ep+8Jmf4JID2bNg7NvCPOZN+kfF574pFQI7mum2AUqDidoKqcTOw==",
"dev": true
},
"nice-try": {
"version": "1.0.5",
@ -1296,6 +1503,12 @@
"semver": "^5.7.0"
}
},
"node-forge": {
"version": "0.8.5",
"resolved": "https://registry.npmjs.org/node-forge/-/node-forge-0.8.5.tgz",
"integrity": "sha512-vFMQIWt+J/7FLNyKouZ9TazT74PRV3wgv9UT4cRjC8BffxFbKXkgIWR42URCPSnHm/QDz6BOlb2Q0U4+VQT67Q==",
"dev": true
},
"normalize-url": {
"version": "3.3.0",
"resolved": "https://registry.npmjs.org/normalize-url/-/normalize-url-3.3.0.tgz",
@ -1356,6 +1569,7 @@
"version": "2.3.0",
"resolved": "https://registry.npmjs.org/on-finished/-/on-finished-2.3.0.tgz",
"integrity": "sha1-IPEzZIGwg811M3mSoWlxqi2QaUc=",
"dev": true,
"requires": {
"ee-first": "1.1.1"
}
@ -1368,6 +1582,21 @@
"wrappy": "1"
}
},
"one-time": {
"version": "0.0.4",
"resolved": "https://registry.npmjs.org/one-time/-/one-time-0.0.4.tgz",
"integrity": "sha1-+M33eISCb+Tf+T46nMN7HkSAdC4="
},
"optimist": {
"version": "0.6.1",
"resolved": "https://registry.npmjs.org/optimist/-/optimist-0.6.1.tgz",
"integrity": "sha1-2j6nRob6IaGaERwybpDrFaAZZoY=",
"dev": true,
"requires": {
"minimist": "~0.0.1",
"wordwrap": "~0.0.2"
}
},
"os-locale": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/os-locale/-/os-locale-3.1.0.tgz",
@ -1379,6 +1608,12 @@
"mem": "^4.0.0"
}
},
"os-tmpdir": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/os-tmpdir/-/os-tmpdir-1.0.2.tgz",
"integrity": "sha1-u+Z0BseaqFxc/sdm/lc0VV36EnQ=",
"dev": true
},
"p-cancelable": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/p-cancelable/-/p-cancelable-1.0.0.tgz",
@ -1437,7 +1672,8 @@
"parseurl": {
"version": "1.3.3",
"resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz",
"integrity": "sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ=="
"integrity": "sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ==",
"dev": true
},
"path-exists": {
"version": "3.0.0",
@ -1459,7 +1695,8 @@
"path-to-regexp": {
"version": "0.1.7",
"resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.7.tgz",
"integrity": "sha1-32BBeABfUi8V60SQ5yR6G/qmf4w="
"integrity": "sha1-32BBeABfUi8V60SQ5yR6G/qmf4w=",
"dev": true
},
"pathval": {
"version": "1.1.0",
@ -1467,6 +1704,26 @@
"integrity": "sha1-uULm1L3mUwBe9rcTYd74cn0GReA=",
"dev": true
},
"pem": {
"version": "1.14.3",
"resolved": "https://registry.npmjs.org/pem/-/pem-1.14.3.tgz",
"integrity": "sha512-Q+AMVMD3fzeVvZs5PHeI+pVt0hgZY2fjhkliBW43qyONLgCXPVk1ryim43F9eupHlNGLJNT5T/NNrzhUdiC5Zg==",
"dev": true,
"requires": {
"es6-promisify": "^6.0.0",
"md5": "^2.2.1",
"os-tmpdir": "^1.0.1",
"which": "^1.3.1"
},
"dependencies": {
"es6-promisify": {
"version": "6.0.2",
"resolved": "https://registry.npmjs.org/es6-promisify/-/es6-promisify-6.0.2.tgz",
"integrity": "sha512-eO6vFm0JvqGzjWIQA6QVKjxpmELfhWbDUWHm1rPfIbn55mhKPiAa5xpLmQWJrNa629ZIeQ8ZvMAi13kvrjK6Mg==",
"dev": true
}
}
},
"pend": {
"version": "1.2.0",
"resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz",
@ -1491,6 +1748,7 @@
"version": "2.0.5",
"resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.5.tgz",
"integrity": "sha512-t/7RxHXPH6cJtP0pRG6smSr9QJidhB+3kXu0KgXnbGYMgzEnUxRQ4/LDdfOwZEMyIh3/xHb8PX3t+lfL9z+YVQ==",
"dev": true,
"requires": {
"forwarded": "~0.1.2",
"ipaddr.js": "1.9.0"
@ -1525,6 +1783,14 @@
"ws": "^6.1.0"
}
},
"puppeteer-cluster": {
"version": "0.18.0",
"resolved": "https://registry.npmjs.org/puppeteer-cluster/-/puppeteer-cluster-0.18.0.tgz",
"integrity": "sha512-sMhK7foa+gq8khtgvKar6xwIjmPFS92ZrDsnpMHcQMX2Q8hWmVSViHG7RhuAOmbfIbS7Ya+lViXlx1xtgUjarQ==",
"requires": {
"debug": "^4.1.1"
}
},
"puppeteer-extra": {
"version": "2.1.3",
"resolved": "https://registry.npmjs.org/puppeteer-extra/-/puppeteer-extra-2.1.3.tgz",
@ -1605,17 +1871,20 @@
"qs": {
"version": "6.7.0",
"resolved": "https://registry.npmjs.org/qs/-/qs-6.7.0.tgz",
"integrity": "sha512-VCdBRNFTX1fyE7Nb6FYoURo/SPe62QCaAyzJvUjwRaIsc+NePBEniHlvxFmmX56+HZphIGtV0XeCirBtpDrTyQ=="
"integrity": "sha512-VCdBRNFTX1fyE7Nb6FYoURo/SPe62QCaAyzJvUjwRaIsc+NePBEniHlvxFmmX56+HZphIGtV0XeCirBtpDrTyQ==",
"dev": true
},
"range-parser": {
"version": "1.2.1",
"resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz",
"integrity": "sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg=="
"integrity": "sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg==",
"dev": true
},
"raw-body": {
"version": "2.4.0",
"resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.4.0.tgz",
"integrity": "sha512-4Oz8DUIwdvoa5qMJelxipzi/iJIi40O5cGV1wNYp5hvZP8ZN0T+jiNkL0QepXs+EsQ9XJ8ipEDoiH70ySUJP3Q==",
"dev": true,
"requires": {
"bytes": "3.1.0",
"http-errors": "1.7.2",
@ -1669,7 +1938,14 @@
"safer-buffer": {
"version": "2.1.2",
"resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
"integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg=="
"integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==",
"dev": true
},
"semaphore": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/semaphore/-/semaphore-1.1.0.tgz",
"integrity": "sha512-O4OZEaNtkMd/K0i6js9SL+gqy0ZCBMgUvlSqHKi4IBdjhe7wB8pwztUk1BbZ1fmrvpwFrPbHzqd2w5pTcJH6LA==",
"dev": true
},
"semver": {
"version": "5.7.0",
@ -1681,6 +1957,7 @@
"version": "0.17.1",
"resolved": "https://registry.npmjs.org/send/-/send-0.17.1.tgz",
"integrity": "sha512-BsVKsiGcQMFwT8UxypobUKyv7irCNRHk1T0G680vk88yf6LBByGcZJOTJCrTP2xVN6yI+XjPJcNuE3V4fT9sAg==",
"dev": true,
"requires": {
"debug": "2.6.9",
"depd": "~1.1.2",
@ -1701,6 +1978,7 @@
"version": "2.6.9",
"resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
"integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
"dev": true,
"requires": {
"ms": "2.0.0"
},
@ -1708,14 +1986,16 @@
"ms": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g="
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=",
"dev": true
}
}
},
"mime": {
"version": "1.6.0",
"resolved": "https://registry.npmjs.org/mime/-/mime-1.6.0.tgz",
"integrity": "sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg=="
"integrity": "sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg==",
"dev": true
}
}
},
@ -1723,6 +2003,7 @@
"version": "1.14.1",
"resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.14.1.tgz",
"integrity": "sha512-JMrvUwE54emCYWlTI+hGrGv5I8dEwmco/00EvkzIIsR7MqrHonbD9pO2MOfFnpFntl7ecpZs+3mW+XbQZu9QCg==",
"dev": true,
"requires": {
"encodeurl": "~1.0.2",
"escape-html": "~1.0.3",
@ -1739,7 +2020,8 @@
"setprototypeof": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.1.1.tgz",
"integrity": "sha512-JvdAWfbXeIGaZ9cILp38HntZSFSo3mWg6xGcJJsd+d4aRMOqauag1C63dJfDw7OaMYwEbHMOxEZ1lqVRYP2OAw=="
"integrity": "sha512-JvdAWfbXeIGaZ9cILp38HntZSFSo3mWg6xGcJJsd+d4aRMOqauag1C63dJfDw7OaMYwEbHMOxEZ1lqVRYP2OAw==",
"dev": true
},
"shallow-clone": {
"version": "0.1.2",
@ -1793,16 +2075,30 @@
"integrity": "sha1-tf3AjxKH6hF4Yo5BXiUTK3NkbG0=",
"dev": true
},
"simple-swizzle": {
"version": "0.2.2",
"resolved": "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.2.tgz",
"integrity": "sha1-pNprY1/8zMoz9w0Xy5JZLeleVXo=",
"requires": {
"is-arrayish": "^0.3.1"
}
},
"sprintf-js": {
"version": "1.0.3",
"resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz",
"integrity": "sha1-BOaSb2YolTVPPdAVIDYzuFcpfiw=",
"dev": true
},
"stack-trace": {
"version": "0.0.10",
"resolved": "https://registry.npmjs.org/stack-trace/-/stack-trace-0.0.10.tgz",
"integrity": "sha1-VHxws0fo0ytOEI6hoqFZ5f3eGcA="
},
"statuses": {
"version": "1.5.0",
"resolved": "https://registry.npmjs.org/statuses/-/statuses-1.5.0.tgz",
"integrity": "sha1-Fhx9rBd2Wf2YEfQ3cfqZOBR4Yow="
"integrity": "sha1-Fhx9rBd2Wf2YEfQ3cfqZOBR4Yow=",
"dev": true
},
"string-width": {
"version": "2.1.1",
@ -1859,6 +2155,11 @@
"has-flag": "^3.0.0"
}
},
"text-hex": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/text-hex/-/text-hex-1.0.0.tgz",
"integrity": "sha512-uuVGNWzgJ4yhRaNSiubPY7OjISw4sw4E5Uv0wbjp+OzcbmVU/rsT8ujgcXJhn9ypzsgr5vlzpPqP+MBBKcGvbg=="
},
"to-readable-stream": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/to-readable-stream/-/to-readable-stream-1.0.0.tgz",
@ -1867,7 +2168,13 @@
"toidentifier": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/toidentifier/-/toidentifier-1.0.0.tgz",
"integrity": "sha512-yaOH/Pk/VEhBWWTlhI+qXxDFXlejDGcQipMlyxda9nthulaxLZUNcUqFxokp0vcYnvteJln5FNQDRrxj3YcbVw=="
"integrity": "sha512-yaOH/Pk/VEhBWWTlhI+qXxDFXlejDGcQipMlyxda9nthulaxLZUNcUqFxokp0vcYnvteJln5FNQDRrxj3YcbVw==",
"dev": true
},
"triple-beam": {
"version": "1.3.0",
"resolved": "https://registry.npmjs.org/triple-beam/-/triple-beam-1.3.0.tgz",
"integrity": "sha512-XrHUvV5HpdLmIj4uVMxHggLbFSZYIn7HEWsqePZcI50pco+MPqJ50wMGY794X7AOOhxOBAjbkqfAbEe/QMp2Lw=="
},
"type-detect": {
"version": "4.0.8",
@ -1879,6 +2186,7 @@
"version": "1.6.18",
"resolved": "https://registry.npmjs.org/type-is/-/type-is-1.6.18.tgz",
"integrity": "sha512-TkRKr9sUTxEH8MdfuCSP7VizJyzRNMjj2J2do2Jr3Kym598JVdEksuzPQCnlFPW4ky9Q+iA+ma9BGm06XQBy8g==",
"dev": true,
"requires": {
"media-typer": "0.3.0",
"mime-types": "~2.1.24"
@ -1889,6 +2197,18 @@
"resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz",
"integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c="
},
"ua-parser-js": {
"version": "0.7.21",
"resolved": "https://registry.npmjs.org/ua-parser-js/-/ua-parser-js-0.7.21.tgz",
"integrity": "sha512-+O8/qh/Qj8CgC6eYBVBykMrNtp5Gebn4dlGD/kKXVkJNDwyrAwSIqwz8CDf+tsAIWVycKcku6gIXJ0qwx/ZXaQ==",
"dev": true
},
"ultron": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/ultron/-/ultron-1.1.1.tgz",
"integrity": "sha512-UIEXBNeYmKptWH6z8ZnqTeS8fV74zG0/eRU9VGkpzz+LIJNs8W/zM/L+7ctCkRrgbNnnR0xxw4bKOr0cW0N0Og==",
"dev": true
},
"underscore": {
"version": "1.9.1",
"resolved": "https://registry.npmjs.org/underscore/-/underscore-1.9.1.tgz",
@ -1905,7 +2225,8 @@
"unpipe": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz",
"integrity": "sha1-sr9O6FFKrmFltIF4KdIbLvSZBOw="
"integrity": "sha1-sr9O6FFKrmFltIF4KdIbLvSZBOw=",
"dev": true
},
"url-parse-lax": {
"version": "3.0.0",
@ -1932,12 +2253,14 @@
"utils-merge": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/utils-merge/-/utils-merge-1.0.1.tgz",
"integrity": "sha1-n5VxD1CiZ5R7LMwSR0HBAoQn5xM="
"integrity": "sha1-n5VxD1CiZ5R7LMwSR0HBAoQn5xM=",
"dev": true
},
"vary": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/vary/-/vary-1.1.2.tgz",
"integrity": "sha1-IpnwLG3tMNSllhsLn3RSShj2NPw="
"integrity": "sha1-IpnwLG3tMNSllhsLn3RSShj2NPw=",
"dev": true
},
"which": {
"version": "1.3.1",
@ -1963,6 +2286,61 @@
"string-width": "^1.0.2 || 2"
}
},
"winston": {
"version": "3.2.1",
"resolved": "https://registry.npmjs.org/winston/-/winston-3.2.1.tgz",
"integrity": "sha512-zU6vgnS9dAWCEKg/QYigd6cgMVVNwyTzKs81XZtTFuRwJOcDdBg7AU0mXVyNbs7O5RH2zdv+BdNZUlx7mXPuOw==",
"requires": {
"async": "^2.6.1",
"diagnostics": "^1.1.1",
"is-stream": "^1.1.0",
"logform": "^2.1.1",
"one-time": "0.0.4",
"readable-stream": "^3.1.1",
"stack-trace": "0.0.x",
"triple-beam": "^1.3.0",
"winston-transport": "^4.3.0"
}
},
"winston-transport": {
"version": "4.3.0",
"resolved": "https://registry.npmjs.org/winston-transport/-/winston-transport-4.3.0.tgz",
"integrity": "sha512-B2wPuwUi3vhzn/51Uukcao4dIduEiPOcOt9HJ3QeaXgkJ5Z7UwpBzxS4ZGNHtrxrUvTwemsQiSys0ihOf8Mp1A==",
"requires": {
"readable-stream": "^2.3.6",
"triple-beam": "^1.2.0"
},
"dependencies": {
"readable-stream": {
"version": "2.3.6",
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz",
"integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==",
"requires": {
"core-util-is": "~1.0.0",
"inherits": "~2.0.3",
"isarray": "~1.0.0",
"process-nextick-args": "~2.0.0",
"safe-buffer": "~5.1.1",
"string_decoder": "~1.1.1",
"util-deprecate": "~1.0.1"
}
},
"string_decoder": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz",
"integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==",
"requires": {
"safe-buffer": "~5.1.0"
}
}
}
},
"wordwrap": {
"version": "0.0.3",
"resolved": "https://registry.npmjs.org/wordwrap/-/wordwrap-0.0.3.tgz",
"integrity": "sha1-o9XabNXAvAAI03I0u68b7WMFkQc=",
"dev": true
},
"wrap-ansi": {
"version": "2.1.0",
"resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-2.1.0.tgz",

View File

@ -5,7 +5,7 @@
"homepage": "https://scrapeulous.com/",
"main": "index.js",
"scripts": {
"test": "mocha test/static_tests/"
"test": "mocha test test/modules"
},
"keywords": [
"scraping",
@ -23,17 +23,23 @@
"dependencies": {
"cheerio": "^1.0.0-rc.3",
"debug": "^4.1.1",
"express": "^4.17.1",
"got": "^9.6.0",
"lodash": "^4.17.14",
"puppeteer": "^2.0.0",
"puppeteer-cluster": "^0.18.0",
"puppeteer-extra": "^2.1.3",
"puppeteer-extra-plugin-stealth": "^2.2.2",
"user-agents": "^1.0.378"
"user-agents": "^1.0.378",
"winston": "^3.2.1"
},
"devDependencies": {
"bluebird": "^3.7.2",
"chai": "^4.2.0",
"chai-string": "^1.5.0",
"mocha": "^6.1.4"
"express": "^4.17.1",
"http-mitm-proxy": "^0.8.2",
"key-cert": "^1.0.1",
"mocha": "^6.1.4",
"ua-parser-js": "^0.7.21"
}
}

View File

@ -1,66 +0,0 @@
/**
Test server with:
curl -XPOST http://0.0.0.0:3000 -H 'Content-Type: application/json' \
-d '{
"browser_config": {
"random_user_agent": true
},
"scrape_config": {
"search_engine": "google",
"keywords": ["test"],
"num_pages": 1
}
}'
*/
const se_scraper = require('../index.js');
'use strict';
const express = require('express');
// Constants
const PORT = process.env.PORT || 3000;
const HOST = process.env.HOST || '0.0.0.0';
// App
const app = express();
app.use(express.json());
let browser_config = {
random_user_agent: true,
headless : true,
debug_level: 1,
sleep_range: '',
puppeteer_cluster_config: {
timeout: 30 * 60 * 1000, // max timeout set to 30 minutes
monitor: false,
concurrency: 1, // one scraper per tab
maxConcurrency: 1, // scrape with 5 tabs
}
};
app.post('/', async (req, res) => {
if (!req.body.browser_config || !req.body.scrape_config) {
res.json({
'status': 400,
'msg': 'please specify browser_config and scrape_config'
});
} else {
// overwrite standard browser config
Object.assign(browser_config, req.body.browser_config);
var scraper = new se_scraper.ScrapeManager(browser_config);
await scraper.start();
var results = await scraper.scrape(req.body.scrape_config);
// console.dir(results, {depth: null, colors: true});
await scraper.quit();
res.send(results);
}
});
app.listen(PORT, HOST);
console.log(`Running on http://${HOST}:${PORT}`);

View File

@ -0,0 +1,55 @@
const { Browser } = require('puppeteer-cluster/dist/concurrency/builtInConcurrency');
const debug = require('debug')('se-scraper:CustomConcurrency');
const { timeoutExecute } = require('puppeteer-cluster/dist/util');
const BROWSER_TIMEOUT = 5000;
class CustomConcurrency extends Browser {
async init() {}
async close() {}
async workerInstance() {
const options = this.options.perBrowserOptions.shift();
debug('Launch puppeteer instance with options=%o', options);
let chrome = await this.puppeteer.launch(options);
let page;
let context;
return {
jobInstance: async () => {
await timeoutExecute(BROWSER_TIMEOUT, (async () => {
context = await chrome.createIncognitoBrowserContext();
page = await context.newPage();
})());
return {
resources: {
page,
},
close: async () => {
await timeoutExecute(BROWSER_TIMEOUT, context.close());
},
};
},
close: async () => {
await chrome.close();
},
repair: async () => {
debug('Starting repair');
try {
// will probably fail, but just in case the repair was not necessary
await chrome.close();
} catch (e) {}
// just relaunch as there is only one page per browser
chrome = await this.puppeteer.launch(options);
},
};
}
};
module.exports = CustomConcurrency;

View File

@ -123,12 +123,9 @@ class BingScraper extends Scraper {
}
}
try {
await this.page.goto(startUrl);
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
} catch (e) {
return false;
}
await this.page.goto(startUrl);
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
return true;
}

View File

@ -1,21 +0,0 @@
function log(config, loglevel, msg = null, cb = null) {
if (typeof loglevel != "number") {
throw Error('loglevel must be numeric.');
}
if (loglevel <= config.debug_level) {
if (msg) {
if (typeof msg == 'object') {
console.dir(msg, {depth: null, colors: false});
} else {
console.log('[i] ' + msg);
}
} else if (cb) {
cb();
}
}
}
module.exports = {
log: log,
};

View File

@ -1,15 +1,18 @@
const cheerio = require('cheerio');
const Scraper = require('./se_scraper');
const debug = require('debug')('se-scraper:DuckduckgoScraper');
class DuckduckgoScraper extends Scraper {
parse(html) {
debug('parse');
// load the page source into cheerio
const $ = cheerio.load(html);
// perform queries
const results = [];
$('#links .result__body').each((i, link) => {
const organicSelector = ($('#links .result--sep').length > 0) ? `#links #rld-${this.page_num - 1} ~ .result .result__body` : '#links .result__body';
$(organicSelector).each((i, link) => {
results.push({
link: $(link).find('.result__title .result__a').attr('href'),
title: $(link).find('.result__title .result__a').text(),
@ -42,19 +45,17 @@ class DuckduckgoScraper extends Scraper {
}
async load_start_page() {
debug('load_start_page');
let startUrl = 'https://duckduckgo.com/';
let startUrl = 'https://duckduckgo.com/?q=test';
this.last_response = await this.page.goto(startUrl);
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
try {
this.last_response = await this.page.goto(startUrl);
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
} catch (e) {
return false;
}
return true;
}
async search_keyword(keyword) {
debug('search_keyword');
const input = await this.page.$('input[name="q"]');
await this.set_input_value(`input[name="q"]`, keyword);
await this.sleep(50);
@ -63,21 +64,19 @@ class DuckduckgoScraper extends Scraper {
}
async next_page() {
let next_page_link = await this.page.$('.result.result--more', {timeout: this.STANDARD_TIMEOUT});
debug('next_page');
let next_page_link = await this.page.$('.result.result--more a', {timeout: this.STANDARD_TIMEOUT});
if (!next_page_link) {
return false;
}
await next_page_link.click();
try {
await this.page.waitForNavigation({timeout: this.STANDARD_TIMEOUT});
} catch(e) {
return false;
}
await this.page.waitForNavigation({ timeout: this.STANDARD_TIMEOUT });
return true;
}
async wait_for_results() {
debug('wait_for_results');
await this.page.waitForSelector('.result__body', { timeout: this.STANDARD_TIMEOUT });
}

View File

@ -2,8 +2,6 @@
const cheerio = require('cheerio');
const Scraper = require('./se_scraper');
const common = require('./common.js');
var log = common.log;
class GoogleScraper extends Scraper {
@ -243,7 +241,7 @@ class GoogleScraper extends Scraper {
}
}
log(this.config, 1, 'Using startUrl: ' + startUrl);
this.logger.info('Using startUrl: ' + startUrl);
this.last_response = await this.page.goto(startUrl);
@ -642,7 +640,7 @@ class GoogleMapsScraper extends Scraper {
this.scrape_in_detail = this.config.google_maps_settings.scrape_in_detail || false;
}
log(this.config, 1, 'Using startUrl: ' + startUrl);
this.logger.info('Using startUrl: ' + startUrl);
this.last_response = await this.page.goto(startUrl);
@ -681,7 +679,7 @@ class GoogleMapsScraper extends Scraper {
let last_title_last_result = this.results[this.keyword][this.page_num-1].results.slice(-1)[0].title;
log(this.config, 1, `Waiting until new last serp title differs from: "${last_title_last_result}"`);
this.logger.info(`Waiting until new last serp title differs from: "${last_title_last_result}"`);
await this.page.waitForFunction((last_title) => {
const res = document.querySelectorAll('.section-result .section-result-title span');
@ -775,7 +773,7 @@ class GoogleShoppingScraper extends Scraper {
}
}
log(this.config, 1, 'Using startUrl: ' + startUrl);
this.logger.info('Using startUrl: ' + startUrl);
this.last_response = await this.page.goto(startUrl);

View File

@ -1,8 +1,6 @@
'use strict';
const meta = require('./metadata.js');
const common = require('./common.js');
var log = common.log;
const debug = require('debug')('se-scraper:Scraper');
/*
Get useful JS knowledge and get awesome...
@ -12,6 +10,7 @@ var log = common.log;
module.exports = class Scraper {
constructor(options = {}) {
debug('constructor');
const {
config = {},
context = {},
@ -26,6 +25,7 @@ module.exports = class Scraper {
};
this.pluggable = pluggable;
this.config = config;
this.logger = this.config.logger;
this.context = context;
this.proxy = config.proxy;
@ -50,7 +50,9 @@ module.exports = class Scraper {
}
}
async run({page, data}) {
async run({page, data, worker}) {
debug('worker=%o', worker, this.config.keywords);
if (page) {
this.page = page;
@ -113,25 +115,25 @@ module.exports = class Scraper {
if (this.config.log_http_headers === true) {
this.metadata.http_headers = await meta.get_http_headers(this.page);
log(this.config, 2, this.metadata.http_headers);
debug('this.metadata.http_headers=%O', this.metadata.http_headers);
}
if (this.config.log_ip_address === true) {
let ipinfo = await meta.get_ip_data(this.page);
this.metadata.ipinfo = ipinfo;
log(this.config, 2, this.metadata.ipinfo);
debug('this.metadata.ipinfo', this.metadata.ipinfo);
}
// check that our proxy is working by confirming
// that ipinfo.io sees the proxy IP address
if (this.proxy && this.config.log_ip_address === true) {
log(this.config, 3, `${this.metadata.ipinfo.ip} vs ${this.proxy}`);
debug(`${this.metadata.ipinfo.ip} vs ${this.proxy}`);
// if the ip returned by ipinfo is not a substring of our proxystring, get the heck outta here
if (!this.proxy.includes(this.metadata.ipinfo.ip)) {
throw new Error(`Proxy output ip ${this.proxy} does not match with provided one`);
} else {
log(this.config, 1, `Using valid Proxy: ${this.proxy}`);
this.logger.info(`Using valid Proxy: ${this.proxy}`);
}
}
@ -179,7 +181,7 @@ module.exports = class Scraper {
do {
log(this.config, 1, `${this.config.search_engine_name} scrapes keyword "${keyword}" on page ${this.page_num}`);
this.logger.info(`${this.config.search_engine_name} scrapes keyword "${keyword}" on page ${this.page_num}`);
await this.wait_for_results();
@ -191,6 +193,13 @@ module.exports = class Scraper {
let parsed = this.parse(html);
this.results[keyword][this.page_num] = parsed ? parsed : await this.parse_async(html);
if (this.config.screen_output) {
this.results[keyword][this.page_num].screenshot = await this.page.screenshot({
encoding: 'base64',
fullPage: false,
});
}
if (this.config.html_output) {
if (this.config.clean_html_output) {
@ -237,13 +246,6 @@ module.exports = class Scraper {
this.results[keyword][this.page_num].html = html_contents;
}
if (this.config.screen_output) {
this.results[keyword][this.page_num].screenshot = await this.page.screenshot({
encoding: 'base64',
fullPage: false,
});
}
this.page_num += 1;
// only load the next page when we will pass the next iteration
@ -263,28 +265,21 @@ module.exports = class Scraper {
} catch (e) {
console.error(`Problem with scraping ${keyword} in search engine ${this.config.search_engine_name}: ${e}`);
this.logger.warn(`Problem with scraping ${keyword} in search engine ${this.config.search_engine_name}: ${e.message}`);
debug('this.last_response=%O', this.last_response);
if (this.last_response) {
log(this.config, 2, this.last_response);
}
if (this.config.debug_level > 2) {
try {
// Try to save a screenshot of the error
await this.page.screenshot({path: `debug_se_scraper_${this.config.search_engine_name}_${keyword}.png`});
} catch (e) {
}
if (this.config.take_screenshot_on_error) {
await this.page.screenshot({ path: `debug_se_scraper_${this.config.search_engine_name}_${keyword}.png` });
}
this.metadata.scraping_detected = await this.detected();
if (this.metadata.scraping_detected === true) {
console.error(`${this.config.search_engine_name} detected the scraping!`);
this.logger.warn(`${this.config.search_engine_name} detected the scraping!`);
if (this.config.is_local === true) {
await this.sleep(this.SOLVE_CAPTCHA_TIME);
console.error(`You have ${this.SOLVE_CAPTCHA_TIME}ms to enter the captcha.`);
this.logger.info(`You have ${this.SOLVE_CAPTCHA_TIME}ms to enter the captcha.`);
// expect that user filled out necessary captcha
} else {
if (this.config.throw_on_detection === true) {
@ -318,7 +313,7 @@ module.exports = class Scraper {
baseUrl += `${key}=${settings[key]}&`
}
log(this.config, 1, 'Using startUrl: ' + baseUrl);
this.logger.info('Using startUrl: ' + baseUrl);
return baseUrl;
}
@ -335,7 +330,7 @@ module.exports = class Scraper {
async random_sleep() {
const [min, max] = this.config.sleep_range;
let rand = Math.floor(Math.random() * (max - min + 1) + min); //Generate Random number
log(this.config, 1, `Sleeping for ${rand}s`);
this.logger.info(`Sleeping for ${rand}s`);
await this.sleep(rand * 1000);
}
@ -349,7 +344,7 @@ module.exports = class Scraper {
no_results(needles, html) {
for (let needle of needles) {
if (html.includes(needle)) {
console.log(this.config, 2, `HTML contains needle ${needle}. no_results=true`);
this.logger.warn(`HTML contains needle ${needle}. no_results=true`);
return true;
}
}

View File

@ -1,8 +1,6 @@
'use strict';
const Scraper = require('./se_scraper');
const common = require('./common.js');
var log = common.log;
class YandexScraper extends Scraper {
@ -75,7 +73,7 @@ class YandexScraper extends Scraper {
async load_start_page() {
let startUrl = 'https://yandex.com';
log(this.config, 1, 'Using startUrl: ' + startUrl);
this.logger.info('Using startUrl: ' + startUrl);
this.last_response = await this.page.goto(startUrl);

View File

@ -1,7 +1,12 @@
'use strict';
var fs = require('fs');
var os = require("os");
const fs = require('fs');
const os = require('os');
const _ = require('lodash');
const { createLogger, format, transports } = require('winston');
const { combine, timestamp, printf } = format;
const debug = require('debug')('se-scraper:ScrapeManager');
const { Cluster } = require('puppeteer-cluster');
const UserAgent = require('user-agents');
const google = require('./modules/google.js');
@ -9,9 +14,7 @@ const bing = require('./modules/bing.js');
const yandex = require('./modules/yandex.js');
const infospace = require('./modules/infospace.js');
const duckduckgo = require('./modules/duckduckgo.js');
const { Cluster } = require('./puppeteer-cluster/dist/index.js');
const common = require('./modules/common.js');
var log = common.log;
const CustomConcurrencyImpl = require('./concurrency-implementation');
const MAX_ALLOWED_BROWSERS = 6;
@ -63,7 +66,7 @@ class ScrapeManager {
this.scraper = null;
this.context = context;
this.config = {
this.config = _.defaults(config, {
// the user agent to scrape with
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3835.0 Safari/537.36',
// if random_user_agent is set to True, a random user agent is chosen
@ -80,17 +83,38 @@ class ScrapeManager {
// which search engine to scrape
search_engine: 'google',
search_engine_name: 'google',
// whether debug information should be printed
// level 0: print nothing
// level 1: print most important info
// ...
// level 4: print all shit nobody wants to know
debug_level: 1,
logger: createLogger({
level: 'info',
format: combine(
timestamp(),
printf(({ level, message, timestamp }) => {
return `${timestamp} [${level}] ${message}`;
})
),
transports: [
new transports.Console()
]
}),
keywords: ['nodejs rocks',],
// whether to start the browser in headless mode
headless: true,
// specify flags passed to chrome here
chrome_flags: [],
// About our defaults values https://peter.sh/experiments/chromium-command-line-switches/
chrome_flags: [
'--disable-infobars',
'--window-position=0,0',
'--ignore-certifcate-errors',
'--ignore-certifcate-errors-spki-list',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--disable-gpu',
'--window-size=1920,1040',
'--start-fullscreen',
'--hide-scrollbars',
'--disable-notifications',
],
// the number of pages to scrape for each keyword
num_pages: 1,
// path to output file, data will be stored in JSON
@ -115,10 +139,8 @@ class ScrapeManager {
//custom_func: resolve('examples/pluggable.js'),
custom_func: null,
throw_on_detection: false,
// use a proxy for all connections
// example: 'socks5://78.94.172.42:1080'
// example: 'http://118.174.233.10:48400'
proxy: '',
// List of proxies to use ['socks5://78.94.172.42:1080', 'http://localhost:1080']
proxies: null,
// a file with one proxy per line. Example:
// socks5://78.94.172.42:1080
// http://118.174.233.10:48400
@ -138,14 +160,9 @@ class ScrapeManager {
concurrency: Cluster.CONCURRENCY_BROWSER,
maxConcurrency: 1,
}
};
});
this.config.proxies = [];
// overwrite default config
for (var key in config) {
this.config[key] = config[key];
}
this.logger = this.config.logger;
if (config.sleep_range) {
// parse an array
@ -160,12 +177,20 @@ class ScrapeManager {
this.config.keywords = read_keywords_from_file(this.config.keyword_file);
}
if (fs.existsSync(this.config.proxy_file)) {
this.config.proxies = read_keywords_from_file(this.config.proxy_file);
log(this.config, 1, `${this.config.proxies.length} proxies read from file.`);
if (this.config.proxies && this.config.proxy_file) {
throw new Error('Either use a proxy_file or specify a proxy for all connections. Do not use both options.');
}
log(this.config, 2, this.config);
if (this.config.proxy_file) {
this.config.proxies = read_keywords_from_file(this.config.proxy_file);
this.logger.info(`${this.config.proxies.length} proxies read from file.`);
}
if (!this.config.proxies && this.config.use_proxies_only) {
throw new Error('Must provide at least one proxy in proxies if you enable use_proxies_only');
}
debug('this.config=%O', this.config);
}
/*
@ -193,135 +218,72 @@ class ScrapeManager {
}
}
// See here: https://peter.sh/experiments/chromium-command-line-switches/
var default_chrome_flags = [
'--disable-infobars',
'--window-position=0,0',
'--ignore-certifcate-errors',
'--ignore-certifcate-errors-spki-list',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--disable-gpu',
'--window-size=1920,1040',
'--start-fullscreen',
'--hide-scrollbars',
'--disable-notifications',
];
var chrome_flags = default_chrome_flags.slice(); // copy that
if (Array.isArray(this.config.chrome_flags) && this.config.chrome_flags.length) {
chrome_flags = this.config.chrome_flags;
}
var user_agent = null;
if (this.config.user_agent) {
user_agent = this.config.user_agent;
}
if (this.config.random_user_agent) {
const userAgent = new UserAgent({ deviceCategory: 'desktop' });
user_agent = userAgent.toString();
}
if (user_agent) {
chrome_flags.push(
`--user-agent=${user_agent}`
)
}
if (this.config.proxy) {
if (this.config.proxies && this.config.proxies.length > 0) {
console.error('Either use a proxy_file or specify a proxy for all connections. Do not use both options.');
return false;
}
chrome_flags.push(
'--proxy-server=' + this.config.proxy,
)
}
var launch_args = {
args: chrome_flags,
headless: this.config.headless,
ignoreHTTPSErrors: true,
};
log(this.config, 2, `Using the following puppeteer configuration: ${launch_args}`);
const chrome_flags = _.clone(this.config.chrome_flags);
if (this.pluggable && this.pluggable.start_browser) {
launch_args.config = this.config;
this.browser = await this.pluggable.start_browser(launch_args);
this.browser = await this.pluggable.start_browser({
config: this.config,
});
this.page = await this.browser.newPage();
} else {
// if no custom start_browser functionality was given
// use puppeteer-cluster for scraping
const { Cluster } = require('./puppeteer-cluster/dist/index.js');
this.numClusters = this.config.puppeteer_cluster_config.maxConcurrency;
var perBrowserOptions = [];
// the first browser this.config with home IP
if (!this.config.use_proxies_only) {
perBrowserOptions.push(launch_args);
}
let proxies;
// if we have at least one proxy, always use CONCURRENCY_BROWSER
// and set maxConcurrency to this.config.proxies.length + 1
// else use whatever this.configuration was passed
if (this.config.proxies.length > 0) {
this.config.puppeteer_cluster_config.concurrency = Cluster.CONCURRENCY_BROWSER;
if (this.config.proxies && this.config.proxies.length > 0) {
// because we use real browsers, we ran out of memory on normal laptops
// when using more than maybe 5 or 6 browsers.
// therefore hardcode a limit here
// TODO not sure this what we want
this.numClusters = Math.min(
this.config.proxies.length + (this.config.use_proxies_only ? 0 : 1),
MAX_ALLOWED_BROWSERS
);
proxies = _.clone(this.config.proxies);
log(this.config, 1, `Using ${this.numClusters} clusters.`);
this.config.puppeteer_cluster_config.maxConcurrency = this.numClusters;
for (var proxy of this.config.proxies) {
perBrowserOptions.push({
headless: this.config.headless,
ignoreHTTPSErrors: true,
args: chrome_flags.concat(`--proxy-server=${proxy}`)
})
// Insert a first config without proxy if use_proxy_only is false
if (this.config.use_proxies_only === false) {
proxies.unshift(null);
}
} else {
this.numClusters = this.config.puppeteer_cluster_config.maxConcurrency;
proxies = _.times(this.numClusters, null);
}
// Give the per browser options each a random user agent when random user agent is set
while (perBrowserOptions.length < this.numClusters) {
const userAgent = new UserAgent();
perBrowserOptions.push({
this.logger.info(`Using ${this.numClusters} clusters.`);
// Give the per browser options
const perBrowserOptions = _.map(proxies, (proxy) => {
const userAgent = (this.config.random_user_agent) ? (new UserAgent({deviceCategory: 'desktop'})).toString() : this.config.user_agent;
let args = chrome_flags.concat([`--user-agent=${userAgent}`]);
if (proxy) {
args = args.concat([`--proxy-server=${proxy}`]);
}
return {
headless: this.config.headless,
ignoreHTTPSErrors: true,
args: default_chrome_flags.slice().concat(`--user-agent=${userAgent.toString()}`)
})
}
args
};
});
if (this.config.debug_level >= 2) {
console.dir(perBrowserOptions)
}
debug('perBrowserOptions=%O', perBrowserOptions)
this.cluster = await Cluster.launch({
monitor: this.config.puppeteer_cluster_config.monitor,
timeout: this.config.puppeteer_cluster_config.timeout, // max timeout set to 30 minutes
concurrency: this.config.puppeteer_cluster_config.concurrency,
maxConcurrency: this.config.puppeteer_cluster_config.maxConcurrency,
puppeteerOptions: launch_args,
perBrowserOptions: perBrowserOptions,
});
this.cluster.on('taskerror', (err, data) => {
console.log(`Error while scraping ${data}: ${err.message}`);
console.log(err);
concurrency: CustomConcurrencyImpl,
maxConcurrency: this.numClusters,
puppeteerOptions: {
perBrowserOptions: perBrowserOptions
}
});
}
}
@ -332,8 +294,7 @@ class ScrapeManager {
async scrape(scrape_config = {}) {
if (!scrape_config.keywords && !scrape_config.keyword_file) {
console.error('Either keywords or keyword_file must be supplied to scrape()');
return false;
throw new Error('Either keywords or keyword_file must be supplied to scrape()');
}
Object.assign(this.config, scrape_config);
@ -345,10 +306,7 @@ class ScrapeManager {
this.config.search_engine_name = typeof this.config.search_engine === 'function' ? this.config.search_engine.name : this.config.search_engine;
if (this.config.keywords && this.config.search_engine) {
log(this.config, 1,
`[se-scraper] started at [${(new Date()).toUTCString()}] and scrapes ${this.config.search_engine_name} with ${this.config.keywords.length} keywords on ${this.config.num_pages} pages each.`)
}
this.logger.info(`scrapes ${this.config.search_engine_name} with ${this.config.keywords.length} keywords on ${this.config.num_pages} pages each.`);
if (this.pluggable && this.pluggable.start_browser) {
@ -377,26 +335,21 @@ class ScrapeManager {
chunks[k % this.numClusters].push(this.config.keywords[k]);
}
let execPromises = [];
let scraperInstances = [];
for (var c = 0; c < chunks.length; c++) {
this.config.keywords = chunks[c];
debug('chunks=%o', chunks);
if (this.config.use_proxies_only) {
this.config.proxy = this.config.proxies[c]; // every cluster has a dedicated proxy
} else if(c > 0) {
this.config.proxy = this.config.proxies[c-1]; // first cluster uses own ip address
}
let execPromises = [];
for (var c = 0; c < chunks.length; c++) {
const config = _.clone(this.config);
config.keywords = chunks[c];
var obj = getScraper(this.config.search_engine, {
config: this.config,
config: config,
context: {},
pluggable: this.pluggable,
});
var boundMethod = obj.run.bind(obj);
execPromises.push(this.cluster.execute({}, boundMethod));
scraperInstances.push(obj);
}
let promiseReturns = await Promise.all(execPromises);
@ -412,8 +365,8 @@ class ScrapeManager {
let timeDelta = Date.now() - startTime;
let ms_per_request = timeDelta/num_requests;
log(this.config, 1, `Scraper took ${timeDelta}ms to perform ${num_requests} requests.`);
log(this.config, 1, `On average ms/request: ${ms_per_request}ms/request`);
this.logger.info(`Scraper took ${timeDelta}ms to perform ${num_requests} requests.`);
this.logger.info(`On average ms/request: ${ms_per_request}ms/request`);
if (this.pluggable && this.pluggable.handle_results) {
await this.pluggable.handle_results(results);
@ -423,14 +376,14 @@ class ScrapeManager {
metadata.ms_per_keyword = ms_per_request.toString();
metadata.num_requests = num_requests;
log(this.config, 2, metadata);
debug('metadata=%O', metadata);
if (this.pluggable && this.pluggable.handle_metadata) {
await this.pluggable.handle_metadata(metadata);
}
if (this.config.output_file) {
log(this.config, 1, `Writing results to ${this.config.output_file}`);
this.logger.info(`Writing results to ${this.config.output_file}`);
write_results(this.config.output_file, JSON.stringify(results, null, 4));
}

@ -1 +0,0 @@
Subproject commit 221e6821d1d5d8c57bdf7b2cfef71d64dbf006a2

101
test/html_output.js Normal file
View File

@ -0,0 +1,101 @@
'use strict';
const express = require('express');
const { createLogger, transports } = require('winston');
const http = require('http');
const https = require('https');
const assert = require('assert');
const path = require('path');
const keyCert = require('key-cert');
const Promise = require('bluebird');
const Proxy = require('http-mitm-proxy');
const debug = require('debug')('se-scraper:test');
const se_scraper = require('../');
const httpPort = 3012;
const httpsPort = httpPort + 1;
const proxyPort = httpPort + 2;
const fakeSearchEngine = express();
fakeSearchEngine.get('/search', (req, res) => {
debug('q=%s', req.query.q);
const pageNumber = ((req.query.start/10) || 0) + 1;
res.sendFile(path.join(__dirname, 'mocks/google/' + req.query.q + '_page' + pageNumber + '.html'));
});
fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']}));
describe('Config', function(){
let httpServer, httpsServer, proxy;
before(async function(){
// Here mount our fake engine in both http and https listen server
httpServer = http.createServer(fakeSearchEngine);
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
proxy = Proxy();
proxy.onRequest((ctx, callback) => {
ctx.proxyToServerRequestOptions.host = 'localhost';
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
return callback();
});
await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
debug('Fake http search engine servers started');
});
after(function(){
httpsServer.close();
httpServer.close();
proxy.close();
});
describe('html_output', function(){
const testLogger = createLogger({
transports: [
new transports.Console({
level: 'error'
})
]
});
/**
* Test html_output option
*/
it('html_output single page single keyword', async function () {
const scrape_job = {
search_engine: 'google',
/* TODO refactor start_url
google_settings: {
start_url: 'http://localhost:' + httpPort
},
*/
keywords: ['test keyword'],
};
var scraper = new se_scraper.ScrapeManager({
throw_on_detection: true,
logger: testLogger,
html_output: true,
//clean_html_output: false,
//clean_data_images: false,
// TODO refactor start_url so we can use-it instead of depending of the proxy for this test
proxies: ['http://localhost:' + proxyPort],
use_proxies_only: true,
});
await scraper.start();
const { results } = await scraper.scrape(scrape_job);
await scraper.quit();
assert(results['test keyword']['1'].html.length > 1000, 'Html of google page 1 should be provided');
});
});
});

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,148 @@
<!DOCTYPE html>
<!--[if IEMobile 7 ]> <html lang="en_US" class="no-js iem7"> <![endif]-->
<!--[if lt IE 7]> <html class="ie6 lt-ie10 lt-ie9 lt-ie8 lt-ie7 no-js" lang="en_US"> <![endif]-->
<!--[if IE 7]> <html class="ie7 lt-ie10 lt-ie9 lt-ie8 no-js" lang="en_US"> <![endif]-->
<!--[if IE 8]> <html class="ie8 lt-ie10 lt-ie9 no-js" lang="en_US"> <![endif]-->
<!--[if IE 9]> <html class="ie9 lt-ie10 no-js" lang="en_US"> <![endif]-->
<!--[if (gte IE 9)|(gt IEMobile 7)|!(IEMobile)|!(IE)]><!--><html class="no-js" lang="en_US"><!--<![endif]-->
<head>
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
<meta http-equiv="content-type" content="text/html; charset=UTF-8;charset=utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=1" />
<meta name="HandheldFriendly" content="true"/>
<link rel="canonical" href="https://duckduckgo.com/">
<link rel="stylesheet" href="/s1847.css" type="text/css">
<link rel="stylesheet" href="/o1847.css" type="text/css">
<link rel="shortcut icon" href="/favicon.ico" type="image/x-icon"/>
<link rel="apple-touch-icon" href="/assets/icons/meta/DDG-iOS-icon_60x60.png"/>
<link rel="apple-touch-icon" sizes="76x76" href="/assets/icons/meta/DDG-iOS-icon_76x76.png"/>
<link rel="apple-touch-icon" sizes="120x120" href="/assets/icons/meta/DDG-iOS-icon_120x120.png"/>
<link rel="apple-touch-icon" sizes="152x152" href="/assets/icons/meta/DDG-iOS-icon_152x152.png"/>
<link rel="image_src" href="/assets/icons/meta/DDG-icon_256x256.png"/>
<link rel="manifest" href="/manifest.json"/>
<meta name="twitter:card" content="summary">
<meta name="twitter:site" value="@duckduckgo">
<meta property="og:url" content="https://duckduckgo.com/" />
<meta property="og:site_name" content="DuckDuckGo" />
<meta property="og:image" content="https://duckduckgo.com/assets/logo_social-media.png">
<title>DuckDuckGo — Privacy, simplified.</title>
<meta property="og:title" content="DuckDuckGo — Privacy, simplified." />
<meta property="og:description" content="The Internet privacy company that empowers you to seamlessly take control of your personal information online, without any tradeoffs.">
<meta name="description" content="The Internet privacy company that empowers you to seamlessly take control of your personal information online, without any tradeoffs.">
</head>
<body id="pg-index" class="page-index body--home">
<script type="text/javascript">
var settings_js_version = "/s2475.js",
locale = "en_US";
</script>
<script type="text/javascript" src="/lib/l113.js"></script>
<script type="text/javascript" src="/locale/en_US/duckduckgo14.js"></script>
<script type="text/javascript" src="/util/u418.js"></script>
<script type="text/javascript" src="/d2727.js"></script>
<script type="text/javascript">
DDG.page = new DDG.Pages.Home();
</script>
<div class="site-wrapper site-wrapper--home js-site-wrapper">
<div class="header-wrap--home js-header-wrap">
<div class="header--aside js-header-aside"></div>
<div class="js-header-home-search header-wrap--home__search">
<div class="logo-wrap--home">
<a id="logo_homepage_link" class="logo_homepage" href="/about">
About DuckDuckGo
<span class="logo_homepage__tt">Duck it!</span>
</a>
</div>
<form id="search_form_homepage_top" class="search search--home js-search-form-top" name="x" method="POST" action="/html">
<input class="search__input js-search-input" type="text" autocomplete="off" name="q" tabindex="1" value="">
<input class="search__button js-search-button" type="submit" tabindex="2" value="S" />
<input class="search__clear empty js-search-clear" type="button" tabindex="3" value="X" />
<div class="search__hidden js-search-hidden"></div>
</form>
</div>
</div>
<div id="" class="content-wrap--home">
<div id="content_homepage" class="content--home">
<div class="cw--c">
<div class="logo-wrap--home">
<a id="logo_homepage_link" class="logo_homepage" href="/about">
About DuckDuckGo
<span class="logo_homepage__tt">Duck it!</span>
</a>
</div>
<div class="search-wrap--home">
<form id="search_form_homepage" class="search search--home js-search-form" name="x" method="POST" action="/html">
<input id="search_form_input_homepage" class="search__input js-search-input" type="text" autocomplete="off" name="q" tabindex="1" value="">
<input id="search_button_homepage" class="search__button js-search-button" type="submit" tabindex="2" value="S" />
<input id="search_form_input_clear" class="search__clear empty js-search-clear" type="button" tabindex="3" value="X" />
<div id="search_elements_hidden" class="search__hidden js-search-hidden"></div>
</form>
</div>
<!-- en_US All Settings -->
<noscript>
<div class="tag-home">
<div class="tag-home__wrapper">
<div class="tag-home__item">
The search engine that doesn't track you.
<span class="hide--screen-xs"><a href="/about" class="tag-home__link">Learn More</a>.</span>
</div>
</div>
</div>
</noscript>
<div class="tag-home tag-home--slide no-js__hide js-tag-home"></div>
<div id="error_homepage"></div>
</div> <!-- cw -->
</div> <!-- content_homepage //-->
</div> <!-- content_wrapper_homepage //-->
<div id="footer_homepage" class="foot-home js-foot-home"></div>
<script type="text/javascript">
{function seterr(str) {
var error=document.getElementById('error_homepage');
error.innerHTML=str;
$(error).css('display','block');
}
var err=new RegExp('[\?\&]e=([^\&]+)');var errm=new Array();errm['2']='no search';errm['3']='search too long';errm['4']='not UTF\u002d8 encoding';errm['6']='too many search terms';if (err.test(window.location.href)) seterr('Oops, '+(errm[RegExp.$1]?errm[RegExp.$1]:'there was an error.')+' &nbsp;Please try again');};
if (kurl) {
document.getElementById("logo_homepage_link").href += (document.getElementById("logo_homepage_link").href.indexOf('?')==-1 ? '?t=i' : '') + kurl;
}
</script>
</div> <!-- site-wrapper -->
</body>
</html>

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

123
test/modules/bing.js Normal file
View File

@ -0,0 +1,123 @@
'use strict';
const express = require('express');
const puppeteer = require('puppeteer');
const { createLogger, transports } = require('winston');
const http = require('http');
const https = require('https');
const assert = require('assert');
const path = require('path');
const keyCert = require('key-cert');
const Promise = require('bluebird');
const Proxy = require('http-mitm-proxy');
const debug = require('debug')('se-scraper:test');
const { BingScraper } = require('../../src/modules/bing');
const httpPort = 3012;
const httpsPort = httpPort + 1;
const proxyPort = httpPort + 2;
const fakeSearchEngine = express();
fakeSearchEngine.get('/search', (req, res, next) => {
debug('q=%s', req.query.q);
const pageNumber = Math.round((req.query.first || 0) /10) + 1;
res.sendFile(path.join(__dirname, '../mocks/bing/' + req.query.q + '_page' + pageNumber + '.html'));
});
fakeSearchEngine.use(express.static('test/mocks/bing', {extensions: ['html']}));
describe('Module Bing', function(){
let httpServer, httpsServer, proxy;
before(async function(){
// Here mount our fake engine in both http and https listen server
httpServer = http.createServer(fakeSearchEngine);
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
proxy = Proxy();
proxy.onRequest((ctx, callback) => {
ctx.proxyToServerRequestOptions.host = 'localhost';
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
debug('connection proxied askedHost=%s toPort=%s', ctx.clientToProxyRequest.headers.host, ctx.proxyToServerRequestOptions.port);
return callback();
});
await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort });
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
debug('Fake http search engine servers started');
});
after(function(){
proxy.close();
httpsServer.close();
httpServer.close();
});
let browser;
let page;
beforeEach(async function(){
debug('Start a new browser');
browser = await puppeteer.launch({
//dumpio: true,
//headless: false,
ignoreHTTPSErrors: true,
args: [ '--proxy-server=http://localhost:' + proxyPort ]
});
debug('Open a fresh page');
page = await browser.newPage();
});
afterEach(async function(){
await browser.close();
});
const testLogger = createLogger({
transports: [
new transports.Console({
level: 'error'
})
]
});
it('one keyword one page', function(){
const bingScraper = new BingScraper({
config: {
search_engine_name: 'bing',
throw_on_detection: true,
keywords: ['test keyword'],
logger: testLogger,
scrape_from_file: '',
}
});
bingScraper.STANDARD_TIMEOUT = 500;
return bingScraper.run({page}).then(({results, metadata, num_requests}) => {
assert.strictEqual(num_requests, 1, 'Must do one request');
assert.strictEqual(results['test keyword']['1'].results.length, 6, 'Must have 6 organic results parsed');
});
});
it('one keyword 3 pages', function () {
const bingScraper = new BingScraper({
config: {
search_engine_name: 'bing',
throw_on_detection: true,
keywords: ['test keyword'],
logger: testLogger,
scrape_from_file: '',
num_pages: 3,
}
});
bingScraper.STANDARD_TIMEOUT = 500;
return bingScraper.run({page}).then(({results, metadata, num_requests}) => {
assert.strictEqual(num_requests, 3, 'Must three requests');
assert.strictEqual(results['test keyword']['1'].results.length, 6, 'Must have 6 organic results parsed on page 1');
assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tests | TestComplete Documentation', 'Title not matching on first organic result page 1');
assert.strictEqual(results['test keyword']['2'].results.length, 10, 'Must have 10 organic results parsed on page 2');
assert.strictEqual(results['test keyword']['2'].results[0].title, 'Keywords - TestLink', 'Title not matching on first organic result page 2');
assert.strictEqual(results['test keyword']['3'].results.length, 10, 'Must have 10 organic results parsed on page 3');
assert.strictEqual(results['test keyword']['3'].results[0].title, 'Keyword Driven Testing | TestComplete', 'Title not matching on first organic result page 3');
});
});
});

140
test/modules/duckduckgo.js Normal file
View File

@ -0,0 +1,140 @@
'use strict';
const express = require('express');
const puppeteer = require('puppeteer');
const { createLogger, transports } = require('winston');
const http = require('http');
const https = require('https');
const assert = require('assert');
const path = require('path');
const keyCert = require('key-cert');
const Promise = require('bluebird');
const Proxy = require('http-mitm-proxy');
const debug = require('debug')('se-scraper:test');
const { DuckduckgoScraper } = require('../../src/modules/duckduckgo');
const httpPort = 3012;
const httpsPort = httpPort + 1;
const proxyPort = httpPort + 2;
const fakeSearchEngine = express();
fakeSearchEngine.use(express.urlencoded({ extended: true }))
fakeSearchEngine.get('/', (req, res, next) => {
if(!req.query.q){
return next();
}
debug('q=%s page=%d', req.query.q, req.query.page);
const pageNumber = req.query.page;
res.sendFile(path.join(__dirname, '../mocks/duckduckgo/' + req.query.q + '_page' + pageNumber + '.html'));
});
fakeSearchEngine.post('/html', (req, res) => {
debug('body=%o', req.body);
const pageNumber = 1;
res.sendFile(path.join(__dirname, '../mocks/duckduckgo/' + req.body.q + '_page' + pageNumber + '.html'));
});
fakeSearchEngine.use(express.static('test/mocks/duckduckgo', {extensions: ['html']}));
describe('Module DuckDuckGo', function(){
let httpServer, httpsServer, proxy;
before(async function(){
// Here mount our fake engine in both http and https listen server
httpServer = http.createServer(fakeSearchEngine);
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
proxy = Proxy();
proxy.onRequest((ctx, callback) => {
ctx.proxyToServerRequestOptions.host = 'localhost';
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
debug('proxy askedHost=%s method=%s url=%s toPort=%s',
ctx.clientToProxyRequest.headers.host,
ctx.clientToProxyRequest.method,
ctx.clientToProxyRequest.url,
ctx.proxyToServerRequestOptions.port
);
return callback();
});
await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort });
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
debug('Fake http search engine servers started');
});
after(function(){
proxy.close();
httpsServer.close();
httpServer.close();
});
let browser;
let page;
beforeEach(async function(){
debug('Start a new browser');
browser = await puppeteer.launch({
//dumpio: true,
//headless: false,
ignoreHTTPSErrors: true,
args: [ '--proxy-server=http://localhost:' + proxyPort ]
});
debug('Open a fresh page');
page = await browser.newPage();
});
afterEach(async function(){
await browser.close();
});
const testLogger = createLogger({
transports: [
new transports.Console({
level: 'error'
})
]
});
it('one keyword one page', function(){
const duckduckgoScraper = new DuckduckgoScraper({
config: {
search_engine_name: 'duckduckgo',
throw_on_detection: true,
keywords: ['test keyword'],
logger: testLogger,
scrape_from_file: '',
}
});
duckduckgoScraper.STANDARD_TIMEOUT = 1000;
return duckduckgoScraper.run({page}).then(({results, metadata, num_requests}) => {
assert.strictEqual(num_requests, 1, 'Must do one request');
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed');
});
});
it('one keyword 3 pages', function () {
this.timeout(4000);
const duckduckgoScraper = new DuckduckgoScraper({
config: {
search_engine_name: 'google',
throw_on_detection: true,
keywords: ['test keyword'],
logger: testLogger,
scrape_from_file: '',
num_pages: 3,
}
});
duckduckgoScraper.STANDARD_TIMEOUT = 1000;
return duckduckgoScraper.run({page}).then(({results, metadata, num_requests}) => {
assert.strictEqual(num_requests, 3, 'Must three requests');
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed on page 1');
assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tests | TestComplete Documentation', 'Title not matching on first organic result page 1');
debug('results page 1 %O',results['test keyword']['1'].results);
debug('results page 2 %O', results['test keyword']['2'].results);
assert.strictEqual(results['test keyword']['2'].results.length, 19, 'Must have 19 organic results parsed on page 2');
assert.strictEqual(results['test keyword']['2'].results[0].title, 'Quest Diagnostics: Test Directory', 'Title not matching on first organic result page 1');
assert.strictEqual(results['test keyword']['3'].results.length, 48, 'Must have 48 organic results parsed on page 3');
assert.strictEqual(results['test keyword']['3'].results[0].title, 'Java Keywords Quiz - Sporcle', 'Title not matching on first organic result page 1');
});
});
});

123
test/modules/google.js Normal file
View File

@ -0,0 +1,123 @@
'use strict';
const express = require('express');
const puppeteer = require('puppeteer');
const { createLogger, transports } = require('winston');
const http = require('http');
const https = require('https');
const assert = require('assert');
const path = require('path');
const keyCert = require('key-cert');
const Promise = require('bluebird');
const Proxy = require('http-mitm-proxy');
const debug = require('debug')('se-scraper:test');
const { GoogleScraper } = require('../../src/modules/google');
const httpPort = 3012;
const httpsPort = httpPort + 1;
const proxyPort = httpPort + 2;
const fakeSearchEngine = express();
fakeSearchEngine.get('/search', (req, res) => {
debug('q=%s', req.query.q);
const pageNumber = ((req.query.start/10) || 0) + 1;
res.sendFile(path.join(__dirname, '../mocks/google/' + req.query.q + '_page' + pageNumber + '.html'));
});
fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']}));
describe('Module Google', function(){
let httpServer, httpsServer, proxy;
before(async function(){
// Here mount our fake engine in both http and https listen server
httpServer = http.createServer(fakeSearchEngine);
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
proxy = Proxy();
proxy.onRequest((ctx, callback) => {
ctx.proxyToServerRequestOptions.host = 'localhost';
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
debug('connection proxied askedHost=%s toPort=%s', ctx.clientToProxyRequest.headers.host, ctx.proxyToServerRequestOptions.port);
return callback();
});
await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort });
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
debug('Fake http search engine servers started');
});
after(function(){
proxy.close();
httpsServer.close();
httpServer.close();
});
let browser;
let page;
beforeEach(async function(){
debug('Start a new browser');
browser = await puppeteer.launch({
//dumpio: true,
//headless: false,
ignoreHTTPSErrors: true,
args: [ '--proxy-server=http://localhost:' + proxyPort ]
});
debug('Open a fresh page');
page = await browser.newPage();
});
afterEach(async function(){
await browser.close();
});
const testLogger = createLogger({
transports: [
new transports.Console({
level: 'error'
})
]
});
it('one keyword one page', function(){
const googleScraper = new GoogleScraper({
config: {
search_engine_name: 'google',
throw_on_detection: true,
keywords: ['test keyword'],
logger: testLogger,
scrape_from_file: '',
}
});
googleScraper.STANDARD_TIMEOUT = 500;
return googleScraper.run({page}).then(({results, metadata, num_requests}) => {
assert.strictEqual(num_requests, 1, 'Must do one request');
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed');
});
});
it('one keyword 3 pages', function () {
const googleScraper = new GoogleScraper({
config: {
search_engine_name: 'google',
throw_on_detection: true,
keywords: ['test keyword'],
logger: testLogger,
scrape_from_file: '',
num_pages: 3,
}
});
googleScraper.STANDARD_TIMEOUT = 500;
return googleScraper.run({page}).then(({results, metadata, num_requests}) => {
assert.strictEqual(num_requests, 3, 'Must three requests');
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed on page 1');
assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tool (FREE) ᐈ #1 Google Keyword Planner Alternative', 'Title not matching on first organic result page 1');
assert.strictEqual(results['test keyword']['2'].results.length, 10, 'Must have 10 organic results parsed on page 2');
assert.strictEqual(results['test keyword']['2'].results[0].title, 'Keyword Research | The Beginner\'s Guide to SEO - Moz', 'Title not matching on first organic result page 1');
assert.strictEqual(results['test keyword']['3'].results.length, 10, 'Must have 10 organic results parsed on page 3');
assert.strictEqual(results['test keyword']['3'].results[0].title, 'The ACT Keyword Study Plan — NerdCoach', 'Title not matching on first organic result page 1');
});
});
});

161
test/proxy.js Normal file
View File

@ -0,0 +1,161 @@
'use strict';
const express = require('express');
const { createLogger, transports } = require('winston');
const http = require('http');
const https = require('https');
const assert = require('assert');
const keyCert = require('key-cert');
const Promise = require('bluebird');
const Proxy = require('http-mitm-proxy');
const debug = require('debug')('se-scraper:test');
const se_scraper = require('../');
const Scraper = require('../src/modules/se_scraper');
const httpPort = 3012;
const httpsPort = httpPort + 1;
const proxyPort = httpPort + 2;
const fakeSearchEngine = express();
fakeSearchEngine.set('trust proxy', 'loopback');
fakeSearchEngine.get('/test-proxy', (req, res) => {
debug('fake-search-engine req.hostname=%s', req.hostname);
//debug('req to', req.socket.localAddress, req.socket.localPort);
res.send(req.hostname);
});
describe('Config', function(){
let httpServer, httpsServer, proxy;
before(async function(){
// Here mount our fake engine in both http and https listen server
httpServer = http.createServer(fakeSearchEngine);
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
proxy = Proxy();
proxy.onRequest((ctx, callback) => {
ctx.proxyToServerRequestOptions.host = 'localhost';
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
return callback();
});
await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
debug('Fake http search engine servers started');
});
after(function(){
httpsServer.close();
httpServer.close();
proxy.close();
});
describe('proxies', function(){
class MockScraperTestProxy extends Scraper {
async load_start_page(){
return true;
}
async search_keyword(){
await this.page.goto('http://test.local:' + httpPort + '/test-proxy');
}
async parse_async(){
const bodyHandle = await this.page.$('body');
return await this.page.evaluate(body => body.innerHTML, bodyHandle);
}
}
const testLogger = createLogger({
transports: [
new transports.Console({
level: 'error'
})
]
});
/**
* Jobs will be executed 2 by 2 through the proxy and direct connection
* THIS TEST NEED TO HAVE test.local 127.0.0.1 in /etc/hosts because chrome bypass localhost even with proxy set
*/
it('one proxy given, use_proxies_only=false', async function () {
const scrape_job = {
search_engine: MockScraperTestProxy,
keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'],
};
var scraper = new se_scraper.ScrapeManager({
throw_on_detection: true,
proxies: ['http://localhost:' + proxyPort],
// default is use_proxies_only: false,
logger: testLogger,
});
await scraper.start();
const { results } = await scraper.scrape(scrape_job);
assert.strictEqual(results['news']['1'], 'test.local');
assert.strictEqual(results['some stuff']['1'], 'ProxiedThroughFakeEngine');
assert.strictEqual(results['i work too much']['1'], 'test.local');
assert.strictEqual(results['what to do?']['1'], 'ProxiedThroughFakeEngine');
assert.strictEqual(results['javascript is hard']['1'], 'test.local');
await scraper.quit();
});
/**
* Jobs will be executed 1 by 1 through the proxy
*/
it('one proxy given, use_proxies_only=true', async function () {
const scrape_job = {
search_engine: MockScraperTestProxy,
keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'],
};
var scraper = new se_scraper.ScrapeManager({
throw_on_detection: true,
proxies: ['http://localhost:' + proxyPort],
use_proxies_only: true,
logger: testLogger,
});
await scraper.start();
const { results } = await scraper.scrape(scrape_job);
assert.strictEqual(results['news']['1'], 'ProxiedThroughFakeEngine');
assert.strictEqual(results['some stuff']['1'], 'ProxiedThroughFakeEngine');
assert.strictEqual(results['i work too much']['1'], 'ProxiedThroughFakeEngine');
assert.strictEqual(results['what to do?']['1'], 'ProxiedThroughFakeEngine');
assert.strictEqual(results['javascript is hard']['1'], 'ProxiedThroughFakeEngine');
await scraper.quit();
});
it('zero proxy given, use_proxies_only=true', async function () {
const scrape_job = {
search_engine: MockScraperTestProxy,
keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'],
};
await assert.rejects(async () => {
var scraper = new se_scraper.ScrapeManager({
throw_on_detection: true,
use_proxies_only: true,
logger: testLogger,
});
await scraper.start();
const { results } = await scraper.scrape(scrape_job);
await scraper.quit();
}, /Must provide at least one proxy in proxies if you enable use_proxies_only/);
});
});
});

View File

@ -1,15 +0,0 @@
## Test with static HTML
Dynamic testing of se-scraper takes too much time.
Save some html and initialize se-scraper by loading the search from disk.
### Disadvantage
static html gets outdated after some time
### Advantages
1. Let's us test corner cases that are missed easily
2. Testing is not reliable, since search engines do not always return the same results for the same query
3. As said, much faster

View File

@ -1,222 +0,0 @@
const se_scraper = require('./../../index.js');
const chai = require('chai');
chai.use(require('chai-string'));
const assert = chai.assert;
const path = require('path');
async function bing_ads() {
let config = {
compress: false,
debug_level: 1,
headless: true,
};
let scrape_config = {
search_engine: 'bing',
keywords: ['kaffeemaschine kaufen'],
num_pages: 1,
scrape_from_file: 'file://' + path.join(__dirname, './html/bing.html'),
};
var scraper = new se_scraper.ScrapeManager(config);
await scraper.start();
bing_search_with_ads( await scraper.scrape(scrape_config) );
scrape_config.keywords = ['best cloud services'];
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/bing2.html');
bing_search_with_ads2( await scraper.scrape(scrape_config) );
scrape_config.keywords = ['car tires cheap'];
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/bing3.html');
bing_search_with_ads3( await scraper.scrape(scrape_config) );
scrape_config.keywords = ['service auto garage'];
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/bing4.html');
bing_search_with_ads4( await scraper.scrape(scrape_config) );
await scraper.quit();
}
// we test with a callback function to our handler
function bing_search_with_ads(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.include(obj.num_results, '1100000', 'num results not included');
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 6, 'results must have at least 6 SERP objects');
assert.isAtLeast(obj.ads.length, 7, 'there are 7 ads');
assert.isAtLeast(obj.right_side_ads.length, 5, 'there are 5 ads');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
confirm_results_ok(obj);
}
}
}
function bing_search_with_ads2(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.include(obj.num_results, '44300000', 'num results not included');
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 6, 'results must have at least 6 SERP objects');
assert.isAtLeast(obj.ads.length, 7, 'there are 7 ads');
assert.isAtLeast(obj.right_side_ads.length, 5, 'there are 5 ads');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
confirm_results_ok(obj);
}
}
}
function bing_search_with_ads3(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.include(obj.num_results, '65.500.000 Results', 'num results not included');
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects');
assert.isAtLeast(obj.ads.length, 3, 'there are 3 ads');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
confirm_results_ok(obj);
}
}
}
function bing_search_with_ads4(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.include(obj.num_results, '4.200.000 Ergebnisse', 'num results not included');
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 9, 'results must have at least 9 SERP objects');
assert.isAtLeast(obj.ads.length, 3, 'there are 3 ads');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
confirm_results_ok(obj);
}
}
}
function confirm_results_ok(obj) {
for (let res of obj.results) {
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'visible_link must be ok');
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 8, 'title must have at least 8 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
assert.isNumber(res.rank, 'rank must be integer');
}
for (let res of obj.ads) {
assert.isOk(res.tracking_link, 'link must be ok');
assert.typeOf(res.tracking_link, 'string', 'link must be string');
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'link must be ok');
assert.typeOf(res.visible_link, 'string', 'link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 8, 'title must have at least 8 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
}
for (let res of obj.right_side_ads) {
assert.isOk(res.tracking_link, 'link must be ok');
assert.typeOf(res.tracking_link, 'string', 'link must be string');
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'link must be ok');
assert.typeOf(res.visible_link, 'string', 'link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 8, 'title must have at least 8 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
}
}
describe('Bing', function(){
this.timeout(15000);
it('static bing searches with ads', bing_ads);
});

View File

@ -1,173 +0,0 @@
const se_scraper = require('./../../index.js');
const chai = require('chai');
chai.use(require('chai-string'));
const assert = chai.assert;
const path = require('path');
const cheerio = require('cheerio');
async function test_html_output() {
let config = {
debug_level: 1,
headless: true,
html_output: true,
// whether to strip JS and CSS from the html_output
// has only an effect if `html_output` is true
clean_html_output: true,
// remove all data images from the html
clean_data_images: true,
// test compression
compress: false,
};
let scrape_config = {
search_engine: 'bing',
keywords: ['kaffeemaschine kaufen'],
num_pages: 1,
scrape_from_file: 'file://' + path.join(__dirname, './html/bing.html'),
};
var scraper = new se_scraper.ScrapeManager(config);
await scraper.start();
var response = await scraper.scrape(scrape_config);
scrape_config.clean_html_output = false;
scrape_config.clean_data_images = false;
var response_no_cleaned = await scraper.scrape(scrape_config);
test(response, response_no_cleaned, 'bing');
scrape_config.search_engine = 'google';
scrape_config.keywords = ['rückspiegel schwarz'];
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google.html');
scrape_config.clean_html_output = true;
scrape_config.clean_data_images = true;
var responseGoogle = await scraper.scrape(scrape_config);
scrape_config.clean_html_output = false;
scrape_config.clean_data_images = false;
var response_no_cleanedGoogle = await scraper.scrape(scrape_config);
test(responseGoogle, response_no_cleanedGoogle, 'google');
scrape_config.keywords = ['cloud services'];
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/googleLarge.html');
scrape_config.clean_html_output = true;
scrape_config.clean_data_images = true;
var responseGoogle = await scraper.scrape(scrape_config);
scrape_config.clean_html_output = false;
scrape_config.clean_data_images = false;
var response_no_cleanedGoogle = await scraper.scrape(scrape_config);
test(responseGoogle, response_no_cleanedGoogle, 'google');
await scraper.quit();
}
function test(response, response_no_cleaned, se='google') {
for (let query in response.results) {
for (let page_number in response.results[query]) {
let obj = response.results[query][page_number];
let obj_no_cleaned = response_no_cleaned.results[query][page_number];
console.log('html length of no cleaned SERP: ' + obj_no_cleaned.html.length);
console.log('html length of cleaned SERP: ' + obj.html.length);
assert.isOk(obj.html, 'Html must be ok!');
assert.isAtLeast(obj.html.length, 100, 'html must be a length string');
assert.isOk(obj_no_cleaned.html, 'Html must be ok!');
assert.isAtLeast(obj_no_cleaned.html.length, 100, 'html must be a length string');
assert.isBelow(obj.html.length, obj_no_cleaned.html.length, 'cleaned html must be smaller');
// test that we can parse the html of both the cleaned and no cleaned versions
// with cheerio and that serp results are roughly the same
const cleaned$ = cheerio.load(obj.html);
const no_cleaned$ = cheerio.load(obj_no_cleaned.html);
var resCleaned = parseResults(cleaned$, se);
var resNoCleaned = parseResults(no_cleaned$, se);
assert.equal(resCleaned.length, resNoCleaned.length);
assert.equal(resCleaned.length, obj.results.length);
assert.equal(resNoCleaned.length, obj.results.length);
// unset the rank
resCleaned = resCleaned.map((el) => el.rank = undefined);
resNoCleaned = resNoCleaned.map((el) => el.rank = undefined);
obj.results = obj.results.map((el) => el.rank = undefined);
assert.deepEqual(resCleaned, resNoCleaned, 'parsed results should be equal, even if html is cleaned');
assert.deepEqual(resCleaned, obj.results, 'parsed results from cleaned html should be equal to se-scraper results');
assert.deepEqual(resNoCleaned, obj.results, 'parsed results from non-cleaned html should be equal to se-scraper results');
}
}
}
function parseResults(s$, se) {
var results = [];
if (se === 'google') {
s$('#center_col .g').each((i, link) => {
results.push({
link: s$(link).find('.r a').attr('href'),
title: s$(link).find('.r a').text(),
snippet: s$(link).find('span.st').text(),
visible_link: s$(link).find('.r cite').text(),
date: s$(link).find('span.f').text() || '',
})
});
} else if (se === 'bing') {
s$('#b_content #b_results .b_algo').each((i, link) => {
results.push({
link: s$(link).find('h2 a').attr('href'),
title: s$(link).find('h2').text(),
snippet: s$(link).find('.b_caption p').text(),
visible_link: s$(link).find('cite').text(),
})
});
} else {
throw "no such search engine";
}
results = clean_results(results, ['title', 'link', 'snippet']);
return results;
}
function clean_results(results, attributes) {
const cleaned = [];
var rank = 1;
for (var res of results) {
let goodboy = true;
for (var attr of attributes) {
if (!res[attr] || !res[attr].trim()) {
goodboy = false;
break;
}
}
if (goodboy) {
res.rank = rank++;
cleaned.push(res);
}
}
return cleaned;
}
describe('html output', function(){
this.timeout(15000);
it('static html output test', test_html_output);
});

View File

@ -1,24 +0,0 @@
'use strict';
const zlib = require('zlib');
const fs = require('fs');
const path = require('path');
var files = ['google.html', 'google2.html', 'google3.html', 'bing.html', 'bing2.html'];
for (var file of files) {
var html = fs.readFileSync(path.resolve(__dirname, './html/' + file));
var compressed = zlib.gzipSync(html);
var deflated = zlib.deflateSync(html);
var compressed_encoded = compressed.toString('base64');
var deflated_encoded = deflated.toString('base64');
console.log(file)
console.log('Normal length: ' + html.length/1000);
console.log('GZIP Compressed length: ' + compressed.length/1000);
console.log('Deflate Compressed length: ' + deflated.length/1000);
console.log('Encoded GZIP Compressed length: ' + compressed_encoded.length/1000);
console.log('Encoded Deflate Compressed length: ' + deflated_encoded.length/1000);
console.log('------\n')
}

View File

@ -1,99 +0,0 @@
const se_scraper = require('./../../index.js');
const chai = require('chai');
chai.use(require('chai-string'));
const assert = chai.assert;
const path = require('path');
async function duckduckgo() {
let config = {
compress: false,
debug_level: 1,
headless: true,
};
let scrape_config = {
search_engine: 'duckduckgo',
keywords: ['cloud service'],
num_pages: 1,
scrape_from_file: 'file://' + path.join(__dirname, './html/duckduckgo1.html'),
};
var scraper = new se_scraper.ScrapeManager(config);
await scraper.start();
duckduckgo_normal( await scraper.scrape(scrape_config) );
await scraper.quit();
}
function duckduckgo_normal(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time', 'ads',], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects');
assert.isAtLeast(obj.ads.length, 2, 'ads must have at least 2 SERP objects');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
confirm_results_ok(obj);
}
}
}
function confirm_results_ok(obj) {
for (let res of obj.results) {
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'visible_link must be ok');
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
assert.isNumber(res.rank, 'rank must be integer');
}
for (let res of obj.ads) {
assert.isOk(res.tracking_link, 'link must be ok');
assert.typeOf(res.tracking_link, 'string', 'link must be string');
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'link must be ok');
assert.typeOf(res.visible_link, 'string', 'link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
}
}
describe('Duckduckgo', function(){
this.timeout(10000);
it('static duckduckgo sarch', duckduckgo);
});

View File

@ -1,410 +0,0 @@
const se_scraper = require('./../../index.js');
const chai = require('chai');
chai.use(require('chai-string'));
const assert = chai.assert;
const path = require('path');
async function normal_search_test() {
let config = {
compress: false,
debug_level: 1,
headless: true,
};
let scrape_config = {
search_engine: 'google',
keywords: ['rückspiegel schwarz'],
num_pages: 1,
scrape_from_file: 'file://' + path.join(__dirname, './html/google.html'),
};
var scraper = new se_scraper.ScrapeManager(config);
await scraper.start();
google_search_with_products( await scraper.scrape(scrape_config) );
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google2.html');
scrape_config.keywords = ['autoreifen mercedes c-klasse'];
google_search_with_products2( await scraper.scrape(scrape_config) );
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google3.html');
scrape_config.keywords = ['kaffeemaschine kaufen'];
google_places( await scraper.scrape(scrape_config) );
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google4.html');
scrape_config.keywords = ['MODEL MARKET SW18 4ES'];
right_side_info_text( await scraper.scrape(scrape_config) );
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google5.html');
scrape_config.keywords = ['BRANDON MOTORS HP13 6NR'];
right_side_info_text2( await scraper.scrape(scrape_config) );
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google6.html');
scrape_config.keywords = ['car tires for sale'];
google_places_and_ads( await scraper.scrape(scrape_config) );
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google_bmw_felgen.html');
scrape_config.keywords = ['bmw felgen'];
google_ads2( await scraper.scrape(scrape_config) );
await scraper.quit();
}
// we test with a callback function to our handler
function google_search_with_products(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.include(obj.num_results, '1780000', 'num results not included');
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 9, 'results must have at least 8 SERP objects');
assert.isAtLeast(obj.top_ads.length, 0, 'there are no top ads');
assert.isAtLeast(obj.bottom_ads.length, 3, 'there are 3 bottom ads');
assert.isAtLeast(obj.top_products.length, 15, 'there are 15 top products');
assert.equal(obj.right_products.length, 0, 'there are 0 right products');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
confirm_results_ok(obj);
}
}
}
function google_search_with_products2(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.include(obj.num_results, '437000 Ergebnisse (0.41 Sekunden)', 'num results not included');
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 9, 'results must have at least 8 SERP objects');
assert.isAtLeast(obj.top_ads.length, 0, 'there are no top ads');
assert.isAtLeast(obj.bottom_ads.length, 1, 'there are 1 bottom ads');
assert.isAtLeast(obj.top_products.length, 0, 'there are 0 top products');
assert.equal(obj.right_products.length, 4, 'there are 4 right products');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
confirm_results_ok(obj);
}
}
}
function google_places(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.include(obj.num_results, '6750000 Ergebnisse (0.52 Sekunden)', 'num results not included');
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 10, 'results must have at least 9 SERP objects');
assert.equal(obj.top_ads.length, 0, 'there are no top ads');
assert.equal(obj.bottom_ads.length, 0, 'there are 0 bottom ads');
assert.equal(obj.top_products.length, 0, 'there are 0 top products');
assert.equal(obj.right_products.length, 0, 'there are 0 right products');
assert.equal(obj.places.length, 3, 'there are 3 places');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
confirm_results_ok(obj);
}
}
}
function right_side_info_text(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.include(obj.num_results, '6 Ergebnisse', 'num results not included');
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results',
'effective_query', 'top_ads', 'bottom_ads', 'right_side_info_text'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
assert.isOk(obj.right_side_info_text.length, 'right_side_info_text should have some data');
assert.isAtLeast(obj.right_side_info_text.length, 50, 'right_side_info_text should have some data');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
confirm_results_ok(obj);
}
}
}
function right_side_info_text2(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.include(obj.num_results, '5 Ergebnisse', 'num results not included');
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results',
'effective_query', 'top_ads', 'bottom_ads', 'right_side_info_text'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 5, 'results must have at least 5 SERP objects');
assert.isOk(obj.right_side_info_text.length, 'right_side_info_text should have some data');
assert.isAtLeast(obj.right_side_info_text.length, 50, 'right_side_info_text should have some data');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
confirm_results_ok(obj);
}
}
}
function google_places_and_ads(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.include(obj.num_results, '439.000.000 Ergebnisse (0,64 Sekunden)', 'num results not included');
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects');
assert.equal(obj.top_ads.length, 0, 'there are no top ads');
assert.equal(obj.bottom_ads.length, 0, 'there are 0 bottom ads');
assert.isAtLeast(obj.top_products.length, 13, 'there are 13 top products');
assert.equal(obj.right_products.length, 0, 'there are 0 right products');
assert.equal(obj.places.length, 2, 'there are 2 places');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
confirm_results_ok(obj);
}
}
}
function google_ads2(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.include(obj.num_results, 'Ungefähr 23.200.000 Ergebnisse (0,29 Sekunden)', 'num results not included');
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects');
assert.equal(obj.top_ads.length, 3, 'there are no top ads');
assert.equal(obj.bottom_ads.length, 0, 'there are 0 bottom ads');
assert.isAtLeast(obj.top_products.length, 0, 'there must be 0 top products');
assert.equal(obj.right_products.length, 9, 'there are 9 right products');
assert.equal(obj.places.length, 0, 'there are 0 places');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
confirm_results_ok(obj);
}
}
}
function confirm_results_ok(obj) {
for (let res of obj.results) {
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'visible_link must be ok');
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
assert.isNumber(res.rank, 'rank must be integer');
}
for (let res of obj.top_ads) {
assert.isOk(res.tracking_link, 'link must be ok');
assert.typeOf(res.tracking_link, 'string', 'link must be string');
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'link must be ok');
assert.typeOf(res.visible_link, 'string', 'link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.link, 'visible_link must be ok');
assert.typeOf(res.link, 'string', 'visible_link must be string');
assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
assert.typeOf(res.links, 'array', 'links must be array');
}
for (let res of obj.bottom_ads) {
assert.isOk(res.tracking_link, 'link must be ok');
assert.typeOf(res.tracking_link, 'string', 'link must be string');
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'link must be ok');
assert.typeOf(res.visible_link, 'string', 'link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.link, 'visible_link must be ok');
assert.typeOf(res.link, 'string', 'visible_link must be string');
assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
assert.typeOf(res.links, 'array', 'links must be array');
}
for (let res of obj.top_products) {
assert.isOk(res.tracking_link, 'link must be ok');
assert.typeOf(res.tracking_link, 'string', 'link must be string');
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.price, 'price must be ok');
assert.typeOf(res.price, 'string', 'price must be string');
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.vendor_link, 'vendor_link must be ok');
assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string');
assert.isAtLeast(res.vendor_link.length, 8, 'vendor_link must have at least 10 chars');
}
for (let res of obj.right_products) {
assert.isOk(res.tracking_link, 'link must be ok');
assert.typeOf(res.tracking_link, 'string', 'link must be string');
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.price, 'price must be ok');
assert.typeOf(res.price, 'string', 'price must be string');
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.vendor_link, 'vendor_link must be ok');
assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string');
assert.isAtLeast(res.vendor_link.length, 8, 'vendor_link must have at least 10 chars');
}
for (let res of obj.places) {
assert.isOk(res.heading, 'heading must be ok');
assert.typeOf(res.heading, 'string', 'heading must be string');
assert.isAtLeast(res.heading.length, 5, 'heading must have at least 5 chars');
assert.isOk(res.rating, 'rating must be ok');
assert.typeOf(res.rating, 'string', 'rating must be string');
assert.isAtLeast(res.rating.length, 5, 'rating must have at least 5 chars');
assert.isOk(res.contact, 'contact must be ok');
assert.typeOf(res.contact, 'string', 'contact must be string');
assert.isAtLeast(res.contact.length, 5, 'contact must have at least 5 chars');
assert.typeOf(res.hours, 'string', 'hours must be string');
if (res.hours) {
assert.isAtLeast(res.hours.length, 10, 'hours must have at least 10 chars');
}
}
}
describe('Google', function() {
this.timeout(25000);
it('static google searches with products,ads and places', normal_search_test);
});

View File

@ -1,213 +0,0 @@
const se_scraper = require('./../../index.js');
const chai = require('chai');
chai.use(require('chai-string'));
const assert = chai.assert;
const path = require('path');
async function normal_search_test() {
let config = {
compress: false,
debug_level: 1,
headless: true,
};
let scrape_config = {
search_engine: 'google',
keywords: ['in.linkedin.com/in/altanai'],
num_pages: 1,
scrape_from_file: 'file://' + path.join(__dirname, './html/google7.html'),
};
var scraper = new se_scraper.ScrapeManager(config);
await scraper.start();
google_test_title( await scraper.scrape(scrape_config) );
await scraper.quit();
}
// we test with a callback function to our handler
function google_test_title(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.include(obj.num_results, '7.600', 'num results not included');
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 9, 'results must have at least 9 SERP objects');
assert.isAtLeast(obj.top_ads.length, 0, 'there are no top ads');
assert.isAtLeast(obj.bottom_ads.length, 0, 'there are 0 bottom ads');
assert.isAtLeast(obj.top_products.length, 0, 'there are 0 top products');
assert.equal(obj.right_products.length, 0, 'there are 0 right products');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
confirm_results_ok(obj);
assert.equal( obj.results[0].title, 'ALTANAI BISHT - SD2 at Voice Engineering - Plivo | LinkedIn' );
assert.equal( obj.results[1].title, 'ALTANAI BISHT | LinkedIn' );
assert.equal( obj.results[2].title, 'ALTANAI BISHT SD2 at Voice Engineering Plivo | LinkedIn' );
assert.equal( obj.results[3].title, 'AI AT - South Delhi, Delhi, India | Professional Profile | LinkedIn' );
assert.equal( obj.results[4].title, 'ALTANAI BISHT | LinkedIn' );
assert.equal( obj.results[9].title, 'Phani Kumar Parasaram - VOIP Expert - Infinite ... - LinkedIn');
assert.equal (obj.results[0].date, '27.07.2016');
assert.equal( obj.results[0].snippet, '27.07.2016 - View ALTANAI BISHT\'S profile on LinkedIn, the world\'s largest professional community. ALTANAI has 6 jobs listed on their profile. See the ...');
assert.equal (obj.results[2].date, '27.07.2016');
}
}
}
function confirm_results_ok(obj) {
for (let res of obj.results) {
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'visible_link must be ok');
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
assert.isNumber(res.rank, 'rank must be integer');
}
for (let res of obj.top_ads) {
assert.isOk(res.tracking_link, 'link must be ok');
assert.typeOf(res.tracking_link, 'string', 'link must be string');
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'link must be ok');
assert.typeOf(res.visible_link, 'string', 'link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.link, 'visible_link must be ok');
assert.typeOf(res.link, 'string', 'visible_link must be string');
assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
assert.typeOf(res.links, 'array', 'links must be array');
}
for (let res of obj.bottom_ads) {
assert.isOk(res.tracking_link, 'link must be ok');
assert.typeOf(res.tracking_link, 'string', 'link must be string');
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'link must be ok');
assert.typeOf(res.visible_link, 'string', 'link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.link, 'visible_link must be ok');
assert.typeOf(res.link, 'string', 'visible_link must be string');
assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
assert.typeOf(res.links, 'array', 'links must be array');
}
for (let res of obj.top_products) {
assert.isOk(res.tracking_link, 'link must be ok');
assert.typeOf(res.tracking_link, 'string', 'link must be string');
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.price, 'price must be ok');
assert.typeOf(res.price, 'string', 'price must be string');
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.vendor_link, 'vendor_link must be ok');
assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string');
assert.isAtLeast(res.vendor_link.length, 10, 'vendor_link must have at least 10 chars');
}
for (let res of obj.right_products) {
assert.isOk(res.tracking_link, 'link must be ok');
assert.typeOf(res.tracking_link, 'string', 'link must be string');
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.price, 'price must be ok');
assert.typeOf(res.price, 'string', 'price must be string');
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.vendor_link, 'vendor_link must be ok');
assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string');
assert.isAtLeast(res.vendor_link.length, 10, 'vendor_link must have at least 10 chars');
}
for (let res of obj.places) {
assert.isOk(res.heading, 'heading must be ok');
assert.typeOf(res.heading, 'string', 'heading must be string');
assert.isAtLeast(res.heading.length, 5, 'heading must have at least 5 chars');
assert.isOk(res.rating, 'rating must be ok');
assert.typeOf(res.rating, 'string', 'rating must be string');
assert.isAtLeast(res.rating.length, 5, 'rating must have at least 5 chars');
assert.isOk(res.contact, 'contact must be ok');
assert.typeOf(res.contact, 'string', 'contact must be string');
assert.isAtLeast(res.contact.length, 5, 'contact must have at least 5 chars');
assert.typeOf(res.hours, 'string', 'hours must be string');
if (res.hours) {
assert.isAtLeast(res.hours.length, 10, 'hours must have at least 10 chars');
}
}
}
describe('Google2', function(){
this.timeout(10000);
it('static google searches testing various details', normal_search_test);
});

View File

@ -1,152 +0,0 @@
const se_scraper = require('./../../index.js');
const chai = require('chai');
chai.use(require('chai-string'));
const assert = chai.assert;
const path = require('path');
async function yandex_ads() {
let config = {
compress: false,
debug_level: 1,
headless: true,
};
let scrape_config = {
search_engine: 'yandex',
keywords: ['cloud service'],
num_pages: 1,
scrape_from_file: 'file://' + path.join(__dirname, './html/yandex1.html'),
};
var scraper = new se_scraper.ScrapeManager(config);
await scraper.start();
yandex_search_with_ads( await scraper.scrape(scrape_config) );
scrape_config.keywords = ['car tires cheap'];
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/yandex2.html');
yandex_search_with_ads2( await scraper.scrape(scrape_config) );
scrape_config.keywords = ['купить деревянные окна'];
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/yandex3.html');
yandex_search_with_ads3( await scraper.scrape(scrape_config) );
await scraper.quit();
}
// we test with a callback function to our handler
function yandex_search_with_ads(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.include(obj.num_results, '2 million results', 'num results not included');
assert.containsAllKeys(obj, ['results', 'time', 'num_results',], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 12, 'results must have at least 12 SERP objects');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
confirm_results_ok(obj);
}
}
}
function yandex_search_with_ads2(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.include(obj.num_results, '5 million results', 'num results not included');
assert.containsAllKeys(obj, ['results', 'time', 'num_results',], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 11, 'results must have at least 12 SERP objects');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
confirm_results_ok(obj);
}
}
}
function yandex_search_with_ads3(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
// console.dir(obj.results, {depth: null, colors: true});
assert.containsAllKeys(obj, ['results', 'time', 'num_results',], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 14, 'results must have at least 14 SERP objects');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
// at least 4 ads
let cnt = 0;
obj.results.forEach((res) => {
if (res.is_ad) {
cnt++;
}
});
assert.isAtLeast(cnt, 4, 'there should be at least 4 ads in the results');
confirm_results_ok(obj);
}
}
}
function confirm_results_ok(obj) {
for (let res of obj.results) {
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'visible_link must be ok');
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
assert.isNumber(res.rank, 'rank must be integer');
}
}
describe('Yandex', function(){
this.timeout(10000);
it('static yandex searches with ads', yandex_ads);
});

View File

@ -1,141 +0,0 @@
'use strict';
const se_scraper = require('./../index.js');
const assert = require('chai').assert;
/*
* Use chai and mocha for tests.
* https://mochajs.org/#installation
*/
const normal_search_keywords = ['iphone', 'clock'];
async function normal_search_test() {
let config = {
compress: false,
keyword_file: '',
headless: true,
output_file: '',
block_assets: true,
random_user_agent: false,
};
let scrape_config = {
search_engine: 'amazon',
num_pages: 1,
keywords: normal_search_keywords,
};
console.log('normal_search_test()');
normal_search_test_case( await se_scraper.scrape(config, scrape_config) );
}
// we test with a callback function to our handler
function normal_search_test_case(response) {
assert.equal(response.metadata.num_requests, 2);
for (let query in response.results) {
let total_rank = 1;
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
for (let res of obj.results) {
assert.containsAllKeys(res, ['link', 'title', 'rank', 'image', 'seller', 'stars', 'num_reviews', 'price', 'oldprice'], 'not all keys are in the SERP object');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
assert.isOk(res.seller, 'seller must be ok');
assert.typeOf(res.seller, 'string', 'seller must be string');
assert.isAtLeast(res.seller.length, 5, 'seller must have at least 10 chars');
assert.isOk(res.stars, 'stars be ok');
assert.typeOf(res.stars, 'string', 'stars must be string');
assert.isAtLeast(res.stars.length, 5, 'stars must have at least 6 chars');
assert.include(res.stars, ' out of ', 'stars must include " out of "');
assert.isOk(res.num_reviews, 'num_reviews be ok');
assert.typeOf(res.num_reviews, 'string', 'num_reviews must be string');
assert.isAtLeast(res.num_reviews.length, 1, 'num_reviews must have at least 1 chars');
assert.isOk(res.price, 'price be ok');
assert.typeOf(res.price, 'string', 'price must be string');
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
assert.isNumber(res.rank, 'rank must be integer');
assert.equal(res.rank, total_rank++, 'rank ist wrong');
}
}
}
}
const keywords_no_results = ['2342kljp;fj9834u40abJ54634344023safkl34a44dsflkjaQQuBBdfk',];
async function no_results_test() {
let config = {
compress: false,
debug_level: 1,
keyword_file: '',
headless: true,
output_file: '',
block_assets: true,
random_user_agent: false,
};
let scrape_config = {
search_engine: 'amazon',
num_pages: 1,
keywords: keywords_no_results,
};
console.log('no_results_test()');
test_case_no_results( await se_scraper.scrape(config, scrape_config) );
}
// we test with a callback function to our handler
function test_case_no_results(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
assert.containsAllKeys(response.results, keywords_no_results, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
assert(obj.results.length === 0, 'results must have 0 SERP objects');
assert.equal(obj.no_results, true, 'no results should be true');
assert.isEmpty(obj.num_results, 'no results should be a empty string');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
}
}
}
describe('Amazon', function(){
this.timeout(30000);
it('normal search test', normal_search_test);
it('no results test', no_results_test);
});

View File

@ -1,87 +0,0 @@
'use strict';
const se_scraper = require('./../index.js');
const assert = require('chai').assert;
/*
* Use chai and mocha for tests.
* https://mochajs.org/#installation
*/
const normal_search_keywords = ['mouse', 'cat'];
async function normal_search_test() {
let config = {
compress: false,
debug_level: 1,
keyword_file: '',
headless: true,
block_assets: true,
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
};
let scrape_config = {
search_engine: 'baidu',
keywords: normal_search_keywords,
num_pages: 2,
};
console.log('normal_search_test()');
normal_search_test_case( await se_scraper.scrape(config, scrape_config) );
}
// we test with a callback function to our handler
function normal_search_test_case(response) {
assert.equal(response.metadata.num_requests, 4);
for (let query in response.results) {
let total_rank = 1;
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time', 'num_results', 'no_results'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.equal(obj.no_results, false, 'no results should be false');
for (let res of obj.results) {
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'visible_link must be ok');
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
assert.isNumber(res.rank, 'rank must be integer');
assert.equal(res.rank, total_rank++, 'rank ist wrong');
}
}
}
}
describe('Baidu', function(){
this.timeout(30000);
it('normal search test', normal_search_test);
});

View File

@ -1,271 +0,0 @@
'use strict';
const se_scraper = require('./../index.js');
const chai = require('chai');
chai.use(require('chai-string'));
const assert = chai.assert;
/*
* Use chai and mocha for tests.
* https://mochajs.org/#installation
*/
const normal_search_keywords = ['apple tree', 'weather tomorrow'];
async function normal_search_test() {
let config = {
search_engine: 'bing',
compress: false,
debug_level: 1,
keywords: normal_search_keywords,
keyword_file: '',
num_pages: 3,
headless: true,
output_file: '',
block_assets: true,
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
random_user_agent: false,
};
let scrape_config = {
search_engine: 'bing',
keywords: normal_search_keywords,
num_pages: 3,
};
console.log('normal_search_test()');
normal_search_test_case( await se_scraper.scrape(config, scrape_config) );
}
// we test with a callback function to our handler
function normal_search_test_case(response) {
assert.equal(response.metadata.num_requests, 6);
for (let query in response.results) {
let total_rank = 1;
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
for (let res of obj.results) {
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'rank'], 'not all keys are in the SERP object');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'visible_link must be ok');
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
if (res.snippet) {
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
}
assert.isNumber(res.rank, 'rank must be integer');
assert.equal(res.rank, total_rank++, 'rank ist wrong');
}
}
}
}
const keywords_no_results = ['2342kljp;fj9834u40abJAkasdlfkjsladfkjasfdas;lk3453-934023safkl34a44dsflkjaQQuBBdfk',];
async function no_results_test() {
let config = {
search_engine: 'bing',
compress: false,
debug_level: 1,
keywords: keywords_no_results,
keyword_file: '',
num_pages: 1,
headless: true,
output_file: '',
block_assets: true,
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
random_user_agent: false,
};
let scrape_config = {
search_engine: 'bing',
keywords: keywords_no_results,
num_pages: 1,
};
console.log('no_results_test()');
test_case_no_results( await se_scraper.scrape(config, scrape_config) );
}
// we test with a callback function to our handler
function test_case_no_results(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
assert.containsAllKeys(response.results, keywords_no_results, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
assert(obj.results.length === 0, 'results must have 0 SERP objects');
assert.equal(obj.no_results, true, 'no results should be true');
assert.isEmpty(obj.num_results, 'no results should be a empty string');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
}
}
}
const effective_query_keywords = ['mount everrest'];
async function effective_query_test() {
let config = {
compress: false,
debug_level: 1,
keyword_file: '',
headless: true,
output_file: '',
block_assets: true,
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
random_user_agent: false,
};
let scrape_config = {
search_engine: 'bing',
keywords: effective_query_keywords,
num_pages: 1,
};
console.log('effective_query_test()');
test_case_effective_query( await se_scraper.scrape(config, scrape_config) );
}
// we test with a callback function to our handler
function test_case_effective_query(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
assert.containsAllKeys(response.results, effective_query_keywords, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
// effective query must be different to the original keyword
assert.isOk(obj.effective_query, 'effective query must be ok');
assert.isNotEmpty(obj.effective_query, 'effective query must be valid');
assert(obj.effective_query !== query, 'effective query must be different from keyword');
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
}
}
}
const ads_keywords = ['cloud services', 'buy shoes'];
async function ads_test() {
let config = {
compress: false,
debug_level: 1,
headless: true,
block_assets: false,
random_user_agent: true,
};
let scrape_config = {
search_engine: 'bing',
keywords: ads_keywords,
num_pages: 1,
};
console.log('ads_test()');
test_case_ads_test( await se_scraper.scrape(config, scrape_config) );
}
function test_case_ads_test(response) {
assert.equal(response.metadata.num_requests, 2);
for (let query in response.results) {
assert.containsAllKeys(response.results, ads_keywords, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 5, 'results must have at least 5 SERP objects');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
assert.isAtLeast(obj.ads.length, 2, 'ads must have at least 2 SERP objects');
for (let res of obj.ads) {
assert.isOk(res.tracking_link, 'link must be ok');
assert.typeOf(res.tracking_link, 'string', 'link must be string');
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
// assert.isOk(res.link, 'link must be ok');
// assert.typeOf(res.link, 'string', 'link must be string');
// assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'visible_link must be ok');
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
}
}
}
}
describe('Bing', function(){
this.timeout(30000);
it('normal search', normal_search_test);
it('no results', no_results_test);
it('effective query', effective_query_test);
it('finds ads', ads_test);
});

View File

@ -1,192 +0,0 @@
'use strict';
const se_scraper = require('./../index.js');
const chai = require('chai');
chai.use(require('chai-string'));
const assert = chai.assert;
const normal_search_keywords = ['apple tree', 'weather tomorrow'];
async function normal_search_test() {
let config = {
compress: false,
debug_level: 1,
headless: true,
block_assets: false,
random_user_agent: true,
};
let scrape_config = {
search_engine: 'duckduckgo',
keywords: normal_search_keywords,
num_pages: 2,
};
console.log('normal_search_test()');
normal_search_test_case( await se_scraper.scrape(config, scrape_config) );
}
// we test with a callback function to our handler
function normal_search_test_case(response) {
assert.equal(response.metadata.num_requests, 2);
for (let query in response.results) {
let total_rank = 1;
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time', 'effective_query'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
for (let res of obj.results) {
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'visible_link must be ok');
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
assert.isNumber(res.rank, 'rank must be integer');
assert.equal(res.rank, total_rank++, 'rank ist wrong');
}
}
}
}
const effective_query_keywords = ['mount everrest'];
async function effective_query_test() {
let config = {
compress: false,
debug_level: 1,
headless: true,
block_assets: true,
random_user_agent: true,
};
let scrape_config = {
search_engine: 'duckduckgo',
keywords: effective_query_keywords,
num_pages: 1,
};
console.log('test_case_effective_query()');
test_case_effective_query( await se_scraper.scrape(config, scrape_config) );
}
// we test with a callback function to our handler
function test_case_effective_query(response) {
assert.equal(response.metadata.num_requests, 1);
results = response.results;
for (let query in response.results) {
assert.containsAllKeys(response.results, effective_query_keywords, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time', 'effective_query'], 'not all keys are in the object');
// effective query must be different to the original keyword
assert.isOk(obj.effective_query, 'effective query must be ok');
assert.isNotEmpty(obj.effective_query, 'effective query must be valid');
assert(obj.effective_query !== query, 'effective query must be different from keyword');
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
}
}
}
const ads_keywords = ['cloud services', 'buy shoes'];
async function ads_test() {
let config = {
compress: false,
debug_level: 1,
headless: true,
block_assets: false,
random_user_agent: false,
};
let scrape_config = {
search_engine: 'duckduckgo',
keywords: ads_keywords,
num_pages: 1,
};
console.log('ads_test()');
test_case_ads_test( await se_scraper.scrape(config, scrape_config) );
}
function test_case_ads_test(response) {
assert.equal(response.metadata.num_requests, 2);
for (let query in response.results) {
assert.containsAllKeys(response.results, ads_keywords, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time', 'effective_query', 'ads'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 6, 'results must have at least 6 SERP objects');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
assert.isAtLeast(obj.ads.length, 2, 'ads must have at least 2 SERP object');
for (let res of obj.ads) {
assert.isOk(res.tracking_link, 'link must be ok');
assert.typeOf(res.tracking_link, 'string', 'link must be string');
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'visible_link must be ok');
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
}
}
}
}
describe('Duckduckgo', function(){
this.timeout(30000);
it('normal search', normal_search_test);
it('effective query', effective_query_test);
it('finds ads', ads_test);
});

View File

@ -1,424 +0,0 @@
'use strict';
const se_scraper = require('./../index.js');
const chai = require('chai');
chai.use(require('chai-string'));
const assert = chai.assert;
/*
* Use chai and mocha for tests.
* https://mochajs.org/#installation
*/
const normal_search_keywords = ['apple tree', 'weather tomorrow'];
async function normal_search_test() {
let config = {
compress: false,
debug_level: 1,
keyword_file: '',
headless: true,
output_file: '',
block_assets: true,
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
random_user_agent: false,
};
let scrape_config = {
search_engine: 'google',
keywords: normal_search_keywords,
num_pages: 3,
};
console.log('normal_search_test()');
normal_search_test_case( await se_scraper.scrape(config, scrape_config) );
}
// we test with a callback function to our handler
function normal_search_test_case(response) {
assert.equal(response.metadata.num_requests, 6);
for (let query in response.results) {
let total_rank = 1;
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 7, 'results must have at least 8 SERP objects');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
for (let res of obj.results) {
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link'], 'not all keys are in the SERP object');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'visible_link must be ok');
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
assert.isNumber(res.rank, 'rank must be integer');
assert.equal(res.rank, total_rank++, 'rank ist wrong');
}
}
}
}
const keywords_no_results = ['fgskl34440abJAksfs4353534a3l34AVGFDFflkjaQQuBBdfk',];
async function no_results_test() {
let config = {
compress: false,
debug_level: 1,
keyword_file: '',
headless: true,
output_file: '',
block_assets: true,
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
random_user_agent: false,
};
let scrape_config = {
search_engine: 'google',
keywords: keywords_no_results,
num_pages: 1,
};
console.log('no_results_test()');
test_case_no_results( await se_scraper.scrape(config, scrape_config) );
}
// we test with a callback function to our handler
function test_case_no_results(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
assert.containsAllKeys(response.results, keywords_no_results, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
assert.strictEqual(obj.results.length, 0, 'results must have 0 SERP objects');
assert.equal(obj.no_results, true, 'no results should be true');
assert.isEmpty(obj.num_results, 'num_results should be a empty string');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
}
}
}
const effective_query_keywords = ['mount evverrest'];
async function effective_query_test() {
let config = {
compress: false,
debug_level: 1,
keyword_file: '',
headless: true,
output_file: '',
block_assets: true,
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
random_user_agent: false,
};
let scrape_config = {
search_engine: 'google',
keywords: effective_query_keywords,
num_pages: 1,
};
console.log('effective_query_test()');
test_case_effective_query( await se_scraper.scrape(config, scrape_config) );
}
// we test with a callback function to our handler
function test_case_effective_query(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
assert.containsAllKeys(response.results, effective_query_keywords, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
// effective query must be different to the original keyword
assert.isOk(obj.effective_query, 'effective query must be ok');
assert.isNotEmpty(obj.effective_query, 'effective query must be valid');
assert(obj.effective_query !== query, 'effective query must be different from keyword');
assert.isAtLeast(obj.results.length, 7, 'results must have at least 8 SERP objects');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
}
}
}
async function html_output_query_test() {
let config = {
compress: false,
debug_level: 1,
keyword_file: '',
headless: true,
output_file: '',
block_assets: true,
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
random_user_agent: false,
};
let scrape_config = {
search_engine: 'google',
keywords: normal_search_keywords,
num_pages: 3,
html_output: true,
};
let output = await se_scraper.scrape(config, scrape_config);
normal_search_test_case( output );
check_html_output_test_case( output );
}
function check_html_output_test_case( response ) {
for (let query in response.html_output) {
assert.containsAllKeys(response.html_output, normal_search_keywords, 'not all keywords were scraped.');
for (let page_number in response.html_output[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
assert.startsWith(response.html_output[query][page_number], '<!DOCTYPE html><html');
}
}
}
const ads_keywords = ['cloud services', 'auto kaufen'];
async function ads_test() {
let config = {
compress: false,
debug_level: 1,
headless: true,
block_assets: false,
random_user_agent: false, // dont try to trick google with ads
};
let scrape_config = {
search_engine: 'google',
keywords: ads_keywords,
num_pages: 1,
};
console.log('ads_test()');
test_case_ads_test( await se_scraper.scrape(config, scrape_config) );
}
function test_case_ads_test(response) {
assert.equal(response.metadata.num_requests, 2);
for (let query in response.results) {
assert.containsAllKeys(response.results, ads_keywords, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
assert(obj.top_ads.length >= 1 || obj.bottom_ads.length >= 1, 'top_ads or bottom_ads must have at least 1 SERP object');
for (let res of obj.top_ads) {
assert.isOk(res.tracking_link, 'link must be ok');
assert.typeOf(res.tracking_link, 'string', 'link must be string');
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'link must be ok');
assert.typeOf(res.visible_link, 'string', 'link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.link, 'visible_link must be ok');
assert.typeOf(res.link, 'string', 'visible_link must be string');
assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
assert.typeOf(res.links, 'array', 'links must be array');
}
for (let res of obj.bottom_ads) {
assert.isOk(res.tracking_link, 'link must be ok');
assert.typeOf(res.tracking_link, 'string', 'link must be string');
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'link must be ok');
assert.typeOf(res.visible_link, 'string', 'link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.link, 'visible_link must be ok');
assert.typeOf(res.link, 'string', 'visible_link must be string');
assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
assert.typeOf(res.links, 'array', 'links must be array');
}
}
}
}
const product_keywords = ['autoreifen bmw'];
async function products_test() {
let config = {
compress: false,
debug_level: 1,
headless: true,
block_assets: false,
random_user_agent: false, // dont try to trick google with ads
};
let scrape_config = {
search_engine: 'google',
keywords: ads_keywords,
num_pages: 1,
};
console.log('products_test()');
test_case_products_test( await se_scraper.scrape(config, scrape_config) );
}
function test_case_products_test(response) {
assert.equal(response.metadata.num_requests, 2);
for (let query in response.results) {
assert.containsAllKeys(response.results, ads_keywords, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
assert(obj.top_products.length >= 1 || obj.right_products.length >= 1, 'top_products or right_products must have at least 1 SERP object');
for (let res of obj.top_products) {
assert.isOk(res.tracking_link, 'link must be ok');
assert.typeOf(res.tracking_link, 'string', 'link must be string');
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.price, 'price must be ok');
assert.typeOf(res.price, 'string', 'price must be string');
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.vendor_link, 'vendor_link must be ok');
assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string');
assert.isAtLeast(res.vendor_link.length, 10, 'vendor_link must have at least 10 chars');
}
for (let res of obj.right_products) {
assert.isOk(res.tracking_link, 'link must be ok');
assert.typeOf(res.tracking_link, 'string', 'link must be string');
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.price, 'price must be ok');
assert.typeOf(res.price, 'string', 'price must be string');
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.vendor_link, 'vendor_link must be ok');
assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string');
assert.isAtLeast(res.vendor_link.length, 10, 'vendor_link must have at least 10 chars');
}
}
}
}
describe('Google', function(){
this.timeout(30000);
it('normal search', normal_search_test);
it('no results', no_results_test);
it('effective query', effective_query_test);
it('html output query', html_output_query_test);
it('ads', ads_test);
it('products test', products_test);
});

View File

@ -1,80 +0,0 @@
'use strict';
const se_scraper = require('./../index.js');
const assert = require('chai').assert;
/*
* Use chai and mocha for tests.
* https://mochajs.org/#installation
*/
const normal_search_keywords = ['apple', 'rain'];
async function normal_image_search_test() {
let config = {
compress: false,
debug_level: 0,
headless: true,
block_assets: true,
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
random_user_agent: false,
};
let scrape_config = {
search_engine: 'google_image',
keywords: normal_search_keywords,
num_pages: 2,
};
console.log('normal_image_search_test()');
normal_image_search_test_case( await se_scraper.scrape(config, scrape_config) );
}
// we test with a callback function to our handler
function normal_image_search_test_case(response) {
assert.equal(response.metadata.num_requests, 2);
for (let query in response.results) {
let total_rank = 1;
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'effective_query'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 15, 'results must have at least 15 SERP objects');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
for (let res of obj.results) {
assert.containsAllKeys(res, ['link', 'snippet', 'rank', 'clean_link'], 'not all keys are in the SERP object');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.clean_link, 'clean_link must be ok');
assert.typeOf(res.clean_link, 'string', 'clean_link must be string');
assert.isAtLeast(res.clean_link.length, 5, 'clean_link must have at least 5 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
assert.isNumber(res.rank, 'rank must be integer');
assert.equal(res.rank, total_rank++, 'rank ist wrong');
}
}
}
}
describe('Google Image', function(){
this.timeout(30000);
it('normal image search test', normal_image_search_test);
});

View File

@ -1,91 +0,0 @@
'use strict';
const se_scraper = require('./../index.js');
const assert = require('chai').assert;
const normal_search_keywords = ['apple juice'];
async function queryargs_search_test() {
let config = {
search_engine: 'google',
compress: false,
debug: true,
verbose: true,
keywords: normal_search_keywords,
keyword_file: '',
num_pages: 2,
headless: true,
output_file: '',
block_assets: true,
// use specific search engine parameters for various search engines
google_settings: {
google_domain: 'google.com',
gl: 'fr', // The gl parameter determines the Google country to use for the query.
hl: 'fr', // The hl parameter determines the Google UI language to return results.
start: 30, // Determines the results offset to use, defaults to 0.
num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
},
};
console.log('queryargs_search_test()');
await se_scraper.scrape(config, queryargs_search_test_case);
}
// we test with a callback function to our handler
function queryargs_search_test_case(err, response) {
if (err) {
console.error(err);
} else {
assert.equal(response.metadata.num_requests, 2);
for (let query in response.results) {
let total_rank = 1;
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 90, 'results must have at least 80 SERP objects');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
for (let res of obj.results) {
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link'], 'not all keys are in the SERP object');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'visible_link must be ok');
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
assert.isNumber(res.rank, 'rank must be integer');
assert.equal(res.rank, total_rank++, 'rank ist wrong');
}
}
}
}
}
describe('Google with query arguments', function(){
this.timeout(30000);
it('query args search test', queryargs_search_test);
});

View File

@ -1,217 +0,0 @@
'use strict';
const se_scraper = require('./../index.js');
const assert = require('chai').assert;
/*
* Use chai and mocha for tests.
* https://mochajs.org/#installation
*/
const quote_search_keywords = ['MSFT', 'AAPL'];
async function reuters_search_test() {
let config = {
search_engine: 'reuters',
compress: false,
debug: false,
verbose: false,
keywords: quote_search_keywords,
keyword_file: '',
num_pages: 1,
headless: true,
output_file: '',
block_assets: true,
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
random_user_agent: false,
};
console.log('reuters_search_test()');
await se_scraper.scrape(config, reuters_search_test_case);
}
// we test with a callback function to our handler
function reuters_search_test_case(err, response) {
if (err) {
console.error(err);
} else {
for (let query in response.results) {
let total_rank = 1;
assert.containsAllKeys(response.results, quote_search_keywords, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
for (let res of obj.results) {
assert.containsAllKeys(res, ['link', 'title', 'date', 'snippet'], 'not all keys are in the SERP object');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
assert.isOk(res.date, 'date must be ok');
assert.typeOf(res.date, 'string', 'date must be string');
assert.isAtLeast(res.date.length, 5, 'date must have at least 5 chars');
}
}
}
}
}
async function cnbc_search_test() {
let config = {
search_engine: 'cnbc',
compress: false,
debug: false,
verbose: false,
keywords: quote_search_keywords,
keyword_file: '',
num_pages: 1,
headless: true,
output_file: '',
block_assets: true,
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
random_user_agent: false,
};
console.log('cnbc_search_test()');
await se_scraper.scrape(config, cnbc_search_test_case);
}
// we test with a callback function to our handler
function cnbc_search_test_case(err, response) {
if (err) {
console.error(err);
} else {
for (let query in response.results) {
let total_rank = 1;
assert.containsAllKeys(response.results, quote_search_keywords, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
for (let res of obj.results) {
assert.containsAllKeys(res, ['link', 'title', 'date'], 'not all keys are in the SERP object');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
assert.isOk(res.date, 'date must be ok');
assert.typeOf(res.date, 'string', 'date must be string');
assert.isAtLeast(res.date.length, 5, 'date must have at least 5 chars');
}
}
}
}
}
const marketwatch_search_keywords = ['MSFT'];
async function marketwatch_search_test() {
let config = {
search_engine: 'marketwatch',
compress: false,
debug: false,
verbose: false,
keywords: marketwatch_search_keywords,
keyword_file: '',
num_pages: 1,
headless: true,
output_file: '',
block_assets: true,
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
random_user_agent: false,
};
console.log('marketwatch_search_test()');
await se_scraper.scrape(config, marketwatch_search_test_case);
}
// we test with a callback function to our handler
function marketwatch_search_test_case(err, response) {
if (err) {
console.error(err);
} else {
for (let query in response.results) {
let total_rank = 1;
assert.containsAllKeys(response.results, marketwatch_search_keywords, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
for (let res of obj.results) {
assert.containsAllKeys(res, ['link', 'title', 'date', 'author'], 'not all keys are in the SERP object');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
assert.isOk(res.author, 'author must be ok');
assert.typeOf(res.author, 'string', 'author must be string');
assert.isAtLeast(res.author.length, 5, 'author must have at least 5 chars');
assert.isOk(res.date, 'date must be ok');
assert.typeOf(res.date, 'string', 'date must be string');
assert.isAtLeast(res.date.length, 5, 'date must have at least 5 chars');
}
}
}
}
}
describe('Ticker', function(){
this.timeout(30000);
it('Reuters search test', reuters_search_test);
it('CNBC search test', cnbc_search_test);
it('Marketwatch search test', marketwatch_search_test);
});

144
test/user_agent.js Normal file
View File

@ -0,0 +1,144 @@
'use strict';
const express = require('express');
const { createLogger, transports } = require('winston');
const http = require('http');
const https = require('https');
const assert = require('assert');
const keyCert = require('key-cert');
const Promise = require('bluebird');
const Proxy = require('http-mitm-proxy');
const UAParser = require('ua-parser-js');
const _ = require('lodash');
const debug = require('debug')('se-scraper:test');
const se_scraper = require('../');
const Scraper = require('../src/modules/se_scraper');
const httpPort = 3012;
const httpsPort = httpPort + 1;
const proxyPort = httpPort + 2;
const fakeSearchEngine = express();
fakeSearchEngine.set('trust proxy', 'loopback');
fakeSearchEngine.get('/test-user_agent', (req, res) => {
debug('fake-search-engine req.headers.user-agent=%s', req.headers['user-agent']);
res.send(req.headers['user-agent']);
});
describe('Config', function(){
let httpServer, httpsServer, proxy;
before(async function(){
// Here mount our fake engine in both http and https listen server
httpServer = http.createServer(fakeSearchEngine);
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
proxy = Proxy();
proxy.onRequest((ctx, callback) => {
ctx.proxyToServerRequestOptions.host = 'localhost';
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
return callback();
});
await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
debug('Fake http search engine servers started');
});
after(function(){
httpsServer.close();
httpServer.close();
proxy.close();
});
describe('user_agent', function(){
class MockScraperTestUserAgent extends Scraper {
async load_start_page(){
return true;
}
async search_keyword(){
await this.page.goto('http://localhost:' + httpPort + '/test-user_agent');
}
async parse_async(){
const bodyHandle = await this.page.$('body');
return await this.page.evaluate(body => body.innerHTML, bodyHandle);
}
}
const testLogger = createLogger({
transports: [
new transports.Console({
level: 'error'
})
]
});
/**
* Test user_agent option
*/
it('fixed user_agent', async function () {
const scrape_job = {
search_engine: MockScraperTestUserAgent,
keywords: ['javascript is hard'],
};
var scraper = new se_scraper.ScrapeManager({
throw_on_detection: true,
logger: testLogger,
user_agent: 'THIS IS A USERAGENT 42.0'
});
await scraper.start();
const { results } = await scraper.scrape(scrape_job);
assert.strictEqual(results['javascript is hard']['1'], 'THIS IS A USERAGENT 42.0');
await scraper.quit();
});
/**
* Test random_user_agent option
* TODO generated user_agent should be different for each keyword
* TODO this test will sometimes fail because user_agent not very random :-(
*/
it('random_user_agent', async function () {
const scrape_job = {
search_engine: MockScraperTestUserAgent,
keywords: ['news'],
};
const NUMBER_OF_EXEC = 10;
const uaList = await Promise.map(_.range(NUMBER_OF_EXEC), async (i) => {
const scraper = new se_scraper.ScrapeManager({
throw_on_detection: true,
logger: testLogger,
random_user_agent: true,
});
await scraper.start();
const { results: { news } } = await scraper.scrape(scrape_job);
await scraper.quit();
return news['1'];
});
uaList.forEach((userAgent) => {
const uaParsed = UAParser(userAgent);
assert(uaParsed.browser.name, 'UserAgent should have a browser name detected');
assert(uaParsed.os.name, 'UserAgent should have a os name detected');
});
assert( _.chain(uaList).countBy().toPairs().sortBy(e => e[1]).last().value()[1] < (NUMBER_OF_EXEC * 0.4), 'Each user agent should appear less than 40% of the time' );
});
});
});