mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2024-11-21 15:13:13 +01:00
Merge branch 'master' of github.com:NikolaiT/se-scraper
branchy
This commit is contained in:
commit
5a0eea201d
2
.gitignore
vendored
2
.gitignore
vendored
@ -79,3 +79,5 @@ typings/
|
||||
|
||||
.idea/
|
||||
GoogleScraperPup.iml
|
||||
|
||||
.http-mitm-proxy
|
||||
|
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -1,3 +0,0 @@
|
||||
[submodule "src/puppeteer-cluster"]
|
||||
path = src/puppeteer-cluster
|
||||
url = https://github.com/NikolaiT/puppeteer-cluster
|
@ -32,12 +32,6 @@ let browser_config = {
|
||||
verbose: true,
|
||||
// whether to start the browser in headless mode
|
||||
headless: true,
|
||||
// whether debug information should be printed
|
||||
// level 0: print nothing
|
||||
// level 1: print most important info
|
||||
// ...
|
||||
// level 4: print all shit nobody wants to know
|
||||
debug_level: 1,
|
||||
is_local: false,
|
||||
throw_on_detection: false,
|
||||
puppeteer_cluster_config: {
|
||||
|
@ -30,12 +30,6 @@ let browser_config = {
|
||||
// if random_user_agent is set to True, a random user agent is chosen
|
||||
random_user_agent: true,
|
||||
headless: true,
|
||||
// whether debug information should be printed
|
||||
// level 0: print nothing
|
||||
// level 1: print most important info
|
||||
// ...
|
||||
// level 4: print all shit nobody wants to know
|
||||
debug_level: 1,
|
||||
is_local: false,
|
||||
throw_on_detection: false,
|
||||
puppeteer_cluster_config: {
|
||||
|
@ -2,7 +2,6 @@ const se_scraper = require('./../src/node_scraper.js');
|
||||
|
||||
(async () => {
|
||||
let browser_config = {
|
||||
debug_level: 1,
|
||||
output_file: '',
|
||||
};
|
||||
|
||||
|
@ -2,7 +2,6 @@ const se_scraper = require('./../src/node_scraper.js');
|
||||
|
||||
(async () => {
|
||||
let browser_config = {
|
||||
debug_level: 2,
|
||||
output_file: 'examples/results/gnold.json',
|
||||
google_news_old_settings: {
|
||||
gl: 'us', // The gl parameter determines the Google country to use for the query.
|
||||
|
@ -2,7 +2,6 @@ const se_scraper = require('./../src/node_scraper.js');
|
||||
|
||||
(async () => {
|
||||
let browser_config = {
|
||||
debug_level: 1,
|
||||
output_file: 'examples/results/maps.json',
|
||||
test_evasion: false,
|
||||
block_assets: false,
|
||||
|
@ -3,7 +3,6 @@ const se_scraper = require('./../src/node_scraper.js');
|
||||
(async () => {
|
||||
let browser_config = {
|
||||
search_engine: 'google',
|
||||
debug_level: 1,
|
||||
random_user_agent: true,
|
||||
is_local: false,
|
||||
html_output: false,
|
||||
|
@ -5,7 +5,6 @@ const se_scraper = require('./../src/node_scraper.js');
|
||||
random_user_agent: true,
|
||||
write_meta_data: true,
|
||||
sleep_range: '[1,1]',
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
output_file: `examples/results/multiple_search_engines.json`
|
||||
};
|
||||
|
@ -3,7 +3,6 @@ const resolve = require('path').resolve;
|
||||
|
||||
(async () => {
|
||||
let browser_config = {
|
||||
debug_level: 1,
|
||||
test_evasion: false,
|
||||
log_http_headers: true,
|
||||
log_ip_address: true,
|
||||
|
@ -2,7 +2,6 @@ const se_scraper = require('./../src/node_scraper.js');
|
||||
|
||||
(async () => {
|
||||
let browser_config = {
|
||||
debug_level: 1,
|
||||
output_file: 'examples/results/proxyresults.json',
|
||||
log_ip_address: true,
|
||||
// a file with one proxy per line. Example:
|
||||
|
@ -2,7 +2,6 @@ const se_scraper = require('./../src/node_scraper.js');
|
||||
|
||||
(async () => {
|
||||
let browser_config = {
|
||||
debug_level: 1,
|
||||
test_evasion: false,
|
||||
log_http_headers: false,
|
||||
log_ip_address: false,
|
||||
|
@ -2,7 +2,6 @@ const se_scraper = require('./../src/node_scraper.js');
|
||||
|
||||
(async () => {
|
||||
let browser_config = {
|
||||
debug_level: 1,
|
||||
output_file: 'examples/results/data.json',
|
||||
};
|
||||
|
||||
|
478
package-lock.json
generated
478
package-lock.json
generated
@ -26,6 +26,7 @@
|
||||
"version": "1.3.7",
|
||||
"resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.7.tgz",
|
||||
"integrity": "sha512-Il80Qs2WjYlJIBNzNkK6KYqlVMTbZLXgHx2oT0pU/fjRHyEp+PEfEPY0R3WCwAGVOtauxh1hOxNgIf5bv7dQpA==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"mime-types": "~2.1.24",
|
||||
"negotiator": "0.6.2"
|
||||
@ -77,7 +78,8 @@
|
||||
"array-flatten": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz",
|
||||
"integrity": "sha1-ml9pkFGx5wczKPKgCJaLZOopVdI="
|
||||
"integrity": "sha1-ml9pkFGx5wczKPKgCJaLZOopVdI=",
|
||||
"dev": true
|
||||
},
|
||||
"assertion-error": {
|
||||
"version": "1.1.0",
|
||||
@ -85,6 +87,14 @@
|
||||
"integrity": "sha512-jgsaNduz+ndvGyFt3uSuWqvy4lCnIJiovtouQN5JZHOKCS2QuhEdbcQHFhVksz2N2U9hXJo8odG7ETyWlEeuDw==",
|
||||
"dev": true
|
||||
},
|
||||
"async": {
|
||||
"version": "2.6.3",
|
||||
"resolved": "https://registry.npmjs.org/async/-/async-2.6.3.tgz",
|
||||
"integrity": "sha512-zflvls11DCy+dQWzTW2dzuilv8Z5X/pjfmZOWba6TNIVDm+2UDaJmXSOXlasHKfNBs8oo3M0aT50fDEWfKZjXg==",
|
||||
"requires": {
|
||||
"lodash": "^4.17.14"
|
||||
}
|
||||
},
|
||||
"async-limiter": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/async-limiter/-/async-limiter-1.0.1.tgz",
|
||||
@ -95,10 +105,17 @@
|
||||
"resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.0.tgz",
|
||||
"integrity": "sha1-ibTRmasr7kneFk6gK4nORi1xt2c="
|
||||
},
|
||||
"bluebird": {
|
||||
"version": "3.7.2",
|
||||
"resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.7.2.tgz",
|
||||
"integrity": "sha512-XpNj6GDQzdfW+r2Wnn7xiSAd7TM3jzkxGXBGTtWKuSXv1xUV+azxAm8jdWZN06QTQk+2N2XB9jRDkvbmQmcRtg==",
|
||||
"dev": true
|
||||
},
|
||||
"body-parser": {
|
||||
"version": "1.19.0",
|
||||
"resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.19.0.tgz",
|
||||
"integrity": "sha512-dhEPs72UPbDnAQJ9ZKMNTP6ptJaionhP5cBb541nXPlW60Jepo9RV/a4fX4XWW9CuFNK22krhrj1+rgzifNCsw==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"bytes": "3.1.0",
|
||||
"content-type": "~1.0.4",
|
||||
@ -116,6 +133,7 @@
|
||||
"version": "2.6.9",
|
||||
"resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
|
||||
"integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"ms": "2.0.0"
|
||||
}
|
||||
@ -123,7 +141,8 @@
|
||||
"ms": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
|
||||
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g="
|
||||
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=",
|
||||
"dev": true
|
||||
}
|
||||
}
|
||||
},
|
||||
@ -155,7 +174,8 @@
|
||||
"bytes": {
|
||||
"version": "3.1.0",
|
||||
"resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.0.tgz",
|
||||
"integrity": "sha512-zauLjrfCG+xvoyaqLoV8bLVXXNGC4JqlxFCutSDWA6fJrTo2ZuvLYTqZ7aHBLZSMOopbzwv8f+wZcVzfVTI2Dg=="
|
||||
"integrity": "sha512-zauLjrfCG+xvoyaqLoV8bLVXXNGC4JqlxFCutSDWA6fJrTo2ZuvLYTqZ7aHBLZSMOopbzwv8f+wZcVzfVTI2Dg==",
|
||||
"dev": true
|
||||
},
|
||||
"cacheable-request": {
|
||||
"version": "6.0.0",
|
||||
@ -219,6 +239,12 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"charenc": {
|
||||
"version": "0.0.2",
|
||||
"resolved": "https://registry.npmjs.org/charenc/-/charenc-0.0.2.tgz",
|
||||
"integrity": "sha1-wKHS86cJLgN3S/qD8UwPxXkKhmc=",
|
||||
"dev": true
|
||||
},
|
||||
"check-error": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/check-error/-/check-error-1.0.2.tgz",
|
||||
@ -275,11 +301,19 @@
|
||||
"integrity": "sha1-DQcLTQQ6W+ozovGkDi7bPZpMz3c=",
|
||||
"dev": true
|
||||
},
|
||||
"color": {
|
||||
"version": "3.0.0",
|
||||
"resolved": "https://registry.npmjs.org/color/-/color-3.0.0.tgz",
|
||||
"integrity": "sha512-jCpd5+s0s0t7p3pHQKpnJ0TpQKKdleP71LWcA0aqiljpiuAkOSUFN/dyH8ZwF0hRmFlrIuRhufds1QyEP9EB+w==",
|
||||
"requires": {
|
||||
"color-convert": "^1.9.1",
|
||||
"color-string": "^1.5.2"
|
||||
}
|
||||
},
|
||||
"color-convert": {
|
||||
"version": "1.9.3",
|
||||
"resolved": "https://registry.npmjs.org/color-convert/-/color-convert-1.9.3.tgz",
|
||||
"integrity": "sha512-QfAUtd+vFdAtFQcC8CCyYt1fYWxSqAiK2cSD6zDB8N3cpsEBAvRxp9zOGg6G/SHHJYAT88/az/IuDGALsNVbGg==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"color-name": "1.1.3"
|
||||
}
|
||||
@ -287,8 +321,35 @@
|
||||
"color-name": {
|
||||
"version": "1.1.3",
|
||||
"resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.3.tgz",
|
||||
"integrity": "sha1-p9BVi9icQveV3UIyj3QIMcpTvCU=",
|
||||
"dev": true
|
||||
"integrity": "sha1-p9BVi9icQveV3UIyj3QIMcpTvCU="
|
||||
},
|
||||
"color-string": {
|
||||
"version": "1.5.3",
|
||||
"resolved": "https://registry.npmjs.org/color-string/-/color-string-1.5.3.tgz",
|
||||
"integrity": "sha512-dC2C5qeWoYkxki5UAXapdjqO672AM4vZuPGRQfO8b5HKuKGBbKWpITyDYN7TOFKvRW7kOgAn3746clDBMDJyQw==",
|
||||
"requires": {
|
||||
"color-name": "^1.0.0",
|
||||
"simple-swizzle": "^0.2.2"
|
||||
}
|
||||
},
|
||||
"colornames": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "https://registry.npmjs.org/colornames/-/colornames-1.1.1.tgz",
|
||||
"integrity": "sha1-+IiQMGhcfE/54qVZ9Qd+t2qBb5Y="
|
||||
},
|
||||
"colors": {
|
||||
"version": "1.4.0",
|
||||
"resolved": "https://registry.npmjs.org/colors/-/colors-1.4.0.tgz",
|
||||
"integrity": "sha512-a+UqTh4kgZg/SlGvfbzDHpgRu7AAQOmmqRHJnxhRZICKFUT91brVhNNt58CMWU9PsBbv3PDCZUHbVxuDiH2mtA=="
|
||||
},
|
||||
"colorspace": {
|
||||
"version": "1.1.2",
|
||||
"resolved": "https://registry.npmjs.org/colorspace/-/colorspace-1.1.2.tgz",
|
||||
"integrity": "sha512-vt+OoIP2d76xLhjwbBaucYlNSpPsrJWPlBTtwCpQKIu6/CSMutyzX93O/Do0qzpH3YoHEes8YEFXyZ797rEhzQ==",
|
||||
"requires": {
|
||||
"color": "3.0.x",
|
||||
"text-hex": "1.0.x"
|
||||
}
|
||||
},
|
||||
"concat-map": {
|
||||
"version": "0.0.1",
|
||||
@ -297,7 +358,7 @@
|
||||
},
|
||||
"concat-stream": {
|
||||
"version": "1.6.2",
|
||||
"resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz",
|
||||
"resolved": "http://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz",
|
||||
"integrity": "sha512-27HBghJxjiZtIk3Ycvn/4kbJk/1uZuJFfuPEns6LaEvpvG1f0hTea8lilrouyo9mVc2GWdcEZ8OLoGmSADlrCw==",
|
||||
"requires": {
|
||||
"buffer-from": "^1.0.0",
|
||||
@ -308,7 +369,7 @@
|
||||
"dependencies": {
|
||||
"readable-stream": {
|
||||
"version": "2.3.6",
|
||||
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz",
|
||||
"resolved": "http://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz",
|
||||
"integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==",
|
||||
"requires": {
|
||||
"core-util-is": "~1.0.0",
|
||||
@ -322,7 +383,7 @@
|
||||
},
|
||||
"string_decoder": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz",
|
||||
"resolved": "http://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz",
|
||||
"integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==",
|
||||
"requires": {
|
||||
"safe-buffer": "~5.1.0"
|
||||
@ -334,6 +395,7 @@
|
||||
"version": "0.5.3",
|
||||
"resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-0.5.3.tgz",
|
||||
"integrity": "sha512-ExO0774ikEObIAEV9kDo50o+79VCUdEB6n6lzKgGwupcVeRlhrj3qGAfwq8G6uBJjkqLrhT0qEYFcWng8z1z0g==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"safe-buffer": "5.1.2"
|
||||
}
|
||||
@ -341,17 +403,20 @@
|
||||
"content-type": {
|
||||
"version": "1.0.4",
|
||||
"resolved": "https://registry.npmjs.org/content-type/-/content-type-1.0.4.tgz",
|
||||
"integrity": "sha512-hIP3EEPs8tB9AT1L+NUqtwOAps4mk2Zob89MWXMHjHWg9milF/j4osnnQLXBCBFBk/tvIG/tUc9mOUJiPBhPXA=="
|
||||
"integrity": "sha512-hIP3EEPs8tB9AT1L+NUqtwOAps4mk2Zob89MWXMHjHWg9milF/j4osnnQLXBCBFBk/tvIG/tUc9mOUJiPBhPXA==",
|
||||
"dev": true
|
||||
},
|
||||
"cookie": {
|
||||
"version": "0.4.0",
|
||||
"resolved": "https://registry.npmjs.org/cookie/-/cookie-0.4.0.tgz",
|
||||
"integrity": "sha512-+Hp8fLp57wnUSt0tY0tHEXh4voZRDnoIrZPqlo3DPiI4y9lwg/jqx+1Om94/W6ZaPDOUbnjOt/99w66zk+l1Xg=="
|
||||
"integrity": "sha512-+Hp8fLp57wnUSt0tY0tHEXh4voZRDnoIrZPqlo3DPiI4y9lwg/jqx+1Om94/W6ZaPDOUbnjOt/99w66zk+l1Xg==",
|
||||
"dev": true
|
||||
},
|
||||
"cookie-signature": {
|
||||
"version": "1.0.6",
|
||||
"resolved": "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.0.6.tgz",
|
||||
"integrity": "sha1-4wOogrNCzD7oylE6eZmXNNqzriw="
|
||||
"integrity": "sha1-4wOogrNCzD7oylE6eZmXNNqzriw=",
|
||||
"dev": true
|
||||
},
|
||||
"core-util-is": {
|
||||
"version": "1.0.2",
|
||||
@ -371,6 +436,12 @@
|
||||
"which": "^1.2.9"
|
||||
}
|
||||
},
|
||||
"crypt": {
|
||||
"version": "0.0.2",
|
||||
"resolved": "https://registry.npmjs.org/crypt/-/crypt-0.0.2.tgz",
|
||||
"integrity": "sha1-iNf/fsDfuG9xPch7u0LQRNPmxBs=",
|
||||
"dev": true
|
||||
},
|
||||
"css-select": {
|
||||
"version": "1.2.0",
|
||||
"resolved": "http://registry.npmjs.org/css-select/-/css-select-1.2.0.tgz",
|
||||
@ -440,12 +511,24 @@
|
||||
"depd": {
|
||||
"version": "1.1.2",
|
||||
"resolved": "https://registry.npmjs.org/depd/-/depd-1.1.2.tgz",
|
||||
"integrity": "sha1-m81S4UwJd2PnSbJ0xDRu0uVgtak="
|
||||
"integrity": "sha1-m81S4UwJd2PnSbJ0xDRu0uVgtak=",
|
||||
"dev": true
|
||||
},
|
||||
"destroy": {
|
||||
"version": "1.0.4",
|
||||
"resolved": "https://registry.npmjs.org/destroy/-/destroy-1.0.4.tgz",
|
||||
"integrity": "sha1-l4hXRCxEdJ5CBmE+N5RiBYJqvYA="
|
||||
"integrity": "sha1-l4hXRCxEdJ5CBmE+N5RiBYJqvYA=",
|
||||
"dev": true
|
||||
},
|
||||
"diagnostics": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "https://registry.npmjs.org/diagnostics/-/diagnostics-1.1.1.tgz",
|
||||
"integrity": "sha512-8wn1PmdunLJ9Tqbx+Fx/ZEuHfJf4NKSN2ZBj7SJC/OWRWha843+WsTjqMe1B5E3p28jqBlp+mJ2fPVxPyNgYKQ==",
|
||||
"requires": {
|
||||
"colorspace": "1.1.x",
|
||||
"enabled": "1.0.x",
|
||||
"kuler": "1.0.x"
|
||||
}
|
||||
},
|
||||
"diff": {
|
||||
"version": "3.5.0",
|
||||
@ -506,7 +589,8 @@
|
||||
"ee-first": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz",
|
||||
"integrity": "sha1-WQxhFWsK4vTwJVcyoViyZrxWsh0="
|
||||
"integrity": "sha1-WQxhFWsK4vTwJVcyoViyZrxWsh0=",
|
||||
"dev": true
|
||||
},
|
||||
"emoji-regex": {
|
||||
"version": "7.0.3",
|
||||
@ -514,10 +598,19 @@
|
||||
"integrity": "sha512-CwBLREIQ7LvYFB0WyRvwhq5N5qPhc6PMjD6bYggFlI5YyDgl+0vxq5VHbMOFqLg7hfWzmu8T5Z1QofhmTIhItA==",
|
||||
"dev": true
|
||||
},
|
||||
"enabled": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/enabled/-/enabled-1.0.2.tgz",
|
||||
"integrity": "sha1-ll9lE9LC0cX0ZStkouM5ZGf8L5M=",
|
||||
"requires": {
|
||||
"env-variable": "0.0.x"
|
||||
}
|
||||
},
|
||||
"encodeurl": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz",
|
||||
"integrity": "sha1-rT/0yG7C0CkyL1oCw6mmBslbP1k="
|
||||
"integrity": "sha1-rT/0yG7C0CkyL1oCw6mmBslbP1k=",
|
||||
"dev": true
|
||||
},
|
||||
"end-of-stream": {
|
||||
"version": "1.4.1",
|
||||
@ -532,6 +625,11 @@
|
||||
"resolved": "https://registry.npmjs.org/entities/-/entities-1.1.2.tgz",
|
||||
"integrity": "sha512-f2LZMYl1Fzu7YSBKg+RoROelpOaNrcGmE9AZubeDfrCEia483oW4MI4VyFd5VNHIgQ/7qm1I0wUHK1eJnn2y2w=="
|
||||
},
|
||||
"env-variable": {
|
||||
"version": "0.0.5",
|
||||
"resolved": "https://registry.npmjs.org/env-variable/-/env-variable-0.0.5.tgz",
|
||||
"integrity": "sha512-zoB603vQReOFvTg5xMl9I1P2PnHsHQQKTEowsKKD7nseUfJq6UWzK+4YtlWUO1nhiQUxe6XMkk+JleSZD1NZFA=="
|
||||
},
|
||||
"es-abstract": {
|
||||
"version": "1.13.0",
|
||||
"resolved": "https://registry.npmjs.org/es-abstract/-/es-abstract-1.13.0.tgz",
|
||||
@ -564,7 +662,7 @@
|
||||
},
|
||||
"es6-promisify": {
|
||||
"version": "5.0.0",
|
||||
"resolved": "https://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz",
|
||||
"resolved": "http://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz",
|
||||
"integrity": "sha1-UQnWLz5W6pZ8S2NQWu8IKRyKUgM=",
|
||||
"requires": {
|
||||
"es6-promise": "^4.0.3"
|
||||
@ -573,7 +671,8 @@
|
||||
"escape-html": {
|
||||
"version": "1.0.3",
|
||||
"resolved": "https://registry.npmjs.org/escape-html/-/escape-html-1.0.3.tgz",
|
||||
"integrity": "sha1-Aljq5NPQwJdN4cFpGI7wBR0dGYg="
|
||||
"integrity": "sha1-Aljq5NPQwJdN4cFpGI7wBR0dGYg=",
|
||||
"dev": true
|
||||
},
|
||||
"escape-string-regexp": {
|
||||
"version": "1.0.5",
|
||||
@ -590,7 +689,8 @@
|
||||
"etag": {
|
||||
"version": "1.8.1",
|
||||
"resolved": "https://registry.npmjs.org/etag/-/etag-1.8.1.tgz",
|
||||
"integrity": "sha1-Qa4u62XvpiJorr/qg6x9eSmbCIc="
|
||||
"integrity": "sha1-Qa4u62XvpiJorr/qg6x9eSmbCIc=",
|
||||
"dev": true
|
||||
},
|
||||
"execa": {
|
||||
"version": "1.0.0",
|
||||
@ -611,6 +711,7 @@
|
||||
"version": "4.17.1",
|
||||
"resolved": "https://registry.npmjs.org/express/-/express-4.17.1.tgz",
|
||||
"integrity": "sha512-mHJ9O79RqluphRrcw2X/GTh3k9tVv8YcoyY4Kkh4WDMUYKRZUq0h1o0w2rrrxBqM7VoeUVqgb27xlEMXTnYt4g==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"accepts": "~1.3.7",
|
||||
"array-flatten": "1.1.1",
|
||||
@ -648,6 +749,7 @@
|
||||
"version": "2.6.9",
|
||||
"resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
|
||||
"integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"ms": "2.0.0"
|
||||
}
|
||||
@ -655,7 +757,8 @@
|
||||
"ms": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
|
||||
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g="
|
||||
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=",
|
||||
"dev": true
|
||||
}
|
||||
}
|
||||
},
|
||||
@ -685,6 +788,11 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"fast-safe-stringify": {
|
||||
"version": "2.0.7",
|
||||
"resolved": "https://registry.npmjs.org/fast-safe-stringify/-/fast-safe-stringify-2.0.7.tgz",
|
||||
"integrity": "sha512-Utm6CdzT+6xsDk2m8S6uL8VHxNwI6Jub+e9NYTcAms28T84pTa25GJQV9j0CY0N1rM8hK4x6grpF2BQf+2qwVA=="
|
||||
},
|
||||
"fd-slicer": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/fd-slicer/-/fd-slicer-1.0.1.tgz",
|
||||
@ -693,10 +801,16 @@
|
||||
"pend": "~1.2.0"
|
||||
}
|
||||
},
|
||||
"fecha": {
|
||||
"version": "2.3.3",
|
||||
"resolved": "https://registry.npmjs.org/fecha/-/fecha-2.3.3.tgz",
|
||||
"integrity": "sha512-lUGBnIamTAwk4znq5BcqsDaxSmZ9nDVJaij6NvRt/Tg4R69gERA+otPKbS86ROw9nxVMw2/mp1fnaiWqbs6Sdg=="
|
||||
},
|
||||
"finalhandler": {
|
||||
"version": "1.1.2",
|
||||
"resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-1.1.2.tgz",
|
||||
"integrity": "sha512-aAWcW57uxVNrQZqFXjITpW3sIUQmHGG3qSb9mUah9MgMC4NeWhNOlNjXEYq3HjRAvL6arUviZGGJsBg6z0zsWA==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"debug": "2.6.9",
|
||||
"encodeurl": "~1.0.2",
|
||||
@ -711,6 +825,7 @@
|
||||
"version": "2.6.9",
|
||||
"resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
|
||||
"integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"ms": "2.0.0"
|
||||
}
|
||||
@ -718,7 +833,8 @@
|
||||
"ms": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
|
||||
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g="
|
||||
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=",
|
||||
"dev": true
|
||||
}
|
||||
}
|
||||
},
|
||||
@ -756,12 +872,14 @@
|
||||
"forwarded": {
|
||||
"version": "0.1.2",
|
||||
"resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.1.2.tgz",
|
||||
"integrity": "sha1-mMI9qxF1ZXuMBXPozszZGw/xjIQ="
|
||||
"integrity": "sha1-mMI9qxF1ZXuMBXPozszZGw/xjIQ=",
|
||||
"dev": true
|
||||
},
|
||||
"fresh": {
|
||||
"version": "0.5.2",
|
||||
"resolved": "https://registry.npmjs.org/fresh/-/fresh-0.5.2.tgz",
|
||||
"integrity": "sha1-PYyt2Q2XZWn6g1qx+OSyOhBWBac="
|
||||
"integrity": "sha1-PYyt2Q2XZWn6g1qx+OSyOhBWBac=",
|
||||
"dev": true
|
||||
},
|
||||
"fs.realpath": {
|
||||
"version": "1.0.0",
|
||||
@ -880,6 +998,7 @@
|
||||
"version": "1.7.2",
|
||||
"resolved": "https://registry.npmjs.org/http-errors/-/http-errors-1.7.2.tgz",
|
||||
"integrity": "sha512-uUQBt3H/cSIVfch6i1EuPNy/YsRSOUBXTVfZ+yR7Zjez3qjBz6i9+i4zjNaoqcoFVI4lQJ5plg63TvGfRSDCRg==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"depd": "~1.1.2",
|
||||
"inherits": "2.0.3",
|
||||
@ -888,6 +1007,34 @@
|
||||
"toidentifier": "1.0.0"
|
||||
}
|
||||
},
|
||||
"http-mitm-proxy": {
|
||||
"version": "0.8.2",
|
||||
"resolved": "https://registry.npmjs.org/http-mitm-proxy/-/http-mitm-proxy-0.8.2.tgz",
|
||||
"integrity": "sha512-QqaqHWssz4acqu2aIPJqJWt/gDa4SzQ9kj/rs16ONA2nBWNh/mfOW0Ez1Wxa5IivHHZSTciQ7wG0Dxzogurngw==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"async": "^2.6.2",
|
||||
"debug": "^4.1.0",
|
||||
"mkdirp": "^0.5.1",
|
||||
"node-forge": "^0.8.4",
|
||||
"optimist": "^0.6.1",
|
||||
"semaphore": "^1.1.0",
|
||||
"ws": "^3.2.0"
|
||||
},
|
||||
"dependencies": {
|
||||
"ws": {
|
||||
"version": "3.3.3",
|
||||
"resolved": "https://registry.npmjs.org/ws/-/ws-3.3.3.tgz",
|
||||
"integrity": "sha512-nnWLa/NwZSt4KQJu51MYlCcSQ5g7INpOrOMt4XV8j4dqTXdmlUmSHQ8/oLC069ckre0fRsgfvsKwbTdtKLCDkA==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"async-limiter": "~1.0.0",
|
||||
"safe-buffer": "~5.1.0",
|
||||
"ultron": "~1.1.0"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"https-proxy-agent": {
|
||||
"version": "3.0.1",
|
||||
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-3.0.1.tgz",
|
||||
@ -911,6 +1058,7 @@
|
||||
"version": "0.4.24",
|
||||
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz",
|
||||
"integrity": "sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"safer-buffer": ">= 2.1.2 < 3"
|
||||
}
|
||||
@ -938,7 +1086,13 @@
|
||||
"ipaddr.js": {
|
||||
"version": "1.9.0",
|
||||
"resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.9.0.tgz",
|
||||
"integrity": "sha512-M4Sjn6N/+O6/IXSJseKqHoFc+5FdGJ22sXqnjTpdZweHK64MzEPAyQZyEU3R/KRv2GLoa7nNtg/C2Ev6m7z+eA=="
|
||||
"integrity": "sha512-M4Sjn6N/+O6/IXSJseKqHoFc+5FdGJ22sXqnjTpdZweHK64MzEPAyQZyEU3R/KRv2GLoa7nNtg/C2Ev6m7z+eA==",
|
||||
"dev": true
|
||||
},
|
||||
"is-arrayish": {
|
||||
"version": "0.3.2",
|
||||
"resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.2.tgz",
|
||||
"integrity": "sha512-eVRqCvVlZbuw3GrM63ovNSNAeA1K16kaR/LRY/92w0zxQ5/1YzwblUX652i4Xs9RwAGjW9d9y6X88t8OaAJfWQ=="
|
||||
},
|
||||
"is-buffer": {
|
||||
"version": "2.0.3",
|
||||
@ -989,8 +1143,7 @@
|
||||
"is-stream": {
|
||||
"version": "1.1.0",
|
||||
"resolved": "https://registry.npmjs.org/is-stream/-/is-stream-1.1.0.tgz",
|
||||
"integrity": "sha1-EtSj3U5o4Lec6428hBc66A2RykQ=",
|
||||
"dev": true
|
||||
"integrity": "sha1-EtSj3U5o4Lec6428hBc66A2RykQ="
|
||||
},
|
||||
"is-symbol": {
|
||||
"version": "1.0.2",
|
||||
@ -1032,6 +1185,15 @@
|
||||
"resolved": "https://registry.npmjs.org/json-buffer/-/json-buffer-3.0.0.tgz",
|
||||
"integrity": "sha1-Wx85evx11ne96Lz8Dkfh+aPZqJg="
|
||||
},
|
||||
"key-cert": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/key-cert/-/key-cert-1.0.1.tgz",
|
||||
"integrity": "sha512-WiaPESfEzsztL9KIxbX6mNAU34NcEOyLVrpajrTkXeVc2tAZDx3lcLQlIE+bUqEoaIl0InBoiIy6C5ToLJ7i0g==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"pem": "^1.12.5"
|
||||
}
|
||||
},
|
||||
"keyv": {
|
||||
"version": "3.1.0",
|
||||
"resolved": "https://registry.npmjs.org/keyv/-/keyv-3.1.0.tgz",
|
||||
@ -1055,6 +1217,14 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"kuler": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/kuler/-/kuler-1.0.1.tgz",
|
||||
"integrity": "sha512-J9nVUucG1p/skKul6DU3PUZrhs0LPulNaeUOox0IyXDi8S4CztTHs1gQphhuZmzXG7VOQSf6NJfKuzteQLv9gQ==",
|
||||
"requires": {
|
||||
"colornames": "^1.1.1"
|
||||
}
|
||||
},
|
||||
"lazy-cache": {
|
||||
"version": "1.0.4",
|
||||
"resolved": "https://registry.npmjs.org/lazy-cache/-/lazy-cache-1.0.4.tgz",
|
||||
@ -1098,6 +1268,18 @@
|
||||
"chalk": "^2.0.1"
|
||||
}
|
||||
},
|
||||
"logform": {
|
||||
"version": "2.1.2",
|
||||
"resolved": "https://registry.npmjs.org/logform/-/logform-2.1.2.tgz",
|
||||
"integrity": "sha512-+lZh4OpERDBLqjiwDLpAWNQu6KMjnlXH2ByZwCuSqVPJletw0kTWJf5CgSNAUKn1KUkv3m2cUz/LK8zyEy7wzQ==",
|
||||
"requires": {
|
||||
"colors": "^1.2.1",
|
||||
"fast-safe-stringify": "^2.0.4",
|
||||
"fecha": "^2.3.3",
|
||||
"ms": "^2.1.1",
|
||||
"triple-beam": "^1.3.0"
|
||||
}
|
||||
},
|
||||
"lowercase-keys": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/lowercase-keys/-/lowercase-keys-1.0.1.tgz",
|
||||
@ -1112,10 +1294,30 @@
|
||||
"p-defer": "^1.0.0"
|
||||
}
|
||||
},
|
||||
"md5": {
|
||||
"version": "2.2.1",
|
||||
"resolved": "https://registry.npmjs.org/md5/-/md5-2.2.1.tgz",
|
||||
"integrity": "sha1-U6s41f48iJG6RlMp6iP6wFQBJvk=",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"charenc": "~0.0.1",
|
||||
"crypt": "~0.0.1",
|
||||
"is-buffer": "~1.1.1"
|
||||
},
|
||||
"dependencies": {
|
||||
"is-buffer": {
|
||||
"version": "1.1.6",
|
||||
"resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-1.1.6.tgz",
|
||||
"integrity": "sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w==",
|
||||
"dev": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"media-typer": {
|
||||
"version": "0.3.0",
|
||||
"resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz",
|
||||
"integrity": "sha1-hxDXrwqmJvj/+hzgAWhUUmMlV0g="
|
||||
"integrity": "sha1-hxDXrwqmJvj/+hzgAWhUUmMlV0g=",
|
||||
"dev": true
|
||||
},
|
||||
"mem": {
|
||||
"version": "4.3.0",
|
||||
@ -1141,12 +1343,14 @@
|
||||
"merge-descriptors": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.1.tgz",
|
||||
"integrity": "sha1-sAqqVW3YtEVoFQ7J0blT8/kMu2E="
|
||||
"integrity": "sha1-sAqqVW3YtEVoFQ7J0blT8/kMu2E=",
|
||||
"dev": true
|
||||
},
|
||||
"methods": {
|
||||
"version": "1.1.2",
|
||||
"resolved": "https://registry.npmjs.org/methods/-/methods-1.1.2.tgz",
|
||||
"integrity": "sha1-VSmk1nZUE07cxSZmVoNbD4Ua/O4="
|
||||
"integrity": "sha1-VSmk1nZUE07cxSZmVoNbD4Ua/O4=",
|
||||
"dev": true
|
||||
},
|
||||
"mime": {
|
||||
"version": "2.4.4",
|
||||
@ -1154,16 +1358,18 @@
|
||||
"integrity": "sha512-LRxmNwziLPT828z+4YkNzloCFC2YM4wrB99k+AV5ZbEyfGNWfG8SO1FUXLmLDBSo89NrJZ4DIWeLjy1CHGhMGA=="
|
||||
},
|
||||
"mime-db": {
|
||||
"version": "1.40.0",
|
||||
"resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.40.0.tgz",
|
||||
"integrity": "sha512-jYdeOMPy9vnxEqFRRo6ZvTZ8d9oPb+k18PKoYNYUe2stVEBPPwsln/qWzdbmaIvnhZ9v2P+CuecK+fpUfsV2mA=="
|
||||
"version": "1.42.0",
|
||||
"resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.42.0.tgz",
|
||||
"integrity": "sha512-UbfJCR4UAVRNgMpfImz05smAXK7+c+ZntjaA26ANtkXLlOe947Aag5zdIcKQULAiF9Cq4WxBi9jUs5zkA84bYQ==",
|
||||
"dev": true
|
||||
},
|
||||
"mime-types": {
|
||||
"version": "2.1.24",
|
||||
"resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.24.tgz",
|
||||
"integrity": "sha512-WaFHS3MCl5fapm3oLxU4eYDw77IQM2ACcxQ9RIxfaC3ooc6PFuBMGZZsYpvoXS5D5QTWPieo1jjLdAm3TBP3cQ==",
|
||||
"version": "2.1.25",
|
||||
"resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.25.tgz",
|
||||
"integrity": "sha512-5KhStqB5xpTAeGqKBAMgwaYMnQik7teQN4IAzC7npDv6kzeU6prfkR67bc87J1kWMPGkoaZSq1npmexMgkmEVg==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"mime-db": "1.40.0"
|
||||
"mime-db": "1.42.0"
|
||||
}
|
||||
},
|
||||
"mimic-fn": {
|
||||
@ -1278,7 +1484,8 @@
|
||||
"negotiator": {
|
||||
"version": "0.6.2",
|
||||
"resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.2.tgz",
|
||||
"integrity": "sha512-hZXc7K2e+PgeI1eDBe/10Ard4ekbfrrqG8Ep+8Jmf4JID2bNg7NvCPOZN+kfF574pFQI7mum2AUqDidoKqcTOw=="
|
||||
"integrity": "sha512-hZXc7K2e+PgeI1eDBe/10Ard4ekbfrrqG8Ep+8Jmf4JID2bNg7NvCPOZN+kfF574pFQI7mum2AUqDidoKqcTOw==",
|
||||
"dev": true
|
||||
},
|
||||
"nice-try": {
|
||||
"version": "1.0.5",
|
||||
@ -1296,6 +1503,12 @@
|
||||
"semver": "^5.7.0"
|
||||
}
|
||||
},
|
||||
"node-forge": {
|
||||
"version": "0.8.5",
|
||||
"resolved": "https://registry.npmjs.org/node-forge/-/node-forge-0.8.5.tgz",
|
||||
"integrity": "sha512-vFMQIWt+J/7FLNyKouZ9TazT74PRV3wgv9UT4cRjC8BffxFbKXkgIWR42URCPSnHm/QDz6BOlb2Q0U4+VQT67Q==",
|
||||
"dev": true
|
||||
},
|
||||
"normalize-url": {
|
||||
"version": "3.3.0",
|
||||
"resolved": "https://registry.npmjs.org/normalize-url/-/normalize-url-3.3.0.tgz",
|
||||
@ -1356,6 +1569,7 @@
|
||||
"version": "2.3.0",
|
||||
"resolved": "https://registry.npmjs.org/on-finished/-/on-finished-2.3.0.tgz",
|
||||
"integrity": "sha1-IPEzZIGwg811M3mSoWlxqi2QaUc=",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"ee-first": "1.1.1"
|
||||
}
|
||||
@ -1368,6 +1582,21 @@
|
||||
"wrappy": "1"
|
||||
}
|
||||
},
|
||||
"one-time": {
|
||||
"version": "0.0.4",
|
||||
"resolved": "https://registry.npmjs.org/one-time/-/one-time-0.0.4.tgz",
|
||||
"integrity": "sha1-+M33eISCb+Tf+T46nMN7HkSAdC4="
|
||||
},
|
||||
"optimist": {
|
||||
"version": "0.6.1",
|
||||
"resolved": "https://registry.npmjs.org/optimist/-/optimist-0.6.1.tgz",
|
||||
"integrity": "sha1-2j6nRob6IaGaERwybpDrFaAZZoY=",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"minimist": "~0.0.1",
|
||||
"wordwrap": "~0.0.2"
|
||||
}
|
||||
},
|
||||
"os-locale": {
|
||||
"version": "3.1.0",
|
||||
"resolved": "https://registry.npmjs.org/os-locale/-/os-locale-3.1.0.tgz",
|
||||
@ -1379,6 +1608,12 @@
|
||||
"mem": "^4.0.0"
|
||||
}
|
||||
},
|
||||
"os-tmpdir": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/os-tmpdir/-/os-tmpdir-1.0.2.tgz",
|
||||
"integrity": "sha1-u+Z0BseaqFxc/sdm/lc0VV36EnQ=",
|
||||
"dev": true
|
||||
},
|
||||
"p-cancelable": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/p-cancelable/-/p-cancelable-1.0.0.tgz",
|
||||
@ -1437,7 +1672,8 @@
|
||||
"parseurl": {
|
||||
"version": "1.3.3",
|
||||
"resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz",
|
||||
"integrity": "sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ=="
|
||||
"integrity": "sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ==",
|
||||
"dev": true
|
||||
},
|
||||
"path-exists": {
|
||||
"version": "3.0.0",
|
||||
@ -1459,7 +1695,8 @@
|
||||
"path-to-regexp": {
|
||||
"version": "0.1.7",
|
||||
"resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.7.tgz",
|
||||
"integrity": "sha1-32BBeABfUi8V60SQ5yR6G/qmf4w="
|
||||
"integrity": "sha1-32BBeABfUi8V60SQ5yR6G/qmf4w=",
|
||||
"dev": true
|
||||
},
|
||||
"pathval": {
|
||||
"version": "1.1.0",
|
||||
@ -1467,6 +1704,26 @@
|
||||
"integrity": "sha1-uULm1L3mUwBe9rcTYd74cn0GReA=",
|
||||
"dev": true
|
||||
},
|
||||
"pem": {
|
||||
"version": "1.14.3",
|
||||
"resolved": "https://registry.npmjs.org/pem/-/pem-1.14.3.tgz",
|
||||
"integrity": "sha512-Q+AMVMD3fzeVvZs5PHeI+pVt0hgZY2fjhkliBW43qyONLgCXPVk1ryim43F9eupHlNGLJNT5T/NNrzhUdiC5Zg==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"es6-promisify": "^6.0.0",
|
||||
"md5": "^2.2.1",
|
||||
"os-tmpdir": "^1.0.1",
|
||||
"which": "^1.3.1"
|
||||
},
|
||||
"dependencies": {
|
||||
"es6-promisify": {
|
||||
"version": "6.0.2",
|
||||
"resolved": "https://registry.npmjs.org/es6-promisify/-/es6-promisify-6.0.2.tgz",
|
||||
"integrity": "sha512-eO6vFm0JvqGzjWIQA6QVKjxpmELfhWbDUWHm1rPfIbn55mhKPiAa5xpLmQWJrNa629ZIeQ8ZvMAi13kvrjK6Mg==",
|
||||
"dev": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"pend": {
|
||||
"version": "1.2.0",
|
||||
"resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz",
|
||||
@ -1491,6 +1748,7 @@
|
||||
"version": "2.0.5",
|
||||
"resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.5.tgz",
|
||||
"integrity": "sha512-t/7RxHXPH6cJtP0pRG6smSr9QJidhB+3kXu0KgXnbGYMgzEnUxRQ4/LDdfOwZEMyIh3/xHb8PX3t+lfL9z+YVQ==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"forwarded": "~0.1.2",
|
||||
"ipaddr.js": "1.9.0"
|
||||
@ -1525,6 +1783,14 @@
|
||||
"ws": "^6.1.0"
|
||||
}
|
||||
},
|
||||
"puppeteer-cluster": {
|
||||
"version": "0.18.0",
|
||||
"resolved": "https://registry.npmjs.org/puppeteer-cluster/-/puppeteer-cluster-0.18.0.tgz",
|
||||
"integrity": "sha512-sMhK7foa+gq8khtgvKar6xwIjmPFS92ZrDsnpMHcQMX2Q8hWmVSViHG7RhuAOmbfIbS7Ya+lViXlx1xtgUjarQ==",
|
||||
"requires": {
|
||||
"debug": "^4.1.1"
|
||||
}
|
||||
},
|
||||
"puppeteer-extra": {
|
||||
"version": "2.1.3",
|
||||
"resolved": "https://registry.npmjs.org/puppeteer-extra/-/puppeteer-extra-2.1.3.tgz",
|
||||
@ -1605,17 +1871,20 @@
|
||||
"qs": {
|
||||
"version": "6.7.0",
|
||||
"resolved": "https://registry.npmjs.org/qs/-/qs-6.7.0.tgz",
|
||||
"integrity": "sha512-VCdBRNFTX1fyE7Nb6FYoURo/SPe62QCaAyzJvUjwRaIsc+NePBEniHlvxFmmX56+HZphIGtV0XeCirBtpDrTyQ=="
|
||||
"integrity": "sha512-VCdBRNFTX1fyE7Nb6FYoURo/SPe62QCaAyzJvUjwRaIsc+NePBEniHlvxFmmX56+HZphIGtV0XeCirBtpDrTyQ==",
|
||||
"dev": true
|
||||
},
|
||||
"range-parser": {
|
||||
"version": "1.2.1",
|
||||
"resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz",
|
||||
"integrity": "sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg=="
|
||||
"integrity": "sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg==",
|
||||
"dev": true
|
||||
},
|
||||
"raw-body": {
|
||||
"version": "2.4.0",
|
||||
"resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.4.0.tgz",
|
||||
"integrity": "sha512-4Oz8DUIwdvoa5qMJelxipzi/iJIi40O5cGV1wNYp5hvZP8ZN0T+jiNkL0QepXs+EsQ9XJ8ipEDoiH70ySUJP3Q==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"bytes": "3.1.0",
|
||||
"http-errors": "1.7.2",
|
||||
@ -1669,7 +1938,14 @@
|
||||
"safer-buffer": {
|
||||
"version": "2.1.2",
|
||||
"resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
|
||||
"integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg=="
|
||||
"integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==",
|
||||
"dev": true
|
||||
},
|
||||
"semaphore": {
|
||||
"version": "1.1.0",
|
||||
"resolved": "https://registry.npmjs.org/semaphore/-/semaphore-1.1.0.tgz",
|
||||
"integrity": "sha512-O4OZEaNtkMd/K0i6js9SL+gqy0ZCBMgUvlSqHKi4IBdjhe7wB8pwztUk1BbZ1fmrvpwFrPbHzqd2w5pTcJH6LA==",
|
||||
"dev": true
|
||||
},
|
||||
"semver": {
|
||||
"version": "5.7.0",
|
||||
@ -1681,6 +1957,7 @@
|
||||
"version": "0.17.1",
|
||||
"resolved": "https://registry.npmjs.org/send/-/send-0.17.1.tgz",
|
||||
"integrity": "sha512-BsVKsiGcQMFwT8UxypobUKyv7irCNRHk1T0G680vk88yf6LBByGcZJOTJCrTP2xVN6yI+XjPJcNuE3V4fT9sAg==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"debug": "2.6.9",
|
||||
"depd": "~1.1.2",
|
||||
@ -1701,6 +1978,7 @@
|
||||
"version": "2.6.9",
|
||||
"resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
|
||||
"integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"ms": "2.0.0"
|
||||
},
|
||||
@ -1708,14 +1986,16 @@
|
||||
"ms": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
|
||||
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g="
|
||||
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=",
|
||||
"dev": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"mime": {
|
||||
"version": "1.6.0",
|
||||
"resolved": "https://registry.npmjs.org/mime/-/mime-1.6.0.tgz",
|
||||
"integrity": "sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg=="
|
||||
"integrity": "sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg==",
|
||||
"dev": true
|
||||
}
|
||||
}
|
||||
},
|
||||
@ -1723,6 +2003,7 @@
|
||||
"version": "1.14.1",
|
||||
"resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.14.1.tgz",
|
||||
"integrity": "sha512-JMrvUwE54emCYWlTI+hGrGv5I8dEwmco/00EvkzIIsR7MqrHonbD9pO2MOfFnpFntl7ecpZs+3mW+XbQZu9QCg==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"encodeurl": "~1.0.2",
|
||||
"escape-html": "~1.0.3",
|
||||
@ -1739,7 +2020,8 @@
|
||||
"setprototypeof": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.1.1.tgz",
|
||||
"integrity": "sha512-JvdAWfbXeIGaZ9cILp38HntZSFSo3mWg6xGcJJsd+d4aRMOqauag1C63dJfDw7OaMYwEbHMOxEZ1lqVRYP2OAw=="
|
||||
"integrity": "sha512-JvdAWfbXeIGaZ9cILp38HntZSFSo3mWg6xGcJJsd+d4aRMOqauag1C63dJfDw7OaMYwEbHMOxEZ1lqVRYP2OAw==",
|
||||
"dev": true
|
||||
},
|
||||
"shallow-clone": {
|
||||
"version": "0.1.2",
|
||||
@ -1793,16 +2075,30 @@
|
||||
"integrity": "sha1-tf3AjxKH6hF4Yo5BXiUTK3NkbG0=",
|
||||
"dev": true
|
||||
},
|
||||
"simple-swizzle": {
|
||||
"version": "0.2.2",
|
||||
"resolved": "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.2.tgz",
|
||||
"integrity": "sha1-pNprY1/8zMoz9w0Xy5JZLeleVXo=",
|
||||
"requires": {
|
||||
"is-arrayish": "^0.3.1"
|
||||
}
|
||||
},
|
||||
"sprintf-js": {
|
||||
"version": "1.0.3",
|
||||
"resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz",
|
||||
"integrity": "sha1-BOaSb2YolTVPPdAVIDYzuFcpfiw=",
|
||||
"dev": true
|
||||
},
|
||||
"stack-trace": {
|
||||
"version": "0.0.10",
|
||||
"resolved": "https://registry.npmjs.org/stack-trace/-/stack-trace-0.0.10.tgz",
|
||||
"integrity": "sha1-VHxws0fo0ytOEI6hoqFZ5f3eGcA="
|
||||
},
|
||||
"statuses": {
|
||||
"version": "1.5.0",
|
||||
"resolved": "https://registry.npmjs.org/statuses/-/statuses-1.5.0.tgz",
|
||||
"integrity": "sha1-Fhx9rBd2Wf2YEfQ3cfqZOBR4Yow="
|
||||
"integrity": "sha1-Fhx9rBd2Wf2YEfQ3cfqZOBR4Yow=",
|
||||
"dev": true
|
||||
},
|
||||
"string-width": {
|
||||
"version": "2.1.1",
|
||||
@ -1859,6 +2155,11 @@
|
||||
"has-flag": "^3.0.0"
|
||||
}
|
||||
},
|
||||
"text-hex": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/text-hex/-/text-hex-1.0.0.tgz",
|
||||
"integrity": "sha512-uuVGNWzgJ4yhRaNSiubPY7OjISw4sw4E5Uv0wbjp+OzcbmVU/rsT8ujgcXJhn9ypzsgr5vlzpPqP+MBBKcGvbg=="
|
||||
},
|
||||
"to-readable-stream": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/to-readable-stream/-/to-readable-stream-1.0.0.tgz",
|
||||
@ -1867,7 +2168,13 @@
|
||||
"toidentifier": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/toidentifier/-/toidentifier-1.0.0.tgz",
|
||||
"integrity": "sha512-yaOH/Pk/VEhBWWTlhI+qXxDFXlejDGcQipMlyxda9nthulaxLZUNcUqFxokp0vcYnvteJln5FNQDRrxj3YcbVw=="
|
||||
"integrity": "sha512-yaOH/Pk/VEhBWWTlhI+qXxDFXlejDGcQipMlyxda9nthulaxLZUNcUqFxokp0vcYnvteJln5FNQDRrxj3YcbVw==",
|
||||
"dev": true
|
||||
},
|
||||
"triple-beam": {
|
||||
"version": "1.3.0",
|
||||
"resolved": "https://registry.npmjs.org/triple-beam/-/triple-beam-1.3.0.tgz",
|
||||
"integrity": "sha512-XrHUvV5HpdLmIj4uVMxHggLbFSZYIn7HEWsqePZcI50pco+MPqJ50wMGY794X7AOOhxOBAjbkqfAbEe/QMp2Lw=="
|
||||
},
|
||||
"type-detect": {
|
||||
"version": "4.0.8",
|
||||
@ -1879,6 +2186,7 @@
|
||||
"version": "1.6.18",
|
||||
"resolved": "https://registry.npmjs.org/type-is/-/type-is-1.6.18.tgz",
|
||||
"integrity": "sha512-TkRKr9sUTxEH8MdfuCSP7VizJyzRNMjj2J2do2Jr3Kym598JVdEksuzPQCnlFPW4ky9Q+iA+ma9BGm06XQBy8g==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"media-typer": "0.3.0",
|
||||
"mime-types": "~2.1.24"
|
||||
@ -1889,6 +2197,18 @@
|
||||
"resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz",
|
||||
"integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c="
|
||||
},
|
||||
"ua-parser-js": {
|
||||
"version": "0.7.21",
|
||||
"resolved": "https://registry.npmjs.org/ua-parser-js/-/ua-parser-js-0.7.21.tgz",
|
||||
"integrity": "sha512-+O8/qh/Qj8CgC6eYBVBykMrNtp5Gebn4dlGD/kKXVkJNDwyrAwSIqwz8CDf+tsAIWVycKcku6gIXJ0qwx/ZXaQ==",
|
||||
"dev": true
|
||||
},
|
||||
"ultron": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "https://registry.npmjs.org/ultron/-/ultron-1.1.1.tgz",
|
||||
"integrity": "sha512-UIEXBNeYmKptWH6z8ZnqTeS8fV74zG0/eRU9VGkpzz+LIJNs8W/zM/L+7ctCkRrgbNnnR0xxw4bKOr0cW0N0Og==",
|
||||
"dev": true
|
||||
},
|
||||
"underscore": {
|
||||
"version": "1.9.1",
|
||||
"resolved": "https://registry.npmjs.org/underscore/-/underscore-1.9.1.tgz",
|
||||
@ -1905,7 +2225,8 @@
|
||||
"unpipe": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz",
|
||||
"integrity": "sha1-sr9O6FFKrmFltIF4KdIbLvSZBOw="
|
||||
"integrity": "sha1-sr9O6FFKrmFltIF4KdIbLvSZBOw=",
|
||||
"dev": true
|
||||
},
|
||||
"url-parse-lax": {
|
||||
"version": "3.0.0",
|
||||
@ -1932,12 +2253,14 @@
|
||||
"utils-merge": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/utils-merge/-/utils-merge-1.0.1.tgz",
|
||||
"integrity": "sha1-n5VxD1CiZ5R7LMwSR0HBAoQn5xM="
|
||||
"integrity": "sha1-n5VxD1CiZ5R7LMwSR0HBAoQn5xM=",
|
||||
"dev": true
|
||||
},
|
||||
"vary": {
|
||||
"version": "1.1.2",
|
||||
"resolved": "https://registry.npmjs.org/vary/-/vary-1.1.2.tgz",
|
||||
"integrity": "sha1-IpnwLG3tMNSllhsLn3RSShj2NPw="
|
||||
"integrity": "sha1-IpnwLG3tMNSllhsLn3RSShj2NPw=",
|
||||
"dev": true
|
||||
},
|
||||
"which": {
|
||||
"version": "1.3.1",
|
||||
@ -1963,6 +2286,61 @@
|
||||
"string-width": "^1.0.2 || 2"
|
||||
}
|
||||
},
|
||||
"winston": {
|
||||
"version": "3.2.1",
|
||||
"resolved": "https://registry.npmjs.org/winston/-/winston-3.2.1.tgz",
|
||||
"integrity": "sha512-zU6vgnS9dAWCEKg/QYigd6cgMVVNwyTzKs81XZtTFuRwJOcDdBg7AU0mXVyNbs7O5RH2zdv+BdNZUlx7mXPuOw==",
|
||||
"requires": {
|
||||
"async": "^2.6.1",
|
||||
"diagnostics": "^1.1.1",
|
||||
"is-stream": "^1.1.0",
|
||||
"logform": "^2.1.1",
|
||||
"one-time": "0.0.4",
|
||||
"readable-stream": "^3.1.1",
|
||||
"stack-trace": "0.0.x",
|
||||
"triple-beam": "^1.3.0",
|
||||
"winston-transport": "^4.3.0"
|
||||
}
|
||||
},
|
||||
"winston-transport": {
|
||||
"version": "4.3.0",
|
||||
"resolved": "https://registry.npmjs.org/winston-transport/-/winston-transport-4.3.0.tgz",
|
||||
"integrity": "sha512-B2wPuwUi3vhzn/51Uukcao4dIduEiPOcOt9HJ3QeaXgkJ5Z7UwpBzxS4ZGNHtrxrUvTwemsQiSys0ihOf8Mp1A==",
|
||||
"requires": {
|
||||
"readable-stream": "^2.3.6",
|
||||
"triple-beam": "^1.2.0"
|
||||
},
|
||||
"dependencies": {
|
||||
"readable-stream": {
|
||||
"version": "2.3.6",
|
||||
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz",
|
||||
"integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==",
|
||||
"requires": {
|
||||
"core-util-is": "~1.0.0",
|
||||
"inherits": "~2.0.3",
|
||||
"isarray": "~1.0.0",
|
||||
"process-nextick-args": "~2.0.0",
|
||||
"safe-buffer": "~5.1.1",
|
||||
"string_decoder": "~1.1.1",
|
||||
"util-deprecate": "~1.0.1"
|
||||
}
|
||||
},
|
||||
"string_decoder": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz",
|
||||
"integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==",
|
||||
"requires": {
|
||||
"safe-buffer": "~5.1.0"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"wordwrap": {
|
||||
"version": "0.0.3",
|
||||
"resolved": "https://registry.npmjs.org/wordwrap/-/wordwrap-0.0.3.tgz",
|
||||
"integrity": "sha1-o9XabNXAvAAI03I0u68b7WMFkQc=",
|
||||
"dev": true
|
||||
},
|
||||
"wrap-ansi": {
|
||||
"version": "2.1.0",
|
||||
"resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-2.1.0.tgz",
|
||||
|
14
package.json
14
package.json
@ -5,7 +5,7 @@
|
||||
"homepage": "https://scrapeulous.com/",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"test": "mocha test/static_tests/"
|
||||
"test": "mocha test test/modules"
|
||||
},
|
||||
"keywords": [
|
||||
"scraping",
|
||||
@ -23,17 +23,23 @@
|
||||
"dependencies": {
|
||||
"cheerio": "^1.0.0-rc.3",
|
||||
"debug": "^4.1.1",
|
||||
"express": "^4.17.1",
|
||||
"got": "^9.6.0",
|
||||
"lodash": "^4.17.14",
|
||||
"puppeteer": "^2.0.0",
|
||||
"puppeteer-cluster": "^0.18.0",
|
||||
"puppeteer-extra": "^2.1.3",
|
||||
"puppeteer-extra-plugin-stealth": "^2.2.2",
|
||||
"user-agents": "^1.0.378"
|
||||
"user-agents": "^1.0.378",
|
||||
"winston": "^3.2.1"
|
||||
},
|
||||
"devDependencies": {
|
||||
"bluebird": "^3.7.2",
|
||||
"chai": "^4.2.0",
|
||||
"chai-string": "^1.5.0",
|
||||
"mocha": "^6.1.4"
|
||||
"express": "^4.17.1",
|
||||
"http-mitm-proxy": "^0.8.2",
|
||||
"key-cert": "^1.0.1",
|
||||
"mocha": "^6.1.4",
|
||||
"ua-parser-js": "^0.7.21"
|
||||
}
|
||||
}
|
||||
|
@ -1,66 +0,0 @@
|
||||
/**
|
||||
|
||||
Test server with:
|
||||
|
||||
curl -XPOST http://0.0.0.0:3000 -H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"browser_config": {
|
||||
"random_user_agent": true
|
||||
},
|
||||
"scrape_config": {
|
||||
"search_engine": "google",
|
||||
"keywords": ["test"],
|
||||
"num_pages": 1
|
||||
}
|
||||
}'
|
||||
|
||||
*/
|
||||
|
||||
const se_scraper = require('../index.js');
|
||||
'use strict';
|
||||
const express = require('express');
|
||||
|
||||
// Constants
|
||||
const PORT = process.env.PORT || 3000;
|
||||
const HOST = process.env.HOST || '0.0.0.0';
|
||||
|
||||
// App
|
||||
const app = express();
|
||||
app.use(express.json());
|
||||
|
||||
let browser_config = {
|
||||
random_user_agent: true,
|
||||
headless : true,
|
||||
debug_level: 1,
|
||||
sleep_range: '',
|
||||
puppeteer_cluster_config: {
|
||||
timeout: 30 * 60 * 1000, // max timeout set to 30 minutes
|
||||
monitor: false,
|
||||
concurrency: 1, // one scraper per tab
|
||||
maxConcurrency: 1, // scrape with 5 tabs
|
||||
}
|
||||
};
|
||||
|
||||
app.post('/', async (req, res) => {
|
||||
if (!req.body.browser_config || !req.body.scrape_config) {
|
||||
res.json({
|
||||
'status': 400,
|
||||
'msg': 'please specify browser_config and scrape_config'
|
||||
});
|
||||
} else {
|
||||
// overwrite standard browser config
|
||||
Object.assign(browser_config, req.body.browser_config);
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(browser_config);
|
||||
await scraper.start();
|
||||
var results = await scraper.scrape(req.body.scrape_config);
|
||||
// console.dir(results, {depth: null, colors: true});
|
||||
await scraper.quit();
|
||||
|
||||
res.send(results);
|
||||
}
|
||||
});
|
||||
|
||||
app.listen(PORT, HOST);
|
||||
|
||||
console.log(`Running on http://${HOST}:${PORT}`);
|
55
src/concurrency-implementation.js
Normal file
55
src/concurrency-implementation.js
Normal file
@ -0,0 +1,55 @@
|
||||
const { Browser } = require('puppeteer-cluster/dist/concurrency/builtInConcurrency');
|
||||
const debug = require('debug')('se-scraper:CustomConcurrency');
|
||||
const { timeoutExecute } = require('puppeteer-cluster/dist/util');
|
||||
|
||||
const BROWSER_TIMEOUT = 5000;
|
||||
|
||||
class CustomConcurrency extends Browser {
|
||||
|
||||
async init() {}
|
||||
async close() {}
|
||||
|
||||
async workerInstance() {
|
||||
const options = this.options.perBrowserOptions.shift();
|
||||
debug('Launch puppeteer instance with options=%o', options);
|
||||
let chrome = await this.puppeteer.launch(options);
|
||||
let page;
|
||||
let context;
|
||||
|
||||
return {
|
||||
jobInstance: async () => {
|
||||
await timeoutExecute(BROWSER_TIMEOUT, (async () => {
|
||||
context = await chrome.createIncognitoBrowserContext();
|
||||
page = await context.newPage();
|
||||
})());
|
||||
|
||||
return {
|
||||
resources: {
|
||||
page,
|
||||
},
|
||||
|
||||
close: async () => {
|
||||
await timeoutExecute(BROWSER_TIMEOUT, context.close());
|
||||
},
|
||||
};
|
||||
},
|
||||
|
||||
close: async () => {
|
||||
await chrome.close();
|
||||
},
|
||||
|
||||
repair: async () => {
|
||||
debug('Starting repair');
|
||||
try {
|
||||
// will probably fail, but just in case the repair was not necessary
|
||||
await chrome.close();
|
||||
} catch (e) {}
|
||||
|
||||
// just relaunch as there is only one page per browser
|
||||
chrome = await this.puppeteer.launch(options);
|
||||
},
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
module.exports = CustomConcurrency;
|
@ -123,12 +123,9 @@ class BingScraper extends Scraper {
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
await this.page.goto(startUrl);
|
||||
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
await this.page.goto(startUrl);
|
||||
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -1,21 +0,0 @@
|
||||
function log(config, loglevel, msg = null, cb = null) {
|
||||
if (typeof loglevel != "number") {
|
||||
throw Error('loglevel must be numeric.');
|
||||
}
|
||||
|
||||
if (loglevel <= config.debug_level) {
|
||||
if (msg) {
|
||||
if (typeof msg == 'object') {
|
||||
console.dir(msg, {depth: null, colors: false});
|
||||
} else {
|
||||
console.log('[i] ' + msg);
|
||||
}
|
||||
} else if (cb) {
|
||||
cb();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
log: log,
|
||||
};
|
@ -1,15 +1,18 @@
|
||||
const cheerio = require('cheerio');
|
||||
const Scraper = require('./se_scraper');
|
||||
const debug = require('debug')('se-scraper:DuckduckgoScraper');
|
||||
|
||||
class DuckduckgoScraper extends Scraper {
|
||||
|
||||
parse(html) {
|
||||
debug('parse');
|
||||
// load the page source into cheerio
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
// perform queries
|
||||
const results = [];
|
||||
$('#links .result__body').each((i, link) => {
|
||||
const organicSelector = ($('#links .result--sep').length > 0) ? `#links #rld-${this.page_num - 1} ~ .result .result__body` : '#links .result__body';
|
||||
$(organicSelector).each((i, link) => {
|
||||
results.push({
|
||||
link: $(link).find('.result__title .result__a').attr('href'),
|
||||
title: $(link).find('.result__title .result__a').text(),
|
||||
@ -42,19 +45,17 @@ class DuckduckgoScraper extends Scraper {
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
debug('load_start_page');
|
||||
let startUrl = 'https://duckduckgo.com/';
|
||||
|
||||
let startUrl = 'https://duckduckgo.com/?q=test';
|
||||
|
||||
try {
|
||||
this.last_response = await this.page.goto(startUrl);
|
||||
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
this.last_response = await this.page.goto(startUrl);
|
||||
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
async search_keyword(keyword) {
|
||||
debug('search_keyword');
|
||||
const input = await this.page.$('input[name="q"]');
|
||||
await this.set_input_value(`input[name="q"]`, keyword);
|
||||
await this.sleep(50);
|
||||
@ -63,21 +64,19 @@ class DuckduckgoScraper extends Scraper {
|
||||
}
|
||||
|
||||
async next_page() {
|
||||
let next_page_link = await this.page.$('.result.result--more', {timeout: this.STANDARD_TIMEOUT});
|
||||
debug('next_page');
|
||||
let next_page_link = await this.page.$('.result.result--more a', {timeout: this.STANDARD_TIMEOUT});
|
||||
if (!next_page_link) {
|
||||
return false;
|
||||
}
|
||||
await next_page_link.click();
|
||||
try {
|
||||
await this.page.waitForNavigation({timeout: this.STANDARD_TIMEOUT});
|
||||
} catch(e) {
|
||||
return false;
|
||||
}
|
||||
await this.page.waitForNavigation({ timeout: this.STANDARD_TIMEOUT });
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
async wait_for_results() {
|
||||
debug('wait_for_results');
|
||||
await this.page.waitForSelector('.result__body', { timeout: this.STANDARD_TIMEOUT });
|
||||
}
|
||||
|
||||
|
@ -2,8 +2,6 @@
|
||||
|
||||
const cheerio = require('cheerio');
|
||||
const Scraper = require('./se_scraper');
|
||||
const common = require('./common.js');
|
||||
var log = common.log;
|
||||
|
||||
class GoogleScraper extends Scraper {
|
||||
|
||||
@ -243,7 +241,7 @@ class GoogleScraper extends Scraper {
|
||||
}
|
||||
}
|
||||
|
||||
log(this.config, 1, 'Using startUrl: ' + startUrl);
|
||||
this.logger.info('Using startUrl: ' + startUrl);
|
||||
|
||||
this.last_response = await this.page.goto(startUrl);
|
||||
|
||||
@ -642,7 +640,7 @@ class GoogleMapsScraper extends Scraper {
|
||||
this.scrape_in_detail = this.config.google_maps_settings.scrape_in_detail || false;
|
||||
}
|
||||
|
||||
log(this.config, 1, 'Using startUrl: ' + startUrl);
|
||||
this.logger.info('Using startUrl: ' + startUrl);
|
||||
|
||||
this.last_response = await this.page.goto(startUrl);
|
||||
|
||||
@ -681,7 +679,7 @@ class GoogleMapsScraper extends Scraper {
|
||||
|
||||
let last_title_last_result = this.results[this.keyword][this.page_num-1].results.slice(-1)[0].title;
|
||||
|
||||
log(this.config, 1, `Waiting until new last serp title differs from: "${last_title_last_result}"`);
|
||||
this.logger.info(`Waiting until new last serp title differs from: "${last_title_last_result}"`);
|
||||
|
||||
await this.page.waitForFunction((last_title) => {
|
||||
const res = document.querySelectorAll('.section-result .section-result-title span');
|
||||
@ -775,7 +773,7 @@ class GoogleShoppingScraper extends Scraper {
|
||||
}
|
||||
}
|
||||
|
||||
log(this.config, 1, 'Using startUrl: ' + startUrl);
|
||||
this.logger.info('Using startUrl: ' + startUrl);
|
||||
|
||||
this.last_response = await this.page.goto(startUrl);
|
||||
|
||||
|
@ -1,8 +1,6 @@
|
||||
'use strict';
|
||||
const meta = require('./metadata.js');
|
||||
const common = require('./common.js');
|
||||
var log = common.log;
|
||||
|
||||
const debug = require('debug')('se-scraper:Scraper');
|
||||
/*
|
||||
Get useful JS knowledge and get awesome...
|
||||
|
||||
@ -12,6 +10,7 @@ var log = common.log;
|
||||
|
||||
module.exports = class Scraper {
|
||||
constructor(options = {}) {
|
||||
debug('constructor');
|
||||
const {
|
||||
config = {},
|
||||
context = {},
|
||||
@ -26,6 +25,7 @@ module.exports = class Scraper {
|
||||
};
|
||||
this.pluggable = pluggable;
|
||||
this.config = config;
|
||||
this.logger = this.config.logger;
|
||||
this.context = context;
|
||||
|
||||
this.proxy = config.proxy;
|
||||
@ -50,7 +50,9 @@ module.exports = class Scraper {
|
||||
}
|
||||
}
|
||||
|
||||
async run({page, data}) {
|
||||
async run({page, data, worker}) {
|
||||
|
||||
debug('worker=%o', worker, this.config.keywords);
|
||||
|
||||
if (page) {
|
||||
this.page = page;
|
||||
@ -113,25 +115,25 @@ module.exports = class Scraper {
|
||||
|
||||
if (this.config.log_http_headers === true) {
|
||||
this.metadata.http_headers = await meta.get_http_headers(this.page);
|
||||
log(this.config, 2, this.metadata.http_headers);
|
||||
debug('this.metadata.http_headers=%O', this.metadata.http_headers);
|
||||
}
|
||||
|
||||
if (this.config.log_ip_address === true) {
|
||||
let ipinfo = await meta.get_ip_data(this.page);
|
||||
this.metadata.ipinfo = ipinfo;
|
||||
log(this.config, 2, this.metadata.ipinfo);
|
||||
debug('this.metadata.ipinfo', this.metadata.ipinfo);
|
||||
}
|
||||
|
||||
// check that our proxy is working by confirming
|
||||
// that ipinfo.io sees the proxy IP address
|
||||
if (this.proxy && this.config.log_ip_address === true) {
|
||||
log(this.config, 3, `${this.metadata.ipinfo.ip} vs ${this.proxy}`);
|
||||
debug(`${this.metadata.ipinfo.ip} vs ${this.proxy}`);
|
||||
|
||||
// if the ip returned by ipinfo is not a substring of our proxystring, get the heck outta here
|
||||
if (!this.proxy.includes(this.metadata.ipinfo.ip)) {
|
||||
throw new Error(`Proxy output ip ${this.proxy} does not match with provided one`);
|
||||
} else {
|
||||
log(this.config, 1, `Using valid Proxy: ${this.proxy}`);
|
||||
this.logger.info(`Using valid Proxy: ${this.proxy}`);
|
||||
}
|
||||
|
||||
}
|
||||
@ -179,7 +181,7 @@ module.exports = class Scraper {
|
||||
|
||||
do {
|
||||
|
||||
log(this.config, 1, `${this.config.search_engine_name} scrapes keyword "${keyword}" on page ${this.page_num}`);
|
||||
this.logger.info(`${this.config.search_engine_name} scrapes keyword "${keyword}" on page ${this.page_num}`);
|
||||
|
||||
await this.wait_for_results();
|
||||
|
||||
@ -191,6 +193,13 @@ module.exports = class Scraper {
|
||||
let parsed = this.parse(html);
|
||||
this.results[keyword][this.page_num] = parsed ? parsed : await this.parse_async(html);
|
||||
|
||||
if (this.config.screen_output) {
|
||||
this.results[keyword][this.page_num].screenshot = await this.page.screenshot({
|
||||
encoding: 'base64',
|
||||
fullPage: false,
|
||||
});
|
||||
}
|
||||
|
||||
if (this.config.html_output) {
|
||||
|
||||
if (this.config.clean_html_output) {
|
||||
@ -237,13 +246,6 @@ module.exports = class Scraper {
|
||||
this.results[keyword][this.page_num].html = html_contents;
|
||||
}
|
||||
|
||||
if (this.config.screen_output) {
|
||||
this.results[keyword][this.page_num].screenshot = await this.page.screenshot({
|
||||
encoding: 'base64',
|
||||
fullPage: false,
|
||||
});
|
||||
}
|
||||
|
||||
this.page_num += 1;
|
||||
|
||||
// only load the next page when we will pass the next iteration
|
||||
@ -263,28 +265,21 @@ module.exports = class Scraper {
|
||||
|
||||
} catch (e) {
|
||||
|
||||
console.error(`Problem with scraping ${keyword} in search engine ${this.config.search_engine_name}: ${e}`);
|
||||
this.logger.warn(`Problem with scraping ${keyword} in search engine ${this.config.search_engine_name}: ${e.message}`);
|
||||
debug('this.last_response=%O', this.last_response);
|
||||
|
||||
if (this.last_response) {
|
||||
log(this.config, 2, this.last_response);
|
||||
}
|
||||
|
||||
if (this.config.debug_level > 2) {
|
||||
try {
|
||||
// Try to save a screenshot of the error
|
||||
await this.page.screenshot({path: `debug_se_scraper_${this.config.search_engine_name}_${keyword}.png`});
|
||||
} catch (e) {
|
||||
}
|
||||
if (this.config.take_screenshot_on_error) {
|
||||
await this.page.screenshot({ path: `debug_se_scraper_${this.config.search_engine_name}_${keyword}.png` });
|
||||
}
|
||||
|
||||
this.metadata.scraping_detected = await this.detected();
|
||||
|
||||
if (this.metadata.scraping_detected === true) {
|
||||
console.error(`${this.config.search_engine_name} detected the scraping!`);
|
||||
this.logger.warn(`${this.config.search_engine_name} detected the scraping!`);
|
||||
|
||||
if (this.config.is_local === true) {
|
||||
await this.sleep(this.SOLVE_CAPTCHA_TIME);
|
||||
console.error(`You have ${this.SOLVE_CAPTCHA_TIME}ms to enter the captcha.`);
|
||||
this.logger.info(`You have ${this.SOLVE_CAPTCHA_TIME}ms to enter the captcha.`);
|
||||
// expect that user filled out necessary captcha
|
||||
} else {
|
||||
if (this.config.throw_on_detection === true) {
|
||||
@ -318,7 +313,7 @@ module.exports = class Scraper {
|
||||
baseUrl += `${key}=${settings[key]}&`
|
||||
}
|
||||
|
||||
log(this.config, 1, 'Using startUrl: ' + baseUrl);
|
||||
this.logger.info('Using startUrl: ' + baseUrl);
|
||||
|
||||
return baseUrl;
|
||||
}
|
||||
@ -335,7 +330,7 @@ module.exports = class Scraper {
|
||||
async random_sleep() {
|
||||
const [min, max] = this.config.sleep_range;
|
||||
let rand = Math.floor(Math.random() * (max - min + 1) + min); //Generate Random number
|
||||
log(this.config, 1, `Sleeping for ${rand}s`);
|
||||
this.logger.info(`Sleeping for ${rand}s`);
|
||||
await this.sleep(rand * 1000);
|
||||
}
|
||||
|
||||
@ -349,7 +344,7 @@ module.exports = class Scraper {
|
||||
no_results(needles, html) {
|
||||
for (let needle of needles) {
|
||||
if (html.includes(needle)) {
|
||||
console.log(this.config, 2, `HTML contains needle ${needle}. no_results=true`);
|
||||
this.logger.warn(`HTML contains needle ${needle}. no_results=true`);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -1,8 +1,6 @@
|
||||
'use strict';
|
||||
|
||||
const Scraper = require('./se_scraper');
|
||||
const common = require('./common.js');
|
||||
var log = common.log;
|
||||
|
||||
class YandexScraper extends Scraper {
|
||||
|
||||
@ -75,7 +73,7 @@ class YandexScraper extends Scraper {
|
||||
async load_start_page() {
|
||||
let startUrl = 'https://yandex.com';
|
||||
|
||||
log(this.config, 1, 'Using startUrl: ' + startUrl);
|
||||
this.logger.info('Using startUrl: ' + startUrl);
|
||||
|
||||
this.last_response = await this.page.goto(startUrl);
|
||||
|
||||
|
@ -1,7 +1,12 @@
|
||||
'use strict';
|
||||
|
||||
var fs = require('fs');
|
||||
var os = require("os");
|
||||
const fs = require('fs');
|
||||
const os = require('os');
|
||||
const _ = require('lodash');
|
||||
const { createLogger, format, transports } = require('winston');
|
||||
const { combine, timestamp, printf } = format;
|
||||
const debug = require('debug')('se-scraper:ScrapeManager');
|
||||
const { Cluster } = require('puppeteer-cluster');
|
||||
|
||||
const UserAgent = require('user-agents');
|
||||
const google = require('./modules/google.js');
|
||||
@ -9,9 +14,7 @@ const bing = require('./modules/bing.js');
|
||||
const yandex = require('./modules/yandex.js');
|
||||
const infospace = require('./modules/infospace.js');
|
||||
const duckduckgo = require('./modules/duckduckgo.js');
|
||||
const { Cluster } = require('./puppeteer-cluster/dist/index.js');
|
||||
const common = require('./modules/common.js');
|
||||
var log = common.log;
|
||||
const CustomConcurrencyImpl = require('./concurrency-implementation');
|
||||
|
||||
const MAX_ALLOWED_BROWSERS = 6;
|
||||
|
||||
@ -63,7 +66,7 @@ class ScrapeManager {
|
||||
this.scraper = null;
|
||||
this.context = context;
|
||||
|
||||
this.config = {
|
||||
this.config = _.defaults(config, {
|
||||
// the user agent to scrape with
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3835.0 Safari/537.36',
|
||||
// if random_user_agent is set to True, a random user agent is chosen
|
||||
@ -80,17 +83,38 @@ class ScrapeManager {
|
||||
// which search engine to scrape
|
||||
search_engine: 'google',
|
||||
search_engine_name: 'google',
|
||||
// whether debug information should be printed
|
||||
// level 0: print nothing
|
||||
// level 1: print most important info
|
||||
// ...
|
||||
// level 4: print all shit nobody wants to know
|
||||
debug_level: 1,
|
||||
logger: createLogger({
|
||||
level: 'info',
|
||||
format: combine(
|
||||
timestamp(),
|
||||
printf(({ level, message, timestamp }) => {
|
||||
return `${timestamp} [${level}] ${message}`;
|
||||
})
|
||||
),
|
||||
transports: [
|
||||
new transports.Console()
|
||||
]
|
||||
}),
|
||||
keywords: ['nodejs rocks',],
|
||||
// whether to start the browser in headless mode
|
||||
headless: true,
|
||||
// specify flags passed to chrome here
|
||||
chrome_flags: [],
|
||||
// About our defaults values https://peter.sh/experiments/chromium-command-line-switches/
|
||||
chrome_flags: [
|
||||
'--disable-infobars',
|
||||
'--window-position=0,0',
|
||||
'--ignore-certifcate-errors',
|
||||
'--ignore-certifcate-errors-spki-list',
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-accelerated-2d-canvas',
|
||||
'--disable-gpu',
|
||||
'--window-size=1920,1040',
|
||||
'--start-fullscreen',
|
||||
'--hide-scrollbars',
|
||||
'--disable-notifications',
|
||||
],
|
||||
// the number of pages to scrape for each keyword
|
||||
num_pages: 1,
|
||||
// path to output file, data will be stored in JSON
|
||||
@ -115,10 +139,8 @@ class ScrapeManager {
|
||||
//custom_func: resolve('examples/pluggable.js'),
|
||||
custom_func: null,
|
||||
throw_on_detection: false,
|
||||
// use a proxy for all connections
|
||||
// example: 'socks5://78.94.172.42:1080'
|
||||
// example: 'http://118.174.233.10:48400'
|
||||
proxy: '',
|
||||
// List of proxies to use ['socks5://78.94.172.42:1080', 'http://localhost:1080']
|
||||
proxies: null,
|
||||
// a file with one proxy per line. Example:
|
||||
// socks5://78.94.172.42:1080
|
||||
// http://118.174.233.10:48400
|
||||
@ -138,14 +160,9 @@ class ScrapeManager {
|
||||
concurrency: Cluster.CONCURRENCY_BROWSER,
|
||||
maxConcurrency: 1,
|
||||
}
|
||||
};
|
||||
});
|
||||
|
||||
this.config.proxies = [];
|
||||
|
||||
// overwrite default config
|
||||
for (var key in config) {
|
||||
this.config[key] = config[key];
|
||||
}
|
||||
this.logger = this.config.logger;
|
||||
|
||||
if (config.sleep_range) {
|
||||
// parse an array
|
||||
@ -160,12 +177,20 @@ class ScrapeManager {
|
||||
this.config.keywords = read_keywords_from_file(this.config.keyword_file);
|
||||
}
|
||||
|
||||
if (fs.existsSync(this.config.proxy_file)) {
|
||||
this.config.proxies = read_keywords_from_file(this.config.proxy_file);
|
||||
log(this.config, 1, `${this.config.proxies.length} proxies read from file.`);
|
||||
if (this.config.proxies && this.config.proxy_file) {
|
||||
throw new Error('Either use a proxy_file or specify a proxy for all connections. Do not use both options.');
|
||||
}
|
||||
|
||||
log(this.config, 2, this.config);
|
||||
if (this.config.proxy_file) {
|
||||
this.config.proxies = read_keywords_from_file(this.config.proxy_file);
|
||||
this.logger.info(`${this.config.proxies.length} proxies read from file.`);
|
||||
}
|
||||
|
||||
if (!this.config.proxies && this.config.use_proxies_only) {
|
||||
throw new Error('Must provide at least one proxy in proxies if you enable use_proxies_only');
|
||||
}
|
||||
|
||||
debug('this.config=%O', this.config);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -193,135 +218,72 @@ class ScrapeManager {
|
||||
}
|
||||
}
|
||||
|
||||
// See here: https://peter.sh/experiments/chromium-command-line-switches/
|
||||
var default_chrome_flags = [
|
||||
'--disable-infobars',
|
||||
'--window-position=0,0',
|
||||
'--ignore-certifcate-errors',
|
||||
'--ignore-certifcate-errors-spki-list',
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-accelerated-2d-canvas',
|
||||
'--disable-gpu',
|
||||
'--window-size=1920,1040',
|
||||
'--start-fullscreen',
|
||||
'--hide-scrollbars',
|
||||
'--disable-notifications',
|
||||
];
|
||||
|
||||
var chrome_flags = default_chrome_flags.slice(); // copy that
|
||||
|
||||
if (Array.isArray(this.config.chrome_flags) && this.config.chrome_flags.length) {
|
||||
chrome_flags = this.config.chrome_flags;
|
||||
}
|
||||
|
||||
var user_agent = null;
|
||||
|
||||
if (this.config.user_agent) {
|
||||
user_agent = this.config.user_agent;
|
||||
}
|
||||
|
||||
if (this.config.random_user_agent) {
|
||||
const userAgent = new UserAgent({ deviceCategory: 'desktop' });
|
||||
user_agent = userAgent.toString();
|
||||
}
|
||||
|
||||
if (user_agent) {
|
||||
chrome_flags.push(
|
||||
`--user-agent=${user_agent}`
|
||||
)
|
||||
}
|
||||
|
||||
if (this.config.proxy) {
|
||||
if (this.config.proxies && this.config.proxies.length > 0) {
|
||||
console.error('Either use a proxy_file or specify a proxy for all connections. Do not use both options.');
|
||||
return false;
|
||||
}
|
||||
|
||||
chrome_flags.push(
|
||||
'--proxy-server=' + this.config.proxy,
|
||||
)
|
||||
}
|
||||
|
||||
var launch_args = {
|
||||
args: chrome_flags,
|
||||
headless: this.config.headless,
|
||||
ignoreHTTPSErrors: true,
|
||||
};
|
||||
|
||||
log(this.config, 2, `Using the following puppeteer configuration: ${launch_args}`);
|
||||
const chrome_flags = _.clone(this.config.chrome_flags);
|
||||
|
||||
if (this.pluggable && this.pluggable.start_browser) {
|
||||
launch_args.config = this.config;
|
||||
this.browser = await this.pluggable.start_browser(launch_args);
|
||||
this.browser = await this.pluggable.start_browser({
|
||||
config: this.config,
|
||||
});
|
||||
this.page = await this.browser.newPage();
|
||||
} else {
|
||||
// if no custom start_browser functionality was given
|
||||
// use puppeteer-cluster for scraping
|
||||
const { Cluster } = require('./puppeteer-cluster/dist/index.js');
|
||||
|
||||
this.numClusters = this.config.puppeteer_cluster_config.maxConcurrency;
|
||||
var perBrowserOptions = [];
|
||||
|
||||
// the first browser this.config with home IP
|
||||
if (!this.config.use_proxies_only) {
|
||||
perBrowserOptions.push(launch_args);
|
||||
}
|
||||
|
||||
let proxies;
|
||||
// if we have at least one proxy, always use CONCURRENCY_BROWSER
|
||||
// and set maxConcurrency to this.config.proxies.length + 1
|
||||
// else use whatever this.configuration was passed
|
||||
if (this.config.proxies.length > 0) {
|
||||
this.config.puppeteer_cluster_config.concurrency = Cluster.CONCURRENCY_BROWSER;
|
||||
if (this.config.proxies && this.config.proxies.length > 0) {
|
||||
|
||||
// because we use real browsers, we ran out of memory on normal laptops
|
||||
// when using more than maybe 5 or 6 browsers.
|
||||
// therefore hardcode a limit here
|
||||
// TODO not sure this what we want
|
||||
this.numClusters = Math.min(
|
||||
this.config.proxies.length + (this.config.use_proxies_only ? 0 : 1),
|
||||
MAX_ALLOWED_BROWSERS
|
||||
);
|
||||
proxies = _.clone(this.config.proxies);
|
||||
|
||||
log(this.config, 1, `Using ${this.numClusters} clusters.`);
|
||||
|
||||
this.config.puppeteer_cluster_config.maxConcurrency = this.numClusters;
|
||||
|
||||
for (var proxy of this.config.proxies) {
|
||||
perBrowserOptions.push({
|
||||
headless: this.config.headless,
|
||||
ignoreHTTPSErrors: true,
|
||||
args: chrome_flags.concat(`--proxy-server=${proxy}`)
|
||||
})
|
||||
// Insert a first config without proxy if use_proxy_only is false
|
||||
if (this.config.use_proxies_only === false) {
|
||||
proxies.unshift(null);
|
||||
}
|
||||
|
||||
} else {
|
||||
this.numClusters = this.config.puppeteer_cluster_config.maxConcurrency;
|
||||
proxies = _.times(this.numClusters, null);
|
||||
}
|
||||
|
||||
// Give the per browser options each a random user agent when random user agent is set
|
||||
while (perBrowserOptions.length < this.numClusters) {
|
||||
const userAgent = new UserAgent();
|
||||
perBrowserOptions.push({
|
||||
this.logger.info(`Using ${this.numClusters} clusters.`);
|
||||
|
||||
// Give the per browser options
|
||||
const perBrowserOptions = _.map(proxies, (proxy) => {
|
||||
const userAgent = (this.config.random_user_agent) ? (new UserAgent({deviceCategory: 'desktop'})).toString() : this.config.user_agent;
|
||||
let args = chrome_flags.concat([`--user-agent=${userAgent}`]);
|
||||
|
||||
if (proxy) {
|
||||
args = args.concat([`--proxy-server=${proxy}`]);
|
||||
}
|
||||
|
||||
return {
|
||||
headless: this.config.headless,
|
||||
ignoreHTTPSErrors: true,
|
||||
args: default_chrome_flags.slice().concat(`--user-agent=${userAgent.toString()}`)
|
||||
})
|
||||
}
|
||||
args
|
||||
};
|
||||
});
|
||||
|
||||
if (this.config.debug_level >= 2) {
|
||||
console.dir(perBrowserOptions)
|
||||
}
|
||||
debug('perBrowserOptions=%O', perBrowserOptions)
|
||||
|
||||
this.cluster = await Cluster.launch({
|
||||
monitor: this.config.puppeteer_cluster_config.monitor,
|
||||
timeout: this.config.puppeteer_cluster_config.timeout, // max timeout set to 30 minutes
|
||||
concurrency: this.config.puppeteer_cluster_config.concurrency,
|
||||
maxConcurrency: this.config.puppeteer_cluster_config.maxConcurrency,
|
||||
puppeteerOptions: launch_args,
|
||||
perBrowserOptions: perBrowserOptions,
|
||||
});
|
||||
|
||||
this.cluster.on('taskerror', (err, data) => {
|
||||
console.log(`Error while scraping ${data}: ${err.message}`);
|
||||
console.log(err);
|
||||
concurrency: CustomConcurrencyImpl,
|
||||
maxConcurrency: this.numClusters,
|
||||
puppeteerOptions: {
|
||||
perBrowserOptions: perBrowserOptions
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
@ -332,8 +294,7 @@ class ScrapeManager {
|
||||
async scrape(scrape_config = {}) {
|
||||
|
||||
if (!scrape_config.keywords && !scrape_config.keyword_file) {
|
||||
console.error('Either keywords or keyword_file must be supplied to scrape()');
|
||||
return false;
|
||||
throw new Error('Either keywords or keyword_file must be supplied to scrape()');
|
||||
}
|
||||
|
||||
Object.assign(this.config, scrape_config);
|
||||
@ -345,10 +306,7 @@ class ScrapeManager {
|
||||
|
||||
this.config.search_engine_name = typeof this.config.search_engine === 'function' ? this.config.search_engine.name : this.config.search_engine;
|
||||
|
||||
if (this.config.keywords && this.config.search_engine) {
|
||||
log(this.config, 1,
|
||||
`[se-scraper] started at [${(new Date()).toUTCString()}] and scrapes ${this.config.search_engine_name} with ${this.config.keywords.length} keywords on ${this.config.num_pages} pages each.`)
|
||||
}
|
||||
this.logger.info(`scrapes ${this.config.search_engine_name} with ${this.config.keywords.length} keywords on ${this.config.num_pages} pages each.`);
|
||||
|
||||
if (this.pluggable && this.pluggable.start_browser) {
|
||||
|
||||
@ -377,26 +335,21 @@ class ScrapeManager {
|
||||
chunks[k % this.numClusters].push(this.config.keywords[k]);
|
||||
}
|
||||
|
||||
let execPromises = [];
|
||||
let scraperInstances = [];
|
||||
for (var c = 0; c < chunks.length; c++) {
|
||||
this.config.keywords = chunks[c];
|
||||
debug('chunks=%o', chunks);
|
||||
|
||||
if (this.config.use_proxies_only) {
|
||||
this.config.proxy = this.config.proxies[c]; // every cluster has a dedicated proxy
|
||||
} else if(c > 0) {
|
||||
this.config.proxy = this.config.proxies[c-1]; // first cluster uses own ip address
|
||||
}
|
||||
let execPromises = [];
|
||||
for (var c = 0; c < chunks.length; c++) {
|
||||
const config = _.clone(this.config);
|
||||
config.keywords = chunks[c];
|
||||
|
||||
var obj = getScraper(this.config.search_engine, {
|
||||
config: this.config,
|
||||
config: config,
|
||||
context: {},
|
||||
pluggable: this.pluggable,
|
||||
});
|
||||
|
||||
var boundMethod = obj.run.bind(obj);
|
||||
execPromises.push(this.cluster.execute({}, boundMethod));
|
||||
scraperInstances.push(obj);
|
||||
}
|
||||
|
||||
let promiseReturns = await Promise.all(execPromises);
|
||||
@ -412,8 +365,8 @@ class ScrapeManager {
|
||||
let timeDelta = Date.now() - startTime;
|
||||
let ms_per_request = timeDelta/num_requests;
|
||||
|
||||
log(this.config, 1, `Scraper took ${timeDelta}ms to perform ${num_requests} requests.`);
|
||||
log(this.config, 1, `On average ms/request: ${ms_per_request}ms/request`);
|
||||
this.logger.info(`Scraper took ${timeDelta}ms to perform ${num_requests} requests.`);
|
||||
this.logger.info(`On average ms/request: ${ms_per_request}ms/request`);
|
||||
|
||||
if (this.pluggable && this.pluggable.handle_results) {
|
||||
await this.pluggable.handle_results(results);
|
||||
@ -423,14 +376,14 @@ class ScrapeManager {
|
||||
metadata.ms_per_keyword = ms_per_request.toString();
|
||||
metadata.num_requests = num_requests;
|
||||
|
||||
log(this.config, 2, metadata);
|
||||
debug('metadata=%O', metadata);
|
||||
|
||||
if (this.pluggable && this.pluggable.handle_metadata) {
|
||||
await this.pluggable.handle_metadata(metadata);
|
||||
}
|
||||
|
||||
if (this.config.output_file) {
|
||||
log(this.config, 1, `Writing results to ${this.config.output_file}`);
|
||||
this.logger.info(`Writing results to ${this.config.output_file}`);
|
||||
write_results(this.config.output_file, JSON.stringify(results, null, 4));
|
||||
}
|
||||
|
||||
|
@ -1 +0,0 @@
|
||||
Subproject commit 221e6821d1d5d8c57bdf7b2cfef71d64dbf006a2
|
101
test/html_output.js
Normal file
101
test/html_output.js
Normal file
@ -0,0 +1,101 @@
|
||||
'use strict';
|
||||
const express = require('express');
|
||||
const { createLogger, transports } = require('winston');
|
||||
const http = require('http');
|
||||
const https = require('https');
|
||||
const assert = require('assert');
|
||||
const path = require('path');
|
||||
const keyCert = require('key-cert');
|
||||
const Promise = require('bluebird');
|
||||
const Proxy = require('http-mitm-proxy');
|
||||
|
||||
const debug = require('debug')('se-scraper:test');
|
||||
const se_scraper = require('../');
|
||||
|
||||
const httpPort = 3012;
|
||||
const httpsPort = httpPort + 1;
|
||||
const proxyPort = httpPort + 2;
|
||||
|
||||
const fakeSearchEngine = express();
|
||||
fakeSearchEngine.get('/search', (req, res) => {
|
||||
debug('q=%s', req.query.q);
|
||||
const pageNumber = ((req.query.start/10) || 0) + 1;
|
||||
res.sendFile(path.join(__dirname, 'mocks/google/' + req.query.q + '_page' + pageNumber + '.html'));
|
||||
});
|
||||
fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']}));
|
||||
|
||||
describe('Config', function(){
|
||||
|
||||
let httpServer, httpsServer, proxy;
|
||||
before(async function(){
|
||||
// Here mount our fake engine in both http and https listen server
|
||||
httpServer = http.createServer(fakeSearchEngine);
|
||||
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
||||
|
||||
proxy = Proxy();
|
||||
proxy.onRequest((ctx, callback) => {
|
||||
ctx.proxyToServerRequestOptions.host = 'localhost';
|
||||
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
|
||||
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
|
||||
debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
|
||||
return callback();
|
||||
});
|
||||
|
||||
await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
|
||||
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
|
||||
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
||||
debug('Fake http search engine servers started');
|
||||
});
|
||||
|
||||
after(function(){
|
||||
httpsServer.close();
|
||||
httpServer.close();
|
||||
proxy.close();
|
||||
});
|
||||
|
||||
describe('html_output', function(){
|
||||
|
||||
const testLogger = createLogger({
|
||||
transports: [
|
||||
new transports.Console({
|
||||
level: 'error'
|
||||
})
|
||||
]
|
||||
});
|
||||
|
||||
/**
|
||||
* Test html_output option
|
||||
*/
|
||||
it('html_output single page single keyword', async function () {
|
||||
|
||||
const scrape_job = {
|
||||
search_engine: 'google',
|
||||
/* TODO refactor start_url
|
||||
google_settings: {
|
||||
start_url: 'http://localhost:' + httpPort
|
||||
},
|
||||
*/
|
||||
keywords: ['test keyword'],
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager({
|
||||
throw_on_detection: true,
|
||||
logger: testLogger,
|
||||
html_output: true,
|
||||
//clean_html_output: false,
|
||||
//clean_data_images: false,
|
||||
// TODO refactor start_url so we can use-it instead of depending of the proxy for this test
|
||||
proxies: ['http://localhost:' + proxyPort],
|
||||
use_proxies_only: true,
|
||||
});
|
||||
await scraper.start();
|
||||
const { results } = await scraper.scrape(scrape_job);
|
||||
await scraper.quit();
|
||||
|
||||
assert(results['test keyword']['1'].html.length > 1000, 'Html of google page 1 should be provided');
|
||||
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
});
|
23
test/mocks/bing/index.html
Normal file
23
test/mocks/bing/index.html
Normal file
File diff suppressed because one or more lines are too long
42
test/mocks/bing/test keyword_page1.html
Normal file
42
test/mocks/bing/test keyword_page1.html
Normal file
File diff suppressed because one or more lines are too long
42
test/mocks/bing/test keyword_page2.html
Normal file
42
test/mocks/bing/test keyword_page2.html
Normal file
File diff suppressed because one or more lines are too long
40
test/mocks/bing/test keyword_page3.html
Normal file
40
test/mocks/bing/test keyword_page3.html
Normal file
File diff suppressed because one or more lines are too long
148
test/mocks/duckduckgo/index.html
Normal file
148
test/mocks/duckduckgo/index.html
Normal file
@ -0,0 +1,148 @@
|
||||
<!DOCTYPE html>
|
||||
<!--[if IEMobile 7 ]> <html lang="en_US" class="no-js iem7"> <![endif]-->
|
||||
<!--[if lt IE 7]> <html class="ie6 lt-ie10 lt-ie9 lt-ie8 lt-ie7 no-js" lang="en_US"> <![endif]-->
|
||||
<!--[if IE 7]> <html class="ie7 lt-ie10 lt-ie9 lt-ie8 no-js" lang="en_US"> <![endif]-->
|
||||
<!--[if IE 8]> <html class="ie8 lt-ie10 lt-ie9 no-js" lang="en_US"> <![endif]-->
|
||||
<!--[if IE 9]> <html class="ie9 lt-ie10 no-js" lang="en_US"> <![endif]-->
|
||||
<!--[if (gte IE 9)|(gt IEMobile 7)|!(IEMobile)|!(IE)]><!--><html class="no-js" lang="en_US"><!--<![endif]-->
|
||||
|
||||
<head>
|
||||
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
|
||||
<meta http-equiv="content-type" content="text/html; charset=UTF-8;charset=utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=1" />
|
||||
<meta name="HandheldFriendly" content="true"/>
|
||||
|
||||
<link rel="canonical" href="https://duckduckgo.com/">
|
||||
|
||||
<link rel="stylesheet" href="/s1847.css" type="text/css">
|
||||
|
||||
<link rel="stylesheet" href="/o1847.css" type="text/css">
|
||||
|
||||
|
||||
|
||||
<link rel="shortcut icon" href="/favicon.ico" type="image/x-icon"/>
|
||||
<link rel="apple-touch-icon" href="/assets/icons/meta/DDG-iOS-icon_60x60.png"/>
|
||||
<link rel="apple-touch-icon" sizes="76x76" href="/assets/icons/meta/DDG-iOS-icon_76x76.png"/>
|
||||
<link rel="apple-touch-icon" sizes="120x120" href="/assets/icons/meta/DDG-iOS-icon_120x120.png"/>
|
||||
<link rel="apple-touch-icon" sizes="152x152" href="/assets/icons/meta/DDG-iOS-icon_152x152.png"/>
|
||||
<link rel="image_src" href="/assets/icons/meta/DDG-icon_256x256.png"/>
|
||||
<link rel="manifest" href="/manifest.json"/>
|
||||
|
||||
<meta name="twitter:card" content="summary">
|
||||
<meta name="twitter:site" value="@duckduckgo">
|
||||
|
||||
<meta property="og:url" content="https://duckduckgo.com/" />
|
||||
<meta property="og:site_name" content="DuckDuckGo" />
|
||||
<meta property="og:image" content="https://duckduckgo.com/assets/logo_social-media.png">
|
||||
|
||||
|
||||
<title>DuckDuckGo — Privacy, simplified.</title>
|
||||
<meta property="og:title" content="DuckDuckGo — Privacy, simplified." />
|
||||
|
||||
|
||||
<meta property="og:description" content="The Internet privacy company that empowers you to seamlessly take control of your personal information online, without any tradeoffs.">
|
||||
<meta name="description" content="The Internet privacy company that empowers you to seamlessly take control of your personal information online, without any tradeoffs.">
|
||||
|
||||
|
||||
</head>
|
||||
<body id="pg-index" class="page-index body--home">
|
||||
<script type="text/javascript">
|
||||
var settings_js_version = "/s2475.js",
|
||||
locale = "en_US";
|
||||
</script>
|
||||
<script type="text/javascript" src="/lib/l113.js"></script>
|
||||
<script type="text/javascript" src="/locale/en_US/duckduckgo14.js"></script>
|
||||
<script type="text/javascript" src="/util/u418.js"></script>
|
||||
<script type="text/javascript" src="/d2727.js"></script>
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript">
|
||||
DDG.page = new DDG.Pages.Home();
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
<div class="site-wrapper site-wrapper--home js-site-wrapper">
|
||||
|
||||
|
||||
<div class="header-wrap--home js-header-wrap">
|
||||
<div class="header--aside js-header-aside"></div>
|
||||
<div class="js-header-home-search header-wrap--home__search">
|
||||
<div class="logo-wrap--home">
|
||||
<a id="logo_homepage_link" class="logo_homepage" href="/about">
|
||||
About DuckDuckGo
|
||||
<span class="logo_homepage__tt">Duck it!</span>
|
||||
</a>
|
||||
</div>
|
||||
|
||||
<form id="search_form_homepage_top" class="search search--home js-search-form-top" name="x" method="POST" action="/html">
|
||||
<input class="search__input js-search-input" type="text" autocomplete="off" name="q" tabindex="1" value="">
|
||||
<input class="search__button js-search-button" type="submit" tabindex="2" value="S" />
|
||||
<input class="search__clear empty js-search-clear" type="button" tabindex="3" value="X" />
|
||||
<div class="search__hidden js-search-hidden"></div>
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
<div id="" class="content-wrap--home">
|
||||
<div id="content_homepage" class="content--home">
|
||||
<div class="cw--c">
|
||||
<div class="logo-wrap--home">
|
||||
<a id="logo_homepage_link" class="logo_homepage" href="/about">
|
||||
About DuckDuckGo
|
||||
<span class="logo_homepage__tt">Duck it!</span>
|
||||
</a>
|
||||
</div>
|
||||
|
||||
<div class="search-wrap--home">
|
||||
<form id="search_form_homepage" class="search search--home js-search-form" name="x" method="POST" action="/html">
|
||||
<input id="search_form_input_homepage" class="search__input js-search-input" type="text" autocomplete="off" name="q" tabindex="1" value="">
|
||||
<input id="search_button_homepage" class="search__button js-search-button" type="submit" tabindex="2" value="S" />
|
||||
<input id="search_form_input_clear" class="search__clear empty js-search-clear" type="button" tabindex="3" value="X" />
|
||||
<div id="search_elements_hidden" class="search__hidden js-search-hidden"></div>
|
||||
</form>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<!-- en_US All Settings -->
|
||||
<noscript>
|
||||
<div class="tag-home">
|
||||
<div class="tag-home__wrapper">
|
||||
<div class="tag-home__item">
|
||||
The search engine that doesn't track you.
|
||||
<span class="hide--screen-xs"><a href="/about" class="tag-home__link">Learn More</a>.</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</noscript>
|
||||
<div class="tag-home tag-home--slide no-js__hide js-tag-home"></div>
|
||||
<div id="error_homepage"></div>
|
||||
|
||||
|
||||
|
||||
|
||||
</div> <!-- cw -->
|
||||
</div> <!-- content_homepage //-->
|
||||
</div> <!-- content_wrapper_homepage //-->
|
||||
<div id="footer_homepage" class="foot-home js-foot-home"></div>
|
||||
|
||||
<script type="text/javascript">
|
||||
{function seterr(str) {
|
||||
var error=document.getElementById('error_homepage');
|
||||
error.innerHTML=str;
|
||||
$(error).css('display','block');
|
||||
}
|
||||
var err=new RegExp('[\?\&]e=([^\&]+)');var errm=new Array();errm['2']='no search';errm['3']='search too long';errm['4']='not UTF\u002d8 encoding';errm['6']='too many search terms';if (err.test(window.location.href)) seterr('Oops, '+(errm[RegExp.$1]?errm[RegExp.$1]:'there was an error.')+' Please try again');};
|
||||
|
||||
if (kurl) {
|
||||
document.getElementById("logo_homepage_link").href += (document.getElementById("logo_homepage_link").href.indexOf('?')==-1 ? '?t=i' : '') + kurl;
|
||||
}
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
</div> <!-- site-wrapper -->
|
||||
</body>
|
||||
</html>
|
3
test/mocks/duckduckgo/test keyword_page1.html
Normal file
3
test/mocks/duckduckgo/test keyword_page1.html
Normal file
File diff suppressed because one or more lines are too long
3
test/mocks/duckduckgo/test keyword_page2.html
Normal file
3
test/mocks/duckduckgo/test keyword_page2.html
Normal file
File diff suppressed because one or more lines are too long
3
test/mocks/duckduckgo/test keyword_page3.html
Normal file
3
test/mocks/duckduckgo/test keyword_page3.html
Normal file
File diff suppressed because one or more lines are too long
358
test/mocks/google/index.html
Normal file
358
test/mocks/google/index.html
Normal file
File diff suppressed because one or more lines are too long
209
test/mocks/google/test keyword_page1.html
Normal file
209
test/mocks/google/test keyword_page1.html
Normal file
File diff suppressed because one or more lines are too long
206
test/mocks/google/test keyword_page2.html
Normal file
206
test/mocks/google/test keyword_page2.html
Normal file
File diff suppressed because one or more lines are too long
191
test/mocks/google/test keyword_page3.html
Normal file
191
test/mocks/google/test keyword_page3.html
Normal file
File diff suppressed because one or more lines are too long
123
test/modules/bing.js
Normal file
123
test/modules/bing.js
Normal file
@ -0,0 +1,123 @@
|
||||
'use strict';
|
||||
const express = require('express');
|
||||
const puppeteer = require('puppeteer');
|
||||
const { createLogger, transports } = require('winston');
|
||||
const http = require('http');
|
||||
const https = require('https');
|
||||
const assert = require('assert');
|
||||
const path = require('path');
|
||||
const keyCert = require('key-cert');
|
||||
const Promise = require('bluebird');
|
||||
const Proxy = require('http-mitm-proxy');
|
||||
|
||||
const debug = require('debug')('se-scraper:test');
|
||||
const { BingScraper } = require('../../src/modules/bing');
|
||||
|
||||
const httpPort = 3012;
|
||||
const httpsPort = httpPort + 1;
|
||||
const proxyPort = httpPort + 2;
|
||||
|
||||
const fakeSearchEngine = express();
|
||||
fakeSearchEngine.get('/search', (req, res, next) => {
|
||||
debug('q=%s', req.query.q);
|
||||
const pageNumber = Math.round((req.query.first || 0) /10) + 1;
|
||||
res.sendFile(path.join(__dirname, '../mocks/bing/' + req.query.q + '_page' + pageNumber + '.html'));
|
||||
});
|
||||
fakeSearchEngine.use(express.static('test/mocks/bing', {extensions: ['html']}));
|
||||
|
||||
describe('Module Bing', function(){
|
||||
|
||||
let httpServer, httpsServer, proxy;
|
||||
before(async function(){
|
||||
// Here mount our fake engine in both http and https listen server
|
||||
httpServer = http.createServer(fakeSearchEngine);
|
||||
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
||||
|
||||
proxy = Proxy();
|
||||
proxy.onRequest((ctx, callback) => {
|
||||
ctx.proxyToServerRequestOptions.host = 'localhost';
|
||||
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
|
||||
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
|
||||
debug('connection proxied askedHost=%s toPort=%s', ctx.clientToProxyRequest.headers.host, ctx.proxyToServerRequestOptions.port);
|
||||
return callback();
|
||||
});
|
||||
|
||||
await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort });
|
||||
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
|
||||
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
||||
debug('Fake http search engine servers started');
|
||||
});
|
||||
|
||||
after(function(){
|
||||
proxy.close();
|
||||
httpsServer.close();
|
||||
httpServer.close();
|
||||
});
|
||||
|
||||
let browser;
|
||||
let page;
|
||||
beforeEach(async function(){
|
||||
debug('Start a new browser');
|
||||
browser = await puppeteer.launch({
|
||||
//dumpio: true,
|
||||
//headless: false,
|
||||
ignoreHTTPSErrors: true,
|
||||
args: [ '--proxy-server=http://localhost:' + proxyPort ]
|
||||
});
|
||||
debug('Open a fresh page');
|
||||
page = await browser.newPage();
|
||||
});
|
||||
|
||||
afterEach(async function(){
|
||||
await browser.close();
|
||||
});
|
||||
|
||||
const testLogger = createLogger({
|
||||
transports: [
|
||||
new transports.Console({
|
||||
level: 'error'
|
||||
})
|
||||
]
|
||||
});
|
||||
|
||||
it('one keyword one page', function(){
|
||||
const bingScraper = new BingScraper({
|
||||
config: {
|
||||
search_engine_name: 'bing',
|
||||
throw_on_detection: true,
|
||||
keywords: ['test keyword'],
|
||||
logger: testLogger,
|
||||
scrape_from_file: '',
|
||||
}
|
||||
});
|
||||
bingScraper.STANDARD_TIMEOUT = 500;
|
||||
return bingScraper.run({page}).then(({results, metadata, num_requests}) => {
|
||||
assert.strictEqual(num_requests, 1, 'Must do one request');
|
||||
assert.strictEqual(results['test keyword']['1'].results.length, 6, 'Must have 6 organic results parsed');
|
||||
});
|
||||
});
|
||||
|
||||
it('one keyword 3 pages', function () {
|
||||
const bingScraper = new BingScraper({
|
||||
config: {
|
||||
search_engine_name: 'bing',
|
||||
throw_on_detection: true,
|
||||
keywords: ['test keyword'],
|
||||
logger: testLogger,
|
||||
scrape_from_file: '',
|
||||
num_pages: 3,
|
||||
}
|
||||
});
|
||||
bingScraper.STANDARD_TIMEOUT = 500;
|
||||
return bingScraper.run({page}).then(({results, metadata, num_requests}) => {
|
||||
assert.strictEqual(num_requests, 3, 'Must three requests');
|
||||
assert.strictEqual(results['test keyword']['1'].results.length, 6, 'Must have 6 organic results parsed on page 1');
|
||||
assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tests | TestComplete Documentation', 'Title not matching on first organic result page 1');
|
||||
assert.strictEqual(results['test keyword']['2'].results.length, 10, 'Must have 10 organic results parsed on page 2');
|
||||
assert.strictEqual(results['test keyword']['2'].results[0].title, 'Keywords - TestLink', 'Title not matching on first organic result page 2');
|
||||
assert.strictEqual(results['test keyword']['3'].results.length, 10, 'Must have 10 organic results parsed on page 3');
|
||||
assert.strictEqual(results['test keyword']['3'].results[0].title, 'Keyword Driven Testing | TestComplete', 'Title not matching on first organic result page 3');
|
||||
});
|
||||
});
|
||||
|
||||
});
|
140
test/modules/duckduckgo.js
Normal file
140
test/modules/duckduckgo.js
Normal file
@ -0,0 +1,140 @@
|
||||
'use strict';
|
||||
const express = require('express');
|
||||
const puppeteer = require('puppeteer');
|
||||
const { createLogger, transports } = require('winston');
|
||||
const http = require('http');
|
||||
const https = require('https');
|
||||
const assert = require('assert');
|
||||
const path = require('path');
|
||||
const keyCert = require('key-cert');
|
||||
const Promise = require('bluebird');
|
||||
const Proxy = require('http-mitm-proxy');
|
||||
|
||||
const debug = require('debug')('se-scraper:test');
|
||||
const { DuckduckgoScraper } = require('../../src/modules/duckduckgo');
|
||||
|
||||
const httpPort = 3012;
|
||||
const httpsPort = httpPort + 1;
|
||||
const proxyPort = httpPort + 2;
|
||||
|
||||
const fakeSearchEngine = express();
|
||||
fakeSearchEngine.use(express.urlencoded({ extended: true }))
|
||||
fakeSearchEngine.get('/', (req, res, next) => {
|
||||
if(!req.query.q){
|
||||
return next();
|
||||
}
|
||||
debug('q=%s page=%d', req.query.q, req.query.page);
|
||||
const pageNumber = req.query.page;
|
||||
res.sendFile(path.join(__dirname, '../mocks/duckduckgo/' + req.query.q + '_page' + pageNumber + '.html'));
|
||||
});
|
||||
fakeSearchEngine.post('/html', (req, res) => {
|
||||
debug('body=%o', req.body);
|
||||
const pageNumber = 1;
|
||||
res.sendFile(path.join(__dirname, '../mocks/duckduckgo/' + req.body.q + '_page' + pageNumber + '.html'));
|
||||
});
|
||||
fakeSearchEngine.use(express.static('test/mocks/duckduckgo', {extensions: ['html']}));
|
||||
|
||||
describe('Module DuckDuckGo', function(){
|
||||
|
||||
let httpServer, httpsServer, proxy;
|
||||
before(async function(){
|
||||
// Here mount our fake engine in both http and https listen server
|
||||
httpServer = http.createServer(fakeSearchEngine);
|
||||
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
||||
|
||||
proxy = Proxy();
|
||||
proxy.onRequest((ctx, callback) => {
|
||||
ctx.proxyToServerRequestOptions.host = 'localhost';
|
||||
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
|
||||
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
|
||||
debug('proxy askedHost=%s method=%s url=%s toPort=%s',
|
||||
ctx.clientToProxyRequest.headers.host,
|
||||
ctx.clientToProxyRequest.method,
|
||||
ctx.clientToProxyRequest.url,
|
||||
ctx.proxyToServerRequestOptions.port
|
||||
);
|
||||
return callback();
|
||||
});
|
||||
|
||||
await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort });
|
||||
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
|
||||
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
||||
debug('Fake http search engine servers started');
|
||||
});
|
||||
|
||||
after(function(){
|
||||
proxy.close();
|
||||
httpsServer.close();
|
||||
httpServer.close();
|
||||
});
|
||||
|
||||
let browser;
|
||||
let page;
|
||||
beforeEach(async function(){
|
||||
debug('Start a new browser');
|
||||
browser = await puppeteer.launch({
|
||||
//dumpio: true,
|
||||
//headless: false,
|
||||
ignoreHTTPSErrors: true,
|
||||
args: [ '--proxy-server=http://localhost:' + proxyPort ]
|
||||
});
|
||||
debug('Open a fresh page');
|
||||
page = await browser.newPage();
|
||||
});
|
||||
|
||||
afterEach(async function(){
|
||||
await browser.close();
|
||||
});
|
||||
|
||||
const testLogger = createLogger({
|
||||
transports: [
|
||||
new transports.Console({
|
||||
level: 'error'
|
||||
})
|
||||
]
|
||||
});
|
||||
|
||||
it('one keyword one page', function(){
|
||||
const duckduckgoScraper = new DuckduckgoScraper({
|
||||
config: {
|
||||
search_engine_name: 'duckduckgo',
|
||||
throw_on_detection: true,
|
||||
keywords: ['test keyword'],
|
||||
logger: testLogger,
|
||||
scrape_from_file: '',
|
||||
}
|
||||
});
|
||||
duckduckgoScraper.STANDARD_TIMEOUT = 1000;
|
||||
return duckduckgoScraper.run({page}).then(({results, metadata, num_requests}) => {
|
||||
assert.strictEqual(num_requests, 1, 'Must do one request');
|
||||
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed');
|
||||
});
|
||||
});
|
||||
|
||||
it('one keyword 3 pages', function () {
|
||||
this.timeout(4000);
|
||||
const duckduckgoScraper = new DuckduckgoScraper({
|
||||
config: {
|
||||
search_engine_name: 'google',
|
||||
throw_on_detection: true,
|
||||
keywords: ['test keyword'],
|
||||
logger: testLogger,
|
||||
scrape_from_file: '',
|
||||
num_pages: 3,
|
||||
}
|
||||
});
|
||||
duckduckgoScraper.STANDARD_TIMEOUT = 1000;
|
||||
return duckduckgoScraper.run({page}).then(({results, metadata, num_requests}) => {
|
||||
assert.strictEqual(num_requests, 3, 'Must three requests');
|
||||
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed on page 1');
|
||||
assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tests | TestComplete Documentation', 'Title not matching on first organic result page 1');
|
||||
debug('results page 1 %O',results['test keyword']['1'].results);
|
||||
debug('results page 2 %O', results['test keyword']['2'].results);
|
||||
assert.strictEqual(results['test keyword']['2'].results.length, 19, 'Must have 19 organic results parsed on page 2');
|
||||
assert.strictEqual(results['test keyword']['2'].results[0].title, 'Quest Diagnostics: Test Directory', 'Title not matching on first organic result page 1');
|
||||
assert.strictEqual(results['test keyword']['3'].results.length, 48, 'Must have 48 organic results parsed on page 3');
|
||||
assert.strictEqual(results['test keyword']['3'].results[0].title, 'Java Keywords Quiz - Sporcle', 'Title not matching on first organic result page 1');
|
||||
});
|
||||
});
|
||||
|
||||
});
|
123
test/modules/google.js
Normal file
123
test/modules/google.js
Normal file
@ -0,0 +1,123 @@
|
||||
'use strict';
|
||||
const express = require('express');
|
||||
const puppeteer = require('puppeteer');
|
||||
const { createLogger, transports } = require('winston');
|
||||
const http = require('http');
|
||||
const https = require('https');
|
||||
const assert = require('assert');
|
||||
const path = require('path');
|
||||
const keyCert = require('key-cert');
|
||||
const Promise = require('bluebird');
|
||||
const Proxy = require('http-mitm-proxy');
|
||||
|
||||
const debug = require('debug')('se-scraper:test');
|
||||
const { GoogleScraper } = require('../../src/modules/google');
|
||||
|
||||
const httpPort = 3012;
|
||||
const httpsPort = httpPort + 1;
|
||||
const proxyPort = httpPort + 2;
|
||||
|
||||
const fakeSearchEngine = express();
|
||||
fakeSearchEngine.get('/search', (req, res) => {
|
||||
debug('q=%s', req.query.q);
|
||||
const pageNumber = ((req.query.start/10) || 0) + 1;
|
||||
res.sendFile(path.join(__dirname, '../mocks/google/' + req.query.q + '_page' + pageNumber + '.html'));
|
||||
});
|
||||
fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']}));
|
||||
|
||||
describe('Module Google', function(){
|
||||
|
||||
let httpServer, httpsServer, proxy;
|
||||
before(async function(){
|
||||
// Here mount our fake engine in both http and https listen server
|
||||
httpServer = http.createServer(fakeSearchEngine);
|
||||
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
||||
|
||||
proxy = Proxy();
|
||||
proxy.onRequest((ctx, callback) => {
|
||||
ctx.proxyToServerRequestOptions.host = 'localhost';
|
||||
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
|
||||
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
|
||||
debug('connection proxied askedHost=%s toPort=%s', ctx.clientToProxyRequest.headers.host, ctx.proxyToServerRequestOptions.port);
|
||||
return callback();
|
||||
});
|
||||
|
||||
await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort });
|
||||
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
|
||||
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
||||
debug('Fake http search engine servers started');
|
||||
});
|
||||
|
||||
after(function(){
|
||||
proxy.close();
|
||||
httpsServer.close();
|
||||
httpServer.close();
|
||||
});
|
||||
|
||||
let browser;
|
||||
let page;
|
||||
beforeEach(async function(){
|
||||
debug('Start a new browser');
|
||||
browser = await puppeteer.launch({
|
||||
//dumpio: true,
|
||||
//headless: false,
|
||||
ignoreHTTPSErrors: true,
|
||||
args: [ '--proxy-server=http://localhost:' + proxyPort ]
|
||||
});
|
||||
debug('Open a fresh page');
|
||||
page = await browser.newPage();
|
||||
});
|
||||
|
||||
afterEach(async function(){
|
||||
await browser.close();
|
||||
});
|
||||
|
||||
const testLogger = createLogger({
|
||||
transports: [
|
||||
new transports.Console({
|
||||
level: 'error'
|
||||
})
|
||||
]
|
||||
});
|
||||
|
||||
it('one keyword one page', function(){
|
||||
const googleScraper = new GoogleScraper({
|
||||
config: {
|
||||
search_engine_name: 'google',
|
||||
throw_on_detection: true,
|
||||
keywords: ['test keyword'],
|
||||
logger: testLogger,
|
||||
scrape_from_file: '',
|
||||
}
|
||||
});
|
||||
googleScraper.STANDARD_TIMEOUT = 500;
|
||||
return googleScraper.run({page}).then(({results, metadata, num_requests}) => {
|
||||
assert.strictEqual(num_requests, 1, 'Must do one request');
|
||||
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed');
|
||||
});
|
||||
});
|
||||
|
||||
it('one keyword 3 pages', function () {
|
||||
const googleScraper = new GoogleScraper({
|
||||
config: {
|
||||
search_engine_name: 'google',
|
||||
throw_on_detection: true,
|
||||
keywords: ['test keyword'],
|
||||
logger: testLogger,
|
||||
scrape_from_file: '',
|
||||
num_pages: 3,
|
||||
}
|
||||
});
|
||||
googleScraper.STANDARD_TIMEOUT = 500;
|
||||
return googleScraper.run({page}).then(({results, metadata, num_requests}) => {
|
||||
assert.strictEqual(num_requests, 3, 'Must three requests');
|
||||
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed on page 1');
|
||||
assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tool (FREE) ᐈ #1 Google Keyword Planner Alternative', 'Title not matching on first organic result page 1');
|
||||
assert.strictEqual(results['test keyword']['2'].results.length, 10, 'Must have 10 organic results parsed on page 2');
|
||||
assert.strictEqual(results['test keyword']['2'].results[0].title, 'Keyword Research | The Beginner\'s Guide to SEO - Moz', 'Title not matching on first organic result page 1');
|
||||
assert.strictEqual(results['test keyword']['3'].results.length, 10, 'Must have 10 organic results parsed on page 3');
|
||||
assert.strictEqual(results['test keyword']['3'].results[0].title, 'The ACT Keyword Study Plan — NerdCoach', 'Title not matching on first organic result page 1');
|
||||
});
|
||||
});
|
||||
|
||||
});
|
161
test/proxy.js
Normal file
161
test/proxy.js
Normal file
@ -0,0 +1,161 @@
|
||||
'use strict';
|
||||
const express = require('express');
|
||||
const { createLogger, transports } = require('winston');
|
||||
const http = require('http');
|
||||
const https = require('https');
|
||||
const assert = require('assert');
|
||||
const keyCert = require('key-cert');
|
||||
const Promise = require('bluebird');
|
||||
const Proxy = require('http-mitm-proxy');
|
||||
|
||||
const debug = require('debug')('se-scraper:test');
|
||||
const se_scraper = require('../');
|
||||
const Scraper = require('../src/modules/se_scraper');
|
||||
|
||||
const httpPort = 3012;
|
||||
const httpsPort = httpPort + 1;
|
||||
const proxyPort = httpPort + 2;
|
||||
|
||||
const fakeSearchEngine = express();
|
||||
fakeSearchEngine.set('trust proxy', 'loopback');
|
||||
fakeSearchEngine.get('/test-proxy', (req, res) => {
|
||||
debug('fake-search-engine req.hostname=%s', req.hostname);
|
||||
//debug('req to', req.socket.localAddress, req.socket.localPort);
|
||||
res.send(req.hostname);
|
||||
});
|
||||
|
||||
describe('Config', function(){
|
||||
|
||||
let httpServer, httpsServer, proxy;
|
||||
before(async function(){
|
||||
// Here mount our fake engine in both http and https listen server
|
||||
httpServer = http.createServer(fakeSearchEngine);
|
||||
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
||||
|
||||
proxy = Proxy();
|
||||
proxy.onRequest((ctx, callback) => {
|
||||
ctx.proxyToServerRequestOptions.host = 'localhost';
|
||||
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
|
||||
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
|
||||
debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
|
||||
return callback();
|
||||
});
|
||||
|
||||
await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
|
||||
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
|
||||
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
||||
debug('Fake http search engine servers started');
|
||||
});
|
||||
|
||||
after(function(){
|
||||
httpsServer.close();
|
||||
httpServer.close();
|
||||
proxy.close();
|
||||
});
|
||||
|
||||
describe('proxies', function(){
|
||||
|
||||
class MockScraperTestProxy extends Scraper {
|
||||
|
||||
async load_start_page(){
|
||||
return true;
|
||||
}
|
||||
|
||||
async search_keyword(){
|
||||
await this.page.goto('http://test.local:' + httpPort + '/test-proxy');
|
||||
}
|
||||
|
||||
async parse_async(){
|
||||
const bodyHandle = await this.page.$('body');
|
||||
return await this.page.evaluate(body => body.innerHTML, bodyHandle);
|
||||
}
|
||||
}
|
||||
|
||||
const testLogger = createLogger({
|
||||
transports: [
|
||||
new transports.Console({
|
||||
level: 'error'
|
||||
})
|
||||
]
|
||||
});
|
||||
|
||||
/**
|
||||
* Jobs will be executed 2 by 2 through the proxy and direct connection
|
||||
* THIS TEST NEED TO HAVE test.local 127.0.0.1 in /etc/hosts because chrome bypass localhost even with proxy set
|
||||
*/
|
||||
it('one proxy given, use_proxies_only=false', async function () {
|
||||
|
||||
const scrape_job = {
|
||||
search_engine: MockScraperTestProxy,
|
||||
keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'],
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager({
|
||||
throw_on_detection: true,
|
||||
proxies: ['http://localhost:' + proxyPort],
|
||||
// default is use_proxies_only: false,
|
||||
logger: testLogger,
|
||||
});
|
||||
await scraper.start();
|
||||
|
||||
const { results } = await scraper.scrape(scrape_job);
|
||||
assert.strictEqual(results['news']['1'], 'test.local');
|
||||
assert.strictEqual(results['some stuff']['1'], 'ProxiedThroughFakeEngine');
|
||||
assert.strictEqual(results['i work too much']['1'], 'test.local');
|
||||
assert.strictEqual(results['what to do?']['1'], 'ProxiedThroughFakeEngine');
|
||||
assert.strictEqual(results['javascript is hard']['1'], 'test.local');
|
||||
|
||||
await scraper.quit();
|
||||
});
|
||||
|
||||
/**
|
||||
* Jobs will be executed 1 by 1 through the proxy
|
||||
*/
|
||||
it('one proxy given, use_proxies_only=true', async function () {
|
||||
|
||||
const scrape_job = {
|
||||
search_engine: MockScraperTestProxy,
|
||||
keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'],
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager({
|
||||
throw_on_detection: true,
|
||||
proxies: ['http://localhost:' + proxyPort],
|
||||
use_proxies_only: true,
|
||||
logger: testLogger,
|
||||
});
|
||||
await scraper.start();
|
||||
|
||||
const { results } = await scraper.scrape(scrape_job);
|
||||
assert.strictEqual(results['news']['1'], 'ProxiedThroughFakeEngine');
|
||||
assert.strictEqual(results['some stuff']['1'], 'ProxiedThroughFakeEngine');
|
||||
assert.strictEqual(results['i work too much']['1'], 'ProxiedThroughFakeEngine');
|
||||
assert.strictEqual(results['what to do?']['1'], 'ProxiedThroughFakeEngine');
|
||||
assert.strictEqual(results['javascript is hard']['1'], 'ProxiedThroughFakeEngine');
|
||||
|
||||
await scraper.quit();
|
||||
});
|
||||
|
||||
it('zero proxy given, use_proxies_only=true', async function () {
|
||||
|
||||
const scrape_job = {
|
||||
search_engine: MockScraperTestProxy,
|
||||
keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'],
|
||||
};
|
||||
|
||||
await assert.rejects(async () => {
|
||||
var scraper = new se_scraper.ScrapeManager({
|
||||
throw_on_detection: true,
|
||||
use_proxies_only: true,
|
||||
logger: testLogger,
|
||||
});
|
||||
await scraper.start();
|
||||
const { results } = await scraper.scrape(scrape_job);
|
||||
await scraper.quit();
|
||||
}, /Must provide at least one proxy in proxies if you enable use_proxies_only/);
|
||||
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
});
|
@ -1,15 +0,0 @@
|
||||
## Test with static HTML
|
||||
|
||||
Dynamic testing of se-scraper takes too much time.
|
||||
|
||||
Save some html and initialize se-scraper by loading the search from disk.
|
||||
|
||||
### Disadvantage
|
||||
|
||||
static html gets outdated after some time
|
||||
|
||||
### Advantages
|
||||
|
||||
1. Let's us test corner cases that are missed easily
|
||||
2. Testing is not reliable, since search engines do not always return the same results for the same query
|
||||
3. As said, much faster
|
@ -1,222 +0,0 @@
|
||||
const se_scraper = require('./../../index.js');
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
const path = require('path');
|
||||
|
||||
async function bing_ads() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'bing',
|
||||
keywords: ['kaffeemaschine kaufen'],
|
||||
num_pages: 1,
|
||||
scrape_from_file: 'file://' + path.join(__dirname, './html/bing.html'),
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(config);
|
||||
|
||||
await scraper.start();
|
||||
|
||||
bing_search_with_ads( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.keywords = ['best cloud services'];
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/bing2.html');
|
||||
|
||||
bing_search_with_ads2( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.keywords = ['car tires cheap'];
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/bing3.html');
|
||||
|
||||
bing_search_with_ads3( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.keywords = ['service auto garage'];
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/bing4.html');
|
||||
|
||||
bing_search_with_ads4( await scraper.scrape(scrape_config) );
|
||||
|
||||
await scraper.quit();
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function bing_search_with_ads(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '1’100’000', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 6, 'results must have at least 6 SERP objects');
|
||||
assert.isAtLeast(obj.ads.length, 7, 'there are 7 ads');
|
||||
|
||||
assert.isAtLeast(obj.right_side_ads.length, 5, 'there are 5 ads');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function bing_search_with_ads2(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '44’300’000', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 6, 'results must have at least 6 SERP objects');
|
||||
assert.isAtLeast(obj.ads.length, 7, 'there are 7 ads');
|
||||
|
||||
assert.isAtLeast(obj.right_side_ads.length, 5, 'there are 5 ads');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function bing_search_with_ads3(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '65.500.000 Results', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects');
|
||||
assert.isAtLeast(obj.ads.length, 3, 'there are 3 ads');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function bing_search_with_ads4(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '4.200.000 Ergebnisse', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 9, 'results must have at least 9 SERP objects');
|
||||
assert.isAtLeast(obj.ads.length, 3, 'there are 3 ads');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function confirm_results_ok(obj) {
|
||||
|
||||
for (let res of obj.results) {
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 8, 'title must have at least 8 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
}
|
||||
|
||||
for (let res of obj.ads) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 8, 'title must have at least 8 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
}
|
||||
|
||||
for (let res of obj.right_side_ads) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 8, 'title must have at least 8 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
}
|
||||
}
|
||||
|
||||
describe('Bing', function(){
|
||||
this.timeout(15000);
|
||||
it('static bing searches with ads', bing_ads);
|
||||
});
|
@ -1,173 +0,0 @@
|
||||
const se_scraper = require('./../../index.js');
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
const path = require('path');
|
||||
const cheerio = require('cheerio');
|
||||
|
||||
|
||||
async function test_html_output() {
|
||||
let config = {
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
html_output: true,
|
||||
// whether to strip JS and CSS from the html_output
|
||||
// has only an effect if `html_output` is true
|
||||
clean_html_output: true,
|
||||
// remove all data images from the html
|
||||
clean_data_images: true,
|
||||
// test compression
|
||||
compress: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'bing',
|
||||
keywords: ['kaffeemaschine kaufen'],
|
||||
num_pages: 1,
|
||||
scrape_from_file: 'file://' + path.join(__dirname, './html/bing.html'),
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(config);
|
||||
|
||||
await scraper.start();
|
||||
|
||||
var response = await scraper.scrape(scrape_config);
|
||||
|
||||
scrape_config.clean_html_output = false;
|
||||
scrape_config.clean_data_images = false;
|
||||
|
||||
var response_no_cleaned = await scraper.scrape(scrape_config);
|
||||
|
||||
test(response, response_no_cleaned, 'bing');
|
||||
|
||||
scrape_config.search_engine = 'google';
|
||||
scrape_config.keywords = ['rückspiegel schwarz'];
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google.html');
|
||||
scrape_config.clean_html_output = true;
|
||||
scrape_config.clean_data_images = true;
|
||||
|
||||
var responseGoogle = await scraper.scrape(scrape_config);
|
||||
|
||||
scrape_config.clean_html_output = false;
|
||||
scrape_config.clean_data_images = false;
|
||||
|
||||
var response_no_cleanedGoogle = await scraper.scrape(scrape_config);
|
||||
|
||||
test(responseGoogle, response_no_cleanedGoogle, 'google');
|
||||
|
||||
|
||||
scrape_config.keywords = ['cloud services'];
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/googleLarge.html');
|
||||
scrape_config.clean_html_output = true;
|
||||
scrape_config.clean_data_images = true;
|
||||
|
||||
var responseGoogle = await scraper.scrape(scrape_config);
|
||||
|
||||
scrape_config.clean_html_output = false;
|
||||
scrape_config.clean_data_images = false;
|
||||
|
||||
var response_no_cleanedGoogle = await scraper.scrape(scrape_config);
|
||||
|
||||
test(responseGoogle, response_no_cleanedGoogle, 'google');
|
||||
|
||||
await scraper.quit();
|
||||
}
|
||||
|
||||
function test(response, response_no_cleaned, se='google') {
|
||||
for (let query in response.results) {
|
||||
for (let page_number in response.results[query]) {
|
||||
let obj = response.results[query][page_number];
|
||||
let obj_no_cleaned = response_no_cleaned.results[query][page_number];
|
||||
|
||||
console.log('html length of no cleaned SERP: ' + obj_no_cleaned.html.length);
|
||||
console.log('html length of cleaned SERP: ' + obj.html.length);
|
||||
|
||||
assert.isOk(obj.html, 'Html must be ok!');
|
||||
assert.isAtLeast(obj.html.length, 100, 'html must be a length string');
|
||||
|
||||
assert.isOk(obj_no_cleaned.html, 'Html must be ok!');
|
||||
assert.isAtLeast(obj_no_cleaned.html.length, 100, 'html must be a length string');
|
||||
|
||||
assert.isBelow(obj.html.length, obj_no_cleaned.html.length, 'cleaned html must be smaller');
|
||||
|
||||
// test that we can parse the html of both the cleaned and no cleaned versions
|
||||
// with cheerio and that serp results are roughly the same
|
||||
|
||||
const cleaned$ = cheerio.load(obj.html);
|
||||
const no_cleaned$ = cheerio.load(obj_no_cleaned.html);
|
||||
|
||||
var resCleaned = parseResults(cleaned$, se);
|
||||
var resNoCleaned = parseResults(no_cleaned$, se);
|
||||
|
||||
assert.equal(resCleaned.length, resNoCleaned.length);
|
||||
assert.equal(resCleaned.length, obj.results.length);
|
||||
assert.equal(resNoCleaned.length, obj.results.length);
|
||||
|
||||
// unset the rank
|
||||
resCleaned = resCleaned.map((el) => el.rank = undefined);
|
||||
resNoCleaned = resNoCleaned.map((el) => el.rank = undefined);
|
||||
obj.results = obj.results.map((el) => el.rank = undefined);
|
||||
|
||||
assert.deepEqual(resCleaned, resNoCleaned, 'parsed results should be equal, even if html is cleaned');
|
||||
assert.deepEqual(resCleaned, obj.results, 'parsed results from cleaned html should be equal to se-scraper results');
|
||||
assert.deepEqual(resNoCleaned, obj.results, 'parsed results from non-cleaned html should be equal to se-scraper results');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function parseResults(s$, se) {
|
||||
|
||||
var results = [];
|
||||
|
||||
if (se === 'google') {
|
||||
s$('#center_col .g').each((i, link) => {
|
||||
results.push({
|
||||
link: s$(link).find('.r a').attr('href'),
|
||||
title: s$(link).find('.r a').text(),
|
||||
snippet: s$(link).find('span.st').text(),
|
||||
visible_link: s$(link).find('.r cite').text(),
|
||||
date: s$(link).find('span.f').text() || '',
|
||||
})
|
||||
});
|
||||
|
||||
} else if (se === 'bing') {
|
||||
s$('#b_content #b_results .b_algo').each((i, link) => {
|
||||
results.push({
|
||||
link: s$(link).find('h2 a').attr('href'),
|
||||
title: s$(link).find('h2').text(),
|
||||
snippet: s$(link).find('.b_caption p').text(),
|
||||
visible_link: s$(link).find('cite').text(),
|
||||
})
|
||||
});
|
||||
} else {
|
||||
throw "no such search engine";
|
||||
}
|
||||
|
||||
results = clean_results(results, ['title', 'link', 'snippet']);
|
||||
return results;
|
||||
}
|
||||
|
||||
function clean_results(results, attributes) {
|
||||
const cleaned = [];
|
||||
var rank = 1;
|
||||
for (var res of results) {
|
||||
let goodboy = true;
|
||||
for (var attr of attributes) {
|
||||
if (!res[attr] || !res[attr].trim()) {
|
||||
goodboy = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (goodboy) {
|
||||
res.rank = rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
return cleaned;
|
||||
}
|
||||
|
||||
describe('html output', function(){
|
||||
this.timeout(15000);
|
||||
it('static html output test', test_html_output);
|
||||
});
|
@ -1,24 +0,0 @@
|
||||
'use strict';
|
||||
const zlib = require('zlib');
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
|
||||
var files = ['google.html', 'google2.html', 'google3.html', 'bing.html', 'bing2.html'];
|
||||
|
||||
for (var file of files) {
|
||||
var html = fs.readFileSync(path.resolve(__dirname, './html/' + file));
|
||||
|
||||
var compressed = zlib.gzipSync(html);
|
||||
var deflated = zlib.deflateSync(html);
|
||||
|
||||
var compressed_encoded = compressed.toString('base64');
|
||||
var deflated_encoded = deflated.toString('base64');
|
||||
|
||||
console.log(file)
|
||||
console.log('Normal length: ' + html.length/1000);
|
||||
console.log('GZIP Compressed length: ' + compressed.length/1000);
|
||||
console.log('Deflate Compressed length: ' + deflated.length/1000);
|
||||
console.log('Encoded GZIP Compressed length: ' + compressed_encoded.length/1000);
|
||||
console.log('Encoded Deflate Compressed length: ' + deflated_encoded.length/1000);
|
||||
console.log('------\n')
|
||||
}
|
@ -1,99 +0,0 @@
|
||||
const se_scraper = require('./../../index.js');
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
const path = require('path');
|
||||
|
||||
async function duckduckgo() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'duckduckgo',
|
||||
keywords: ['cloud service'],
|
||||
num_pages: 1,
|
||||
scrape_from_file: 'file://' + path.join(__dirname, './html/duckduckgo1.html'),
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(config);
|
||||
|
||||
await scraper.start();
|
||||
|
||||
duckduckgo_normal( await scraper.scrape(scrape_config) );
|
||||
|
||||
await scraper.quit();
|
||||
}
|
||||
|
||||
function duckduckgo_normal(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'ads',], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects');
|
||||
assert.isAtLeast(obj.ads.length, 2, 'ads must have at least 2 SERP objects');
|
||||
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function confirm_results_ok(obj) {
|
||||
|
||||
for (let res of obj.results) {
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
}
|
||||
|
||||
for (let res of obj.ads) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
}
|
||||
}
|
||||
|
||||
describe('Duckduckgo', function(){
|
||||
this.timeout(10000);
|
||||
it('static duckduckgo sarch', duckduckgo);
|
||||
});
|
@ -1,410 +0,0 @@
|
||||
const se_scraper = require('./../../index.js');
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
const path = require('path');
|
||||
|
||||
async function normal_search_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'google',
|
||||
keywords: ['rückspiegel schwarz'],
|
||||
num_pages: 1,
|
||||
scrape_from_file: 'file://' + path.join(__dirname, './html/google.html'),
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(config);
|
||||
|
||||
await scraper.start();
|
||||
|
||||
google_search_with_products( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google2.html');
|
||||
scrape_config.keywords = ['autoreifen mercedes c-klasse'];
|
||||
|
||||
google_search_with_products2( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google3.html');
|
||||
scrape_config.keywords = ['kaffeemaschine kaufen'];
|
||||
|
||||
google_places( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google4.html');
|
||||
scrape_config.keywords = ['MODEL MARKET SW18 4ES'];
|
||||
|
||||
right_side_info_text( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google5.html');
|
||||
scrape_config.keywords = ['BRANDON MOTORS HP13 6NR'];
|
||||
|
||||
right_side_info_text2( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google6.html');
|
||||
scrape_config.keywords = ['car tires for sale'];
|
||||
|
||||
google_places_and_ads( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google_bmw_felgen.html');
|
||||
scrape_config.keywords = ['bmw felgen'];
|
||||
|
||||
google_ads2( await scraper.scrape(scrape_config) );
|
||||
|
||||
await scraper.quit();
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function google_search_with_products(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '1’780’000', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 9, 'results must have at least 8 SERP objects');
|
||||
assert.isAtLeast(obj.top_ads.length, 0, 'there are no top ads');
|
||||
assert.isAtLeast(obj.bottom_ads.length, 3, 'there are 3 bottom ads');
|
||||
assert.isAtLeast(obj.top_products.length, 15, 'there are 15 top products');
|
||||
assert.equal(obj.right_products.length, 0, 'there are 0 right products');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function google_search_with_products2(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '437’000 Ergebnisse (0.41 Sekunden)', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 9, 'results must have at least 8 SERP objects');
|
||||
assert.isAtLeast(obj.top_ads.length, 0, 'there are no top ads');
|
||||
assert.isAtLeast(obj.bottom_ads.length, 1, 'there are 1 bottom ads');
|
||||
assert.isAtLeast(obj.top_products.length, 0, 'there are 0 top products');
|
||||
assert.equal(obj.right_products.length, 4, 'there are 4 right products');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function google_places(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '6’750’000 Ergebnisse (0.52 Sekunden)', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 10, 'results must have at least 9 SERP objects');
|
||||
assert.equal(obj.top_ads.length, 0, 'there are no top ads');
|
||||
assert.equal(obj.bottom_ads.length, 0, 'there are 0 bottom ads');
|
||||
assert.equal(obj.top_products.length, 0, 'there are 0 top products');
|
||||
assert.equal(obj.right_products.length, 0, 'there are 0 right products');
|
||||
assert.equal(obj.places.length, 3, 'there are 3 places');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function right_side_info_text(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
for (let query in response.results) {
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '6 Ergebnisse', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results',
|
||||
'effective_query', 'top_ads', 'bottom_ads', 'right_side_info_text'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
|
||||
assert.isOk(obj.right_side_info_text.length, 'right_side_info_text should have some data');
|
||||
assert.isAtLeast(obj.right_side_info_text.length, 50, 'right_side_info_text should have some data');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function right_side_info_text2(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
for (let query in response.results) {
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '5 Ergebnisse', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results',
|
||||
'effective_query', 'top_ads', 'bottom_ads', 'right_side_info_text'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 5, 'results must have at least 5 SERP objects');
|
||||
assert.isOk(obj.right_side_info_text.length, 'right_side_info_text should have some data');
|
||||
assert.isAtLeast(obj.right_side_info_text.length, 50, 'right_side_info_text should have some data');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function google_places_and_ads(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '439.000.000 Ergebnisse (0,64 Sekunden)', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects');
|
||||
assert.equal(obj.top_ads.length, 0, 'there are no top ads');
|
||||
assert.equal(obj.bottom_ads.length, 0, 'there are 0 bottom ads');
|
||||
assert.isAtLeast(obj.top_products.length, 13, 'there are 13 top products');
|
||||
assert.equal(obj.right_products.length, 0, 'there are 0 right products');
|
||||
assert.equal(obj.places.length, 2, 'there are 2 places');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function google_ads2(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, 'Ungefähr 23.200.000 Ergebnisse (0,29 Sekunden)', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects');
|
||||
assert.equal(obj.top_ads.length, 3, 'there are no top ads');
|
||||
assert.equal(obj.bottom_ads.length, 0, 'there are 0 bottom ads');
|
||||
assert.isAtLeast(obj.top_products.length, 0, 'there must be 0 top products');
|
||||
assert.equal(obj.right_products.length, 9, 'there are 9 right products');
|
||||
assert.equal(obj.places.length, 0, 'there are 0 places');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function confirm_results_ok(obj) {
|
||||
|
||||
for (let res of obj.results) {
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
}
|
||||
|
||||
for (let res of obj.top_ads) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'visible_link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.typeOf(res.links, 'array', 'links must be array');
|
||||
}
|
||||
|
||||
for (let res of obj.bottom_ads) {
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'visible_link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.typeOf(res.links, 'array', 'links must be array');
|
||||
}
|
||||
|
||||
for (let res of obj.top_products) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.price, 'price must be ok');
|
||||
assert.typeOf(res.price, 'string', 'price must be string');
|
||||
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.vendor_link, 'vendor_link must be ok');
|
||||
assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string');
|
||||
assert.isAtLeast(res.vendor_link.length, 8, 'vendor_link must have at least 10 chars');
|
||||
}
|
||||
|
||||
for (let res of obj.right_products) {
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.price, 'price must be ok');
|
||||
assert.typeOf(res.price, 'string', 'price must be string');
|
||||
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.vendor_link, 'vendor_link must be ok');
|
||||
assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string');
|
||||
assert.isAtLeast(res.vendor_link.length, 8, 'vendor_link must have at least 10 chars');
|
||||
}
|
||||
|
||||
for (let res of obj.places) {
|
||||
assert.isOk(res.heading, 'heading must be ok');
|
||||
assert.typeOf(res.heading, 'string', 'heading must be string');
|
||||
assert.isAtLeast(res.heading.length, 5, 'heading must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.rating, 'rating must be ok');
|
||||
assert.typeOf(res.rating, 'string', 'rating must be string');
|
||||
assert.isAtLeast(res.rating.length, 5, 'rating must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.contact, 'contact must be ok');
|
||||
assert.typeOf(res.contact, 'string', 'contact must be string');
|
||||
assert.isAtLeast(res.contact.length, 5, 'contact must have at least 5 chars');
|
||||
|
||||
assert.typeOf(res.hours, 'string', 'hours must be string');
|
||||
if (res.hours) {
|
||||
assert.isAtLeast(res.hours.length, 10, 'hours must have at least 10 chars');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
describe('Google', function() {
|
||||
this.timeout(25000);
|
||||
it('static google searches with products,ads and places', normal_search_test);
|
||||
});
|
@ -1,213 +0,0 @@
|
||||
const se_scraper = require('./../../index.js');
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
const path = require('path');
|
||||
|
||||
async function normal_search_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'google',
|
||||
keywords: ['in.linkedin.com/in/altanai'],
|
||||
num_pages: 1,
|
||||
scrape_from_file: 'file://' + path.join(__dirname, './html/google7.html'),
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(config);
|
||||
|
||||
await scraper.start();
|
||||
|
||||
google_test_title( await scraper.scrape(scrape_config) );
|
||||
|
||||
await scraper.quit();
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function google_test_title(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '7.600', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 9, 'results must have at least 9 SERP objects');
|
||||
assert.isAtLeast(obj.top_ads.length, 0, 'there are no top ads');
|
||||
assert.isAtLeast(obj.bottom_ads.length, 0, 'there are 0 bottom ads');
|
||||
assert.isAtLeast(obj.top_products.length, 0, 'there are 0 top products');
|
||||
assert.equal(obj.right_products.length, 0, 'there are 0 right products');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
|
||||
assert.equal( obj.results[0].title, 'ALTANAI BISHT - SD2 at Voice Engineering - Plivo | LinkedIn' );
|
||||
assert.equal( obj.results[1].title, 'ALTANAI BISHT | LinkedIn' );
|
||||
assert.equal( obj.results[2].title, 'ALTANAI BISHT – SD2 at Voice Engineering – Plivo | LinkedIn' );
|
||||
assert.equal( obj.results[3].title, 'AI AT - South Delhi, Delhi, India | Professional Profile | LinkedIn' );
|
||||
assert.equal( obj.results[4].title, 'ALTANAI BISHT | LinkedIn' );
|
||||
assert.equal( obj.results[9].title, 'Phani Kumar Parasaram - VOIP Expert - Infinite ... - LinkedIn');
|
||||
|
||||
assert.equal (obj.results[0].date, '27.07.2016');
|
||||
assert.equal( obj.results[0].snippet, '27.07.2016 - View ALTANAI BISHT\'S profile on LinkedIn, the world\'s largest professional community. ALTANAI has 6 jobs listed on their profile. See the ...');
|
||||
|
||||
assert.equal (obj.results[2].date, '27.07.2016');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function confirm_results_ok(obj) {
|
||||
|
||||
for (let res of obj.results) {
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
}
|
||||
|
||||
for (let res of obj.top_ads) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'visible_link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.typeOf(res.links, 'array', 'links must be array');
|
||||
}
|
||||
|
||||
for (let res of obj.bottom_ads) {
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'visible_link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.typeOf(res.links, 'array', 'links must be array');
|
||||
}
|
||||
|
||||
for (let res of obj.top_products) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.price, 'price must be ok');
|
||||
assert.typeOf(res.price, 'string', 'price must be string');
|
||||
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.vendor_link, 'vendor_link must be ok');
|
||||
assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string');
|
||||
assert.isAtLeast(res.vendor_link.length, 10, 'vendor_link must have at least 10 chars');
|
||||
}
|
||||
|
||||
for (let res of obj.right_products) {
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.price, 'price must be ok');
|
||||
assert.typeOf(res.price, 'string', 'price must be string');
|
||||
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.vendor_link, 'vendor_link must be ok');
|
||||
assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string');
|
||||
assert.isAtLeast(res.vendor_link.length, 10, 'vendor_link must have at least 10 chars');
|
||||
}
|
||||
|
||||
for (let res of obj.places) {
|
||||
assert.isOk(res.heading, 'heading must be ok');
|
||||
assert.typeOf(res.heading, 'string', 'heading must be string');
|
||||
assert.isAtLeast(res.heading.length, 5, 'heading must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.rating, 'rating must be ok');
|
||||
assert.typeOf(res.rating, 'string', 'rating must be string');
|
||||
assert.isAtLeast(res.rating.length, 5, 'rating must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.contact, 'contact must be ok');
|
||||
assert.typeOf(res.contact, 'string', 'contact must be string');
|
||||
assert.isAtLeast(res.contact.length, 5, 'contact must have at least 5 chars');
|
||||
|
||||
assert.typeOf(res.hours, 'string', 'hours must be string');
|
||||
if (res.hours) {
|
||||
assert.isAtLeast(res.hours.length, 10, 'hours must have at least 10 chars');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
describe('Google2', function(){
|
||||
this.timeout(10000);
|
||||
it('static google searches testing various details', normal_search_test);
|
||||
});
|
@ -1,152 +0,0 @@
|
||||
const se_scraper = require('./../../index.js');
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
const path = require('path');
|
||||
|
||||
async function yandex_ads() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'yandex',
|
||||
keywords: ['cloud service'],
|
||||
num_pages: 1,
|
||||
scrape_from_file: 'file://' + path.join(__dirname, './html/yandex1.html'),
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(config);
|
||||
|
||||
await scraper.start();
|
||||
|
||||
yandex_search_with_ads( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.keywords = ['car tires cheap'];
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/yandex2.html');
|
||||
|
||||
yandex_search_with_ads2( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.keywords = ['купить деревянные окна'];
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/yandex3.html');
|
||||
|
||||
yandex_search_with_ads3( await scraper.scrape(scrape_config) );
|
||||
|
||||
await scraper.quit();
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function yandex_search_with_ads(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '2 million results', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'num_results',], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 12, 'results must have at least 12 SERP objects');
|
||||
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function yandex_search_with_ads2(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '5 million results', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'num_results',], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 11, 'results must have at least 12 SERP objects');
|
||||
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function yandex_search_with_ads3(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
// console.dir(obj.results, {depth: null, colors: true});
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'num_results',], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 14, 'results must have at least 14 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
// at least 4 ads
|
||||
let cnt = 0;
|
||||
obj.results.forEach((res) => {
|
||||
if (res.is_ad) {
|
||||
cnt++;
|
||||
}
|
||||
});
|
||||
|
||||
assert.isAtLeast(cnt, 4, 'there should be at least 4 ads in the results');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function confirm_results_ok(obj) {
|
||||
|
||||
for (let res of obj.results) {
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
}
|
||||
}
|
||||
|
||||
describe('Yandex', function(){
|
||||
this.timeout(10000);
|
||||
it('static yandex searches with ads', yandex_ads);
|
||||
});
|
@ -1,141 +0,0 @@
|
||||
'use strict';
|
||||
const se_scraper = require('./../index.js');
|
||||
const assert = require('chai').assert;
|
||||
|
||||
/*
|
||||
* Use chai and mocha for tests.
|
||||
* https://mochajs.org/#installation
|
||||
*/
|
||||
|
||||
const normal_search_keywords = ['iphone', 'clock'];
|
||||
|
||||
async function normal_search_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
keyword_file: '',
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'amazon',
|
||||
num_pages: 1,
|
||||
keywords: normal_search_keywords,
|
||||
};
|
||||
|
||||
console.log('normal_search_test()');
|
||||
normal_search_test_case( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function normal_search_test_case(response) {
|
||||
assert.equal(response.metadata.num_requests, 2);
|
||||
|
||||
for (let query in response.results) {
|
||||
let total_rank = 1;
|
||||
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'image', 'seller', 'stars', 'num_reviews', 'price', 'oldprice'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.seller, 'seller must be ok');
|
||||
assert.typeOf(res.seller, 'string', 'seller must be string');
|
||||
assert.isAtLeast(res.seller.length, 5, 'seller must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.stars, 'stars be ok');
|
||||
assert.typeOf(res.stars, 'string', 'stars must be string');
|
||||
assert.isAtLeast(res.stars.length, 5, 'stars must have at least 6 chars');
|
||||
assert.include(res.stars, ' out of ', 'stars must include " out of "');
|
||||
|
||||
assert.isOk(res.num_reviews, 'num_reviews be ok');
|
||||
assert.typeOf(res.num_reviews, 'string', 'num_reviews must be string');
|
||||
assert.isAtLeast(res.num_reviews.length, 1, 'num_reviews must have at least 1 chars');
|
||||
|
||||
assert.isOk(res.price, 'price be ok');
|
||||
assert.typeOf(res.price, 'string', 'price must be string');
|
||||
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
assert.equal(res.rank, total_rank++, 'rank ist wrong');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const keywords_no_results = ['2342kljp;fj9834u40abJ54634344023safkl34a44dsflkjaQQuBBdfk',];
|
||||
|
||||
async function no_results_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
keyword_file: '',
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'amazon',
|
||||
num_pages: 1,
|
||||
keywords: keywords_no_results,
|
||||
};
|
||||
|
||||
console.log('no_results_test()');
|
||||
test_case_no_results( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function test_case_no_results(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, keywords_no_results, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
assert(obj.results.length === 0, 'results must have 0 SERP objects');
|
||||
assert.equal(obj.no_results, true, 'no results should be true');
|
||||
assert.isEmpty(obj.num_results, 'no results should be a empty string');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
describe('Amazon', function(){
|
||||
this.timeout(30000);
|
||||
it('normal search test', normal_search_test);
|
||||
it('no results test', no_results_test);
|
||||
});
|
@ -1,87 +0,0 @@
|
||||
'use strict';
|
||||
const se_scraper = require('./../index.js');
|
||||
const assert = require('chai').assert;
|
||||
|
||||
/*
|
||||
* Use chai and mocha for tests.
|
||||
* https://mochajs.org/#installation
|
||||
*/
|
||||
|
||||
const normal_search_keywords = ['mouse', 'cat'];
|
||||
|
||||
async function normal_search_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
keyword_file: '',
|
||||
headless: true,
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'baidu',
|
||||
keywords: normal_search_keywords,
|
||||
num_pages: 2,
|
||||
};
|
||||
|
||||
console.log('normal_search_test()');
|
||||
normal_search_test_case( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function normal_search_test_case(response) {
|
||||
assert.equal(response.metadata.num_requests, 4);
|
||||
|
||||
for (let query in response.results) {
|
||||
let total_rank = 1;
|
||||
|
||||
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'num_results', 'no_results'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
assert.equal(res.rank, total_rank++, 'rank ist wrong');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
describe('Baidu', function(){
|
||||
this.timeout(30000);
|
||||
it('normal search test', normal_search_test);
|
||||
});
|
@ -1,271 +0,0 @@
|
||||
'use strict';
|
||||
const se_scraper = require('./../index.js');
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
/*
|
||||
* Use chai and mocha for tests.
|
||||
* https://mochajs.org/#installation
|
||||
*/
|
||||
|
||||
const normal_search_keywords = ['apple tree', 'weather tomorrow'];
|
||||
|
||||
async function normal_search_test() {
|
||||
let config = {
|
||||
search_engine: 'bing',
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
keywords: normal_search_keywords,
|
||||
keyword_file: '',
|
||||
num_pages: 3,
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'bing',
|
||||
keywords: normal_search_keywords,
|
||||
num_pages: 3,
|
||||
};
|
||||
|
||||
console.log('normal_search_test()');
|
||||
normal_search_test_case( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function normal_search_test_case(response) {
|
||||
assert.equal(response.metadata.num_requests, 6);
|
||||
|
||||
for (let query in response.results) {
|
||||
let total_rank = 1;
|
||||
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'rank'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
|
||||
|
||||
if (res.snippet) {
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
}
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
assert.equal(res.rank, total_rank++, 'rank ist wrong');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const keywords_no_results = ['2342kljp;fj9834u40abJAkasdlfkjsladfkjasfdas;lk3453-934023safkl34a44dsflkjaQQuBBdfk',];
|
||||
|
||||
async function no_results_test() {
|
||||
let config = {
|
||||
search_engine: 'bing',
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
keywords: keywords_no_results,
|
||||
keyword_file: '',
|
||||
num_pages: 1,
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'bing',
|
||||
keywords: keywords_no_results,
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
console.log('no_results_test()');
|
||||
test_case_no_results( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function test_case_no_results(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, keywords_no_results, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
assert(obj.results.length === 0, 'results must have 0 SERP objects');
|
||||
assert.equal(obj.no_results, true, 'no results should be true');
|
||||
assert.isEmpty(obj.num_results, 'no results should be a empty string');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const effective_query_keywords = ['mount everrest'];
|
||||
|
||||
async function effective_query_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
keyword_file: '',
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'bing',
|
||||
keywords: effective_query_keywords,
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
console.log('effective_query_test()');
|
||||
test_case_effective_query( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function test_case_effective_query(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, effective_query_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
// effective query must be different to the original keyword
|
||||
assert.isOk(obj.effective_query, 'effective query must be ok');
|
||||
assert.isNotEmpty(obj.effective_query, 'effective query must be valid');
|
||||
assert(obj.effective_query !== query, 'effective query must be different from keyword');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
const ads_keywords = ['cloud services', 'buy shoes'];
|
||||
|
||||
async function ads_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
block_assets: false,
|
||||
random_user_agent: true,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'bing',
|
||||
keywords: ads_keywords,
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
console.log('ads_test()');
|
||||
test_case_ads_test( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
function test_case_ads_test(response) {
|
||||
assert.equal(response.metadata.num_requests, 2);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, ads_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 5, 'results must have at least 5 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
assert.isAtLeast(obj.ads.length, 2, 'ads must have at least 2 SERP objects');
|
||||
|
||||
for (let res of obj.ads) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
// assert.isOk(res.link, 'link must be ok');
|
||||
// assert.typeOf(res.link, 'string', 'link must be string');
|
||||
// assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
describe('Bing', function(){
|
||||
this.timeout(30000);
|
||||
it('normal search', normal_search_test);
|
||||
it('no results', no_results_test);
|
||||
it('effective query', effective_query_test);
|
||||
it('finds ads', ads_test);
|
||||
});
|
@ -1,192 +0,0 @@
|
||||
'use strict';
|
||||
const se_scraper = require('./../index.js');
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
|
||||
const normal_search_keywords = ['apple tree', 'weather tomorrow'];
|
||||
|
||||
async function normal_search_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
block_assets: false,
|
||||
random_user_agent: true,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'duckduckgo',
|
||||
keywords: normal_search_keywords,
|
||||
num_pages: 2,
|
||||
};
|
||||
|
||||
console.log('normal_search_test()');
|
||||
normal_search_test_case( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function normal_search_test_case(response) {
|
||||
assert.equal(response.metadata.num_requests, 2);
|
||||
|
||||
for (let query in response.results) {
|
||||
let total_rank = 1;
|
||||
|
||||
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
assert.equal(res.rank, total_rank++, 'rank ist wrong');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const effective_query_keywords = ['mount everrest'];
|
||||
|
||||
async function effective_query_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
block_assets: true,
|
||||
random_user_agent: true,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'duckduckgo',
|
||||
keywords: effective_query_keywords,
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
console.log('test_case_effective_query()');
|
||||
test_case_effective_query( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function test_case_effective_query(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
results = response.results;
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, effective_query_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
// effective query must be different to the original keyword
|
||||
assert.isOk(obj.effective_query, 'effective query must be ok');
|
||||
assert.isNotEmpty(obj.effective_query, 'effective query must be valid');
|
||||
assert(obj.effective_query !== query, 'effective query must be different from keyword');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const ads_keywords = ['cloud services', 'buy shoes'];
|
||||
|
||||
async function ads_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
block_assets: false,
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'duckduckgo',
|
||||
keywords: ads_keywords,
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
console.log('ads_test()');
|
||||
test_case_ads_test( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
function test_case_ads_test(response) {
|
||||
assert.equal(response.metadata.num_requests, 2);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, ads_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'effective_query', 'ads'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 6, 'results must have at least 6 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
assert.isAtLeast(obj.ads.length, 2, 'ads must have at least 2 SERP object');
|
||||
|
||||
for (let res of obj.ads) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
describe('Duckduckgo', function(){
|
||||
this.timeout(30000);
|
||||
it('normal search', normal_search_test);
|
||||
it('effective query', effective_query_test);
|
||||
it('finds ads', ads_test);
|
||||
});
|
@ -1,424 +0,0 @@
|
||||
'use strict';
|
||||
const se_scraper = require('./../index.js');
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
|
||||
/*
|
||||
* Use chai and mocha for tests.
|
||||
* https://mochajs.org/#installation
|
||||
*/
|
||||
|
||||
const normal_search_keywords = ['apple tree', 'weather tomorrow'];
|
||||
|
||||
async function normal_search_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
keyword_file: '',
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'google',
|
||||
keywords: normal_search_keywords,
|
||||
num_pages: 3,
|
||||
};
|
||||
|
||||
console.log('normal_search_test()');
|
||||
normal_search_test_case( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function normal_search_test_case(response) {
|
||||
assert.equal(response.metadata.num_requests, 6);
|
||||
|
||||
for (let query in response.results) {
|
||||
let total_rank = 1;
|
||||
|
||||
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 8 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
assert.equal(res.rank, total_rank++, 'rank ist wrong');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const keywords_no_results = ['fgskl34440abJAksfs4353534a3l34AVGFDFflkjaQQuBBdfk',];
|
||||
|
||||
async function no_results_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
keyword_file: '',
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'google',
|
||||
keywords: keywords_no_results,
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
console.log('no_results_test()');
|
||||
test_case_no_results( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function test_case_no_results(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, keywords_no_results, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
assert.strictEqual(obj.results.length, 0, 'results must have 0 SERP objects');
|
||||
assert.equal(obj.no_results, true, 'no results should be true');
|
||||
assert.isEmpty(obj.num_results, 'num_results should be a empty string');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const effective_query_keywords = ['mount evverrest'];
|
||||
|
||||
async function effective_query_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
keyword_file: '',
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'google',
|
||||
keywords: effective_query_keywords,
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
console.log('effective_query_test()');
|
||||
test_case_effective_query( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function test_case_effective_query(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, effective_query_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
// effective query must be different to the original keyword
|
||||
assert.isOk(obj.effective_query, 'effective query must be ok');
|
||||
assert.isNotEmpty(obj.effective_query, 'effective query must be valid');
|
||||
assert(obj.effective_query !== query, 'effective query must be different from keyword');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 8 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function html_output_query_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
keyword_file: '',
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'google',
|
||||
keywords: normal_search_keywords,
|
||||
num_pages: 3,
|
||||
html_output: true,
|
||||
};
|
||||
|
||||
let output = await se_scraper.scrape(config, scrape_config);
|
||||
normal_search_test_case( output );
|
||||
check_html_output_test_case( output );
|
||||
}
|
||||
|
||||
function check_html_output_test_case( response ) {
|
||||
for (let query in response.html_output) {
|
||||
|
||||
assert.containsAllKeys(response.html_output, normal_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.html_output[query]) {
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
assert.startsWith(response.html_output[query][page_number], '<!DOCTYPE html><html');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const ads_keywords = ['cloud services', 'auto kaufen'];
|
||||
|
||||
async function ads_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
block_assets: false,
|
||||
random_user_agent: false, // dont try to trick google with ads
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'google',
|
||||
keywords: ads_keywords,
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
console.log('ads_test()');
|
||||
test_case_ads_test( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
function test_case_ads_test(response) {
|
||||
assert.equal(response.metadata.num_requests, 2);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, ads_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
assert(obj.top_ads.length >= 1 || obj.bottom_ads.length >= 1, 'top_ads or bottom_ads must have at least 1 SERP object');
|
||||
|
||||
for (let res of obj.top_ads) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'visible_link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.typeOf(res.links, 'array', 'links must be array');
|
||||
}
|
||||
|
||||
for (let res of obj.bottom_ads) {
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'visible_link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.typeOf(res.links, 'array', 'links must be array');
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
const product_keywords = ['autoreifen bmw'];
|
||||
|
||||
async function products_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
block_assets: false,
|
||||
random_user_agent: false, // dont try to trick google with ads
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'google',
|
||||
keywords: ads_keywords,
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
console.log('products_test()');
|
||||
test_case_products_test( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
function test_case_products_test(response) {
|
||||
assert.equal(response.metadata.num_requests, 2);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, ads_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
assert(obj.top_products.length >= 1 || obj.right_products.length >= 1, 'top_products or right_products must have at least 1 SERP object');
|
||||
|
||||
for (let res of obj.top_products) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.price, 'price must be ok');
|
||||
assert.typeOf(res.price, 'string', 'price must be string');
|
||||
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.vendor_link, 'vendor_link must be ok');
|
||||
assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string');
|
||||
assert.isAtLeast(res.vendor_link.length, 10, 'vendor_link must have at least 10 chars');
|
||||
}
|
||||
|
||||
for (let res of obj.right_products) {
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.price, 'price must be ok');
|
||||
assert.typeOf(res.price, 'string', 'price must be string');
|
||||
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.vendor_link, 'vendor_link must be ok');
|
||||
assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string');
|
||||
assert.isAtLeast(res.vendor_link.length, 10, 'vendor_link must have at least 10 chars');
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
describe('Google', function(){
|
||||
this.timeout(30000);
|
||||
it('normal search', normal_search_test);
|
||||
it('no results', no_results_test);
|
||||
it('effective query', effective_query_test);
|
||||
it('html output query', html_output_query_test);
|
||||
it('ads', ads_test);
|
||||
it('products test', products_test);
|
||||
});
|
@ -1,80 +0,0 @@
|
||||
'use strict';
|
||||
const se_scraper = require('./../index.js');
|
||||
const assert = require('chai').assert;
|
||||
|
||||
/*
|
||||
* Use chai and mocha for tests.
|
||||
* https://mochajs.org/#installation
|
||||
*/
|
||||
|
||||
const normal_search_keywords = ['apple', 'rain'];
|
||||
|
||||
async function normal_image_search_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 0,
|
||||
headless: true,
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'google_image',
|
||||
keywords: normal_search_keywords,
|
||||
num_pages: 2,
|
||||
};
|
||||
|
||||
console.log('normal_image_search_test()');
|
||||
normal_image_search_test_case( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function normal_image_search_test_case(response) {
|
||||
assert.equal(response.metadata.num_requests, 2);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
let total_rank = 1;
|
||||
|
||||
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 15, 'results must have at least 15 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'snippet', 'rank', 'clean_link'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.clean_link, 'clean_link must be ok');
|
||||
assert.typeOf(res.clean_link, 'string', 'clean_link must be string');
|
||||
assert.isAtLeast(res.clean_link.length, 5, 'clean_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
assert.equal(res.rank, total_rank++, 'rank ist wrong');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
describe('Google Image', function(){
|
||||
this.timeout(30000);
|
||||
it('normal image search test', normal_image_search_test);
|
||||
});
|
@ -1,91 +0,0 @@
|
||||
'use strict';
|
||||
const se_scraper = require('./../index.js');
|
||||
const assert = require('chai').assert;
|
||||
|
||||
const normal_search_keywords = ['apple juice'];
|
||||
|
||||
async function queryargs_search_test() {
|
||||
let config = {
|
||||
search_engine: 'google',
|
||||
compress: false,
|
||||
debug: true,
|
||||
verbose: true,
|
||||
keywords: normal_search_keywords,
|
||||
keyword_file: '',
|
||||
num_pages: 2,
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
// use specific search engine parameters for various search engines
|
||||
google_settings: {
|
||||
google_domain: 'google.com',
|
||||
gl: 'fr', // The gl parameter determines the Google country to use for the query.
|
||||
hl: 'fr', // The hl parameter determines the Google UI language to return results.
|
||||
start: 30, // Determines the results offset to use, defaults to 0.
|
||||
num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
|
||||
},
|
||||
};
|
||||
|
||||
console.log('queryargs_search_test()');
|
||||
await se_scraper.scrape(config, queryargs_search_test_case);
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function queryargs_search_test_case(err, response) {
|
||||
|
||||
if (err) {
|
||||
console.error(err);
|
||||
} else {
|
||||
assert.equal(response.metadata.num_requests, 2);
|
||||
|
||||
for (let query in response.results) {
|
||||
let total_rank = 1;
|
||||
|
||||
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 90, 'results must have at least 80 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
assert.equal(res.rank, total_rank++, 'rank ist wrong');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
describe('Google with query arguments', function(){
|
||||
this.timeout(30000);
|
||||
it('query args search test', queryargs_search_test);
|
||||
});
|
@ -1,217 +0,0 @@
|
||||
'use strict';
|
||||
const se_scraper = require('./../index.js');
|
||||
const assert = require('chai').assert;
|
||||
|
||||
/*
|
||||
* Use chai and mocha for tests.
|
||||
* https://mochajs.org/#installation
|
||||
*/
|
||||
|
||||
const quote_search_keywords = ['MSFT', 'AAPL'];
|
||||
|
||||
async function reuters_search_test() {
|
||||
let config = {
|
||||
search_engine: 'reuters',
|
||||
compress: false,
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: quote_search_keywords,
|
||||
keyword_file: '',
|
||||
num_pages: 1,
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
console.log('reuters_search_test()');
|
||||
await se_scraper.scrape(config, reuters_search_test_case);
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function reuters_search_test_case(err, response) {
|
||||
|
||||
if (err) {
|
||||
console.error(err);
|
||||
} else {
|
||||
|
||||
for (let query in response.results) {
|
||||
let total_rank = 1;
|
||||
assert.containsAllKeys(response.results, quote_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'date', 'snippet'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.date, 'date must be ok');
|
||||
assert.typeOf(res.date, 'string', 'date must be string');
|
||||
assert.isAtLeast(res.date.length, 5, 'date must have at least 5 chars');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function cnbc_search_test() {
|
||||
let config = {
|
||||
search_engine: 'cnbc',
|
||||
compress: false,
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: quote_search_keywords,
|
||||
keyword_file: '',
|
||||
num_pages: 1,
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
console.log('cnbc_search_test()');
|
||||
await se_scraper.scrape(config, cnbc_search_test_case);
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function cnbc_search_test_case(err, response) {
|
||||
|
||||
if (err) {
|
||||
console.error(err);
|
||||
} else {
|
||||
|
||||
for (let query in response.results) {
|
||||
let total_rank = 1;
|
||||
assert.containsAllKeys(response.results, quote_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'date'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.date, 'date must be ok');
|
||||
assert.typeOf(res.date, 'string', 'date must be string');
|
||||
assert.isAtLeast(res.date.length, 5, 'date must have at least 5 chars');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const marketwatch_search_keywords = ['MSFT'];
|
||||
|
||||
async function marketwatch_search_test() {
|
||||
let config = {
|
||||
search_engine: 'marketwatch',
|
||||
compress: false,
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: marketwatch_search_keywords,
|
||||
keyword_file: '',
|
||||
num_pages: 1,
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
console.log('marketwatch_search_test()');
|
||||
await se_scraper.scrape(config, marketwatch_search_test_case);
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function marketwatch_search_test_case(err, response) {
|
||||
|
||||
if (err) {
|
||||
console.error(err);
|
||||
} else {
|
||||
|
||||
for (let query in response.results) {
|
||||
let total_rank = 1;
|
||||
assert.containsAllKeys(response.results, marketwatch_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'date', 'author'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.author, 'author must be ok');
|
||||
assert.typeOf(res.author, 'string', 'author must be string');
|
||||
assert.isAtLeast(res.author.length, 5, 'author must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.date, 'date must be ok');
|
||||
assert.typeOf(res.date, 'string', 'date must be string');
|
||||
assert.isAtLeast(res.date.length, 5, 'date must have at least 5 chars');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
describe('Ticker', function(){
|
||||
this.timeout(30000);
|
||||
it('Reuters search test', reuters_search_test);
|
||||
it('CNBC search test', cnbc_search_test);
|
||||
it('Marketwatch search test', marketwatch_search_test);
|
||||
});
|
144
test/user_agent.js
Normal file
144
test/user_agent.js
Normal file
@ -0,0 +1,144 @@
|
||||
'use strict';
|
||||
const express = require('express');
|
||||
const { createLogger, transports } = require('winston');
|
||||
const http = require('http');
|
||||
const https = require('https');
|
||||
const assert = require('assert');
|
||||
const keyCert = require('key-cert');
|
||||
const Promise = require('bluebird');
|
||||
const Proxy = require('http-mitm-proxy');
|
||||
const UAParser = require('ua-parser-js');
|
||||
const _ = require('lodash');
|
||||
|
||||
const debug = require('debug')('se-scraper:test');
|
||||
const se_scraper = require('../');
|
||||
const Scraper = require('../src/modules/se_scraper');
|
||||
|
||||
const httpPort = 3012;
|
||||
const httpsPort = httpPort + 1;
|
||||
const proxyPort = httpPort + 2;
|
||||
|
||||
const fakeSearchEngine = express();
|
||||
fakeSearchEngine.set('trust proxy', 'loopback');
|
||||
fakeSearchEngine.get('/test-user_agent', (req, res) => {
|
||||
debug('fake-search-engine req.headers.user-agent=%s', req.headers['user-agent']);
|
||||
res.send(req.headers['user-agent']);
|
||||
});
|
||||
|
||||
describe('Config', function(){
|
||||
|
||||
let httpServer, httpsServer, proxy;
|
||||
before(async function(){
|
||||
// Here mount our fake engine in both http and https listen server
|
||||
httpServer = http.createServer(fakeSearchEngine);
|
||||
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
||||
|
||||
proxy = Proxy();
|
||||
proxy.onRequest((ctx, callback) => {
|
||||
ctx.proxyToServerRequestOptions.host = 'localhost';
|
||||
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
|
||||
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
|
||||
debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
|
||||
return callback();
|
||||
});
|
||||
|
||||
await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
|
||||
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
|
||||
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
||||
debug('Fake http search engine servers started');
|
||||
});
|
||||
|
||||
after(function(){
|
||||
httpsServer.close();
|
||||
httpServer.close();
|
||||
proxy.close();
|
||||
});
|
||||
|
||||
describe('user_agent', function(){
|
||||
|
||||
class MockScraperTestUserAgent extends Scraper {
|
||||
|
||||
async load_start_page(){
|
||||
return true;
|
||||
}
|
||||
|
||||
async search_keyword(){
|
||||
await this.page.goto('http://localhost:' + httpPort + '/test-user_agent');
|
||||
}
|
||||
|
||||
async parse_async(){
|
||||
const bodyHandle = await this.page.$('body');
|
||||
return await this.page.evaluate(body => body.innerHTML, bodyHandle);
|
||||
}
|
||||
}
|
||||
|
||||
const testLogger = createLogger({
|
||||
transports: [
|
||||
new transports.Console({
|
||||
level: 'error'
|
||||
})
|
||||
]
|
||||
});
|
||||
|
||||
/**
|
||||
* Test user_agent option
|
||||
*/
|
||||
it('fixed user_agent', async function () {
|
||||
|
||||
const scrape_job = {
|
||||
search_engine: MockScraperTestUserAgent,
|
||||
keywords: ['javascript is hard'],
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager({
|
||||
throw_on_detection: true,
|
||||
logger: testLogger,
|
||||
user_agent: 'THIS IS A USERAGENT 42.0'
|
||||
});
|
||||
await scraper.start();
|
||||
|
||||
const { results } = await scraper.scrape(scrape_job);
|
||||
assert.strictEqual(results['javascript is hard']['1'], 'THIS IS A USERAGENT 42.0');
|
||||
|
||||
await scraper.quit();
|
||||
});
|
||||
|
||||
/**
|
||||
* Test random_user_agent option
|
||||
* TODO generated user_agent should be different for each keyword
|
||||
* TODO this test will sometimes fail because user_agent not very random :-(
|
||||
*/
|
||||
it('random_user_agent', async function () {
|
||||
|
||||
const scrape_job = {
|
||||
search_engine: MockScraperTestUserAgent,
|
||||
keywords: ['news'],
|
||||
};
|
||||
|
||||
const NUMBER_OF_EXEC = 10;
|
||||
|
||||
const uaList = await Promise.map(_.range(NUMBER_OF_EXEC), async (i) => {
|
||||
const scraper = new se_scraper.ScrapeManager({
|
||||
throw_on_detection: true,
|
||||
logger: testLogger,
|
||||
random_user_agent: true,
|
||||
});
|
||||
await scraper.start();
|
||||
const { results: { news } } = await scraper.scrape(scrape_job);
|
||||
await scraper.quit();
|
||||
return news['1'];
|
||||
});
|
||||
|
||||
uaList.forEach((userAgent) => {
|
||||
const uaParsed = UAParser(userAgent);
|
||||
assert(uaParsed.browser.name, 'UserAgent should have a browser name detected');
|
||||
assert(uaParsed.os.name, 'UserAgent should have a os name detected');
|
||||
});
|
||||
|
||||
assert( _.chain(uaList).countBy().toPairs().sortBy(e => e[1]).last().value()[1] < (NUMBER_OF_EXEC * 0.4), 'Each user agent should appear less than 40% of the time' );
|
||||
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
});
|
Loading…
Reference in New Issue
Block a user