forked from extern/se-scraper
Merge pull request #61 from Monibrand/refactor/use-original-puppeteer-cluster
Refactor/use original puppeteer cluster
This commit is contained in:
commit
6b806dedfe
2
.gitignore
vendored
2
.gitignore
vendored
@ -79,3 +79,5 @@ typings/
|
||||
|
||||
.idea/
|
||||
GoogleScraperPup.iml
|
||||
|
||||
.http-mitm-proxy
|
||||
|
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -1,3 +0,0 @@
|
||||
[submodule "src/puppeteer-cluster"]
|
||||
path = src/puppeteer-cluster
|
||||
url = https://github.com/NikolaiT/puppeteer-cluster
|
599
package-lock.json
generated
599
package-lock.json
generated
@ -22,6 +22,16 @@
|
||||
"resolved": "https://registry.npmjs.org/@types/node/-/node-12.7.5.tgz",
|
||||
"integrity": "sha512-9fq4jZVhPNW8r+UYKnxF1e2HkDWOWKM5bC2/7c9wPV835I0aOrVbS/Hw/pWPk2uKrNXQqg9Z959Kz+IYDd5p3w=="
|
||||
},
|
||||
"accepts": {
|
||||
"version": "1.3.7",
|
||||
"resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.7.tgz",
|
||||
"integrity": "sha512-Il80Qs2WjYlJIBNzNkK6KYqlVMTbZLXgHx2oT0pU/fjRHyEp+PEfEPY0R3WCwAGVOtauxh1hOxNgIf5bv7dQpA==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"mime-types": "~2.1.24",
|
||||
"negotiator": "0.6.2"
|
||||
}
|
||||
},
|
||||
"agent-base": {
|
||||
"version": "4.3.0",
|
||||
"resolved": "https://registry.npmjs.org/agent-base/-/agent-base-4.3.0.tgz",
|
||||
@ -65,6 +75,12 @@
|
||||
"resolved": "https://registry.npmjs.org/arr-union/-/arr-union-3.1.0.tgz",
|
||||
"integrity": "sha1-45sJrqne+Gao8gbiiK9jkZuuOcQ="
|
||||
},
|
||||
"array-flatten": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz",
|
||||
"integrity": "sha1-ml9pkFGx5wczKPKgCJaLZOopVdI=",
|
||||
"dev": true
|
||||
},
|
||||
"assertion-error": {
|
||||
"version": "1.1.0",
|
||||
"resolved": "https://registry.npmjs.org/assertion-error/-/assertion-error-1.1.0.tgz",
|
||||
@ -89,6 +105,47 @@
|
||||
"resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.0.tgz",
|
||||
"integrity": "sha1-ibTRmasr7kneFk6gK4nORi1xt2c="
|
||||
},
|
||||
"bluebird": {
|
||||
"version": "3.7.2",
|
||||
"resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.7.2.tgz",
|
||||
"integrity": "sha512-XpNj6GDQzdfW+r2Wnn7xiSAd7TM3jzkxGXBGTtWKuSXv1xUV+azxAm8jdWZN06QTQk+2N2XB9jRDkvbmQmcRtg==",
|
||||
"dev": true
|
||||
},
|
||||
"body-parser": {
|
||||
"version": "1.19.0",
|
||||
"resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.19.0.tgz",
|
||||
"integrity": "sha512-dhEPs72UPbDnAQJ9ZKMNTP6ptJaionhP5cBb541nXPlW60Jepo9RV/a4fX4XWW9CuFNK22krhrj1+rgzifNCsw==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"bytes": "3.1.0",
|
||||
"content-type": "~1.0.4",
|
||||
"debug": "2.6.9",
|
||||
"depd": "~1.1.2",
|
||||
"http-errors": "1.7.2",
|
||||
"iconv-lite": "0.4.24",
|
||||
"on-finished": "~2.3.0",
|
||||
"qs": "6.7.0",
|
||||
"raw-body": "2.4.0",
|
||||
"type-is": "~1.6.17"
|
||||
},
|
||||
"dependencies": {
|
||||
"debug": {
|
||||
"version": "2.6.9",
|
||||
"resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
|
||||
"integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"ms": "2.0.0"
|
||||
}
|
||||
},
|
||||
"ms": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
|
||||
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=",
|
||||
"dev": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"boolbase": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz",
|
||||
@ -114,6 +171,12 @@
|
||||
"resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.1.tgz",
|
||||
"integrity": "sha512-MQcXEUbCKtEo7bhqEs6560Hyd4XaovZlO/k9V3hjVUF/zwW7KBVdSK4gIt/bzwS9MbR5qob+F5jusZsb0YQK2A=="
|
||||
},
|
||||
"bytes": {
|
||||
"version": "3.1.0",
|
||||
"resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.0.tgz",
|
||||
"integrity": "sha512-zauLjrfCG+xvoyaqLoV8bLVXXNGC4JqlxFCutSDWA6fJrTo2ZuvLYTqZ7aHBLZSMOopbzwv8f+wZcVzfVTI2Dg==",
|
||||
"dev": true
|
||||
},
|
||||
"cacheable-request": {
|
||||
"version": "6.0.0",
|
||||
"resolved": "https://registry.npmjs.org/cacheable-request/-/cacheable-request-6.0.0.tgz",
|
||||
@ -176,6 +239,12 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"charenc": {
|
||||
"version": "0.0.2",
|
||||
"resolved": "https://registry.npmjs.org/charenc/-/charenc-0.0.2.tgz",
|
||||
"integrity": "sha1-wKHS86cJLgN3S/qD8UwPxXkKhmc=",
|
||||
"dev": true
|
||||
},
|
||||
"check-error": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/check-error/-/check-error-1.0.2.tgz",
|
||||
@ -322,6 +391,33 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"content-disposition": {
|
||||
"version": "0.5.3",
|
||||
"resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-0.5.3.tgz",
|
||||
"integrity": "sha512-ExO0774ikEObIAEV9kDo50o+79VCUdEB6n6lzKgGwupcVeRlhrj3qGAfwq8G6uBJjkqLrhT0qEYFcWng8z1z0g==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"safe-buffer": "5.1.2"
|
||||
}
|
||||
},
|
||||
"content-type": {
|
||||
"version": "1.0.4",
|
||||
"resolved": "https://registry.npmjs.org/content-type/-/content-type-1.0.4.tgz",
|
||||
"integrity": "sha512-hIP3EEPs8tB9AT1L+NUqtwOAps4mk2Zob89MWXMHjHWg9milF/j4osnnQLXBCBFBk/tvIG/tUc9mOUJiPBhPXA==",
|
||||
"dev": true
|
||||
},
|
||||
"cookie": {
|
||||
"version": "0.4.0",
|
||||
"resolved": "https://registry.npmjs.org/cookie/-/cookie-0.4.0.tgz",
|
||||
"integrity": "sha512-+Hp8fLp57wnUSt0tY0tHEXh4voZRDnoIrZPqlo3DPiI4y9lwg/jqx+1Om94/W6ZaPDOUbnjOt/99w66zk+l1Xg==",
|
||||
"dev": true
|
||||
},
|
||||
"cookie-signature": {
|
||||
"version": "1.0.6",
|
||||
"resolved": "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.0.6.tgz",
|
||||
"integrity": "sha1-4wOogrNCzD7oylE6eZmXNNqzriw=",
|
||||
"dev": true
|
||||
},
|
||||
"core-util-is": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz",
|
||||
@ -340,6 +436,12 @@
|
||||
"which": "^1.2.9"
|
||||
}
|
||||
},
|
||||
"crypt": {
|
||||
"version": "0.0.2",
|
||||
"resolved": "https://registry.npmjs.org/crypt/-/crypt-0.0.2.tgz",
|
||||
"integrity": "sha1-iNf/fsDfuG9xPch7u0LQRNPmxBs=",
|
||||
"dev": true
|
||||
},
|
||||
"css-select": {
|
||||
"version": "1.2.0",
|
||||
"resolved": "http://registry.npmjs.org/css-select/-/css-select-1.2.0.tgz",
|
||||
@ -406,6 +508,18 @@
|
||||
"object-keys": "^1.0.12"
|
||||
}
|
||||
},
|
||||
"depd": {
|
||||
"version": "1.1.2",
|
||||
"resolved": "https://registry.npmjs.org/depd/-/depd-1.1.2.tgz",
|
||||
"integrity": "sha1-m81S4UwJd2PnSbJ0xDRu0uVgtak=",
|
||||
"dev": true
|
||||
},
|
||||
"destroy": {
|
||||
"version": "1.0.4",
|
||||
"resolved": "https://registry.npmjs.org/destroy/-/destroy-1.0.4.tgz",
|
||||
"integrity": "sha1-l4hXRCxEdJ5CBmE+N5RiBYJqvYA=",
|
||||
"dev": true
|
||||
},
|
||||
"diagnostics": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "https://registry.npmjs.org/diagnostics/-/diagnostics-1.1.1.tgz",
|
||||
@ -472,6 +586,12 @@
|
||||
"resolved": "https://registry.npmjs.org/duplexer3/-/duplexer3-0.1.4.tgz",
|
||||
"integrity": "sha1-7gHdHKwO08vH/b6jfcCo8c4ALOI="
|
||||
},
|
||||
"ee-first": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz",
|
||||
"integrity": "sha1-WQxhFWsK4vTwJVcyoViyZrxWsh0=",
|
||||
"dev": true
|
||||
},
|
||||
"emoji-regex": {
|
||||
"version": "7.0.3",
|
||||
"resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-7.0.3.tgz",
|
||||
@ -486,6 +606,12 @@
|
||||
"env-variable": "0.0.x"
|
||||
}
|
||||
},
|
||||
"encodeurl": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz",
|
||||
"integrity": "sha1-rT/0yG7C0CkyL1oCw6mmBslbP1k=",
|
||||
"dev": true
|
||||
},
|
||||
"end-of-stream": {
|
||||
"version": "1.4.1",
|
||||
"resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.1.tgz",
|
||||
@ -542,6 +668,12 @@
|
||||
"es6-promise": "^4.0.3"
|
||||
}
|
||||
},
|
||||
"escape-html": {
|
||||
"version": "1.0.3",
|
||||
"resolved": "https://registry.npmjs.org/escape-html/-/escape-html-1.0.3.tgz",
|
||||
"integrity": "sha1-Aljq5NPQwJdN4cFpGI7wBR0dGYg=",
|
||||
"dev": true
|
||||
},
|
||||
"escape-string-regexp": {
|
||||
"version": "1.0.5",
|
||||
"resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-1.0.5.tgz",
|
||||
@ -554,6 +686,12 @@
|
||||
"integrity": "sha512-eGuFFw7Upda+g4p+QHvnW0RyTX/SVeJBDM/gCtMARO0cLuT2HcEKnTPvhjV6aGeqrCB/sbNop0Kszm0jsaWU4A==",
|
||||
"dev": true
|
||||
},
|
||||
"etag": {
|
||||
"version": "1.8.1",
|
||||
"resolved": "https://registry.npmjs.org/etag/-/etag-1.8.1.tgz",
|
||||
"integrity": "sha1-Qa4u62XvpiJorr/qg6x9eSmbCIc=",
|
||||
"dev": true
|
||||
},
|
||||
"execa": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/execa/-/execa-1.0.0.tgz",
|
||||
@ -569,6 +707,61 @@
|
||||
"strip-eof": "^1.0.0"
|
||||
}
|
||||
},
|
||||
"express": {
|
||||
"version": "4.17.1",
|
||||
"resolved": "https://registry.npmjs.org/express/-/express-4.17.1.tgz",
|
||||
"integrity": "sha512-mHJ9O79RqluphRrcw2X/GTh3k9tVv8YcoyY4Kkh4WDMUYKRZUq0h1o0w2rrrxBqM7VoeUVqgb27xlEMXTnYt4g==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"accepts": "~1.3.7",
|
||||
"array-flatten": "1.1.1",
|
||||
"body-parser": "1.19.0",
|
||||
"content-disposition": "0.5.3",
|
||||
"content-type": "~1.0.4",
|
||||
"cookie": "0.4.0",
|
||||
"cookie-signature": "1.0.6",
|
||||
"debug": "2.6.9",
|
||||
"depd": "~1.1.2",
|
||||
"encodeurl": "~1.0.2",
|
||||
"escape-html": "~1.0.3",
|
||||
"etag": "~1.8.1",
|
||||
"finalhandler": "~1.1.2",
|
||||
"fresh": "0.5.2",
|
||||
"merge-descriptors": "1.0.1",
|
||||
"methods": "~1.1.2",
|
||||
"on-finished": "~2.3.0",
|
||||
"parseurl": "~1.3.3",
|
||||
"path-to-regexp": "0.1.7",
|
||||
"proxy-addr": "~2.0.5",
|
||||
"qs": "6.7.0",
|
||||
"range-parser": "~1.2.1",
|
||||
"safe-buffer": "5.1.2",
|
||||
"send": "0.17.1",
|
||||
"serve-static": "1.14.1",
|
||||
"setprototypeof": "1.1.1",
|
||||
"statuses": "~1.5.0",
|
||||
"type-is": "~1.6.18",
|
||||
"utils-merge": "1.0.1",
|
||||
"vary": "~1.1.2"
|
||||
},
|
||||
"dependencies": {
|
||||
"debug": {
|
||||
"version": "2.6.9",
|
||||
"resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
|
||||
"integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"ms": "2.0.0"
|
||||
}
|
||||
},
|
||||
"ms": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
|
||||
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=",
|
||||
"dev": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"extract-zip": {
|
||||
"version": "1.6.7",
|
||||
"resolved": "https://registry.npmjs.org/extract-zip/-/extract-zip-1.6.7.tgz",
|
||||
@ -613,6 +806,38 @@
|
||||
"resolved": "https://registry.npmjs.org/fecha/-/fecha-2.3.3.tgz",
|
||||
"integrity": "sha512-lUGBnIamTAwk4znq5BcqsDaxSmZ9nDVJaij6NvRt/Tg4R69gERA+otPKbS86ROw9nxVMw2/mp1fnaiWqbs6Sdg=="
|
||||
},
|
||||
"finalhandler": {
|
||||
"version": "1.1.2",
|
||||
"resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-1.1.2.tgz",
|
||||
"integrity": "sha512-aAWcW57uxVNrQZqFXjITpW3sIUQmHGG3qSb9mUah9MgMC4NeWhNOlNjXEYq3HjRAvL6arUviZGGJsBg6z0zsWA==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"debug": "2.6.9",
|
||||
"encodeurl": "~1.0.2",
|
||||
"escape-html": "~1.0.3",
|
||||
"on-finished": "~2.3.0",
|
||||
"parseurl": "~1.3.3",
|
||||
"statuses": "~1.5.0",
|
||||
"unpipe": "~1.0.0"
|
||||
},
|
||||
"dependencies": {
|
||||
"debug": {
|
||||
"version": "2.6.9",
|
||||
"resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
|
||||
"integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"ms": "2.0.0"
|
||||
}
|
||||
},
|
||||
"ms": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
|
||||
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=",
|
||||
"dev": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"find-up": {
|
||||
"version": "3.0.0",
|
||||
"resolved": "https://registry.npmjs.org/find-up/-/find-up-3.0.0.tgz",
|
||||
@ -644,6 +869,18 @@
|
||||
"for-in": "^1.0.1"
|
||||
}
|
||||
},
|
||||
"forwarded": {
|
||||
"version": "0.1.2",
|
||||
"resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.1.2.tgz",
|
||||
"integrity": "sha1-mMI9qxF1ZXuMBXPozszZGw/xjIQ=",
|
||||
"dev": true
|
||||
},
|
||||
"fresh": {
|
||||
"version": "0.5.2",
|
||||
"resolved": "https://registry.npmjs.org/fresh/-/fresh-0.5.2.tgz",
|
||||
"integrity": "sha1-PYyt2Q2XZWn6g1qx+OSyOhBWBac=",
|
||||
"dev": true
|
||||
},
|
||||
"fs.realpath": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz",
|
||||
@ -757,6 +994,47 @@
|
||||
"resolved": "https://registry.npmjs.org/http-cache-semantics/-/http-cache-semantics-4.0.2.tgz",
|
||||
"integrity": "sha512-laeSTWIkuFa6lUgZAt+ic9RwOSEwbi9VDQNcCvMFO4sZiDc2Ha8DaZVCJnfpLLQCcS8rvCnIWYmz0POLxt7Dew=="
|
||||
},
|
||||
"http-errors": {
|
||||
"version": "1.7.2",
|
||||
"resolved": "https://registry.npmjs.org/http-errors/-/http-errors-1.7.2.tgz",
|
||||
"integrity": "sha512-uUQBt3H/cSIVfch6i1EuPNy/YsRSOUBXTVfZ+yR7Zjez3qjBz6i9+i4zjNaoqcoFVI4lQJ5plg63TvGfRSDCRg==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"depd": "~1.1.2",
|
||||
"inherits": "2.0.3",
|
||||
"setprototypeof": "1.1.1",
|
||||
"statuses": ">= 1.5.0 < 2",
|
||||
"toidentifier": "1.0.0"
|
||||
}
|
||||
},
|
||||
"http-mitm-proxy": {
|
||||
"version": "0.8.2",
|
||||
"resolved": "https://registry.npmjs.org/http-mitm-proxy/-/http-mitm-proxy-0.8.2.tgz",
|
||||
"integrity": "sha512-QqaqHWssz4acqu2aIPJqJWt/gDa4SzQ9kj/rs16ONA2nBWNh/mfOW0Ez1Wxa5IivHHZSTciQ7wG0Dxzogurngw==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"async": "^2.6.2",
|
||||
"debug": "^4.1.0",
|
||||
"mkdirp": "^0.5.1",
|
||||
"node-forge": "^0.8.4",
|
||||
"optimist": "^0.6.1",
|
||||
"semaphore": "^1.1.0",
|
||||
"ws": "^3.2.0"
|
||||
},
|
||||
"dependencies": {
|
||||
"ws": {
|
||||
"version": "3.3.3",
|
||||
"resolved": "https://registry.npmjs.org/ws/-/ws-3.3.3.tgz",
|
||||
"integrity": "sha512-nnWLa/NwZSt4KQJu51MYlCcSQ5g7INpOrOMt4XV8j4dqTXdmlUmSHQ8/oLC069ckre0fRsgfvsKwbTdtKLCDkA==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"async-limiter": "~1.0.0",
|
||||
"safe-buffer": "~5.1.0",
|
||||
"ultron": "~1.1.0"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"https-proxy-agent": {
|
||||
"version": "3.0.1",
|
||||
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-3.0.1.tgz",
|
||||
@ -776,6 +1054,15 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"iconv-lite": {
|
||||
"version": "0.4.24",
|
||||
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz",
|
||||
"integrity": "sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"safer-buffer": ">= 2.1.2 < 3"
|
||||
}
|
||||
},
|
||||
"inflight": {
|
||||
"version": "1.0.6",
|
||||
"resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz",
|
||||
@ -796,6 +1083,12 @@
|
||||
"integrity": "sha512-wPVv/y/QQ/Uiirj/vh3oP+1Ww+AWehmi1g5fFWGPF6IpCBCDVrhgHRMvrLfdYcwDh3QJbGXDW4JAuzxElLSqKA==",
|
||||
"dev": true
|
||||
},
|
||||
"ipaddr.js": {
|
||||
"version": "1.9.0",
|
||||
"resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.9.0.tgz",
|
||||
"integrity": "sha512-M4Sjn6N/+O6/IXSJseKqHoFc+5FdGJ22sXqnjTpdZweHK64MzEPAyQZyEU3R/KRv2GLoa7nNtg/C2Ev6m7z+eA==",
|
||||
"dev": true
|
||||
},
|
||||
"is-arrayish": {
|
||||
"version": "0.3.2",
|
||||
"resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.2.tgz",
|
||||
@ -892,6 +1185,15 @@
|
||||
"resolved": "https://registry.npmjs.org/json-buffer/-/json-buffer-3.0.0.tgz",
|
||||
"integrity": "sha1-Wx85evx11ne96Lz8Dkfh+aPZqJg="
|
||||
},
|
||||
"key-cert": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/key-cert/-/key-cert-1.0.1.tgz",
|
||||
"integrity": "sha512-WiaPESfEzsztL9KIxbX6mNAU34NcEOyLVrpajrTkXeVc2tAZDx3lcLQlIE+bUqEoaIl0InBoiIy6C5ToLJ7i0g==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"pem": "^1.12.5"
|
||||
}
|
||||
},
|
||||
"keyv": {
|
||||
"version": "3.1.0",
|
||||
"resolved": "https://registry.npmjs.org/keyv/-/keyv-3.1.0.tgz",
|
||||
@ -992,6 +1294,31 @@
|
||||
"p-defer": "^1.0.0"
|
||||
}
|
||||
},
|
||||
"md5": {
|
||||
"version": "2.2.1",
|
||||
"resolved": "https://registry.npmjs.org/md5/-/md5-2.2.1.tgz",
|
||||
"integrity": "sha1-U6s41f48iJG6RlMp6iP6wFQBJvk=",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"charenc": "~0.0.1",
|
||||
"crypt": "~0.0.1",
|
||||
"is-buffer": "~1.1.1"
|
||||
},
|
||||
"dependencies": {
|
||||
"is-buffer": {
|
||||
"version": "1.1.6",
|
||||
"resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-1.1.6.tgz",
|
||||
"integrity": "sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w==",
|
||||
"dev": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"media-typer": {
|
||||
"version": "0.3.0",
|
||||
"resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz",
|
||||
"integrity": "sha1-hxDXrwqmJvj/+hzgAWhUUmMlV0g=",
|
||||
"dev": true
|
||||
},
|
||||
"mem": {
|
||||
"version": "4.3.0",
|
||||
"resolved": "https://registry.npmjs.org/mem/-/mem-4.3.0.tgz",
|
||||
@ -1013,11 +1340,38 @@
|
||||
"kind-of": "^3.0.2"
|
||||
}
|
||||
},
|
||||
"merge-descriptors": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.1.tgz",
|
||||
"integrity": "sha1-sAqqVW3YtEVoFQ7J0blT8/kMu2E=",
|
||||
"dev": true
|
||||
},
|
||||
"methods": {
|
||||
"version": "1.1.2",
|
||||
"resolved": "https://registry.npmjs.org/methods/-/methods-1.1.2.tgz",
|
||||
"integrity": "sha1-VSmk1nZUE07cxSZmVoNbD4Ua/O4=",
|
||||
"dev": true
|
||||
},
|
||||
"mime": {
|
||||
"version": "2.4.4",
|
||||
"resolved": "https://registry.npmjs.org/mime/-/mime-2.4.4.tgz",
|
||||
"integrity": "sha512-LRxmNwziLPT828z+4YkNzloCFC2YM4wrB99k+AV5ZbEyfGNWfG8SO1FUXLmLDBSo89NrJZ4DIWeLjy1CHGhMGA=="
|
||||
},
|
||||
"mime-db": {
|
||||
"version": "1.42.0",
|
||||
"resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.42.0.tgz",
|
||||
"integrity": "sha512-UbfJCR4UAVRNgMpfImz05smAXK7+c+ZntjaA26ANtkXLlOe947Aag5zdIcKQULAiF9Cq4WxBi9jUs5zkA84bYQ==",
|
||||
"dev": true
|
||||
},
|
||||
"mime-types": {
|
||||
"version": "2.1.25",
|
||||
"resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.25.tgz",
|
||||
"integrity": "sha512-5KhStqB5xpTAeGqKBAMgwaYMnQik7teQN4IAzC7npDv6kzeU6prfkR67bc87J1kWMPGkoaZSq1npmexMgkmEVg==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"mime-db": "1.42.0"
|
||||
}
|
||||
},
|
||||
"mimic-fn": {
|
||||
"version": "2.1.0",
|
||||
"resolved": "https://registry.npmjs.org/mimic-fn/-/mimic-fn-2.1.0.tgz",
|
||||
@ -1127,6 +1481,12 @@
|
||||
"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.1.tgz",
|
||||
"integrity": "sha512-tgp+dl5cGk28utYktBsrFqA7HKgrhgPsg6Z/EfhWI4gl1Hwq8B/GmY/0oXZ6nF8hDVesS/FpnYaD/kOWhYQvyg=="
|
||||
},
|
||||
"negotiator": {
|
||||
"version": "0.6.2",
|
||||
"resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.2.tgz",
|
||||
"integrity": "sha512-hZXc7K2e+PgeI1eDBe/10Ard4ekbfrrqG8Ep+8Jmf4JID2bNg7NvCPOZN+kfF574pFQI7mum2AUqDidoKqcTOw==",
|
||||
"dev": true
|
||||
},
|
||||
"nice-try": {
|
||||
"version": "1.0.5",
|
||||
"resolved": "https://registry.npmjs.org/nice-try/-/nice-try-1.0.5.tgz",
|
||||
@ -1143,6 +1503,12 @@
|
||||
"semver": "^5.7.0"
|
||||
}
|
||||
},
|
||||
"node-forge": {
|
||||
"version": "0.8.5",
|
||||
"resolved": "https://registry.npmjs.org/node-forge/-/node-forge-0.8.5.tgz",
|
||||
"integrity": "sha512-vFMQIWt+J/7FLNyKouZ9TazT74PRV3wgv9UT4cRjC8BffxFbKXkgIWR42URCPSnHm/QDz6BOlb2Q0U4+VQT67Q==",
|
||||
"dev": true
|
||||
},
|
||||
"normalize-url": {
|
||||
"version": "3.3.0",
|
||||
"resolved": "https://registry.npmjs.org/normalize-url/-/normalize-url-3.3.0.tgz",
|
||||
@ -1199,6 +1565,15 @@
|
||||
"es-abstract": "^1.5.1"
|
||||
}
|
||||
},
|
||||
"on-finished": {
|
||||
"version": "2.3.0",
|
||||
"resolved": "https://registry.npmjs.org/on-finished/-/on-finished-2.3.0.tgz",
|
||||
"integrity": "sha1-IPEzZIGwg811M3mSoWlxqi2QaUc=",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"ee-first": "1.1.1"
|
||||
}
|
||||
},
|
||||
"once": {
|
||||
"version": "1.4.0",
|
||||
"resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz",
|
||||
@ -1212,6 +1587,16 @@
|
||||
"resolved": "https://registry.npmjs.org/one-time/-/one-time-0.0.4.tgz",
|
||||
"integrity": "sha1-+M33eISCb+Tf+T46nMN7HkSAdC4="
|
||||
},
|
||||
"optimist": {
|
||||
"version": "0.6.1",
|
||||
"resolved": "https://registry.npmjs.org/optimist/-/optimist-0.6.1.tgz",
|
||||
"integrity": "sha1-2j6nRob6IaGaERwybpDrFaAZZoY=",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"minimist": "~0.0.1",
|
||||
"wordwrap": "~0.0.2"
|
||||
}
|
||||
},
|
||||
"os-locale": {
|
||||
"version": "3.1.0",
|
||||
"resolved": "https://registry.npmjs.org/os-locale/-/os-locale-3.1.0.tgz",
|
||||
@ -1223,6 +1608,12 @@
|
||||
"mem": "^4.0.0"
|
||||
}
|
||||
},
|
||||
"os-tmpdir": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/os-tmpdir/-/os-tmpdir-1.0.2.tgz",
|
||||
"integrity": "sha1-u+Z0BseaqFxc/sdm/lc0VV36EnQ=",
|
||||
"dev": true
|
||||
},
|
||||
"p-cancelable": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/p-cancelable/-/p-cancelable-1.0.0.tgz",
|
||||
@ -1278,6 +1669,12 @@
|
||||
"@types/node": "*"
|
||||
}
|
||||
},
|
||||
"parseurl": {
|
||||
"version": "1.3.3",
|
||||
"resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz",
|
||||
"integrity": "sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ==",
|
||||
"dev": true
|
||||
},
|
||||
"path-exists": {
|
||||
"version": "3.0.0",
|
||||
"resolved": "https://registry.npmjs.org/path-exists/-/path-exists-3.0.0.tgz",
|
||||
@ -1295,12 +1692,38 @@
|
||||
"integrity": "sha1-QRyttXTFoUDTpLGRDUDYDMn0C0A=",
|
||||
"dev": true
|
||||
},
|
||||
"path-to-regexp": {
|
||||
"version": "0.1.7",
|
||||
"resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.7.tgz",
|
||||
"integrity": "sha1-32BBeABfUi8V60SQ5yR6G/qmf4w=",
|
||||
"dev": true
|
||||
},
|
||||
"pathval": {
|
||||
"version": "1.1.0",
|
||||
"resolved": "https://registry.npmjs.org/pathval/-/pathval-1.1.0.tgz",
|
||||
"integrity": "sha1-uULm1L3mUwBe9rcTYd74cn0GReA=",
|
||||
"dev": true
|
||||
},
|
||||
"pem": {
|
||||
"version": "1.14.3",
|
||||
"resolved": "https://registry.npmjs.org/pem/-/pem-1.14.3.tgz",
|
||||
"integrity": "sha512-Q+AMVMD3fzeVvZs5PHeI+pVt0hgZY2fjhkliBW43qyONLgCXPVk1ryim43F9eupHlNGLJNT5T/NNrzhUdiC5Zg==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"es6-promisify": "^6.0.0",
|
||||
"md5": "^2.2.1",
|
||||
"os-tmpdir": "^1.0.1",
|
||||
"which": "^1.3.1"
|
||||
},
|
||||
"dependencies": {
|
||||
"es6-promisify": {
|
||||
"version": "6.0.2",
|
||||
"resolved": "https://registry.npmjs.org/es6-promisify/-/es6-promisify-6.0.2.tgz",
|
||||
"integrity": "sha512-eO6vFm0JvqGzjWIQA6QVKjxpmELfhWbDUWHm1rPfIbn55mhKPiAa5xpLmQWJrNa629ZIeQ8ZvMAi13kvrjK6Mg==",
|
||||
"dev": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"pend": {
|
||||
"version": "1.2.0",
|
||||
"resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz",
|
||||
@ -1321,6 +1744,16 @@
|
||||
"resolved": "https://registry.npmjs.org/progress/-/progress-2.0.3.tgz",
|
||||
"integrity": "sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA=="
|
||||
},
|
||||
"proxy-addr": {
|
||||
"version": "2.0.5",
|
||||
"resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.5.tgz",
|
||||
"integrity": "sha512-t/7RxHXPH6cJtP0pRG6smSr9QJidhB+3kXu0KgXnbGYMgzEnUxRQ4/LDdfOwZEMyIh3/xHb8PX3t+lfL9z+YVQ==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"forwarded": "~0.1.2",
|
||||
"ipaddr.js": "1.9.0"
|
||||
}
|
||||
},
|
||||
"proxy-from-env": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.0.0.tgz",
|
||||
@ -1350,6 +1783,14 @@
|
||||
"ws": "^6.1.0"
|
||||
}
|
||||
},
|
||||
"puppeteer-cluster": {
|
||||
"version": "0.18.0",
|
||||
"resolved": "https://registry.npmjs.org/puppeteer-cluster/-/puppeteer-cluster-0.18.0.tgz",
|
||||
"integrity": "sha512-sMhK7foa+gq8khtgvKar6xwIjmPFS92ZrDsnpMHcQMX2Q8hWmVSViHG7RhuAOmbfIbS7Ya+lViXlx1xtgUjarQ==",
|
||||
"requires": {
|
||||
"debug": "^4.1.1"
|
||||
}
|
||||
},
|
||||
"puppeteer-extra": {
|
||||
"version": "2.1.3",
|
||||
"resolved": "https://registry.npmjs.org/puppeteer-extra/-/puppeteer-extra-2.1.3.tgz",
|
||||
@ -1427,6 +1868,30 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"qs": {
|
||||
"version": "6.7.0",
|
||||
"resolved": "https://registry.npmjs.org/qs/-/qs-6.7.0.tgz",
|
||||
"integrity": "sha512-VCdBRNFTX1fyE7Nb6FYoURo/SPe62QCaAyzJvUjwRaIsc+NePBEniHlvxFmmX56+HZphIGtV0XeCirBtpDrTyQ==",
|
||||
"dev": true
|
||||
},
|
||||
"range-parser": {
|
||||
"version": "1.2.1",
|
||||
"resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz",
|
||||
"integrity": "sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg==",
|
||||
"dev": true
|
||||
},
|
||||
"raw-body": {
|
||||
"version": "2.4.0",
|
||||
"resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.4.0.tgz",
|
||||
"integrity": "sha512-4Oz8DUIwdvoa5qMJelxipzi/iJIi40O5cGV1wNYp5hvZP8ZN0T+jiNkL0QepXs+EsQ9XJ8ipEDoiH70ySUJP3Q==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"bytes": "3.1.0",
|
||||
"http-errors": "1.7.2",
|
||||
"iconv-lite": "0.4.24",
|
||||
"unpipe": "1.0.0"
|
||||
}
|
||||
},
|
||||
"readable-stream": {
|
||||
"version": "3.4.0",
|
||||
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.4.0.tgz",
|
||||
@ -1470,18 +1935,94 @@
|
||||
"resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz",
|
||||
"integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g=="
|
||||
},
|
||||
"safer-buffer": {
|
||||
"version": "2.1.2",
|
||||
"resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
|
||||
"integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==",
|
||||
"dev": true
|
||||
},
|
||||
"semaphore": {
|
||||
"version": "1.1.0",
|
||||
"resolved": "https://registry.npmjs.org/semaphore/-/semaphore-1.1.0.tgz",
|
||||
"integrity": "sha512-O4OZEaNtkMd/K0i6js9SL+gqy0ZCBMgUvlSqHKi4IBdjhe7wB8pwztUk1BbZ1fmrvpwFrPbHzqd2w5pTcJH6LA==",
|
||||
"dev": true
|
||||
},
|
||||
"semver": {
|
||||
"version": "5.7.0",
|
||||
"resolved": "https://registry.npmjs.org/semver/-/semver-5.7.0.tgz",
|
||||
"integrity": "sha512-Ya52jSX2u7QKghxeoFGpLwCtGlt7j0oY9DYb5apt9nPlJ42ID+ulTXESnt/qAQcoSERyZ5sl3LDIOw0nAn/5DA==",
|
||||
"dev": true
|
||||
},
|
||||
"send": {
|
||||
"version": "0.17.1",
|
||||
"resolved": "https://registry.npmjs.org/send/-/send-0.17.1.tgz",
|
||||
"integrity": "sha512-BsVKsiGcQMFwT8UxypobUKyv7irCNRHk1T0G680vk88yf6LBByGcZJOTJCrTP2xVN6yI+XjPJcNuE3V4fT9sAg==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"debug": "2.6.9",
|
||||
"depd": "~1.1.2",
|
||||
"destroy": "~1.0.4",
|
||||
"encodeurl": "~1.0.2",
|
||||
"escape-html": "~1.0.3",
|
||||
"etag": "~1.8.1",
|
||||
"fresh": "0.5.2",
|
||||
"http-errors": "~1.7.2",
|
||||
"mime": "1.6.0",
|
||||
"ms": "2.1.1",
|
||||
"on-finished": "~2.3.0",
|
||||
"range-parser": "~1.2.1",
|
||||
"statuses": "~1.5.0"
|
||||
},
|
||||
"dependencies": {
|
||||
"debug": {
|
||||
"version": "2.6.9",
|
||||
"resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
|
||||
"integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"ms": "2.0.0"
|
||||
},
|
||||
"dependencies": {
|
||||
"ms": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
|
||||
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=",
|
||||
"dev": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"mime": {
|
||||
"version": "1.6.0",
|
||||
"resolved": "https://registry.npmjs.org/mime/-/mime-1.6.0.tgz",
|
||||
"integrity": "sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg==",
|
||||
"dev": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"serve-static": {
|
||||
"version": "1.14.1",
|
||||
"resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.14.1.tgz",
|
||||
"integrity": "sha512-JMrvUwE54emCYWlTI+hGrGv5I8dEwmco/00EvkzIIsR7MqrHonbD9pO2MOfFnpFntl7ecpZs+3mW+XbQZu9QCg==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"encodeurl": "~1.0.2",
|
||||
"escape-html": "~1.0.3",
|
||||
"parseurl": "~1.3.3",
|
||||
"send": "0.17.1"
|
||||
}
|
||||
},
|
||||
"set-blocking": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/set-blocking/-/set-blocking-2.0.0.tgz",
|
||||
"integrity": "sha1-BF+XgtARrppoA93TgrJDkrPYkPc=",
|
||||
"dev": true
|
||||
},
|
||||
"setprototypeof": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.1.1.tgz",
|
||||
"integrity": "sha512-JvdAWfbXeIGaZ9cILp38HntZSFSo3mWg6xGcJJsd+d4aRMOqauag1C63dJfDw7OaMYwEbHMOxEZ1lqVRYP2OAw==",
|
||||
"dev": true
|
||||
},
|
||||
"shallow-clone": {
|
||||
"version": "0.1.2",
|
||||
"resolved": "https://registry.npmjs.org/shallow-clone/-/shallow-clone-0.1.2.tgz",
|
||||
@ -1553,6 +2094,12 @@
|
||||
"resolved": "https://registry.npmjs.org/stack-trace/-/stack-trace-0.0.10.tgz",
|
||||
"integrity": "sha1-VHxws0fo0ytOEI6hoqFZ5f3eGcA="
|
||||
},
|
||||
"statuses": {
|
||||
"version": "1.5.0",
|
||||
"resolved": "https://registry.npmjs.org/statuses/-/statuses-1.5.0.tgz",
|
||||
"integrity": "sha1-Fhx9rBd2Wf2YEfQ3cfqZOBR4Yow=",
|
||||
"dev": true
|
||||
},
|
||||
"string-width": {
|
||||
"version": "2.1.1",
|
||||
"resolved": "https://registry.npmjs.org/string-width/-/string-width-2.1.1.tgz",
|
||||
@ -1618,6 +2165,12 @@
|
||||
"resolved": "https://registry.npmjs.org/to-readable-stream/-/to-readable-stream-1.0.0.tgz",
|
||||
"integrity": "sha512-Iq25XBt6zD5npPhlLVXGFN3/gyR2/qODcKNNyTMd4vbm39HUaOiAM4PMq0eMVC/Tkxz+Zjdsc55g9yyz+Yq00Q=="
|
||||
},
|
||||
"toidentifier": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/toidentifier/-/toidentifier-1.0.0.tgz",
|
||||
"integrity": "sha512-yaOH/Pk/VEhBWWTlhI+qXxDFXlejDGcQipMlyxda9nthulaxLZUNcUqFxokp0vcYnvteJln5FNQDRrxj3YcbVw==",
|
||||
"dev": true
|
||||
},
|
||||
"triple-beam": {
|
||||
"version": "1.3.0",
|
||||
"resolved": "https://registry.npmjs.org/triple-beam/-/triple-beam-1.3.0.tgz",
|
||||
@ -1629,11 +2182,33 @@
|
||||
"integrity": "sha512-0fr/mIH1dlO+x7TlcMy+bIDqKPsw/70tVyeHW787goQjhmqaZe10uwLujubK9q9Lg6Fiho1KUKDYz0Z7k7g5/g==",
|
||||
"dev": true
|
||||
},
|
||||
"type-is": {
|
||||
"version": "1.6.18",
|
||||
"resolved": "https://registry.npmjs.org/type-is/-/type-is-1.6.18.tgz",
|
||||
"integrity": "sha512-TkRKr9sUTxEH8MdfuCSP7VizJyzRNMjj2J2do2Jr3Kym598JVdEksuzPQCnlFPW4ky9Q+iA+ma9BGm06XQBy8g==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"media-typer": "0.3.0",
|
||||
"mime-types": "~2.1.24"
|
||||
}
|
||||
},
|
||||
"typedarray": {
|
||||
"version": "0.0.6",
|
||||
"resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz",
|
||||
"integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c="
|
||||
},
|
||||
"ua-parser-js": {
|
||||
"version": "0.7.21",
|
||||
"resolved": "https://registry.npmjs.org/ua-parser-js/-/ua-parser-js-0.7.21.tgz",
|
||||
"integrity": "sha512-+O8/qh/Qj8CgC6eYBVBykMrNtp5Gebn4dlGD/kKXVkJNDwyrAwSIqwz8CDf+tsAIWVycKcku6gIXJ0qwx/ZXaQ==",
|
||||
"dev": true
|
||||
},
|
||||
"ultron": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "https://registry.npmjs.org/ultron/-/ultron-1.1.1.tgz",
|
||||
"integrity": "sha512-UIEXBNeYmKptWH6z8ZnqTeS8fV74zG0/eRU9VGkpzz+LIJNs8W/zM/L+7ctCkRrgbNnnR0xxw4bKOr0cW0N0Og==",
|
||||
"dev": true
|
||||
},
|
||||
"underscore": {
|
||||
"version": "1.9.1",
|
||||
"resolved": "https://registry.npmjs.org/underscore/-/underscore-1.9.1.tgz",
|
||||
@ -1647,6 +2222,12 @@
|
||||
"underscore": "*"
|
||||
}
|
||||
},
|
||||
"unpipe": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz",
|
||||
"integrity": "sha1-sr9O6FFKrmFltIF4KdIbLvSZBOw=",
|
||||
"dev": true
|
||||
},
|
||||
"url-parse-lax": {
|
||||
"version": "3.0.0",
|
||||
"resolved": "https://registry.npmjs.org/url-parse-lax/-/url-parse-lax-3.0.0.tgz",
|
||||
@ -1669,6 +2250,18 @@
|
||||
"resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz",
|
||||
"integrity": "sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8="
|
||||
},
|
||||
"utils-merge": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/utils-merge/-/utils-merge-1.0.1.tgz",
|
||||
"integrity": "sha1-n5VxD1CiZ5R7LMwSR0HBAoQn5xM=",
|
||||
"dev": true
|
||||
},
|
||||
"vary": {
|
||||
"version": "1.1.2",
|
||||
"resolved": "https://registry.npmjs.org/vary/-/vary-1.1.2.tgz",
|
||||
"integrity": "sha1-IpnwLG3tMNSllhsLn3RSShj2NPw=",
|
||||
"dev": true
|
||||
},
|
||||
"which": {
|
||||
"version": "1.3.1",
|
||||
"resolved": "https://registry.npmjs.org/which/-/which-1.3.1.tgz",
|
||||
@ -1742,6 +2335,12 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"wordwrap": {
|
||||
"version": "0.0.3",
|
||||
"resolved": "https://registry.npmjs.org/wordwrap/-/wordwrap-0.0.3.tgz",
|
||||
"integrity": "sha1-o9XabNXAvAAI03I0u68b7WMFkQc=",
|
||||
"dev": true
|
||||
},
|
||||
"wrap-ansi": {
|
||||
"version": "2.1.0",
|
||||
"resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-2.1.0.tgz",
|
||||
|
11
package.json
11
package.json
@ -5,8 +5,7 @@
|
||||
"homepage": "https://scrapeulous.com/",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"postinstall": "cd src/puppeteer-cluster && npm install && npm run build",
|
||||
"test": "mocha test/static_tests/"
|
||||
"test": "mocha test test/modules"
|
||||
},
|
||||
"keywords": [
|
||||
"scraping",
|
||||
@ -27,14 +26,20 @@
|
||||
"got": "^9.6.0",
|
||||
"lodash": "^4.17.14",
|
||||
"puppeteer": "^2.0.0",
|
||||
"puppeteer-cluster": "^0.18.0",
|
||||
"puppeteer-extra": "^2.1.3",
|
||||
"puppeteer-extra-plugin-stealth": "^2.2.2",
|
||||
"user-agents": "^1.0.378",
|
||||
"winston": "^3.2.1"
|
||||
},
|
||||
"devDependencies": {
|
||||
"bluebird": "^3.7.2",
|
||||
"chai": "^4.2.0",
|
||||
"chai-string": "^1.5.0",
|
||||
"mocha": "^6.1.4"
|
||||
"express": "^4.17.1",
|
||||
"http-mitm-proxy": "^0.8.2",
|
||||
"key-cert": "^1.0.1",
|
||||
"mocha": "^6.1.4",
|
||||
"ua-parser-js": "^0.7.21"
|
||||
}
|
||||
}
|
||||
|
@ -1,66 +0,0 @@
|
||||
/**
|
||||
|
||||
Test server with:
|
||||
|
||||
curl -XPOST http://0.0.0.0:3000 -H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"browser_config": {
|
||||
"random_user_agent": true
|
||||
},
|
||||
"scrape_config": {
|
||||
"search_engine": "google",
|
||||
"keywords": ["test"],
|
||||
"num_pages": 1
|
||||
}
|
||||
}'
|
||||
|
||||
*/
|
||||
|
||||
const se_scraper = require('../index.js');
|
||||
'use strict';
|
||||
const express = require('express');
|
||||
|
||||
// Constants
|
||||
const PORT = process.env.PORT || 3000;
|
||||
const HOST = process.env.HOST || '0.0.0.0';
|
||||
|
||||
// App
|
||||
const app = express();
|
||||
app.use(express.json());
|
||||
|
||||
let browser_config = {
|
||||
random_user_agent: true,
|
||||
headless : true,
|
||||
debug_level: 1,
|
||||
sleep_range: '',
|
||||
puppeteer_cluster_config: {
|
||||
timeout: 30 * 60 * 1000, // max timeout set to 30 minutes
|
||||
monitor: false,
|
||||
concurrency: 1, // one scraper per tab
|
||||
maxConcurrency: 1, // scrape with 5 tabs
|
||||
}
|
||||
};
|
||||
|
||||
app.post('/', async (req, res) => {
|
||||
if (!req.body.browser_config || !req.body.scrape_config) {
|
||||
res.json({
|
||||
'status': 400,
|
||||
'msg': 'please specify browser_config and scrape_config'
|
||||
});
|
||||
} else {
|
||||
// overwrite standard browser config
|
||||
Object.assign(browser_config, req.body.browser_config);
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(browser_config);
|
||||
await scraper.start();
|
||||
var results = await scraper.scrape(req.body.scrape_config);
|
||||
// console.dir(results, {depth: null, colors: true});
|
||||
await scraper.quit();
|
||||
|
||||
res.send(results);
|
||||
}
|
||||
});
|
||||
|
||||
app.listen(PORT, HOST);
|
||||
|
||||
console.log(`Running on http://${HOST}:${PORT}`);
|
55
src/concurrency-implementation.js
Normal file
55
src/concurrency-implementation.js
Normal file
@ -0,0 +1,55 @@
|
||||
const { Browser } = require('puppeteer-cluster/dist/concurrency/builtInConcurrency');
|
||||
const debug = require('debug')('se-scraper:CustomConcurrency');
|
||||
const { timeoutExecute } = require('puppeteer-cluster/dist/util');
|
||||
|
||||
const BROWSER_TIMEOUT = 5000;
|
||||
|
||||
class CustomConcurrency extends Browser {
|
||||
|
||||
async init() {}
|
||||
async close() {}
|
||||
|
||||
async workerInstance() {
|
||||
const options = this.options.perBrowserOptions.shift();
|
||||
debug('Launch puppeteer instance with options=%o', options);
|
||||
let chrome = await this.puppeteer.launch(options);
|
||||
let page;
|
||||
let context;
|
||||
|
||||
return {
|
||||
jobInstance: async () => {
|
||||
await timeoutExecute(BROWSER_TIMEOUT, (async () => {
|
||||
context = await chrome.createIncognitoBrowserContext();
|
||||
page = await context.newPage();
|
||||
})());
|
||||
|
||||
return {
|
||||
resources: {
|
||||
page,
|
||||
},
|
||||
|
||||
close: async () => {
|
||||
await timeoutExecute(BROWSER_TIMEOUT, context.close());
|
||||
},
|
||||
};
|
||||
},
|
||||
|
||||
close: async () => {
|
||||
await chrome.close();
|
||||
},
|
||||
|
||||
repair: async () => {
|
||||
debug('Starting repair');
|
||||
try {
|
||||
// will probably fail, but just in case the repair was not necessary
|
||||
await chrome.close();
|
||||
} catch (e) {}
|
||||
|
||||
// just relaunch as there is only one page per browser
|
||||
chrome = await this.puppeteer.launch(options);
|
||||
},
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
module.exports = CustomConcurrency;
|
@ -123,12 +123,9 @@ class BingScraper extends Scraper {
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
await this.page.goto(startUrl);
|
||||
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
await this.page.goto(startUrl);
|
||||
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -1,15 +1,18 @@
|
||||
const cheerio = require('cheerio');
|
||||
const Scraper = require('./se_scraper');
|
||||
const debug = require('debug')('se-scraper:DuckduckgoScraper');
|
||||
|
||||
class DuckduckgoScraper extends Scraper {
|
||||
|
||||
parse(html) {
|
||||
debug('parse');
|
||||
// load the page source into cheerio
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
// perform queries
|
||||
const results = [];
|
||||
$('#links .result__body').each((i, link) => {
|
||||
const organicSelector = ($('#links .result--sep').length > 0) ? `#links #rld-${this.page_num - 1} ~ .result .result__body` : '#links .result__body';
|
||||
$(organicSelector).each((i, link) => {
|
||||
results.push({
|
||||
link: $(link).find('.result__title .result__a').attr('href'),
|
||||
title: $(link).find('.result__title .result__a').text(),
|
||||
@ -42,19 +45,17 @@ class DuckduckgoScraper extends Scraper {
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
debug('load_start_page');
|
||||
let startUrl = 'https://duckduckgo.com/';
|
||||
|
||||
let startUrl = 'https://duckduckgo.com/?q=test';
|
||||
|
||||
try {
|
||||
this.last_response = await this.page.goto(startUrl);
|
||||
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
this.last_response = await this.page.goto(startUrl);
|
||||
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
async search_keyword(keyword) {
|
||||
debug('search_keyword');
|
||||
const input = await this.page.$('input[name="q"]');
|
||||
await this.set_input_value(`input[name="q"]`, keyword);
|
||||
await this.sleep(50);
|
||||
@ -63,21 +64,19 @@ class DuckduckgoScraper extends Scraper {
|
||||
}
|
||||
|
||||
async next_page() {
|
||||
let next_page_link = await this.page.$('.result.result--more', {timeout: this.STANDARD_TIMEOUT});
|
||||
debug('next_page');
|
||||
let next_page_link = await this.page.$('.result.result--more a', {timeout: this.STANDARD_TIMEOUT});
|
||||
if (!next_page_link) {
|
||||
return false;
|
||||
}
|
||||
await next_page_link.click();
|
||||
try {
|
||||
await this.page.waitForNavigation({timeout: this.STANDARD_TIMEOUT});
|
||||
} catch(e) {
|
||||
return false;
|
||||
}
|
||||
await this.page.waitForNavigation({ timeout: this.STANDARD_TIMEOUT });
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
async wait_for_results() {
|
||||
debug('wait_for_results');
|
||||
await this.page.waitForSelector('.result__body', { timeout: this.STANDARD_TIMEOUT });
|
||||
}
|
||||
|
||||
|
@ -10,6 +10,7 @@ const debug = require('debug')('se-scraper:Scraper');
|
||||
|
||||
module.exports = class Scraper {
|
||||
constructor(options = {}) {
|
||||
debug('constructor');
|
||||
const {
|
||||
config = {},
|
||||
context = {},
|
||||
@ -49,7 +50,9 @@ module.exports = class Scraper {
|
||||
}
|
||||
}
|
||||
|
||||
async run({page, data}) {
|
||||
async run({page, data, worker}) {
|
||||
|
||||
debug('worker=%o', worker, this.config.keywords);
|
||||
|
||||
if (page) {
|
||||
this.page = page;
|
||||
|
@ -6,6 +6,7 @@ const _ = require('lodash');
|
||||
const { createLogger, format, transports } = require('winston');
|
||||
const { combine, timestamp, printf } = format;
|
||||
const debug = require('debug')('se-scraper:ScrapeManager');
|
||||
const { Cluster } = require('puppeteer-cluster');
|
||||
|
||||
const UserAgent = require('user-agents');
|
||||
const google = require('./modules/google.js');
|
||||
@ -13,7 +14,7 @@ const bing = require('./modules/bing.js');
|
||||
const yandex = require('./modules/yandex.js');
|
||||
const infospace = require('./modules/infospace.js');
|
||||
const duckduckgo = require('./modules/duckduckgo.js');
|
||||
const { Cluster } = require('./puppeteer-cluster/dist/index.js');
|
||||
const CustomConcurrencyImpl = require('./concurrency-implementation');
|
||||
|
||||
const MAX_ALLOWED_BROWSERS = 6;
|
||||
|
||||
@ -185,6 +186,10 @@ class ScrapeManager {
|
||||
this.logger.info(`${this.config.proxies.length} proxies read from file.`);
|
||||
}
|
||||
|
||||
if (!this.config.proxies && this.config.use_proxies_only) {
|
||||
throw new Error('Must provide at least one proxy in proxies if you enable use_proxies_only');
|
||||
}
|
||||
|
||||
debug('this.config=%O', this.config);
|
||||
}
|
||||
|
||||
@ -215,92 +220,70 @@ class ScrapeManager {
|
||||
|
||||
const chrome_flags = _.clone(this.config.chrome_flags);
|
||||
|
||||
if (this.config.random_user_agent) {
|
||||
const userAgent = new UserAgent({ deviceCategory: 'desktop' });
|
||||
this.config.user_agent = userAgent.toString();
|
||||
}
|
||||
|
||||
if (this.config.user_agent) {
|
||||
chrome_flags.push(
|
||||
`--user-agent=${this.config.user_agent}`
|
||||
)
|
||||
}
|
||||
|
||||
var launch_args = {
|
||||
args: chrome_flags,
|
||||
headless: this.config.headless,
|
||||
ignoreHTTPSErrors: true,
|
||||
};
|
||||
|
||||
debug('Using the following puppeteer configuration launch_args=%O', launch_args);
|
||||
|
||||
if (this.pluggable && this.pluggable.start_browser) {
|
||||
launch_args.config = this.config;
|
||||
this.browser = await this.pluggable.start_browser(launch_args);
|
||||
this.browser = await this.pluggable.start_browser({
|
||||
config: this.config,
|
||||
});
|
||||
this.page = await this.browser.newPage();
|
||||
} else {
|
||||
// if no custom start_browser functionality was given
|
||||
// use puppeteer-cluster for scraping
|
||||
|
||||
this.numClusters = this.config.puppeteer_cluster_config.maxConcurrency;
|
||||
var perBrowserOptions = [];
|
||||
|
||||
// the first browser this.config with home IP
|
||||
if (!this.config.use_proxies_only) {
|
||||
perBrowserOptions.push(launch_args);
|
||||
}
|
||||
|
||||
let proxies;
|
||||
// if we have at least one proxy, always use CONCURRENCY_BROWSER
|
||||
// and set maxConcurrency to this.config.proxies.length + 1
|
||||
// else use whatever this.configuration was passed
|
||||
if (this.config.proxies && this.config.proxies.length > 0) {
|
||||
this.config.puppeteer_cluster_config.concurrency = Cluster.CONCURRENCY_BROWSER;
|
||||
|
||||
// because we use real browsers, we ran out of memory on normal laptops
|
||||
// when using more than maybe 5 or 6 browsers.
|
||||
// therefore hardcode a limit here
|
||||
// TODO not sure this what we want
|
||||
this.numClusters = Math.min(
|
||||
this.config.proxies.length + (this.config.use_proxies_only ? 0 : 1),
|
||||
MAX_ALLOWED_BROWSERS
|
||||
);
|
||||
proxies = _.clone(this.config.proxies);
|
||||
|
||||
this.logger.info(`Using ${this.numClusters} clusters.`);
|
||||
|
||||
this.config.puppeteer_cluster_config.maxConcurrency = this.numClusters;
|
||||
|
||||
for (var proxy of this.config.proxies) {
|
||||
perBrowserOptions.push({
|
||||
headless: this.config.headless,
|
||||
ignoreHTTPSErrors: true,
|
||||
args: chrome_flags.concat(`--proxy-server=${proxy}`)
|
||||
})
|
||||
// Insert a first config without proxy if use_proxy_only is false
|
||||
if (this.config.use_proxies_only === false) {
|
||||
proxies.unshift(null);
|
||||
}
|
||||
|
||||
} else {
|
||||
this.numClusters = this.config.puppeteer_cluster_config.maxConcurrency;
|
||||
proxies = _.times(this.numClusters, null);
|
||||
}
|
||||
|
||||
// Give the per browser options each a random user agent when random user agent is set
|
||||
while (perBrowserOptions.length < this.numClusters) {
|
||||
const userAgent = new UserAgent();
|
||||
perBrowserOptions.push({
|
||||
this.logger.info(`Using ${this.numClusters} clusters.`);
|
||||
|
||||
// Give the per browser options
|
||||
const perBrowserOptions = _.map(proxies, (proxy) => {
|
||||
const userAgent = (this.config.random_user_agent) ? (new UserAgent({deviceCategory: 'desktop'})).toString() : this.config.user_agent;
|
||||
let args = chrome_flags.concat([`--user-agent=${userAgent}`]);
|
||||
|
||||
if (proxy) {
|
||||
args = args.concat([`--proxy-server=${proxy}`]);
|
||||
}
|
||||
|
||||
return {
|
||||
headless: this.config.headless,
|
||||
ignoreHTTPSErrors: true,
|
||||
args: default_chrome_flags.slice().concat(`--user-agent=${userAgent.toString()}`)
|
||||
})
|
||||
}
|
||||
args
|
||||
};
|
||||
});
|
||||
|
||||
debug('perBrowserOptions=%O', perBrowserOptions)
|
||||
|
||||
this.cluster = await Cluster.launch({
|
||||
monitor: this.config.puppeteer_cluster_config.monitor,
|
||||
timeout: this.config.puppeteer_cluster_config.timeout, // max timeout set to 30 minutes
|
||||
concurrency: this.config.puppeteer_cluster_config.concurrency,
|
||||
maxConcurrency: this.config.puppeteer_cluster_config.maxConcurrency,
|
||||
puppeteerOptions: launch_args,
|
||||
perBrowserOptions: perBrowserOptions,
|
||||
});
|
||||
|
||||
this.cluster.on('taskerror', (err, data) => {
|
||||
this.logger.error(`Error while scraping ${data}: ${err.message}`);
|
||||
debug('Error during cluster task', err);
|
||||
concurrency: CustomConcurrencyImpl,
|
||||
maxConcurrency: this.numClusters,
|
||||
puppeteerOptions: {
|
||||
perBrowserOptions: perBrowserOptions
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
@ -352,26 +335,21 @@ class ScrapeManager {
|
||||
chunks[k % this.numClusters].push(this.config.keywords[k]);
|
||||
}
|
||||
|
||||
let execPromises = [];
|
||||
let scraperInstances = [];
|
||||
for (var c = 0; c < chunks.length; c++) {
|
||||
this.config.keywords = chunks[c];
|
||||
debug('chunks=%o', chunks);
|
||||
|
||||
if (this.config.use_proxies_only) {
|
||||
this.config.proxy = this.config.proxies[c]; // every cluster has a dedicated proxy
|
||||
} else if(c > 0) {
|
||||
this.config.proxy = this.config.proxies[c-1]; // first cluster uses own ip address
|
||||
}
|
||||
let execPromises = [];
|
||||
for (var c = 0; c < chunks.length; c++) {
|
||||
const config = _.clone(this.config);
|
||||
config.keywords = chunks[c];
|
||||
|
||||
var obj = getScraper(this.config.search_engine, {
|
||||
config: this.config,
|
||||
config: config,
|
||||
context: {},
|
||||
pluggable: this.pluggable,
|
||||
});
|
||||
|
||||
var boundMethod = obj.run.bind(obj);
|
||||
execPromises.push(this.cluster.execute({}, boundMethod));
|
||||
scraperInstances.push(obj);
|
||||
}
|
||||
|
||||
let promiseReturns = await Promise.all(execPromises);
|
||||
|
@ -1 +0,0 @@
|
||||
Subproject commit f333cd0ffc7d5dbbc1b7df255986c13f353672d8
|
101
test/html_output.js
Normal file
101
test/html_output.js
Normal file
@ -0,0 +1,101 @@
|
||||
'use strict';
|
||||
const express = require('express');
|
||||
const { createLogger, transports } = require('winston');
|
||||
const http = require('http');
|
||||
const https = require('https');
|
||||
const assert = require('assert');
|
||||
const path = require('path');
|
||||
const keyCert = require('key-cert');
|
||||
const Promise = require('bluebird');
|
||||
const Proxy = require('http-mitm-proxy');
|
||||
|
||||
const debug = require('debug')('se-scraper:test');
|
||||
const se_scraper = require('../');
|
||||
|
||||
const httpPort = 3012;
|
||||
const httpsPort = httpPort + 1;
|
||||
const proxyPort = httpPort + 2;
|
||||
|
||||
const fakeSearchEngine = express();
|
||||
fakeSearchEngine.get('/search', (req, res) => {
|
||||
debug('q=%s', req.query.q);
|
||||
const pageNumber = ((req.query.start/10) || 0) + 1;
|
||||
res.sendFile(path.join(__dirname, 'mocks/google/' + req.query.q + '_page' + pageNumber + '.html'));
|
||||
});
|
||||
fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']}));
|
||||
|
||||
describe('Config', function(){
|
||||
|
||||
let httpServer, httpsServer, proxy;
|
||||
before(async function(){
|
||||
// Here mount our fake engine in both http and https listen server
|
||||
httpServer = http.createServer(fakeSearchEngine);
|
||||
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
||||
|
||||
proxy = Proxy();
|
||||
proxy.onRequest((ctx, callback) => {
|
||||
ctx.proxyToServerRequestOptions.host = 'localhost';
|
||||
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
|
||||
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
|
||||
debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
|
||||
return callback();
|
||||
});
|
||||
|
||||
await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
|
||||
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
|
||||
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
||||
debug('Fake http search engine servers started');
|
||||
});
|
||||
|
||||
after(function(){
|
||||
httpsServer.close();
|
||||
httpServer.close();
|
||||
proxy.close();
|
||||
});
|
||||
|
||||
describe('html_output', function(){
|
||||
|
||||
const testLogger = createLogger({
|
||||
transports: [
|
||||
new transports.Console({
|
||||
level: 'error'
|
||||
})
|
||||
]
|
||||
});
|
||||
|
||||
/**
|
||||
* Test html_output option
|
||||
*/
|
||||
it('html_output single page single keyword', async function () {
|
||||
|
||||
const scrape_job = {
|
||||
search_engine: 'google',
|
||||
/* TODO refactor start_url
|
||||
google_settings: {
|
||||
start_url: 'http://localhost:' + httpPort
|
||||
},
|
||||
*/
|
||||
keywords: ['test keyword'],
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager({
|
||||
throw_on_detection: true,
|
||||
logger: testLogger,
|
||||
html_output: true,
|
||||
//clean_html_output: false,
|
||||
//clean_data_images: false,
|
||||
// TODO refactor start_url so we can use-it instead of depending of the proxy for this test
|
||||
proxies: ['http://localhost:' + proxyPort],
|
||||
use_proxies_only: true,
|
||||
});
|
||||
await scraper.start();
|
||||
const { results } = await scraper.scrape(scrape_job);
|
||||
await scraper.quit();
|
||||
|
||||
assert(results['test keyword']['1'].html.length > 1000, 'Html of google page 1 should be provided');
|
||||
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
});
|
23
test/mocks/bing/index.html
Normal file
23
test/mocks/bing/index.html
Normal file
File diff suppressed because one or more lines are too long
42
test/mocks/bing/test keyword_page1.html
Normal file
42
test/mocks/bing/test keyword_page1.html
Normal file
File diff suppressed because one or more lines are too long
42
test/mocks/bing/test keyword_page2.html
Normal file
42
test/mocks/bing/test keyword_page2.html
Normal file
File diff suppressed because one or more lines are too long
40
test/mocks/bing/test keyword_page3.html
Normal file
40
test/mocks/bing/test keyword_page3.html
Normal file
File diff suppressed because one or more lines are too long
148
test/mocks/duckduckgo/index.html
Normal file
148
test/mocks/duckduckgo/index.html
Normal file
@ -0,0 +1,148 @@
|
||||
<!DOCTYPE html>
|
||||
<!--[if IEMobile 7 ]> <html lang="en_US" class="no-js iem7"> <![endif]-->
|
||||
<!--[if lt IE 7]> <html class="ie6 lt-ie10 lt-ie9 lt-ie8 lt-ie7 no-js" lang="en_US"> <![endif]-->
|
||||
<!--[if IE 7]> <html class="ie7 lt-ie10 lt-ie9 lt-ie8 no-js" lang="en_US"> <![endif]-->
|
||||
<!--[if IE 8]> <html class="ie8 lt-ie10 lt-ie9 no-js" lang="en_US"> <![endif]-->
|
||||
<!--[if IE 9]> <html class="ie9 lt-ie10 no-js" lang="en_US"> <![endif]-->
|
||||
<!--[if (gte IE 9)|(gt IEMobile 7)|!(IEMobile)|!(IE)]><!--><html class="no-js" lang="en_US"><!--<![endif]-->
|
||||
|
||||
<head>
|
||||
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
|
||||
<meta http-equiv="content-type" content="text/html; charset=UTF-8;charset=utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=1" />
|
||||
<meta name="HandheldFriendly" content="true"/>
|
||||
|
||||
<link rel="canonical" href="https://duckduckgo.com/">
|
||||
|
||||
<link rel="stylesheet" href="/s1847.css" type="text/css">
|
||||
|
||||
<link rel="stylesheet" href="/o1847.css" type="text/css">
|
||||
|
||||
|
||||
|
||||
<link rel="shortcut icon" href="/favicon.ico" type="image/x-icon"/>
|
||||
<link rel="apple-touch-icon" href="/assets/icons/meta/DDG-iOS-icon_60x60.png"/>
|
||||
<link rel="apple-touch-icon" sizes="76x76" href="/assets/icons/meta/DDG-iOS-icon_76x76.png"/>
|
||||
<link rel="apple-touch-icon" sizes="120x120" href="/assets/icons/meta/DDG-iOS-icon_120x120.png"/>
|
||||
<link rel="apple-touch-icon" sizes="152x152" href="/assets/icons/meta/DDG-iOS-icon_152x152.png"/>
|
||||
<link rel="image_src" href="/assets/icons/meta/DDG-icon_256x256.png"/>
|
||||
<link rel="manifest" href="/manifest.json"/>
|
||||
|
||||
<meta name="twitter:card" content="summary">
|
||||
<meta name="twitter:site" value="@duckduckgo">
|
||||
|
||||
<meta property="og:url" content="https://duckduckgo.com/" />
|
||||
<meta property="og:site_name" content="DuckDuckGo" />
|
||||
<meta property="og:image" content="https://duckduckgo.com/assets/logo_social-media.png">
|
||||
|
||||
|
||||
<title>DuckDuckGo — Privacy, simplified.</title>
|
||||
<meta property="og:title" content="DuckDuckGo — Privacy, simplified." />
|
||||
|
||||
|
||||
<meta property="og:description" content="The Internet privacy company that empowers you to seamlessly take control of your personal information online, without any tradeoffs.">
|
||||
<meta name="description" content="The Internet privacy company that empowers you to seamlessly take control of your personal information online, without any tradeoffs.">
|
||||
|
||||
|
||||
</head>
|
||||
<body id="pg-index" class="page-index body--home">
|
||||
<script type="text/javascript">
|
||||
var settings_js_version = "/s2475.js",
|
||||
locale = "en_US";
|
||||
</script>
|
||||
<script type="text/javascript" src="/lib/l113.js"></script>
|
||||
<script type="text/javascript" src="/locale/en_US/duckduckgo14.js"></script>
|
||||
<script type="text/javascript" src="/util/u418.js"></script>
|
||||
<script type="text/javascript" src="/d2727.js"></script>
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript">
|
||||
DDG.page = new DDG.Pages.Home();
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
<div class="site-wrapper site-wrapper--home js-site-wrapper">
|
||||
|
||||
|
||||
<div class="header-wrap--home js-header-wrap">
|
||||
<div class="header--aside js-header-aside"></div>
|
||||
<div class="js-header-home-search header-wrap--home__search">
|
||||
<div class="logo-wrap--home">
|
||||
<a id="logo_homepage_link" class="logo_homepage" href="/about">
|
||||
About DuckDuckGo
|
||||
<span class="logo_homepage__tt">Duck it!</span>
|
||||
</a>
|
||||
</div>
|
||||
|
||||
<form id="search_form_homepage_top" class="search search--home js-search-form-top" name="x" method="POST" action="/html">
|
||||
<input class="search__input js-search-input" type="text" autocomplete="off" name="q" tabindex="1" value="">
|
||||
<input class="search__button js-search-button" type="submit" tabindex="2" value="S" />
|
||||
<input class="search__clear empty js-search-clear" type="button" tabindex="3" value="X" />
|
||||
<div class="search__hidden js-search-hidden"></div>
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
<div id="" class="content-wrap--home">
|
||||
<div id="content_homepage" class="content--home">
|
||||
<div class="cw--c">
|
||||
<div class="logo-wrap--home">
|
||||
<a id="logo_homepage_link" class="logo_homepage" href="/about">
|
||||
About DuckDuckGo
|
||||
<span class="logo_homepage__tt">Duck it!</span>
|
||||
</a>
|
||||
</div>
|
||||
|
||||
<div class="search-wrap--home">
|
||||
<form id="search_form_homepage" class="search search--home js-search-form" name="x" method="POST" action="/html">
|
||||
<input id="search_form_input_homepage" class="search__input js-search-input" type="text" autocomplete="off" name="q" tabindex="1" value="">
|
||||
<input id="search_button_homepage" class="search__button js-search-button" type="submit" tabindex="2" value="S" />
|
||||
<input id="search_form_input_clear" class="search__clear empty js-search-clear" type="button" tabindex="3" value="X" />
|
||||
<div id="search_elements_hidden" class="search__hidden js-search-hidden"></div>
|
||||
</form>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<!-- en_US All Settings -->
|
||||
<noscript>
|
||||
<div class="tag-home">
|
||||
<div class="tag-home__wrapper">
|
||||
<div class="tag-home__item">
|
||||
The search engine that doesn't track you.
|
||||
<span class="hide--screen-xs"><a href="/about" class="tag-home__link">Learn More</a>.</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</noscript>
|
||||
<div class="tag-home tag-home--slide no-js__hide js-tag-home"></div>
|
||||
<div id="error_homepage"></div>
|
||||
|
||||
|
||||
|
||||
|
||||
</div> <!-- cw -->
|
||||
</div> <!-- content_homepage //-->
|
||||
</div> <!-- content_wrapper_homepage //-->
|
||||
<div id="footer_homepage" class="foot-home js-foot-home"></div>
|
||||
|
||||
<script type="text/javascript">
|
||||
{function seterr(str) {
|
||||
var error=document.getElementById('error_homepage');
|
||||
error.innerHTML=str;
|
||||
$(error).css('display','block');
|
||||
}
|
||||
var err=new RegExp('[\?\&]e=([^\&]+)');var errm=new Array();errm['2']='no search';errm['3']='search too long';errm['4']='not UTF\u002d8 encoding';errm['6']='too many search terms';if (err.test(window.location.href)) seterr('Oops, '+(errm[RegExp.$1]?errm[RegExp.$1]:'there was an error.')+' Please try again');};
|
||||
|
||||
if (kurl) {
|
||||
document.getElementById("logo_homepage_link").href += (document.getElementById("logo_homepage_link").href.indexOf('?')==-1 ? '?t=i' : '') + kurl;
|
||||
}
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
</div> <!-- site-wrapper -->
|
||||
</body>
|
||||
</html>
|
3
test/mocks/duckduckgo/test keyword_page1.html
Normal file
3
test/mocks/duckduckgo/test keyword_page1.html
Normal file
File diff suppressed because one or more lines are too long
3
test/mocks/duckduckgo/test keyword_page2.html
Normal file
3
test/mocks/duckduckgo/test keyword_page2.html
Normal file
File diff suppressed because one or more lines are too long
3
test/mocks/duckduckgo/test keyword_page3.html
Normal file
3
test/mocks/duckduckgo/test keyword_page3.html
Normal file
File diff suppressed because one or more lines are too long
358
test/mocks/google/index.html
Normal file
358
test/mocks/google/index.html
Normal file
File diff suppressed because one or more lines are too long
209
test/mocks/google/test keyword_page1.html
Normal file
209
test/mocks/google/test keyword_page1.html
Normal file
File diff suppressed because one or more lines are too long
206
test/mocks/google/test keyword_page2.html
Normal file
206
test/mocks/google/test keyword_page2.html
Normal file
File diff suppressed because one or more lines are too long
191
test/mocks/google/test keyword_page3.html
Normal file
191
test/mocks/google/test keyword_page3.html
Normal file
File diff suppressed because one or more lines are too long
123
test/modules/bing.js
Normal file
123
test/modules/bing.js
Normal file
@ -0,0 +1,123 @@
|
||||
'use strict';
|
||||
const express = require('express');
|
||||
const puppeteer = require('puppeteer');
|
||||
const { createLogger, transports } = require('winston');
|
||||
const http = require('http');
|
||||
const https = require('https');
|
||||
const assert = require('assert');
|
||||
const path = require('path');
|
||||
const keyCert = require('key-cert');
|
||||
const Promise = require('bluebird');
|
||||
const Proxy = require('http-mitm-proxy');
|
||||
|
||||
const debug = require('debug')('se-scraper:test');
|
||||
const { BingScraper } = require('../../src/modules/bing');
|
||||
|
||||
const httpPort = 3012;
|
||||
const httpsPort = httpPort + 1;
|
||||
const proxyPort = httpPort + 2;
|
||||
|
||||
const fakeSearchEngine = express();
|
||||
fakeSearchEngine.get('/search', (req, res, next) => {
|
||||
debug('q=%s', req.query.q);
|
||||
const pageNumber = Math.round((req.query.first || 0) /10) + 1;
|
||||
res.sendFile(path.join(__dirname, '../mocks/bing/' + req.query.q + '_page' + pageNumber + '.html'));
|
||||
});
|
||||
fakeSearchEngine.use(express.static('test/mocks/bing', {extensions: ['html']}));
|
||||
|
||||
describe('Module Bing', function(){
|
||||
|
||||
let httpServer, httpsServer, proxy;
|
||||
before(async function(){
|
||||
// Here mount our fake engine in both http and https listen server
|
||||
httpServer = http.createServer(fakeSearchEngine);
|
||||
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
||||
|
||||
proxy = Proxy();
|
||||
proxy.onRequest((ctx, callback) => {
|
||||
ctx.proxyToServerRequestOptions.host = 'localhost';
|
||||
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
|
||||
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
|
||||
debug('connection proxied askedHost=%s toPort=%s', ctx.clientToProxyRequest.headers.host, ctx.proxyToServerRequestOptions.port);
|
||||
return callback();
|
||||
});
|
||||
|
||||
await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort });
|
||||
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
|
||||
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
||||
debug('Fake http search engine servers started');
|
||||
});
|
||||
|
||||
after(function(){
|
||||
proxy.close();
|
||||
httpsServer.close();
|
||||
httpServer.close();
|
||||
});
|
||||
|
||||
let browser;
|
||||
let page;
|
||||
beforeEach(async function(){
|
||||
debug('Start a new browser');
|
||||
browser = await puppeteer.launch({
|
||||
//dumpio: true,
|
||||
//headless: false,
|
||||
ignoreHTTPSErrors: true,
|
||||
args: [ '--proxy-server=http://localhost:' + proxyPort ]
|
||||
});
|
||||
debug('Open a fresh page');
|
||||
page = await browser.newPage();
|
||||
});
|
||||
|
||||
afterEach(async function(){
|
||||
await browser.close();
|
||||
});
|
||||
|
||||
const testLogger = createLogger({
|
||||
transports: [
|
||||
new transports.Console({
|
||||
level: 'error'
|
||||
})
|
||||
]
|
||||
});
|
||||
|
||||
it('one keyword one page', function(){
|
||||
const bingScraper = new BingScraper({
|
||||
config: {
|
||||
search_engine_name: 'bing',
|
||||
throw_on_detection: true,
|
||||
keywords: ['test keyword'],
|
||||
logger: testLogger,
|
||||
scrape_from_file: '',
|
||||
}
|
||||
});
|
||||
bingScraper.STANDARD_TIMEOUT = 500;
|
||||
return bingScraper.run({page}).then(({results, metadata, num_requests}) => {
|
||||
assert.strictEqual(num_requests, 1, 'Must do one request');
|
||||
assert.strictEqual(results['test keyword']['1'].results.length, 6, 'Must have 6 organic results parsed');
|
||||
});
|
||||
});
|
||||
|
||||
it('one keyword 3 pages', function () {
|
||||
const bingScraper = new BingScraper({
|
||||
config: {
|
||||
search_engine_name: 'bing',
|
||||
throw_on_detection: true,
|
||||
keywords: ['test keyword'],
|
||||
logger: testLogger,
|
||||
scrape_from_file: '',
|
||||
num_pages: 3,
|
||||
}
|
||||
});
|
||||
bingScraper.STANDARD_TIMEOUT = 500;
|
||||
return bingScraper.run({page}).then(({results, metadata, num_requests}) => {
|
||||
assert.strictEqual(num_requests, 3, 'Must three requests');
|
||||
assert.strictEqual(results['test keyword']['1'].results.length, 6, 'Must have 6 organic results parsed on page 1');
|
||||
assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tests | TestComplete Documentation', 'Title not matching on first organic result page 1');
|
||||
assert.strictEqual(results['test keyword']['2'].results.length, 10, 'Must have 10 organic results parsed on page 2');
|
||||
assert.strictEqual(results['test keyword']['2'].results[0].title, 'Keywords - TestLink', 'Title not matching on first organic result page 2');
|
||||
assert.strictEqual(results['test keyword']['3'].results.length, 10, 'Must have 10 organic results parsed on page 3');
|
||||
assert.strictEqual(results['test keyword']['3'].results[0].title, 'Keyword Driven Testing | TestComplete', 'Title not matching on first organic result page 3');
|
||||
});
|
||||
});
|
||||
|
||||
});
|
140
test/modules/duckduckgo.js
Normal file
140
test/modules/duckduckgo.js
Normal file
@ -0,0 +1,140 @@
|
||||
'use strict';
|
||||
const express = require('express');
|
||||
const puppeteer = require('puppeteer');
|
||||
const { createLogger, transports } = require('winston');
|
||||
const http = require('http');
|
||||
const https = require('https');
|
||||
const assert = require('assert');
|
||||
const path = require('path');
|
||||
const keyCert = require('key-cert');
|
||||
const Promise = require('bluebird');
|
||||
const Proxy = require('http-mitm-proxy');
|
||||
|
||||
const debug = require('debug')('se-scraper:test');
|
||||
const { DuckduckgoScraper } = require('../../src/modules/duckduckgo');
|
||||
|
||||
const httpPort = 3012;
|
||||
const httpsPort = httpPort + 1;
|
||||
const proxyPort = httpPort + 2;
|
||||
|
||||
const fakeSearchEngine = express();
|
||||
fakeSearchEngine.use(express.urlencoded({ extended: true }))
|
||||
fakeSearchEngine.get('/', (req, res, next) => {
|
||||
if(!req.query.q){
|
||||
return next();
|
||||
}
|
||||
debug('q=%s page=%d', req.query.q, req.query.page);
|
||||
const pageNumber = req.query.page;
|
||||
res.sendFile(path.join(__dirname, '../mocks/duckduckgo/' + req.query.q + '_page' + pageNumber + '.html'));
|
||||
});
|
||||
fakeSearchEngine.post('/html', (req, res) => {
|
||||
debug('body=%o', req.body);
|
||||
const pageNumber = 1;
|
||||
res.sendFile(path.join(__dirname, '../mocks/duckduckgo/' + req.body.q + '_page' + pageNumber + '.html'));
|
||||
});
|
||||
fakeSearchEngine.use(express.static('test/mocks/duckduckgo', {extensions: ['html']}));
|
||||
|
||||
describe('Module DuckDuckGo', function(){
|
||||
|
||||
let httpServer, httpsServer, proxy;
|
||||
before(async function(){
|
||||
// Here mount our fake engine in both http and https listen server
|
||||
httpServer = http.createServer(fakeSearchEngine);
|
||||
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
||||
|
||||
proxy = Proxy();
|
||||
proxy.onRequest((ctx, callback) => {
|
||||
ctx.proxyToServerRequestOptions.host = 'localhost';
|
||||
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
|
||||
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
|
||||
debug('proxy askedHost=%s method=%s url=%s toPort=%s',
|
||||
ctx.clientToProxyRequest.headers.host,
|
||||
ctx.clientToProxyRequest.method,
|
||||
ctx.clientToProxyRequest.url,
|
||||
ctx.proxyToServerRequestOptions.port
|
||||
);
|
||||
return callback();
|
||||
});
|
||||
|
||||
await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort });
|
||||
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
|
||||
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
||||
debug('Fake http search engine servers started');
|
||||
});
|
||||
|
||||
after(function(){
|
||||
proxy.close();
|
||||
httpsServer.close();
|
||||
httpServer.close();
|
||||
});
|
||||
|
||||
let browser;
|
||||
let page;
|
||||
beforeEach(async function(){
|
||||
debug('Start a new browser');
|
||||
browser = await puppeteer.launch({
|
||||
//dumpio: true,
|
||||
//headless: false,
|
||||
ignoreHTTPSErrors: true,
|
||||
args: [ '--proxy-server=http://localhost:' + proxyPort ]
|
||||
});
|
||||
debug('Open a fresh page');
|
||||
page = await browser.newPage();
|
||||
});
|
||||
|
||||
afterEach(async function(){
|
||||
await browser.close();
|
||||
});
|
||||
|
||||
const testLogger = createLogger({
|
||||
transports: [
|
||||
new transports.Console({
|
||||
level: 'error'
|
||||
})
|
||||
]
|
||||
});
|
||||
|
||||
it('one keyword one page', function(){
|
||||
const duckduckgoScraper = new DuckduckgoScraper({
|
||||
config: {
|
||||
search_engine_name: 'duckduckgo',
|
||||
throw_on_detection: true,
|
||||
keywords: ['test keyword'],
|
||||
logger: testLogger,
|
||||
scrape_from_file: '',
|
||||
}
|
||||
});
|
||||
duckduckgoScraper.STANDARD_TIMEOUT = 1000;
|
||||
return duckduckgoScraper.run({page}).then(({results, metadata, num_requests}) => {
|
||||
assert.strictEqual(num_requests, 1, 'Must do one request');
|
||||
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed');
|
||||
});
|
||||
});
|
||||
|
||||
it('one keyword 3 pages', function () {
|
||||
this.timeout(4000);
|
||||
const duckduckgoScraper = new DuckduckgoScraper({
|
||||
config: {
|
||||
search_engine_name: 'google',
|
||||
throw_on_detection: true,
|
||||
keywords: ['test keyword'],
|
||||
logger: testLogger,
|
||||
scrape_from_file: '',
|
||||
num_pages: 3,
|
||||
}
|
||||
});
|
||||
duckduckgoScraper.STANDARD_TIMEOUT = 1000;
|
||||
return duckduckgoScraper.run({page}).then(({results, metadata, num_requests}) => {
|
||||
assert.strictEqual(num_requests, 3, 'Must three requests');
|
||||
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed on page 1');
|
||||
assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tests | TestComplete Documentation', 'Title not matching on first organic result page 1');
|
||||
debug('results page 1 %O',results['test keyword']['1'].results);
|
||||
debug('results page 2 %O', results['test keyword']['2'].results);
|
||||
assert.strictEqual(results['test keyword']['2'].results.length, 19, 'Must have 19 organic results parsed on page 2');
|
||||
assert.strictEqual(results['test keyword']['2'].results[0].title, 'Quest Diagnostics: Test Directory', 'Title not matching on first organic result page 1');
|
||||
assert.strictEqual(results['test keyword']['3'].results.length, 48, 'Must have 48 organic results parsed on page 3');
|
||||
assert.strictEqual(results['test keyword']['3'].results[0].title, 'Java Keywords Quiz - Sporcle', 'Title not matching on first organic result page 1');
|
||||
});
|
||||
});
|
||||
|
||||
});
|
123
test/modules/google.js
Normal file
123
test/modules/google.js
Normal file
@ -0,0 +1,123 @@
|
||||
'use strict';
|
||||
const express = require('express');
|
||||
const puppeteer = require('puppeteer');
|
||||
const { createLogger, transports } = require('winston');
|
||||
const http = require('http');
|
||||
const https = require('https');
|
||||
const assert = require('assert');
|
||||
const path = require('path');
|
||||
const keyCert = require('key-cert');
|
||||
const Promise = require('bluebird');
|
||||
const Proxy = require('http-mitm-proxy');
|
||||
|
||||
const debug = require('debug')('se-scraper:test');
|
||||
const { GoogleScraper } = require('../../src/modules/google');
|
||||
|
||||
const httpPort = 3012;
|
||||
const httpsPort = httpPort + 1;
|
||||
const proxyPort = httpPort + 2;
|
||||
|
||||
const fakeSearchEngine = express();
|
||||
fakeSearchEngine.get('/search', (req, res) => {
|
||||
debug('q=%s', req.query.q);
|
||||
const pageNumber = ((req.query.start/10) || 0) + 1;
|
||||
res.sendFile(path.join(__dirname, '../mocks/google/' + req.query.q + '_page' + pageNumber + '.html'));
|
||||
});
|
||||
fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']}));
|
||||
|
||||
describe('Module Google', function(){
|
||||
|
||||
let httpServer, httpsServer, proxy;
|
||||
before(async function(){
|
||||
// Here mount our fake engine in both http and https listen server
|
||||
httpServer = http.createServer(fakeSearchEngine);
|
||||
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
||||
|
||||
proxy = Proxy();
|
||||
proxy.onRequest((ctx, callback) => {
|
||||
ctx.proxyToServerRequestOptions.host = 'localhost';
|
||||
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
|
||||
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
|
||||
debug('connection proxied askedHost=%s toPort=%s', ctx.clientToProxyRequest.headers.host, ctx.proxyToServerRequestOptions.port);
|
||||
return callback();
|
||||
});
|
||||
|
||||
await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort });
|
||||
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
|
||||
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
||||
debug('Fake http search engine servers started');
|
||||
});
|
||||
|
||||
after(function(){
|
||||
proxy.close();
|
||||
httpsServer.close();
|
||||
httpServer.close();
|
||||
});
|
||||
|
||||
let browser;
|
||||
let page;
|
||||
beforeEach(async function(){
|
||||
debug('Start a new browser');
|
||||
browser = await puppeteer.launch({
|
||||
//dumpio: true,
|
||||
//headless: false,
|
||||
ignoreHTTPSErrors: true,
|
||||
args: [ '--proxy-server=http://localhost:' + proxyPort ]
|
||||
});
|
||||
debug('Open a fresh page');
|
||||
page = await browser.newPage();
|
||||
});
|
||||
|
||||
afterEach(async function(){
|
||||
await browser.close();
|
||||
});
|
||||
|
||||
const testLogger = createLogger({
|
||||
transports: [
|
||||
new transports.Console({
|
||||
level: 'error'
|
||||
})
|
||||
]
|
||||
});
|
||||
|
||||
it('one keyword one page', function(){
|
||||
const googleScraper = new GoogleScraper({
|
||||
config: {
|
||||
search_engine_name: 'google',
|
||||
throw_on_detection: true,
|
||||
keywords: ['test keyword'],
|
||||
logger: testLogger,
|
||||
scrape_from_file: '',
|
||||
}
|
||||
});
|
||||
googleScraper.STANDARD_TIMEOUT = 500;
|
||||
return googleScraper.run({page}).then(({results, metadata, num_requests}) => {
|
||||
assert.strictEqual(num_requests, 1, 'Must do one request');
|
||||
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed');
|
||||
});
|
||||
});
|
||||
|
||||
it('one keyword 3 pages', function () {
|
||||
const googleScraper = new GoogleScraper({
|
||||
config: {
|
||||
search_engine_name: 'google',
|
||||
throw_on_detection: true,
|
||||
keywords: ['test keyword'],
|
||||
logger: testLogger,
|
||||
scrape_from_file: '',
|
||||
num_pages: 3,
|
||||
}
|
||||
});
|
||||
googleScraper.STANDARD_TIMEOUT = 500;
|
||||
return googleScraper.run({page}).then(({results, metadata, num_requests}) => {
|
||||
assert.strictEqual(num_requests, 3, 'Must three requests');
|
||||
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed on page 1');
|
||||
assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tool (FREE) ᐈ #1 Google Keyword Planner Alternative', 'Title not matching on first organic result page 1');
|
||||
assert.strictEqual(results['test keyword']['2'].results.length, 10, 'Must have 10 organic results parsed on page 2');
|
||||
assert.strictEqual(results['test keyword']['2'].results[0].title, 'Keyword Research | The Beginner\'s Guide to SEO - Moz', 'Title not matching on first organic result page 1');
|
||||
assert.strictEqual(results['test keyword']['3'].results.length, 10, 'Must have 10 organic results parsed on page 3');
|
||||
assert.strictEqual(results['test keyword']['3'].results[0].title, 'The ACT Keyword Study Plan — NerdCoach', 'Title not matching on first organic result page 1');
|
||||
});
|
||||
});
|
||||
|
||||
});
|
161
test/proxy.js
Normal file
161
test/proxy.js
Normal file
@ -0,0 +1,161 @@
|
||||
'use strict';
|
||||
const express = require('express');
|
||||
const { createLogger, transports } = require('winston');
|
||||
const http = require('http');
|
||||
const https = require('https');
|
||||
const assert = require('assert');
|
||||
const keyCert = require('key-cert');
|
||||
const Promise = require('bluebird');
|
||||
const Proxy = require('http-mitm-proxy');
|
||||
|
||||
const debug = require('debug')('se-scraper:test');
|
||||
const se_scraper = require('../');
|
||||
const Scraper = require('../src/modules/se_scraper');
|
||||
|
||||
const httpPort = 3012;
|
||||
const httpsPort = httpPort + 1;
|
||||
const proxyPort = httpPort + 2;
|
||||
|
||||
const fakeSearchEngine = express();
|
||||
fakeSearchEngine.set('trust proxy', 'loopback');
|
||||
fakeSearchEngine.get('/test-proxy', (req, res) => {
|
||||
debug('fake-search-engine req.hostname=%s', req.hostname);
|
||||
//debug('req to', req.socket.localAddress, req.socket.localPort);
|
||||
res.send(req.hostname);
|
||||
});
|
||||
|
||||
describe('Config', function(){
|
||||
|
||||
let httpServer, httpsServer, proxy;
|
||||
before(async function(){
|
||||
// Here mount our fake engine in both http and https listen server
|
||||
httpServer = http.createServer(fakeSearchEngine);
|
||||
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
||||
|
||||
proxy = Proxy();
|
||||
proxy.onRequest((ctx, callback) => {
|
||||
ctx.proxyToServerRequestOptions.host = 'localhost';
|
||||
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
|
||||
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
|
||||
debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
|
||||
return callback();
|
||||
});
|
||||
|
||||
await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
|
||||
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
|
||||
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
||||
debug('Fake http search engine servers started');
|
||||
});
|
||||
|
||||
after(function(){
|
||||
httpsServer.close();
|
||||
httpServer.close();
|
||||
proxy.close();
|
||||
});
|
||||
|
||||
describe('proxies', function(){
|
||||
|
||||
class MockScraperTestProxy extends Scraper {
|
||||
|
||||
async load_start_page(){
|
||||
return true;
|
||||
}
|
||||
|
||||
async search_keyword(){
|
||||
await this.page.goto('http://test.local:' + httpPort + '/test-proxy');
|
||||
}
|
||||
|
||||
async parse_async(){
|
||||
const bodyHandle = await this.page.$('body');
|
||||
return await this.page.evaluate(body => body.innerHTML, bodyHandle);
|
||||
}
|
||||
}
|
||||
|
||||
const testLogger = createLogger({
|
||||
transports: [
|
||||
new transports.Console({
|
||||
level: 'error'
|
||||
})
|
||||
]
|
||||
});
|
||||
|
||||
/**
|
||||
* Jobs will be executed 2 by 2 through the proxy and direct connection
|
||||
* THIS TEST NEED TO HAVE test.local 127.0.0.1 in /etc/hosts because chrome bypass localhost even with proxy set
|
||||
*/
|
||||
it('one proxy given, use_proxies_only=false', async function () {
|
||||
|
||||
const scrape_job = {
|
||||
search_engine: MockScraperTestProxy,
|
||||
keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'],
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager({
|
||||
throw_on_detection: true,
|
||||
proxies: ['http://localhost:' + proxyPort],
|
||||
// default is use_proxies_only: false,
|
||||
logger: testLogger,
|
||||
});
|
||||
await scraper.start();
|
||||
|
||||
const { results } = await scraper.scrape(scrape_job);
|
||||
assert.strictEqual(results['news']['1'], 'test.local');
|
||||
assert.strictEqual(results['some stuff']['1'], 'ProxiedThroughFakeEngine');
|
||||
assert.strictEqual(results['i work too much']['1'], 'test.local');
|
||||
assert.strictEqual(results['what to do?']['1'], 'ProxiedThroughFakeEngine');
|
||||
assert.strictEqual(results['javascript is hard']['1'], 'test.local');
|
||||
|
||||
await scraper.quit();
|
||||
});
|
||||
|
||||
/**
|
||||
* Jobs will be executed 1 by 1 through the proxy
|
||||
*/
|
||||
it('one proxy given, use_proxies_only=true', async function () {
|
||||
|
||||
const scrape_job = {
|
||||
search_engine: MockScraperTestProxy,
|
||||
keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'],
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager({
|
||||
throw_on_detection: true,
|
||||
proxies: ['http://localhost:' + proxyPort],
|
||||
use_proxies_only: true,
|
||||
logger: testLogger,
|
||||
});
|
||||
await scraper.start();
|
||||
|
||||
const { results } = await scraper.scrape(scrape_job);
|
||||
assert.strictEqual(results['news']['1'], 'ProxiedThroughFakeEngine');
|
||||
assert.strictEqual(results['some stuff']['1'], 'ProxiedThroughFakeEngine');
|
||||
assert.strictEqual(results['i work too much']['1'], 'ProxiedThroughFakeEngine');
|
||||
assert.strictEqual(results['what to do?']['1'], 'ProxiedThroughFakeEngine');
|
||||
assert.strictEqual(results['javascript is hard']['1'], 'ProxiedThroughFakeEngine');
|
||||
|
||||
await scraper.quit();
|
||||
});
|
||||
|
||||
it('zero proxy given, use_proxies_only=true', async function () {
|
||||
|
||||
const scrape_job = {
|
||||
search_engine: MockScraperTestProxy,
|
||||
keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'],
|
||||
};
|
||||
|
||||
await assert.rejects(async () => {
|
||||
var scraper = new se_scraper.ScrapeManager({
|
||||
throw_on_detection: true,
|
||||
use_proxies_only: true,
|
||||
logger: testLogger,
|
||||
});
|
||||
await scraper.start();
|
||||
const { results } = await scraper.scrape(scrape_job);
|
||||
await scraper.quit();
|
||||
}, /Must provide at least one proxy in proxies if you enable use_proxies_only/);
|
||||
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
});
|
@ -1,15 +0,0 @@
|
||||
## Test with static HTML
|
||||
|
||||
Dynamic testing of se-scraper takes too much time.
|
||||
|
||||
Save some html and initialize se-scraper by loading the search from disk.
|
||||
|
||||
### Disadvantage
|
||||
|
||||
static html gets outdated after some time
|
||||
|
||||
### Advantages
|
||||
|
||||
1. Let's us test corner cases that are missed easily
|
||||
2. Testing is not reliable, since search engines do not always return the same results for the same query
|
||||
3. As said, much faster
|
@ -1,222 +0,0 @@
|
||||
const se_scraper = require('./../../index.js');
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
const path = require('path');
|
||||
|
||||
async function bing_ads() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'bing',
|
||||
keywords: ['kaffeemaschine kaufen'],
|
||||
num_pages: 1,
|
||||
scrape_from_file: 'file://' + path.join(__dirname, './html/bing.html'),
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(config);
|
||||
|
||||
await scraper.start();
|
||||
|
||||
bing_search_with_ads( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.keywords = ['best cloud services'];
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/bing2.html');
|
||||
|
||||
bing_search_with_ads2( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.keywords = ['car tires cheap'];
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/bing3.html');
|
||||
|
||||
bing_search_with_ads3( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.keywords = ['service auto garage'];
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/bing4.html');
|
||||
|
||||
bing_search_with_ads4( await scraper.scrape(scrape_config) );
|
||||
|
||||
await scraper.quit();
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function bing_search_with_ads(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '1’100’000', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 6, 'results must have at least 6 SERP objects');
|
||||
assert.isAtLeast(obj.ads.length, 7, 'there are 7 ads');
|
||||
|
||||
assert.isAtLeast(obj.right_side_ads.length, 5, 'there are 5 ads');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function bing_search_with_ads2(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '44’300’000', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 6, 'results must have at least 6 SERP objects');
|
||||
assert.isAtLeast(obj.ads.length, 7, 'there are 7 ads');
|
||||
|
||||
assert.isAtLeast(obj.right_side_ads.length, 5, 'there are 5 ads');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function bing_search_with_ads3(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '65.500.000 Results', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects');
|
||||
assert.isAtLeast(obj.ads.length, 3, 'there are 3 ads');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function bing_search_with_ads4(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '4.200.000 Ergebnisse', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 9, 'results must have at least 9 SERP objects');
|
||||
assert.isAtLeast(obj.ads.length, 3, 'there are 3 ads');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function confirm_results_ok(obj) {
|
||||
|
||||
for (let res of obj.results) {
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 8, 'title must have at least 8 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
}
|
||||
|
||||
for (let res of obj.ads) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 8, 'title must have at least 8 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
}
|
||||
|
||||
for (let res of obj.right_side_ads) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 8, 'title must have at least 8 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
}
|
||||
}
|
||||
|
||||
describe('Bing', function(){
|
||||
this.timeout(15000);
|
||||
it('static bing searches with ads', bing_ads);
|
||||
});
|
@ -1,173 +0,0 @@
|
||||
const se_scraper = require('./../../index.js');
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
const path = require('path');
|
||||
const cheerio = require('cheerio');
|
||||
|
||||
|
||||
async function test_html_output() {
|
||||
let config = {
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
html_output: true,
|
||||
// whether to strip JS and CSS from the html_output
|
||||
// has only an effect if `html_output` is true
|
||||
clean_html_output: true,
|
||||
// remove all data images from the html
|
||||
clean_data_images: true,
|
||||
// test compression
|
||||
compress: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'bing',
|
||||
keywords: ['kaffeemaschine kaufen'],
|
||||
num_pages: 1,
|
||||
scrape_from_file: 'file://' + path.join(__dirname, './html/bing.html'),
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(config);
|
||||
|
||||
await scraper.start();
|
||||
|
||||
var response = await scraper.scrape(scrape_config);
|
||||
|
||||
scrape_config.clean_html_output = false;
|
||||
scrape_config.clean_data_images = false;
|
||||
|
||||
var response_no_cleaned = await scraper.scrape(scrape_config);
|
||||
|
||||
test(response, response_no_cleaned, 'bing');
|
||||
|
||||
scrape_config.search_engine = 'google';
|
||||
scrape_config.keywords = ['rückspiegel schwarz'];
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google.html');
|
||||
scrape_config.clean_html_output = true;
|
||||
scrape_config.clean_data_images = true;
|
||||
|
||||
var responseGoogle = await scraper.scrape(scrape_config);
|
||||
|
||||
scrape_config.clean_html_output = false;
|
||||
scrape_config.clean_data_images = false;
|
||||
|
||||
var response_no_cleanedGoogle = await scraper.scrape(scrape_config);
|
||||
|
||||
test(responseGoogle, response_no_cleanedGoogle, 'google');
|
||||
|
||||
|
||||
scrape_config.keywords = ['cloud services'];
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/googleLarge.html');
|
||||
scrape_config.clean_html_output = true;
|
||||
scrape_config.clean_data_images = true;
|
||||
|
||||
var responseGoogle = await scraper.scrape(scrape_config);
|
||||
|
||||
scrape_config.clean_html_output = false;
|
||||
scrape_config.clean_data_images = false;
|
||||
|
||||
var response_no_cleanedGoogle = await scraper.scrape(scrape_config);
|
||||
|
||||
test(responseGoogle, response_no_cleanedGoogle, 'google');
|
||||
|
||||
await scraper.quit();
|
||||
}
|
||||
|
||||
function test(response, response_no_cleaned, se='google') {
|
||||
for (let query in response.results) {
|
||||
for (let page_number in response.results[query]) {
|
||||
let obj = response.results[query][page_number];
|
||||
let obj_no_cleaned = response_no_cleaned.results[query][page_number];
|
||||
|
||||
console.log('html length of no cleaned SERP: ' + obj_no_cleaned.html.length);
|
||||
console.log('html length of cleaned SERP: ' + obj.html.length);
|
||||
|
||||
assert.isOk(obj.html, 'Html must be ok!');
|
||||
assert.isAtLeast(obj.html.length, 100, 'html must be a length string');
|
||||
|
||||
assert.isOk(obj_no_cleaned.html, 'Html must be ok!');
|
||||
assert.isAtLeast(obj_no_cleaned.html.length, 100, 'html must be a length string');
|
||||
|
||||
assert.isBelow(obj.html.length, obj_no_cleaned.html.length, 'cleaned html must be smaller');
|
||||
|
||||
// test that we can parse the html of both the cleaned and no cleaned versions
|
||||
// with cheerio and that serp results are roughly the same
|
||||
|
||||
const cleaned$ = cheerio.load(obj.html);
|
||||
const no_cleaned$ = cheerio.load(obj_no_cleaned.html);
|
||||
|
||||
var resCleaned = parseResults(cleaned$, se);
|
||||
var resNoCleaned = parseResults(no_cleaned$, se);
|
||||
|
||||
assert.equal(resCleaned.length, resNoCleaned.length);
|
||||
assert.equal(resCleaned.length, obj.results.length);
|
||||
assert.equal(resNoCleaned.length, obj.results.length);
|
||||
|
||||
// unset the rank
|
||||
resCleaned = resCleaned.map((el) => el.rank = undefined);
|
||||
resNoCleaned = resNoCleaned.map((el) => el.rank = undefined);
|
||||
obj.results = obj.results.map((el) => el.rank = undefined);
|
||||
|
||||
assert.deepEqual(resCleaned, resNoCleaned, 'parsed results should be equal, even if html is cleaned');
|
||||
assert.deepEqual(resCleaned, obj.results, 'parsed results from cleaned html should be equal to se-scraper results');
|
||||
assert.deepEqual(resNoCleaned, obj.results, 'parsed results from non-cleaned html should be equal to se-scraper results');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function parseResults(s$, se) {
|
||||
|
||||
var results = [];
|
||||
|
||||
if (se === 'google') {
|
||||
s$('#center_col .g').each((i, link) => {
|
||||
results.push({
|
||||
link: s$(link).find('.r a').attr('href'),
|
||||
title: s$(link).find('.r a').text(),
|
||||
snippet: s$(link).find('span.st').text(),
|
||||
visible_link: s$(link).find('.r cite').text(),
|
||||
date: s$(link).find('span.f').text() || '',
|
||||
})
|
||||
});
|
||||
|
||||
} else if (se === 'bing') {
|
||||
s$('#b_content #b_results .b_algo').each((i, link) => {
|
||||
results.push({
|
||||
link: s$(link).find('h2 a').attr('href'),
|
||||
title: s$(link).find('h2').text(),
|
||||
snippet: s$(link).find('.b_caption p').text(),
|
||||
visible_link: s$(link).find('cite').text(),
|
||||
})
|
||||
});
|
||||
} else {
|
||||
throw "no such search engine";
|
||||
}
|
||||
|
||||
results = clean_results(results, ['title', 'link', 'snippet']);
|
||||
return results;
|
||||
}
|
||||
|
||||
function clean_results(results, attributes) {
|
||||
const cleaned = [];
|
||||
var rank = 1;
|
||||
for (var res of results) {
|
||||
let goodboy = true;
|
||||
for (var attr of attributes) {
|
||||
if (!res[attr] || !res[attr].trim()) {
|
||||
goodboy = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (goodboy) {
|
||||
res.rank = rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
return cleaned;
|
||||
}
|
||||
|
||||
describe('html output', function(){
|
||||
this.timeout(15000);
|
||||
it('static html output test', test_html_output);
|
||||
});
|
@ -1,24 +0,0 @@
|
||||
'use strict';
|
||||
const zlib = require('zlib');
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
|
||||
var files = ['google.html', 'google2.html', 'google3.html', 'bing.html', 'bing2.html'];
|
||||
|
||||
for (var file of files) {
|
||||
var html = fs.readFileSync(path.resolve(__dirname, './html/' + file));
|
||||
|
||||
var compressed = zlib.gzipSync(html);
|
||||
var deflated = zlib.deflateSync(html);
|
||||
|
||||
var compressed_encoded = compressed.toString('base64');
|
||||
var deflated_encoded = deflated.toString('base64');
|
||||
|
||||
console.log(file)
|
||||
console.log('Normal length: ' + html.length/1000);
|
||||
console.log('GZIP Compressed length: ' + compressed.length/1000);
|
||||
console.log('Deflate Compressed length: ' + deflated.length/1000);
|
||||
console.log('Encoded GZIP Compressed length: ' + compressed_encoded.length/1000);
|
||||
console.log('Encoded Deflate Compressed length: ' + deflated_encoded.length/1000);
|
||||
console.log('------\n')
|
||||
}
|
@ -1,99 +0,0 @@
|
||||
const se_scraper = require('./../../index.js');
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
const path = require('path');
|
||||
|
||||
async function duckduckgo() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'duckduckgo',
|
||||
keywords: ['cloud service'],
|
||||
num_pages: 1,
|
||||
scrape_from_file: 'file://' + path.join(__dirname, './html/duckduckgo1.html'),
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(config);
|
||||
|
||||
await scraper.start();
|
||||
|
||||
duckduckgo_normal( await scraper.scrape(scrape_config) );
|
||||
|
||||
await scraper.quit();
|
||||
}
|
||||
|
||||
function duckduckgo_normal(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'ads',], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects');
|
||||
assert.isAtLeast(obj.ads.length, 2, 'ads must have at least 2 SERP objects');
|
||||
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function confirm_results_ok(obj) {
|
||||
|
||||
for (let res of obj.results) {
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
}
|
||||
|
||||
for (let res of obj.ads) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
}
|
||||
}
|
||||
|
||||
describe('Duckduckgo', function(){
|
||||
this.timeout(10000);
|
||||
it('static duckduckgo sarch', duckduckgo);
|
||||
});
|
@ -1,410 +0,0 @@
|
||||
const se_scraper = require('./../../index.js');
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
const path = require('path');
|
||||
|
||||
async function normal_search_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'google',
|
||||
keywords: ['rückspiegel schwarz'],
|
||||
num_pages: 1,
|
||||
scrape_from_file: 'file://' + path.join(__dirname, './html/google.html'),
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(config);
|
||||
|
||||
await scraper.start();
|
||||
|
||||
google_search_with_products( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google2.html');
|
||||
scrape_config.keywords = ['autoreifen mercedes c-klasse'];
|
||||
|
||||
google_search_with_products2( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google3.html');
|
||||
scrape_config.keywords = ['kaffeemaschine kaufen'];
|
||||
|
||||
google_places( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google4.html');
|
||||
scrape_config.keywords = ['MODEL MARKET SW18 4ES'];
|
||||
|
||||
right_side_info_text( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google5.html');
|
||||
scrape_config.keywords = ['BRANDON MOTORS HP13 6NR'];
|
||||
|
||||
right_side_info_text2( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google6.html');
|
||||
scrape_config.keywords = ['car tires for sale'];
|
||||
|
||||
google_places_and_ads( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google_bmw_felgen.html');
|
||||
scrape_config.keywords = ['bmw felgen'];
|
||||
|
||||
google_ads2( await scraper.scrape(scrape_config) );
|
||||
|
||||
await scraper.quit();
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function google_search_with_products(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '1’780’000', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 9, 'results must have at least 8 SERP objects');
|
||||
assert.isAtLeast(obj.top_ads.length, 0, 'there are no top ads');
|
||||
assert.isAtLeast(obj.bottom_ads.length, 3, 'there are 3 bottom ads');
|
||||
assert.isAtLeast(obj.top_products.length, 15, 'there are 15 top products');
|
||||
assert.equal(obj.right_products.length, 0, 'there are 0 right products');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function google_search_with_products2(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '437’000 Ergebnisse (0.41 Sekunden)', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 9, 'results must have at least 8 SERP objects');
|
||||
assert.isAtLeast(obj.top_ads.length, 0, 'there are no top ads');
|
||||
assert.isAtLeast(obj.bottom_ads.length, 1, 'there are 1 bottom ads');
|
||||
assert.isAtLeast(obj.top_products.length, 0, 'there are 0 top products');
|
||||
assert.equal(obj.right_products.length, 4, 'there are 4 right products');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function google_places(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '6’750’000 Ergebnisse (0.52 Sekunden)', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 10, 'results must have at least 9 SERP objects');
|
||||
assert.equal(obj.top_ads.length, 0, 'there are no top ads');
|
||||
assert.equal(obj.bottom_ads.length, 0, 'there are 0 bottom ads');
|
||||
assert.equal(obj.top_products.length, 0, 'there are 0 top products');
|
||||
assert.equal(obj.right_products.length, 0, 'there are 0 right products');
|
||||
assert.equal(obj.places.length, 3, 'there are 3 places');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function right_side_info_text(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
for (let query in response.results) {
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '6 Ergebnisse', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results',
|
||||
'effective_query', 'top_ads', 'bottom_ads', 'right_side_info_text'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
|
||||
assert.isOk(obj.right_side_info_text.length, 'right_side_info_text should have some data');
|
||||
assert.isAtLeast(obj.right_side_info_text.length, 50, 'right_side_info_text should have some data');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function right_side_info_text2(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
for (let query in response.results) {
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '5 Ergebnisse', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results',
|
||||
'effective_query', 'top_ads', 'bottom_ads', 'right_side_info_text'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 5, 'results must have at least 5 SERP objects');
|
||||
assert.isOk(obj.right_side_info_text.length, 'right_side_info_text should have some data');
|
||||
assert.isAtLeast(obj.right_side_info_text.length, 50, 'right_side_info_text should have some data');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function google_places_and_ads(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '439.000.000 Ergebnisse (0,64 Sekunden)', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects');
|
||||
assert.equal(obj.top_ads.length, 0, 'there are no top ads');
|
||||
assert.equal(obj.bottom_ads.length, 0, 'there are 0 bottom ads');
|
||||
assert.isAtLeast(obj.top_products.length, 13, 'there are 13 top products');
|
||||
assert.equal(obj.right_products.length, 0, 'there are 0 right products');
|
||||
assert.equal(obj.places.length, 2, 'there are 2 places');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function google_ads2(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, 'Ungefähr 23.200.000 Ergebnisse (0,29 Sekunden)', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects');
|
||||
assert.equal(obj.top_ads.length, 3, 'there are no top ads');
|
||||
assert.equal(obj.bottom_ads.length, 0, 'there are 0 bottom ads');
|
||||
assert.isAtLeast(obj.top_products.length, 0, 'there must be 0 top products');
|
||||
assert.equal(obj.right_products.length, 9, 'there are 9 right products');
|
||||
assert.equal(obj.places.length, 0, 'there are 0 places');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function confirm_results_ok(obj) {
|
||||
|
||||
for (let res of obj.results) {
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
}
|
||||
|
||||
for (let res of obj.top_ads) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'visible_link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.typeOf(res.links, 'array', 'links must be array');
|
||||
}
|
||||
|
||||
for (let res of obj.bottom_ads) {
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'visible_link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.typeOf(res.links, 'array', 'links must be array');
|
||||
}
|
||||
|
||||
for (let res of obj.top_products) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.price, 'price must be ok');
|
||||
assert.typeOf(res.price, 'string', 'price must be string');
|
||||
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.vendor_link, 'vendor_link must be ok');
|
||||
assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string');
|
||||
assert.isAtLeast(res.vendor_link.length, 8, 'vendor_link must have at least 10 chars');
|
||||
}
|
||||
|
||||
for (let res of obj.right_products) {
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.price, 'price must be ok');
|
||||
assert.typeOf(res.price, 'string', 'price must be string');
|
||||
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.vendor_link, 'vendor_link must be ok');
|
||||
assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string');
|
||||
assert.isAtLeast(res.vendor_link.length, 8, 'vendor_link must have at least 10 chars');
|
||||
}
|
||||
|
||||
for (let res of obj.places) {
|
||||
assert.isOk(res.heading, 'heading must be ok');
|
||||
assert.typeOf(res.heading, 'string', 'heading must be string');
|
||||
assert.isAtLeast(res.heading.length, 5, 'heading must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.rating, 'rating must be ok');
|
||||
assert.typeOf(res.rating, 'string', 'rating must be string');
|
||||
assert.isAtLeast(res.rating.length, 5, 'rating must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.contact, 'contact must be ok');
|
||||
assert.typeOf(res.contact, 'string', 'contact must be string');
|
||||
assert.isAtLeast(res.contact.length, 5, 'contact must have at least 5 chars');
|
||||
|
||||
assert.typeOf(res.hours, 'string', 'hours must be string');
|
||||
if (res.hours) {
|
||||
assert.isAtLeast(res.hours.length, 10, 'hours must have at least 10 chars');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
describe('Google', function() {
|
||||
this.timeout(25000);
|
||||
it('static google searches with products,ads and places', normal_search_test);
|
||||
});
|
@ -1,213 +0,0 @@
|
||||
const se_scraper = require('./../../index.js');
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
const path = require('path');
|
||||
|
||||
async function normal_search_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'google',
|
||||
keywords: ['in.linkedin.com/in/altanai'],
|
||||
num_pages: 1,
|
||||
scrape_from_file: 'file://' + path.join(__dirname, './html/google7.html'),
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(config);
|
||||
|
||||
await scraper.start();
|
||||
|
||||
google_test_title( await scraper.scrape(scrape_config) );
|
||||
|
||||
await scraper.quit();
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function google_test_title(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '7.600', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 9, 'results must have at least 9 SERP objects');
|
||||
assert.isAtLeast(obj.top_ads.length, 0, 'there are no top ads');
|
||||
assert.isAtLeast(obj.bottom_ads.length, 0, 'there are 0 bottom ads');
|
||||
assert.isAtLeast(obj.top_products.length, 0, 'there are 0 top products');
|
||||
assert.equal(obj.right_products.length, 0, 'there are 0 right products');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
|
||||
assert.equal( obj.results[0].title, 'ALTANAI BISHT - SD2 at Voice Engineering - Plivo | LinkedIn' );
|
||||
assert.equal( obj.results[1].title, 'ALTANAI BISHT | LinkedIn' );
|
||||
assert.equal( obj.results[2].title, 'ALTANAI BISHT – SD2 at Voice Engineering – Plivo | LinkedIn' );
|
||||
assert.equal( obj.results[3].title, 'AI AT - South Delhi, Delhi, India | Professional Profile | LinkedIn' );
|
||||
assert.equal( obj.results[4].title, 'ALTANAI BISHT | LinkedIn' );
|
||||
assert.equal( obj.results[9].title, 'Phani Kumar Parasaram - VOIP Expert - Infinite ... - LinkedIn');
|
||||
|
||||
assert.equal (obj.results[0].date, '27.07.2016');
|
||||
assert.equal( obj.results[0].snippet, '27.07.2016 - View ALTANAI BISHT\'S profile on LinkedIn, the world\'s largest professional community. ALTANAI has 6 jobs listed on their profile. See the ...');
|
||||
|
||||
assert.equal (obj.results[2].date, '27.07.2016');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function confirm_results_ok(obj) {
|
||||
|
||||
for (let res of obj.results) {
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
}
|
||||
|
||||
for (let res of obj.top_ads) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'visible_link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.typeOf(res.links, 'array', 'links must be array');
|
||||
}
|
||||
|
||||
for (let res of obj.bottom_ads) {
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'visible_link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.typeOf(res.links, 'array', 'links must be array');
|
||||
}
|
||||
|
||||
for (let res of obj.top_products) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.price, 'price must be ok');
|
||||
assert.typeOf(res.price, 'string', 'price must be string');
|
||||
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.vendor_link, 'vendor_link must be ok');
|
||||
assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string');
|
||||
assert.isAtLeast(res.vendor_link.length, 10, 'vendor_link must have at least 10 chars');
|
||||
}
|
||||
|
||||
for (let res of obj.right_products) {
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.price, 'price must be ok');
|
||||
assert.typeOf(res.price, 'string', 'price must be string');
|
||||
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.vendor_link, 'vendor_link must be ok');
|
||||
assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string');
|
||||
assert.isAtLeast(res.vendor_link.length, 10, 'vendor_link must have at least 10 chars');
|
||||
}
|
||||
|
||||
for (let res of obj.places) {
|
||||
assert.isOk(res.heading, 'heading must be ok');
|
||||
assert.typeOf(res.heading, 'string', 'heading must be string');
|
||||
assert.isAtLeast(res.heading.length, 5, 'heading must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.rating, 'rating must be ok');
|
||||
assert.typeOf(res.rating, 'string', 'rating must be string');
|
||||
assert.isAtLeast(res.rating.length, 5, 'rating must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.contact, 'contact must be ok');
|
||||
assert.typeOf(res.contact, 'string', 'contact must be string');
|
||||
assert.isAtLeast(res.contact.length, 5, 'contact must have at least 5 chars');
|
||||
|
||||
assert.typeOf(res.hours, 'string', 'hours must be string');
|
||||
if (res.hours) {
|
||||
assert.isAtLeast(res.hours.length, 10, 'hours must have at least 10 chars');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
describe('Google2', function(){
|
||||
this.timeout(10000);
|
||||
it('static google searches testing various details', normal_search_test);
|
||||
});
|
@ -1,152 +0,0 @@
|
||||
const se_scraper = require('./../../index.js');
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
const path = require('path');
|
||||
|
||||
async function yandex_ads() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'yandex',
|
||||
keywords: ['cloud service'],
|
||||
num_pages: 1,
|
||||
scrape_from_file: 'file://' + path.join(__dirname, './html/yandex1.html'),
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(config);
|
||||
|
||||
await scraper.start();
|
||||
|
||||
yandex_search_with_ads( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.keywords = ['car tires cheap'];
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/yandex2.html');
|
||||
|
||||
yandex_search_with_ads2( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.keywords = ['купить деревянные окна'];
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/yandex3.html');
|
||||
|
||||
yandex_search_with_ads3( await scraper.scrape(scrape_config) );
|
||||
|
||||
await scraper.quit();
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function yandex_search_with_ads(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '2 million results', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'num_results',], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 12, 'results must have at least 12 SERP objects');
|
||||
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function yandex_search_with_ads2(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '5 million results', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'num_results',], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 11, 'results must have at least 12 SERP objects');
|
||||
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function yandex_search_with_ads3(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
// console.dir(obj.results, {depth: null, colors: true});
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'num_results',], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 14, 'results must have at least 14 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
// at least 4 ads
|
||||
let cnt = 0;
|
||||
obj.results.forEach((res) => {
|
||||
if (res.is_ad) {
|
||||
cnt++;
|
||||
}
|
||||
});
|
||||
|
||||
assert.isAtLeast(cnt, 4, 'there should be at least 4 ads in the results');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function confirm_results_ok(obj) {
|
||||
|
||||
for (let res of obj.results) {
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
}
|
||||
}
|
||||
|
||||
describe('Yandex', function(){
|
||||
this.timeout(10000);
|
||||
it('static yandex searches with ads', yandex_ads);
|
||||
});
|
@ -1,141 +0,0 @@
|
||||
'use strict';
|
||||
const se_scraper = require('./../index.js');
|
||||
const assert = require('chai').assert;
|
||||
|
||||
/*
|
||||
* Use chai and mocha for tests.
|
||||
* https://mochajs.org/#installation
|
||||
*/
|
||||
|
||||
const normal_search_keywords = ['iphone', 'clock'];
|
||||
|
||||
async function normal_search_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
keyword_file: '',
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'amazon',
|
||||
num_pages: 1,
|
||||
keywords: normal_search_keywords,
|
||||
};
|
||||
|
||||
console.log('normal_search_test()');
|
||||
normal_search_test_case( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function normal_search_test_case(response) {
|
||||
assert.equal(response.metadata.num_requests, 2);
|
||||
|
||||
for (let query in response.results) {
|
||||
let total_rank = 1;
|
||||
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'image', 'seller', 'stars', 'num_reviews', 'price', 'oldprice'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.seller, 'seller must be ok');
|
||||
assert.typeOf(res.seller, 'string', 'seller must be string');
|
||||
assert.isAtLeast(res.seller.length, 5, 'seller must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.stars, 'stars be ok');
|
||||
assert.typeOf(res.stars, 'string', 'stars must be string');
|
||||
assert.isAtLeast(res.stars.length, 5, 'stars must have at least 6 chars');
|
||||
assert.include(res.stars, ' out of ', 'stars must include " out of "');
|
||||
|
||||
assert.isOk(res.num_reviews, 'num_reviews be ok');
|
||||
assert.typeOf(res.num_reviews, 'string', 'num_reviews must be string');
|
||||
assert.isAtLeast(res.num_reviews.length, 1, 'num_reviews must have at least 1 chars');
|
||||
|
||||
assert.isOk(res.price, 'price be ok');
|
||||
assert.typeOf(res.price, 'string', 'price must be string');
|
||||
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
assert.equal(res.rank, total_rank++, 'rank ist wrong');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const keywords_no_results = ['2342kljp;fj9834u40abJ54634344023safkl34a44dsflkjaQQuBBdfk',];
|
||||
|
||||
async function no_results_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
keyword_file: '',
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'amazon',
|
||||
num_pages: 1,
|
||||
keywords: keywords_no_results,
|
||||
};
|
||||
|
||||
console.log('no_results_test()');
|
||||
test_case_no_results( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function test_case_no_results(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, keywords_no_results, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
assert(obj.results.length === 0, 'results must have 0 SERP objects');
|
||||
assert.equal(obj.no_results, true, 'no results should be true');
|
||||
assert.isEmpty(obj.num_results, 'no results should be a empty string');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
describe('Amazon', function(){
|
||||
this.timeout(30000);
|
||||
it('normal search test', normal_search_test);
|
||||
it('no results test', no_results_test);
|
||||
});
|
@ -1,87 +0,0 @@
|
||||
'use strict';
|
||||
const se_scraper = require('./../index.js');
|
||||
const assert = require('chai').assert;
|
||||
|
||||
/*
|
||||
* Use chai and mocha for tests.
|
||||
* https://mochajs.org/#installation
|
||||
*/
|
||||
|
||||
const normal_search_keywords = ['mouse', 'cat'];
|
||||
|
||||
async function normal_search_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
keyword_file: '',
|
||||
headless: true,
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'baidu',
|
||||
keywords: normal_search_keywords,
|
||||
num_pages: 2,
|
||||
};
|
||||
|
||||
console.log('normal_search_test()');
|
||||
normal_search_test_case( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function normal_search_test_case(response) {
|
||||
assert.equal(response.metadata.num_requests, 4);
|
||||
|
||||
for (let query in response.results) {
|
||||
let total_rank = 1;
|
||||
|
||||
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'num_results', 'no_results'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
assert.equal(res.rank, total_rank++, 'rank ist wrong');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
describe('Baidu', function(){
|
||||
this.timeout(30000);
|
||||
it('normal search test', normal_search_test);
|
||||
});
|
@ -1,271 +0,0 @@
|
||||
'use strict';
|
||||
const se_scraper = require('./../index.js');
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
/*
|
||||
* Use chai and mocha for tests.
|
||||
* https://mochajs.org/#installation
|
||||
*/
|
||||
|
||||
const normal_search_keywords = ['apple tree', 'weather tomorrow'];
|
||||
|
||||
async function normal_search_test() {
|
||||
let config = {
|
||||
search_engine: 'bing',
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
keywords: normal_search_keywords,
|
||||
keyword_file: '',
|
||||
num_pages: 3,
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'bing',
|
||||
keywords: normal_search_keywords,
|
||||
num_pages: 3,
|
||||
};
|
||||
|
||||
console.log('normal_search_test()');
|
||||
normal_search_test_case( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function normal_search_test_case(response) {
|
||||
assert.equal(response.metadata.num_requests, 6);
|
||||
|
||||
for (let query in response.results) {
|
||||
let total_rank = 1;
|
||||
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'rank'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
|
||||
|
||||
if (res.snippet) {
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
}
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
assert.equal(res.rank, total_rank++, 'rank ist wrong');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const keywords_no_results = ['2342kljp;fj9834u40abJAkasdlfkjsladfkjasfdas;lk3453-934023safkl34a44dsflkjaQQuBBdfk',];
|
||||
|
||||
async function no_results_test() {
|
||||
let config = {
|
||||
search_engine: 'bing',
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
keywords: keywords_no_results,
|
||||
keyword_file: '',
|
||||
num_pages: 1,
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'bing',
|
||||
keywords: keywords_no_results,
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
console.log('no_results_test()');
|
||||
test_case_no_results( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function test_case_no_results(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, keywords_no_results, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
assert(obj.results.length === 0, 'results must have 0 SERP objects');
|
||||
assert.equal(obj.no_results, true, 'no results should be true');
|
||||
assert.isEmpty(obj.num_results, 'no results should be a empty string');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const effective_query_keywords = ['mount everrest'];
|
||||
|
||||
async function effective_query_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
keyword_file: '',
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'bing',
|
||||
keywords: effective_query_keywords,
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
console.log('effective_query_test()');
|
||||
test_case_effective_query( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function test_case_effective_query(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, effective_query_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
// effective query must be different to the original keyword
|
||||
assert.isOk(obj.effective_query, 'effective query must be ok');
|
||||
assert.isNotEmpty(obj.effective_query, 'effective query must be valid');
|
||||
assert(obj.effective_query !== query, 'effective query must be different from keyword');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
const ads_keywords = ['cloud services', 'buy shoes'];
|
||||
|
||||
async function ads_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
block_assets: false,
|
||||
random_user_agent: true,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'bing',
|
||||
keywords: ads_keywords,
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
console.log('ads_test()');
|
||||
test_case_ads_test( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
function test_case_ads_test(response) {
|
||||
assert.equal(response.metadata.num_requests, 2);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, ads_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 5, 'results must have at least 5 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
assert.isAtLeast(obj.ads.length, 2, 'ads must have at least 2 SERP objects');
|
||||
|
||||
for (let res of obj.ads) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
// assert.isOk(res.link, 'link must be ok');
|
||||
// assert.typeOf(res.link, 'string', 'link must be string');
|
||||
// assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
describe('Bing', function(){
|
||||
this.timeout(30000);
|
||||
it('normal search', normal_search_test);
|
||||
it('no results', no_results_test);
|
||||
it('effective query', effective_query_test);
|
||||
it('finds ads', ads_test);
|
||||
});
|
@ -1,192 +0,0 @@
|
||||
'use strict';
|
||||
const se_scraper = require('./../index.js');
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
|
||||
const normal_search_keywords = ['apple tree', 'weather tomorrow'];
|
||||
|
||||
async function normal_search_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
block_assets: false,
|
||||
random_user_agent: true,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'duckduckgo',
|
||||
keywords: normal_search_keywords,
|
||||
num_pages: 2,
|
||||
};
|
||||
|
||||
console.log('normal_search_test()');
|
||||
normal_search_test_case( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function normal_search_test_case(response) {
|
||||
assert.equal(response.metadata.num_requests, 2);
|
||||
|
||||
for (let query in response.results) {
|
||||
let total_rank = 1;
|
||||
|
||||
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
assert.equal(res.rank, total_rank++, 'rank ist wrong');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const effective_query_keywords = ['mount everrest'];
|
||||
|
||||
async function effective_query_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
block_assets: true,
|
||||
random_user_agent: true,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'duckduckgo',
|
||||
keywords: effective_query_keywords,
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
console.log('test_case_effective_query()');
|
||||
test_case_effective_query( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function test_case_effective_query(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
results = response.results;
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, effective_query_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
// effective query must be different to the original keyword
|
||||
assert.isOk(obj.effective_query, 'effective query must be ok');
|
||||
assert.isNotEmpty(obj.effective_query, 'effective query must be valid');
|
||||
assert(obj.effective_query !== query, 'effective query must be different from keyword');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const ads_keywords = ['cloud services', 'buy shoes'];
|
||||
|
||||
async function ads_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
block_assets: false,
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'duckduckgo',
|
||||
keywords: ads_keywords,
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
console.log('ads_test()');
|
||||
test_case_ads_test( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
function test_case_ads_test(response) {
|
||||
assert.equal(response.metadata.num_requests, 2);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, ads_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'effective_query', 'ads'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 6, 'results must have at least 6 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
assert.isAtLeast(obj.ads.length, 2, 'ads must have at least 2 SERP object');
|
||||
|
||||
for (let res of obj.ads) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
describe('Duckduckgo', function(){
|
||||
this.timeout(30000);
|
||||
it('normal search', normal_search_test);
|
||||
it('effective query', effective_query_test);
|
||||
it('finds ads', ads_test);
|
||||
});
|
@ -1,424 +0,0 @@
|
||||
'use strict';
|
||||
const se_scraper = require('./../index.js');
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
|
||||
/*
|
||||
* Use chai and mocha for tests.
|
||||
* https://mochajs.org/#installation
|
||||
*/
|
||||
|
||||
const normal_search_keywords = ['apple tree', 'weather tomorrow'];
|
||||
|
||||
async function normal_search_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
keyword_file: '',
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'google',
|
||||
keywords: normal_search_keywords,
|
||||
num_pages: 3,
|
||||
};
|
||||
|
||||
console.log('normal_search_test()');
|
||||
normal_search_test_case( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function normal_search_test_case(response) {
|
||||
assert.equal(response.metadata.num_requests, 6);
|
||||
|
||||
for (let query in response.results) {
|
||||
let total_rank = 1;
|
||||
|
||||
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 8 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
assert.equal(res.rank, total_rank++, 'rank ist wrong');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const keywords_no_results = ['fgskl34440abJAksfs4353534a3l34AVGFDFflkjaQQuBBdfk',];
|
||||
|
||||
async function no_results_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
keyword_file: '',
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'google',
|
||||
keywords: keywords_no_results,
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
console.log('no_results_test()');
|
||||
test_case_no_results( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function test_case_no_results(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, keywords_no_results, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
assert.strictEqual(obj.results.length, 0, 'results must have 0 SERP objects');
|
||||
assert.equal(obj.no_results, true, 'no results should be true');
|
||||
assert.isEmpty(obj.num_results, 'num_results should be a empty string');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const effective_query_keywords = ['mount evverrest'];
|
||||
|
||||
async function effective_query_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
keyword_file: '',
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'google',
|
||||
keywords: effective_query_keywords,
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
console.log('effective_query_test()');
|
||||
test_case_effective_query( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function test_case_effective_query(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, effective_query_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
// effective query must be different to the original keyword
|
||||
assert.isOk(obj.effective_query, 'effective query must be ok');
|
||||
assert.isNotEmpty(obj.effective_query, 'effective query must be valid');
|
||||
assert(obj.effective_query !== query, 'effective query must be different from keyword');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 8 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function html_output_query_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
keyword_file: '',
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'google',
|
||||
keywords: normal_search_keywords,
|
||||
num_pages: 3,
|
||||
html_output: true,
|
||||
};
|
||||
|
||||
let output = await se_scraper.scrape(config, scrape_config);
|
||||
normal_search_test_case( output );
|
||||
check_html_output_test_case( output );
|
||||
}
|
||||
|
||||
function check_html_output_test_case( response ) {
|
||||
for (let query in response.html_output) {
|
||||
|
||||
assert.containsAllKeys(response.html_output, normal_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.html_output[query]) {
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
assert.startsWith(response.html_output[query][page_number], '<!DOCTYPE html><html');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const ads_keywords = ['cloud services', 'auto kaufen'];
|
||||
|
||||
async function ads_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
block_assets: false,
|
||||
random_user_agent: false, // dont try to trick google with ads
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'google',
|
||||
keywords: ads_keywords,
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
console.log('ads_test()');
|
||||
test_case_ads_test( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
function test_case_ads_test(response) {
|
||||
assert.equal(response.metadata.num_requests, 2);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, ads_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
assert(obj.top_ads.length >= 1 || obj.bottom_ads.length >= 1, 'top_ads or bottom_ads must have at least 1 SERP object');
|
||||
|
||||
for (let res of obj.top_ads) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'visible_link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.typeOf(res.links, 'array', 'links must be array');
|
||||
}
|
||||
|
||||
for (let res of obj.bottom_ads) {
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'visible_link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.typeOf(res.links, 'array', 'links must be array');
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
const product_keywords = ['autoreifen bmw'];
|
||||
|
||||
async function products_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
block_assets: false,
|
||||
random_user_agent: false, // dont try to trick google with ads
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'google',
|
||||
keywords: ads_keywords,
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
console.log('products_test()');
|
||||
test_case_products_test( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
function test_case_products_test(response) {
|
||||
assert.equal(response.metadata.num_requests, 2);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, ads_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
assert(obj.top_products.length >= 1 || obj.right_products.length >= 1, 'top_products or right_products must have at least 1 SERP object');
|
||||
|
||||
for (let res of obj.top_products) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.price, 'price must be ok');
|
||||
assert.typeOf(res.price, 'string', 'price must be string');
|
||||
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.vendor_link, 'vendor_link must be ok');
|
||||
assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string');
|
||||
assert.isAtLeast(res.vendor_link.length, 10, 'vendor_link must have at least 10 chars');
|
||||
}
|
||||
|
||||
for (let res of obj.right_products) {
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.price, 'price must be ok');
|
||||
assert.typeOf(res.price, 'string', 'price must be string');
|
||||
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.vendor_link, 'vendor_link must be ok');
|
||||
assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string');
|
||||
assert.isAtLeast(res.vendor_link.length, 10, 'vendor_link must have at least 10 chars');
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
describe('Google', function(){
|
||||
this.timeout(30000);
|
||||
it('normal search', normal_search_test);
|
||||
it('no results', no_results_test);
|
||||
it('effective query', effective_query_test);
|
||||
it('html output query', html_output_query_test);
|
||||
it('ads', ads_test);
|
||||
it('products test', products_test);
|
||||
});
|
@ -1,80 +0,0 @@
|
||||
'use strict';
|
||||
const se_scraper = require('./../index.js');
|
||||
const assert = require('chai').assert;
|
||||
|
||||
/*
|
||||
* Use chai and mocha for tests.
|
||||
* https://mochajs.org/#installation
|
||||
*/
|
||||
|
||||
const normal_search_keywords = ['apple', 'rain'];
|
||||
|
||||
async function normal_image_search_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 0,
|
||||
headless: true,
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'google_image',
|
||||
keywords: normal_search_keywords,
|
||||
num_pages: 2,
|
||||
};
|
||||
|
||||
console.log('normal_image_search_test()');
|
||||
normal_image_search_test_case( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function normal_image_search_test_case(response) {
|
||||
assert.equal(response.metadata.num_requests, 2);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
let total_rank = 1;
|
||||
|
||||
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 15, 'results must have at least 15 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'snippet', 'rank', 'clean_link'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.clean_link, 'clean_link must be ok');
|
||||
assert.typeOf(res.clean_link, 'string', 'clean_link must be string');
|
||||
assert.isAtLeast(res.clean_link.length, 5, 'clean_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
assert.equal(res.rank, total_rank++, 'rank ist wrong');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
describe('Google Image', function(){
|
||||
this.timeout(30000);
|
||||
it('normal image search test', normal_image_search_test);
|
||||
});
|
@ -1,91 +0,0 @@
|
||||
'use strict';
|
||||
const se_scraper = require('./../index.js');
|
||||
const assert = require('chai').assert;
|
||||
|
||||
const normal_search_keywords = ['apple juice'];
|
||||
|
||||
async function queryargs_search_test() {
|
||||
let config = {
|
||||
search_engine: 'google',
|
||||
compress: false,
|
||||
debug: true,
|
||||
verbose: true,
|
||||
keywords: normal_search_keywords,
|
||||
keyword_file: '',
|
||||
num_pages: 2,
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
// use specific search engine parameters for various search engines
|
||||
google_settings: {
|
||||
google_domain: 'google.com',
|
||||
gl: 'fr', // The gl parameter determines the Google country to use for the query.
|
||||
hl: 'fr', // The hl parameter determines the Google UI language to return results.
|
||||
start: 30, // Determines the results offset to use, defaults to 0.
|
||||
num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
|
||||
},
|
||||
};
|
||||
|
||||
console.log('queryargs_search_test()');
|
||||
await se_scraper.scrape(config, queryargs_search_test_case);
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function queryargs_search_test_case(err, response) {
|
||||
|
||||
if (err) {
|
||||
console.error(err);
|
||||
} else {
|
||||
assert.equal(response.metadata.num_requests, 2);
|
||||
|
||||
for (let query in response.results) {
|
||||
let total_rank = 1;
|
||||
|
||||
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 90, 'results must have at least 80 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
assert.equal(res.rank, total_rank++, 'rank ist wrong');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
describe('Google with query arguments', function(){
|
||||
this.timeout(30000);
|
||||
it('query args search test', queryargs_search_test);
|
||||
});
|
@ -1,217 +0,0 @@
|
||||
'use strict';
|
||||
const se_scraper = require('./../index.js');
|
||||
const assert = require('chai').assert;
|
||||
|
||||
/*
|
||||
* Use chai and mocha for tests.
|
||||
* https://mochajs.org/#installation
|
||||
*/
|
||||
|
||||
const quote_search_keywords = ['MSFT', 'AAPL'];
|
||||
|
||||
async function reuters_search_test() {
|
||||
let config = {
|
||||
search_engine: 'reuters',
|
||||
compress: false,
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: quote_search_keywords,
|
||||
keyword_file: '',
|
||||
num_pages: 1,
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
console.log('reuters_search_test()');
|
||||
await se_scraper.scrape(config, reuters_search_test_case);
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function reuters_search_test_case(err, response) {
|
||||
|
||||
if (err) {
|
||||
console.error(err);
|
||||
} else {
|
||||
|
||||
for (let query in response.results) {
|
||||
let total_rank = 1;
|
||||
assert.containsAllKeys(response.results, quote_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'date', 'snippet'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.date, 'date must be ok');
|
||||
assert.typeOf(res.date, 'string', 'date must be string');
|
||||
assert.isAtLeast(res.date.length, 5, 'date must have at least 5 chars');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function cnbc_search_test() {
|
||||
let config = {
|
||||
search_engine: 'cnbc',
|
||||
compress: false,
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: quote_search_keywords,
|
||||
keyword_file: '',
|
||||
num_pages: 1,
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
console.log('cnbc_search_test()');
|
||||
await se_scraper.scrape(config, cnbc_search_test_case);
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function cnbc_search_test_case(err, response) {
|
||||
|
||||
if (err) {
|
||||
console.error(err);
|
||||
} else {
|
||||
|
||||
for (let query in response.results) {
|
||||
let total_rank = 1;
|
||||
assert.containsAllKeys(response.results, quote_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'date'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.date, 'date must be ok');
|
||||
assert.typeOf(res.date, 'string', 'date must be string');
|
||||
assert.isAtLeast(res.date.length, 5, 'date must have at least 5 chars');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const marketwatch_search_keywords = ['MSFT'];
|
||||
|
||||
async function marketwatch_search_test() {
|
||||
let config = {
|
||||
search_engine: 'marketwatch',
|
||||
compress: false,
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: marketwatch_search_keywords,
|
||||
keyword_file: '',
|
||||
num_pages: 1,
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
console.log('marketwatch_search_test()');
|
||||
await se_scraper.scrape(config, marketwatch_search_test_case);
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function marketwatch_search_test_case(err, response) {
|
||||
|
||||
if (err) {
|
||||
console.error(err);
|
||||
} else {
|
||||
|
||||
for (let query in response.results) {
|
||||
let total_rank = 1;
|
||||
assert.containsAllKeys(response.results, marketwatch_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'date', 'author'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.author, 'author must be ok');
|
||||
assert.typeOf(res.author, 'string', 'author must be string');
|
||||
assert.isAtLeast(res.author.length, 5, 'author must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.date, 'date must be ok');
|
||||
assert.typeOf(res.date, 'string', 'date must be string');
|
||||
assert.isAtLeast(res.date.length, 5, 'date must have at least 5 chars');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
describe('Ticker', function(){
|
||||
this.timeout(30000);
|
||||
it('Reuters search test', reuters_search_test);
|
||||
it('CNBC search test', cnbc_search_test);
|
||||
it('Marketwatch search test', marketwatch_search_test);
|
||||
});
|
144
test/user_agent.js
Normal file
144
test/user_agent.js
Normal file
@ -0,0 +1,144 @@
|
||||
'use strict';
|
||||
const express = require('express');
|
||||
const { createLogger, transports } = require('winston');
|
||||
const http = require('http');
|
||||
const https = require('https');
|
||||
const assert = require('assert');
|
||||
const keyCert = require('key-cert');
|
||||
const Promise = require('bluebird');
|
||||
const Proxy = require('http-mitm-proxy');
|
||||
const UAParser = require('ua-parser-js');
|
||||
const _ = require('lodash');
|
||||
|
||||
const debug = require('debug')('se-scraper:test');
|
||||
const se_scraper = require('../');
|
||||
const Scraper = require('../src/modules/se_scraper');
|
||||
|
||||
const httpPort = 3012;
|
||||
const httpsPort = httpPort + 1;
|
||||
const proxyPort = httpPort + 2;
|
||||
|
||||
const fakeSearchEngine = express();
|
||||
fakeSearchEngine.set('trust proxy', 'loopback');
|
||||
fakeSearchEngine.get('/test-user_agent', (req, res) => {
|
||||
debug('fake-search-engine req.headers.user-agent=%s', req.headers['user-agent']);
|
||||
res.send(req.headers['user-agent']);
|
||||
});
|
||||
|
||||
describe('Config', function(){
|
||||
|
||||
let httpServer, httpsServer, proxy;
|
||||
before(async function(){
|
||||
// Here mount our fake engine in both http and https listen server
|
||||
httpServer = http.createServer(fakeSearchEngine);
|
||||
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
||||
|
||||
proxy = Proxy();
|
||||
proxy.onRequest((ctx, callback) => {
|
||||
ctx.proxyToServerRequestOptions.host = 'localhost';
|
||||
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
|
||||
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
|
||||
debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
|
||||
return callback();
|
||||
});
|
||||
|
||||
await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
|
||||
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
|
||||
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
||||
debug('Fake http search engine servers started');
|
||||
});
|
||||
|
||||
after(function(){
|
||||
httpsServer.close();
|
||||
httpServer.close();
|
||||
proxy.close();
|
||||
});
|
||||
|
||||
describe('user_agent', function(){
|
||||
|
||||
class MockScraperTestUserAgent extends Scraper {
|
||||
|
||||
async load_start_page(){
|
||||
return true;
|
||||
}
|
||||
|
||||
async search_keyword(){
|
||||
await this.page.goto('http://localhost:' + httpPort + '/test-user_agent');
|
||||
}
|
||||
|
||||
async parse_async(){
|
||||
const bodyHandle = await this.page.$('body');
|
||||
return await this.page.evaluate(body => body.innerHTML, bodyHandle);
|
||||
}
|
||||
}
|
||||
|
||||
const testLogger = createLogger({
|
||||
transports: [
|
||||
new transports.Console({
|
||||
level: 'error'
|
||||
})
|
||||
]
|
||||
});
|
||||
|
||||
/**
|
||||
* Test user_agent option
|
||||
*/
|
||||
it('fixed user_agent', async function () {
|
||||
|
||||
const scrape_job = {
|
||||
search_engine: MockScraperTestUserAgent,
|
||||
keywords: ['javascript is hard'],
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager({
|
||||
throw_on_detection: true,
|
||||
logger: testLogger,
|
||||
user_agent: 'THIS IS A USERAGENT 42.0'
|
||||
});
|
||||
await scraper.start();
|
||||
|
||||
const { results } = await scraper.scrape(scrape_job);
|
||||
assert.strictEqual(results['javascript is hard']['1'], 'THIS IS A USERAGENT 42.0');
|
||||
|
||||
await scraper.quit();
|
||||
});
|
||||
|
||||
/**
|
||||
* Test random_user_agent option
|
||||
* TODO generated user_agent should be different for each keyword
|
||||
* TODO this test will sometimes fail because user_agent not very random :-(
|
||||
*/
|
||||
it('random_user_agent', async function () {
|
||||
|
||||
const scrape_job = {
|
||||
search_engine: MockScraperTestUserAgent,
|
||||
keywords: ['news'],
|
||||
};
|
||||
|
||||
const NUMBER_OF_EXEC = 10;
|
||||
|
||||
const uaList = await Promise.map(_.range(NUMBER_OF_EXEC), async (i) => {
|
||||
const scraper = new se_scraper.ScrapeManager({
|
||||
throw_on_detection: true,
|
||||
logger: testLogger,
|
||||
random_user_agent: true,
|
||||
});
|
||||
await scraper.start();
|
||||
const { results: { news } } = await scraper.scrape(scrape_job);
|
||||
await scraper.quit();
|
||||
return news['1'];
|
||||
});
|
||||
|
||||
uaList.forEach((userAgent) => {
|
||||
const uaParsed = UAParser(userAgent);
|
||||
assert(uaParsed.browser.name, 'UserAgent should have a browser name detected');
|
||||
assert(uaParsed.os.name, 'UserAgent should have a os name detected');
|
||||
});
|
||||
|
||||
assert( _.chain(uaList).countBy().toPairs().sortBy(e => e[1]).last().value()[1] < (NUMBER_OF_EXEC * 0.4), 'Each user agent should appear less than 40% of the time' );
|
||||
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
});
|
Loading…
Reference in New Issue
Block a user