diff --git a/.gitignore b/.gitignore index 0103c8d..7763e13 100644 --- a/.gitignore +++ b/.gitignore @@ -79,3 +79,5 @@ typings/ .idea/ GoogleScraperPup.iml + +.http-mitm-proxy diff --git a/.gitmodules b/.gitmodules index caa6108..e69de29 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +0,0 @@ -[submodule "src/puppeteer-cluster"] - path = src/puppeteer-cluster - url = https://github.com/NikolaiT/puppeteer-cluster diff --git a/package-lock.json b/package-lock.json index 671d279..c5e9ae8 100644 --- a/package-lock.json +++ b/package-lock.json @@ -22,6 +22,16 @@ "resolved": "https://registry.npmjs.org/@types/node/-/node-12.7.5.tgz", "integrity": "sha512-9fq4jZVhPNW8r+UYKnxF1e2HkDWOWKM5bC2/7c9wPV835I0aOrVbS/Hw/pWPk2uKrNXQqg9Z959Kz+IYDd5p3w==" }, + "accepts": { + "version": "1.3.7", + "resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.7.tgz", + "integrity": "sha512-Il80Qs2WjYlJIBNzNkK6KYqlVMTbZLXgHx2oT0pU/fjRHyEp+PEfEPY0R3WCwAGVOtauxh1hOxNgIf5bv7dQpA==", + "dev": true, + "requires": { + "mime-types": "~2.1.24", + "negotiator": "0.6.2" + } + }, "agent-base": { "version": "4.3.0", "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-4.3.0.tgz", @@ -65,6 +75,12 @@ "resolved": "https://registry.npmjs.org/arr-union/-/arr-union-3.1.0.tgz", "integrity": "sha1-45sJrqne+Gao8gbiiK9jkZuuOcQ=" }, + "array-flatten": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz", + "integrity": "sha1-ml9pkFGx5wczKPKgCJaLZOopVdI=", + "dev": true + }, "assertion-error": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/assertion-error/-/assertion-error-1.1.0.tgz", @@ -89,6 +105,47 @@ "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.0.tgz", "integrity": "sha1-ibTRmasr7kneFk6gK4nORi1xt2c=" }, + "bluebird": { + "version": "3.7.2", + "resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.7.2.tgz", + "integrity": "sha512-XpNj6GDQzdfW+r2Wnn7xiSAd7TM3jzkxGXBGTtWKuSXv1xUV+azxAm8jdWZN06QTQk+2N2XB9jRDkvbmQmcRtg==", + "dev": true + }, + "body-parser": { + "version": "1.19.0", + "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.19.0.tgz", + "integrity": "sha512-dhEPs72UPbDnAQJ9ZKMNTP6ptJaionhP5cBb541nXPlW60Jepo9RV/a4fX4XWW9CuFNK22krhrj1+rgzifNCsw==", + "dev": true, + "requires": { + "bytes": "3.1.0", + "content-type": "~1.0.4", + "debug": "2.6.9", + "depd": "~1.1.2", + "http-errors": "1.7.2", + "iconv-lite": "0.4.24", + "on-finished": "~2.3.0", + "qs": "6.7.0", + "raw-body": "2.4.0", + "type-is": "~1.6.17" + }, + "dependencies": { + "debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "dev": true, + "requires": { + "ms": "2.0.0" + } + }, + "ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=", + "dev": true + } + } + }, "boolbase": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz", @@ -114,6 +171,12 @@ "resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.1.tgz", "integrity": "sha512-MQcXEUbCKtEo7bhqEs6560Hyd4XaovZlO/k9V3hjVUF/zwW7KBVdSK4gIt/bzwS9MbR5qob+F5jusZsb0YQK2A==" }, + "bytes": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.0.tgz", + "integrity": "sha512-zauLjrfCG+xvoyaqLoV8bLVXXNGC4JqlxFCutSDWA6fJrTo2ZuvLYTqZ7aHBLZSMOopbzwv8f+wZcVzfVTI2Dg==", + "dev": true + }, "cacheable-request": { "version": "6.0.0", "resolved": "https://registry.npmjs.org/cacheable-request/-/cacheable-request-6.0.0.tgz", @@ -176,6 +239,12 @@ } } }, + "charenc": { + "version": "0.0.2", + "resolved": "https://registry.npmjs.org/charenc/-/charenc-0.0.2.tgz", + "integrity": "sha1-wKHS86cJLgN3S/qD8UwPxXkKhmc=", + "dev": true + }, "check-error": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/check-error/-/check-error-1.0.2.tgz", @@ -322,6 +391,33 @@ } } }, + "content-disposition": { + "version": "0.5.3", + "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-0.5.3.tgz", + "integrity": "sha512-ExO0774ikEObIAEV9kDo50o+79VCUdEB6n6lzKgGwupcVeRlhrj3qGAfwq8G6uBJjkqLrhT0qEYFcWng8z1z0g==", + "dev": true, + "requires": { + "safe-buffer": "5.1.2" + } + }, + "content-type": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/content-type/-/content-type-1.0.4.tgz", + "integrity": "sha512-hIP3EEPs8tB9AT1L+NUqtwOAps4mk2Zob89MWXMHjHWg9milF/j4osnnQLXBCBFBk/tvIG/tUc9mOUJiPBhPXA==", + "dev": true + }, + "cookie": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.4.0.tgz", + "integrity": "sha512-+Hp8fLp57wnUSt0tY0tHEXh4voZRDnoIrZPqlo3DPiI4y9lwg/jqx+1Om94/W6ZaPDOUbnjOt/99w66zk+l1Xg==", + "dev": true + }, + "cookie-signature": { + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.0.6.tgz", + "integrity": "sha1-4wOogrNCzD7oylE6eZmXNNqzriw=", + "dev": true + }, "core-util-is": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz", @@ -340,6 +436,12 @@ "which": "^1.2.9" } }, + "crypt": { + "version": "0.0.2", + "resolved": "https://registry.npmjs.org/crypt/-/crypt-0.0.2.tgz", + "integrity": "sha1-iNf/fsDfuG9xPch7u0LQRNPmxBs=", + "dev": true + }, "css-select": { "version": "1.2.0", "resolved": "http://registry.npmjs.org/css-select/-/css-select-1.2.0.tgz", @@ -406,6 +508,18 @@ "object-keys": "^1.0.12" } }, + "depd": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/depd/-/depd-1.1.2.tgz", + "integrity": "sha1-m81S4UwJd2PnSbJ0xDRu0uVgtak=", + "dev": true + }, + "destroy": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/destroy/-/destroy-1.0.4.tgz", + "integrity": "sha1-l4hXRCxEdJ5CBmE+N5RiBYJqvYA=", + "dev": true + }, "diagnostics": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/diagnostics/-/diagnostics-1.1.1.tgz", @@ -472,6 +586,12 @@ "resolved": "https://registry.npmjs.org/duplexer3/-/duplexer3-0.1.4.tgz", "integrity": "sha1-7gHdHKwO08vH/b6jfcCo8c4ALOI=" }, + "ee-first": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz", + "integrity": "sha1-WQxhFWsK4vTwJVcyoViyZrxWsh0=", + "dev": true + }, "emoji-regex": { "version": "7.0.3", "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-7.0.3.tgz", @@ -486,6 +606,12 @@ "env-variable": "0.0.x" } }, + "encodeurl": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz", + "integrity": "sha1-rT/0yG7C0CkyL1oCw6mmBslbP1k=", + "dev": true + }, "end-of-stream": { "version": "1.4.1", "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.1.tgz", @@ -542,6 +668,12 @@ "es6-promise": "^4.0.3" } }, + "escape-html": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/escape-html/-/escape-html-1.0.3.tgz", + "integrity": "sha1-Aljq5NPQwJdN4cFpGI7wBR0dGYg=", + "dev": true + }, "escape-string-regexp": { "version": "1.0.5", "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-1.0.5.tgz", @@ -554,6 +686,12 @@ "integrity": "sha512-eGuFFw7Upda+g4p+QHvnW0RyTX/SVeJBDM/gCtMARO0cLuT2HcEKnTPvhjV6aGeqrCB/sbNop0Kszm0jsaWU4A==", "dev": true }, + "etag": { + "version": "1.8.1", + "resolved": "https://registry.npmjs.org/etag/-/etag-1.8.1.tgz", + "integrity": "sha1-Qa4u62XvpiJorr/qg6x9eSmbCIc=", + "dev": true + }, "execa": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/execa/-/execa-1.0.0.tgz", @@ -569,6 +707,61 @@ "strip-eof": "^1.0.0" } }, + "express": { + "version": "4.17.1", + "resolved": "https://registry.npmjs.org/express/-/express-4.17.1.tgz", + "integrity": "sha512-mHJ9O79RqluphRrcw2X/GTh3k9tVv8YcoyY4Kkh4WDMUYKRZUq0h1o0w2rrrxBqM7VoeUVqgb27xlEMXTnYt4g==", + "dev": true, + "requires": { + "accepts": "~1.3.7", + "array-flatten": "1.1.1", + "body-parser": "1.19.0", + "content-disposition": "0.5.3", + "content-type": "~1.0.4", + "cookie": "0.4.0", + "cookie-signature": "1.0.6", + "debug": "2.6.9", + "depd": "~1.1.2", + "encodeurl": "~1.0.2", + "escape-html": "~1.0.3", + "etag": "~1.8.1", + "finalhandler": "~1.1.2", + "fresh": "0.5.2", + "merge-descriptors": "1.0.1", + "methods": "~1.1.2", + "on-finished": "~2.3.0", + "parseurl": "~1.3.3", + "path-to-regexp": "0.1.7", + "proxy-addr": "~2.0.5", + "qs": "6.7.0", + "range-parser": "~1.2.1", + "safe-buffer": "5.1.2", + "send": "0.17.1", + "serve-static": "1.14.1", + "setprototypeof": "1.1.1", + "statuses": "~1.5.0", + "type-is": "~1.6.18", + "utils-merge": "1.0.1", + "vary": "~1.1.2" + }, + "dependencies": { + "debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "dev": true, + "requires": { + "ms": "2.0.0" + } + }, + "ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=", + "dev": true + } + } + }, "extract-zip": { "version": "1.6.7", "resolved": "https://registry.npmjs.org/extract-zip/-/extract-zip-1.6.7.tgz", @@ -613,6 +806,38 @@ "resolved": "https://registry.npmjs.org/fecha/-/fecha-2.3.3.tgz", "integrity": "sha512-lUGBnIamTAwk4znq5BcqsDaxSmZ9nDVJaij6NvRt/Tg4R69gERA+otPKbS86ROw9nxVMw2/mp1fnaiWqbs6Sdg==" }, + "finalhandler": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-1.1.2.tgz", + "integrity": "sha512-aAWcW57uxVNrQZqFXjITpW3sIUQmHGG3qSb9mUah9MgMC4NeWhNOlNjXEYq3HjRAvL6arUviZGGJsBg6z0zsWA==", + "dev": true, + "requires": { + "debug": "2.6.9", + "encodeurl": "~1.0.2", + "escape-html": "~1.0.3", + "on-finished": "~2.3.0", + "parseurl": "~1.3.3", + "statuses": "~1.5.0", + "unpipe": "~1.0.0" + }, + "dependencies": { + "debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "dev": true, + "requires": { + "ms": "2.0.0" + } + }, + "ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=", + "dev": true + } + } + }, "find-up": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/find-up/-/find-up-3.0.0.tgz", @@ -644,6 +869,18 @@ "for-in": "^1.0.1" } }, + "forwarded": { + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.1.2.tgz", + "integrity": "sha1-mMI9qxF1ZXuMBXPozszZGw/xjIQ=", + "dev": true + }, + "fresh": { + "version": "0.5.2", + "resolved": "https://registry.npmjs.org/fresh/-/fresh-0.5.2.tgz", + "integrity": "sha1-PYyt2Q2XZWn6g1qx+OSyOhBWBac=", + "dev": true + }, "fs.realpath": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", @@ -757,6 +994,47 @@ "resolved": "https://registry.npmjs.org/http-cache-semantics/-/http-cache-semantics-4.0.2.tgz", "integrity": "sha512-laeSTWIkuFa6lUgZAt+ic9RwOSEwbi9VDQNcCvMFO4sZiDc2Ha8DaZVCJnfpLLQCcS8rvCnIWYmz0POLxt7Dew==" }, + "http-errors": { + "version": "1.7.2", + "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-1.7.2.tgz", + "integrity": "sha512-uUQBt3H/cSIVfch6i1EuPNy/YsRSOUBXTVfZ+yR7Zjez3qjBz6i9+i4zjNaoqcoFVI4lQJ5plg63TvGfRSDCRg==", + "dev": true, + "requires": { + "depd": "~1.1.2", + "inherits": "2.0.3", + "setprototypeof": "1.1.1", + "statuses": ">= 1.5.0 < 2", + "toidentifier": "1.0.0" + } + }, + "http-mitm-proxy": { + "version": "0.8.2", + "resolved": "https://registry.npmjs.org/http-mitm-proxy/-/http-mitm-proxy-0.8.2.tgz", + "integrity": "sha512-QqaqHWssz4acqu2aIPJqJWt/gDa4SzQ9kj/rs16ONA2nBWNh/mfOW0Ez1Wxa5IivHHZSTciQ7wG0Dxzogurngw==", + "dev": true, + "requires": { + "async": "^2.6.2", + "debug": "^4.1.0", + "mkdirp": "^0.5.1", + "node-forge": "^0.8.4", + "optimist": "^0.6.1", + "semaphore": "^1.1.0", + "ws": "^3.2.0" + }, + "dependencies": { + "ws": { + "version": "3.3.3", + "resolved": "https://registry.npmjs.org/ws/-/ws-3.3.3.tgz", + "integrity": "sha512-nnWLa/NwZSt4KQJu51MYlCcSQ5g7INpOrOMt4XV8j4dqTXdmlUmSHQ8/oLC069ckre0fRsgfvsKwbTdtKLCDkA==", + "dev": true, + "requires": { + "async-limiter": "~1.0.0", + "safe-buffer": "~5.1.0", + "ultron": "~1.1.0" + } + } + } + }, "https-proxy-agent": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-3.0.1.tgz", @@ -776,6 +1054,15 @@ } } }, + "iconv-lite": { + "version": "0.4.24", + "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz", + "integrity": "sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA==", + "dev": true, + "requires": { + "safer-buffer": ">= 2.1.2 < 3" + } + }, "inflight": { "version": "1.0.6", "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz", @@ -796,6 +1083,12 @@ "integrity": "sha512-wPVv/y/QQ/Uiirj/vh3oP+1Ww+AWehmi1g5fFWGPF6IpCBCDVrhgHRMvrLfdYcwDh3QJbGXDW4JAuzxElLSqKA==", "dev": true }, + "ipaddr.js": { + "version": "1.9.0", + "resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.9.0.tgz", + "integrity": "sha512-M4Sjn6N/+O6/IXSJseKqHoFc+5FdGJ22sXqnjTpdZweHK64MzEPAyQZyEU3R/KRv2GLoa7nNtg/C2Ev6m7z+eA==", + "dev": true + }, "is-arrayish": { "version": "0.3.2", "resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.2.tgz", @@ -892,6 +1185,15 @@ "resolved": "https://registry.npmjs.org/json-buffer/-/json-buffer-3.0.0.tgz", "integrity": "sha1-Wx85evx11ne96Lz8Dkfh+aPZqJg=" }, + "key-cert": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/key-cert/-/key-cert-1.0.1.tgz", + "integrity": "sha512-WiaPESfEzsztL9KIxbX6mNAU34NcEOyLVrpajrTkXeVc2tAZDx3lcLQlIE+bUqEoaIl0InBoiIy6C5ToLJ7i0g==", + "dev": true, + "requires": { + "pem": "^1.12.5" + } + }, "keyv": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/keyv/-/keyv-3.1.0.tgz", @@ -992,6 +1294,31 @@ "p-defer": "^1.0.0" } }, + "md5": { + "version": "2.2.1", + "resolved": "https://registry.npmjs.org/md5/-/md5-2.2.1.tgz", + "integrity": "sha1-U6s41f48iJG6RlMp6iP6wFQBJvk=", + "dev": true, + "requires": { + "charenc": "~0.0.1", + "crypt": "~0.0.1", + "is-buffer": "~1.1.1" + }, + "dependencies": { + "is-buffer": { + "version": "1.1.6", + "resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-1.1.6.tgz", + "integrity": "sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w==", + "dev": true + } + } + }, + "media-typer": { + "version": "0.3.0", + "resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz", + "integrity": "sha1-hxDXrwqmJvj/+hzgAWhUUmMlV0g=", + "dev": true + }, "mem": { "version": "4.3.0", "resolved": "https://registry.npmjs.org/mem/-/mem-4.3.0.tgz", @@ -1013,11 +1340,38 @@ "kind-of": "^3.0.2" } }, + "merge-descriptors": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.1.tgz", + "integrity": "sha1-sAqqVW3YtEVoFQ7J0blT8/kMu2E=", + "dev": true + }, + "methods": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/methods/-/methods-1.1.2.tgz", + "integrity": "sha1-VSmk1nZUE07cxSZmVoNbD4Ua/O4=", + "dev": true + }, "mime": { "version": "2.4.4", "resolved": "https://registry.npmjs.org/mime/-/mime-2.4.4.tgz", "integrity": "sha512-LRxmNwziLPT828z+4YkNzloCFC2YM4wrB99k+AV5ZbEyfGNWfG8SO1FUXLmLDBSo89NrJZ4DIWeLjy1CHGhMGA==" }, + "mime-db": { + "version": "1.42.0", + "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.42.0.tgz", + "integrity": "sha512-UbfJCR4UAVRNgMpfImz05smAXK7+c+ZntjaA26ANtkXLlOe947Aag5zdIcKQULAiF9Cq4WxBi9jUs5zkA84bYQ==", + "dev": true + }, + "mime-types": { + "version": "2.1.25", + "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.25.tgz", + "integrity": "sha512-5KhStqB5xpTAeGqKBAMgwaYMnQik7teQN4IAzC7npDv6kzeU6prfkR67bc87J1kWMPGkoaZSq1npmexMgkmEVg==", + "dev": true, + "requires": { + "mime-db": "1.42.0" + } + }, "mimic-fn": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/mimic-fn/-/mimic-fn-2.1.0.tgz", @@ -1127,6 +1481,12 @@ "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.1.tgz", "integrity": "sha512-tgp+dl5cGk28utYktBsrFqA7HKgrhgPsg6Z/EfhWI4gl1Hwq8B/GmY/0oXZ6nF8hDVesS/FpnYaD/kOWhYQvyg==" }, + "negotiator": { + "version": "0.6.2", + "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.2.tgz", + "integrity": "sha512-hZXc7K2e+PgeI1eDBe/10Ard4ekbfrrqG8Ep+8Jmf4JID2bNg7NvCPOZN+kfF574pFQI7mum2AUqDidoKqcTOw==", + "dev": true + }, "nice-try": { "version": "1.0.5", "resolved": "https://registry.npmjs.org/nice-try/-/nice-try-1.0.5.tgz", @@ -1143,6 +1503,12 @@ "semver": "^5.7.0" } }, + "node-forge": { + "version": "0.8.5", + "resolved": "https://registry.npmjs.org/node-forge/-/node-forge-0.8.5.tgz", + "integrity": "sha512-vFMQIWt+J/7FLNyKouZ9TazT74PRV3wgv9UT4cRjC8BffxFbKXkgIWR42URCPSnHm/QDz6BOlb2Q0U4+VQT67Q==", + "dev": true + }, "normalize-url": { "version": "3.3.0", "resolved": "https://registry.npmjs.org/normalize-url/-/normalize-url-3.3.0.tgz", @@ -1199,6 +1565,15 @@ "es-abstract": "^1.5.1" } }, + "on-finished": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/on-finished/-/on-finished-2.3.0.tgz", + "integrity": "sha1-IPEzZIGwg811M3mSoWlxqi2QaUc=", + "dev": true, + "requires": { + "ee-first": "1.1.1" + } + }, "once": { "version": "1.4.0", "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", @@ -1212,6 +1587,16 @@ "resolved": "https://registry.npmjs.org/one-time/-/one-time-0.0.4.tgz", "integrity": "sha1-+M33eISCb+Tf+T46nMN7HkSAdC4=" }, + "optimist": { + "version": "0.6.1", + "resolved": "https://registry.npmjs.org/optimist/-/optimist-0.6.1.tgz", + "integrity": "sha1-2j6nRob6IaGaERwybpDrFaAZZoY=", + "dev": true, + "requires": { + "minimist": "~0.0.1", + "wordwrap": "~0.0.2" + } + }, "os-locale": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/os-locale/-/os-locale-3.1.0.tgz", @@ -1223,6 +1608,12 @@ "mem": "^4.0.0" } }, + "os-tmpdir": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/os-tmpdir/-/os-tmpdir-1.0.2.tgz", + "integrity": "sha1-u+Z0BseaqFxc/sdm/lc0VV36EnQ=", + "dev": true + }, "p-cancelable": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/p-cancelable/-/p-cancelable-1.0.0.tgz", @@ -1278,6 +1669,12 @@ "@types/node": "*" } }, + "parseurl": { + "version": "1.3.3", + "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz", + "integrity": "sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ==", + "dev": true + }, "path-exists": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-3.0.0.tgz", @@ -1295,12 +1692,38 @@ "integrity": "sha1-QRyttXTFoUDTpLGRDUDYDMn0C0A=", "dev": true }, + "path-to-regexp": { + "version": "0.1.7", + "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.7.tgz", + "integrity": "sha1-32BBeABfUi8V60SQ5yR6G/qmf4w=", + "dev": true + }, "pathval": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/pathval/-/pathval-1.1.0.tgz", "integrity": "sha1-uULm1L3mUwBe9rcTYd74cn0GReA=", "dev": true }, + "pem": { + "version": "1.14.3", + "resolved": "https://registry.npmjs.org/pem/-/pem-1.14.3.tgz", + "integrity": "sha512-Q+AMVMD3fzeVvZs5PHeI+pVt0hgZY2fjhkliBW43qyONLgCXPVk1ryim43F9eupHlNGLJNT5T/NNrzhUdiC5Zg==", + "dev": true, + "requires": { + "es6-promisify": "^6.0.0", + "md5": "^2.2.1", + "os-tmpdir": "^1.0.1", + "which": "^1.3.1" + }, + "dependencies": { + "es6-promisify": { + "version": "6.0.2", + "resolved": "https://registry.npmjs.org/es6-promisify/-/es6-promisify-6.0.2.tgz", + "integrity": "sha512-eO6vFm0JvqGzjWIQA6QVKjxpmELfhWbDUWHm1rPfIbn55mhKPiAa5xpLmQWJrNa629ZIeQ8ZvMAi13kvrjK6Mg==", + "dev": true + } + } + }, "pend": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz", @@ -1321,6 +1744,16 @@ "resolved": "https://registry.npmjs.org/progress/-/progress-2.0.3.tgz", "integrity": "sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA==" }, + "proxy-addr": { + "version": "2.0.5", + "resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.5.tgz", + "integrity": "sha512-t/7RxHXPH6cJtP0pRG6smSr9QJidhB+3kXu0KgXnbGYMgzEnUxRQ4/LDdfOwZEMyIh3/xHb8PX3t+lfL9z+YVQ==", + "dev": true, + "requires": { + "forwarded": "~0.1.2", + "ipaddr.js": "1.9.0" + } + }, "proxy-from-env": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.0.0.tgz", @@ -1350,6 +1783,14 @@ "ws": "^6.1.0" } }, + "puppeteer-cluster": { + "version": "0.18.0", + "resolved": "https://registry.npmjs.org/puppeteer-cluster/-/puppeteer-cluster-0.18.0.tgz", + "integrity": "sha512-sMhK7foa+gq8khtgvKar6xwIjmPFS92ZrDsnpMHcQMX2Q8hWmVSViHG7RhuAOmbfIbS7Ya+lViXlx1xtgUjarQ==", + "requires": { + "debug": "^4.1.1" + } + }, "puppeteer-extra": { "version": "2.1.3", "resolved": "https://registry.npmjs.org/puppeteer-extra/-/puppeteer-extra-2.1.3.tgz", @@ -1427,6 +1868,30 @@ } } }, + "qs": { + "version": "6.7.0", + "resolved": "https://registry.npmjs.org/qs/-/qs-6.7.0.tgz", + "integrity": "sha512-VCdBRNFTX1fyE7Nb6FYoURo/SPe62QCaAyzJvUjwRaIsc+NePBEniHlvxFmmX56+HZphIGtV0XeCirBtpDrTyQ==", + "dev": true + }, + "range-parser": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz", + "integrity": "sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg==", + "dev": true + }, + "raw-body": { + "version": "2.4.0", + "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.4.0.tgz", + "integrity": "sha512-4Oz8DUIwdvoa5qMJelxipzi/iJIi40O5cGV1wNYp5hvZP8ZN0T+jiNkL0QepXs+EsQ9XJ8ipEDoiH70ySUJP3Q==", + "dev": true, + "requires": { + "bytes": "3.1.0", + "http-errors": "1.7.2", + "iconv-lite": "0.4.24", + "unpipe": "1.0.0" + } + }, "readable-stream": { "version": "3.4.0", "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.4.0.tgz", @@ -1470,18 +1935,94 @@ "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==" }, + "safer-buffer": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", + "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==", + "dev": true + }, + "semaphore": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/semaphore/-/semaphore-1.1.0.tgz", + "integrity": "sha512-O4OZEaNtkMd/K0i6js9SL+gqy0ZCBMgUvlSqHKi4IBdjhe7wB8pwztUk1BbZ1fmrvpwFrPbHzqd2w5pTcJH6LA==", + "dev": true + }, "semver": { "version": "5.7.0", "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.0.tgz", "integrity": "sha512-Ya52jSX2u7QKghxeoFGpLwCtGlt7j0oY9DYb5apt9nPlJ42ID+ulTXESnt/qAQcoSERyZ5sl3LDIOw0nAn/5DA==", "dev": true }, + "send": { + "version": "0.17.1", + "resolved": "https://registry.npmjs.org/send/-/send-0.17.1.tgz", + "integrity": "sha512-BsVKsiGcQMFwT8UxypobUKyv7irCNRHk1T0G680vk88yf6LBByGcZJOTJCrTP2xVN6yI+XjPJcNuE3V4fT9sAg==", + "dev": true, + "requires": { + "debug": "2.6.9", + "depd": "~1.1.2", + "destroy": "~1.0.4", + "encodeurl": "~1.0.2", + "escape-html": "~1.0.3", + "etag": "~1.8.1", + "fresh": "0.5.2", + "http-errors": "~1.7.2", + "mime": "1.6.0", + "ms": "2.1.1", + "on-finished": "~2.3.0", + "range-parser": "~1.2.1", + "statuses": "~1.5.0" + }, + "dependencies": { + "debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "dev": true, + "requires": { + "ms": "2.0.0" + }, + "dependencies": { + "ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=", + "dev": true + } + } + }, + "mime": { + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/mime/-/mime-1.6.0.tgz", + "integrity": "sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg==", + "dev": true + } + } + }, + "serve-static": { + "version": "1.14.1", + "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.14.1.tgz", + "integrity": "sha512-JMrvUwE54emCYWlTI+hGrGv5I8dEwmco/00EvkzIIsR7MqrHonbD9pO2MOfFnpFntl7ecpZs+3mW+XbQZu9QCg==", + "dev": true, + "requires": { + "encodeurl": "~1.0.2", + "escape-html": "~1.0.3", + "parseurl": "~1.3.3", + "send": "0.17.1" + } + }, "set-blocking": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/set-blocking/-/set-blocking-2.0.0.tgz", "integrity": "sha1-BF+XgtARrppoA93TgrJDkrPYkPc=", "dev": true }, + "setprototypeof": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.1.1.tgz", + "integrity": "sha512-JvdAWfbXeIGaZ9cILp38HntZSFSo3mWg6xGcJJsd+d4aRMOqauag1C63dJfDw7OaMYwEbHMOxEZ1lqVRYP2OAw==", + "dev": true + }, "shallow-clone": { "version": "0.1.2", "resolved": "https://registry.npmjs.org/shallow-clone/-/shallow-clone-0.1.2.tgz", @@ -1553,6 +2094,12 @@ "resolved": "https://registry.npmjs.org/stack-trace/-/stack-trace-0.0.10.tgz", "integrity": "sha1-VHxws0fo0ytOEI6hoqFZ5f3eGcA=" }, + "statuses": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/statuses/-/statuses-1.5.0.tgz", + "integrity": "sha1-Fhx9rBd2Wf2YEfQ3cfqZOBR4Yow=", + "dev": true + }, "string-width": { "version": "2.1.1", "resolved": "https://registry.npmjs.org/string-width/-/string-width-2.1.1.tgz", @@ -1618,6 +2165,12 @@ "resolved": "https://registry.npmjs.org/to-readable-stream/-/to-readable-stream-1.0.0.tgz", "integrity": "sha512-Iq25XBt6zD5npPhlLVXGFN3/gyR2/qODcKNNyTMd4vbm39HUaOiAM4PMq0eMVC/Tkxz+Zjdsc55g9yyz+Yq00Q==" }, + "toidentifier": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/toidentifier/-/toidentifier-1.0.0.tgz", + "integrity": "sha512-yaOH/Pk/VEhBWWTlhI+qXxDFXlejDGcQipMlyxda9nthulaxLZUNcUqFxokp0vcYnvteJln5FNQDRrxj3YcbVw==", + "dev": true + }, "triple-beam": { "version": "1.3.0", "resolved": "https://registry.npmjs.org/triple-beam/-/triple-beam-1.3.0.tgz", @@ -1629,11 +2182,33 @@ "integrity": "sha512-0fr/mIH1dlO+x7TlcMy+bIDqKPsw/70tVyeHW787goQjhmqaZe10uwLujubK9q9Lg6Fiho1KUKDYz0Z7k7g5/g==", "dev": true }, + "type-is": { + "version": "1.6.18", + "resolved": "https://registry.npmjs.org/type-is/-/type-is-1.6.18.tgz", + "integrity": "sha512-TkRKr9sUTxEH8MdfuCSP7VizJyzRNMjj2J2do2Jr3Kym598JVdEksuzPQCnlFPW4ky9Q+iA+ma9BGm06XQBy8g==", + "dev": true, + "requires": { + "media-typer": "0.3.0", + "mime-types": "~2.1.24" + } + }, "typedarray": { "version": "0.0.6", "resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz", "integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c=" }, + "ua-parser-js": { + "version": "0.7.21", + "resolved": "https://registry.npmjs.org/ua-parser-js/-/ua-parser-js-0.7.21.tgz", + "integrity": "sha512-+O8/qh/Qj8CgC6eYBVBykMrNtp5Gebn4dlGD/kKXVkJNDwyrAwSIqwz8CDf+tsAIWVycKcku6gIXJ0qwx/ZXaQ==", + "dev": true + }, + "ultron": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/ultron/-/ultron-1.1.1.tgz", + "integrity": "sha512-UIEXBNeYmKptWH6z8ZnqTeS8fV74zG0/eRU9VGkpzz+LIJNs8W/zM/L+7ctCkRrgbNnnR0xxw4bKOr0cW0N0Og==", + "dev": true + }, "underscore": { "version": "1.9.1", "resolved": "https://registry.npmjs.org/underscore/-/underscore-1.9.1.tgz", @@ -1647,6 +2222,12 @@ "underscore": "*" } }, + "unpipe": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz", + "integrity": "sha1-sr9O6FFKrmFltIF4KdIbLvSZBOw=", + "dev": true + }, "url-parse-lax": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/url-parse-lax/-/url-parse-lax-3.0.0.tgz", @@ -1669,6 +2250,18 @@ "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", "integrity": "sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8=" }, + "utils-merge": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/utils-merge/-/utils-merge-1.0.1.tgz", + "integrity": "sha1-n5VxD1CiZ5R7LMwSR0HBAoQn5xM=", + "dev": true + }, + "vary": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/vary/-/vary-1.1.2.tgz", + "integrity": "sha1-IpnwLG3tMNSllhsLn3RSShj2NPw=", + "dev": true + }, "which": { "version": "1.3.1", "resolved": "https://registry.npmjs.org/which/-/which-1.3.1.tgz", @@ -1742,6 +2335,12 @@ } } }, + "wordwrap": { + "version": "0.0.3", + "resolved": "https://registry.npmjs.org/wordwrap/-/wordwrap-0.0.3.tgz", + "integrity": "sha1-o9XabNXAvAAI03I0u68b7WMFkQc=", + "dev": true + }, "wrap-ansi": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-2.1.0.tgz", diff --git a/package.json b/package.json index e4c4bad..8533f13 100644 --- a/package.json +++ b/package.json @@ -5,8 +5,7 @@ "homepage": "https://scrapeulous.com/", "main": "index.js", "scripts": { - "postinstall": "cd src/puppeteer-cluster && npm install && npm run build", - "test": "mocha test/static_tests/" + "test": "mocha test test/modules" }, "keywords": [ "scraping", @@ -27,14 +26,20 @@ "got": "^9.6.0", "lodash": "^4.17.14", "puppeteer": "^2.0.0", + "puppeteer-cluster": "^0.18.0", "puppeteer-extra": "^2.1.3", "puppeteer-extra-plugin-stealth": "^2.2.2", "user-agents": "^1.0.378", "winston": "^3.2.1" }, "devDependencies": { + "bluebird": "^3.7.2", "chai": "^4.2.0", "chai-string": "^1.5.0", - "mocha": "^6.1.4" + "express": "^4.17.1", + "http-mitm-proxy": "^0.8.2", + "key-cert": "^1.0.1", + "mocha": "^6.1.4", + "ua-parser-js": "^0.7.21" } } diff --git a/server/server.js b/server/server.js deleted file mode 100644 index 0512196..0000000 --- a/server/server.js +++ /dev/null @@ -1,66 +0,0 @@ -/** - - Test server with: - -curl -XPOST http://0.0.0.0:3000 -H 'Content-Type: application/json' \ --d '{ - "browser_config": { - "random_user_agent": true - }, - "scrape_config": { - "search_engine": "google", - "keywords": ["test"], - "num_pages": 1 - } -}' - -*/ - -const se_scraper = require('../index.js'); -'use strict'; -const express = require('express'); - -// Constants -const PORT = process.env.PORT || 3000; -const HOST = process.env.HOST || '0.0.0.0'; - -// App -const app = express(); -app.use(express.json()); - -let browser_config = { - random_user_agent: true, - headless : true, - debug_level: 1, - sleep_range: '', - puppeteer_cluster_config: { - timeout: 30 * 60 * 1000, // max timeout set to 30 minutes - monitor: false, - concurrency: 1, // one scraper per tab - maxConcurrency: 1, // scrape with 5 tabs - } -}; - -app.post('/', async (req, res) => { - if (!req.body.browser_config || !req.body.scrape_config) { - res.json({ - 'status': 400, - 'msg': 'please specify browser_config and scrape_config' - }); - } else { - // overwrite standard browser config - Object.assign(browser_config, req.body.browser_config); - - var scraper = new se_scraper.ScrapeManager(browser_config); - await scraper.start(); - var results = await scraper.scrape(req.body.scrape_config); - // console.dir(results, {depth: null, colors: true}); - await scraper.quit(); - - res.send(results); - } -}); - -app.listen(PORT, HOST); - -console.log(`Running on http://${HOST}:${PORT}`); diff --git a/src/concurrency-implementation.js b/src/concurrency-implementation.js new file mode 100644 index 0000000..bace89c --- /dev/null +++ b/src/concurrency-implementation.js @@ -0,0 +1,55 @@ +const { Browser } = require('puppeteer-cluster/dist/concurrency/builtInConcurrency'); +const debug = require('debug')('se-scraper:CustomConcurrency'); +const { timeoutExecute } = require('puppeteer-cluster/dist/util'); + +const BROWSER_TIMEOUT = 5000; + +class CustomConcurrency extends Browser { + + async init() {} + async close() {} + + async workerInstance() { + const options = this.options.perBrowserOptions.shift(); + debug('Launch puppeteer instance with options=%o', options); + let chrome = await this.puppeteer.launch(options); + let page; + let context; + + return { + jobInstance: async () => { + await timeoutExecute(BROWSER_TIMEOUT, (async () => { + context = await chrome.createIncognitoBrowserContext(); + page = await context.newPage(); + })()); + + return { + resources: { + page, + }, + + close: async () => { + await timeoutExecute(BROWSER_TIMEOUT, context.close()); + }, + }; + }, + + close: async () => { + await chrome.close(); + }, + + repair: async () => { + debug('Starting repair'); + try { + // will probably fail, but just in case the repair was not necessary + await chrome.close(); + } catch (e) {} + + // just relaunch as there is only one page per browser + chrome = await this.puppeteer.launch(options); + }, + }; + } +}; + +module.exports = CustomConcurrency; \ No newline at end of file diff --git a/src/modules/bing.js b/src/modules/bing.js index 0cda19a..78f2d2a 100644 --- a/src/modules/bing.js +++ b/src/modules/bing.js @@ -123,12 +123,9 @@ class BingScraper extends Scraper { } } - try { - await this.page.goto(startUrl); - await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT }); - } catch (e) { - return false; - } + await this.page.goto(startUrl); + await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT }); + return true; } diff --git a/src/modules/duckduckgo.js b/src/modules/duckduckgo.js index 8fbf96e..2a3a536 100644 --- a/src/modules/duckduckgo.js +++ b/src/modules/duckduckgo.js @@ -1,15 +1,18 @@ const cheerio = require('cheerio'); const Scraper = require('./se_scraper'); +const debug = require('debug')('se-scraper:DuckduckgoScraper'); class DuckduckgoScraper extends Scraper { parse(html) { + debug('parse'); // load the page source into cheerio const $ = cheerio.load(html); // perform queries const results = []; - $('#links .result__body').each((i, link) => { + const organicSelector = ($('#links .result--sep').length > 0) ? `#links #rld-${this.page_num - 1} ~ .result .result__body` : '#links .result__body'; + $(organicSelector).each((i, link) => { results.push({ link: $(link).find('.result__title .result__a').attr('href'), title: $(link).find('.result__title .result__a').text(), @@ -42,19 +45,17 @@ class DuckduckgoScraper extends Scraper { } async load_start_page() { + debug('load_start_page'); + let startUrl = 'https://duckduckgo.com/'; - let startUrl = 'https://duckduckgo.com/?q=test'; - - try { - this.last_response = await this.page.goto(startUrl); - await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT }); - } catch (e) { - return false; - } + this.last_response = await this.page.goto(startUrl); + await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT }); + return true; } async search_keyword(keyword) { + debug('search_keyword'); const input = await this.page.$('input[name="q"]'); await this.set_input_value(`input[name="q"]`, keyword); await this.sleep(50); @@ -63,21 +64,19 @@ class DuckduckgoScraper extends Scraper { } async next_page() { - let next_page_link = await this.page.$('.result.result--more', {timeout: this.STANDARD_TIMEOUT}); + debug('next_page'); + let next_page_link = await this.page.$('.result.result--more a', {timeout: this.STANDARD_TIMEOUT}); if (!next_page_link) { return false; } await next_page_link.click(); - try { - await this.page.waitForNavigation({timeout: this.STANDARD_TIMEOUT}); - } catch(e) { - return false; - } + await this.page.waitForNavigation({ timeout: this.STANDARD_TIMEOUT }); return true; } async wait_for_results() { + debug('wait_for_results'); await this.page.waitForSelector('.result__body', { timeout: this.STANDARD_TIMEOUT }); } diff --git a/src/modules/se_scraper.js b/src/modules/se_scraper.js index d4f64c9..7de0a3d 100644 --- a/src/modules/se_scraper.js +++ b/src/modules/se_scraper.js @@ -10,6 +10,7 @@ const debug = require('debug')('se-scraper:Scraper'); module.exports = class Scraper { constructor(options = {}) { + debug('constructor'); const { config = {}, context = {}, @@ -49,7 +50,9 @@ module.exports = class Scraper { } } - async run({page, data}) { + async run({page, data, worker}) { + + debug('worker=%o', worker, this.config.keywords); if (page) { this.page = page; diff --git a/src/node_scraper.js b/src/node_scraper.js index 2f51169..2dec432 100644 --- a/src/node_scraper.js +++ b/src/node_scraper.js @@ -6,6 +6,7 @@ const _ = require('lodash'); const { createLogger, format, transports } = require('winston'); const { combine, timestamp, printf } = format; const debug = require('debug')('se-scraper:ScrapeManager'); +const { Cluster } = require('puppeteer-cluster'); const UserAgent = require('user-agents'); const google = require('./modules/google.js'); @@ -13,7 +14,7 @@ const bing = require('./modules/bing.js'); const yandex = require('./modules/yandex.js'); const infospace = require('./modules/infospace.js'); const duckduckgo = require('./modules/duckduckgo.js'); -const { Cluster } = require('./puppeteer-cluster/dist/index.js'); +const CustomConcurrencyImpl = require('./concurrency-implementation'); const MAX_ALLOWED_BROWSERS = 6; @@ -185,6 +186,10 @@ class ScrapeManager { this.logger.info(`${this.config.proxies.length} proxies read from file.`); } + if (!this.config.proxies && this.config.use_proxies_only) { + throw new Error('Must provide at least one proxy in proxies if you enable use_proxies_only'); + } + debug('this.config=%O', this.config); } @@ -215,92 +220,70 @@ class ScrapeManager { const chrome_flags = _.clone(this.config.chrome_flags); - if (this.config.random_user_agent) { - const userAgent = new UserAgent({ deviceCategory: 'desktop' }); - this.config.user_agent = userAgent.toString(); - } - - if (this.config.user_agent) { - chrome_flags.push( - `--user-agent=${this.config.user_agent}` - ) - } - - var launch_args = { - args: chrome_flags, - headless: this.config.headless, - ignoreHTTPSErrors: true, - }; - - debug('Using the following puppeteer configuration launch_args=%O', launch_args); - if (this.pluggable && this.pluggable.start_browser) { launch_args.config = this.config; - this.browser = await this.pluggable.start_browser(launch_args); + this.browser = await this.pluggable.start_browser({ + config: this.config, + }); this.page = await this.browser.newPage(); } else { // if no custom start_browser functionality was given // use puppeteer-cluster for scraping - this.numClusters = this.config.puppeteer_cluster_config.maxConcurrency; - var perBrowserOptions = []; - - // the first browser this.config with home IP - if (!this.config.use_proxies_only) { - perBrowserOptions.push(launch_args); - } - + let proxies; // if we have at least one proxy, always use CONCURRENCY_BROWSER // and set maxConcurrency to this.config.proxies.length + 1 // else use whatever this.configuration was passed if (this.config.proxies && this.config.proxies.length > 0) { - this.config.puppeteer_cluster_config.concurrency = Cluster.CONCURRENCY_BROWSER; // because we use real browsers, we ran out of memory on normal laptops // when using more than maybe 5 or 6 browsers. // therefore hardcode a limit here + // TODO not sure this what we want this.numClusters = Math.min( this.config.proxies.length + (this.config.use_proxies_only ? 0 : 1), MAX_ALLOWED_BROWSERS ); + proxies = _.clone(this.config.proxies); - this.logger.info(`Using ${this.numClusters} clusters.`); - - this.config.puppeteer_cluster_config.maxConcurrency = this.numClusters; - - for (var proxy of this.config.proxies) { - perBrowserOptions.push({ - headless: this.config.headless, - ignoreHTTPSErrors: true, - args: chrome_flags.concat(`--proxy-server=${proxy}`) - }) + // Insert a first config without proxy if use_proxy_only is false + if (this.config.use_proxies_only === false) { + proxies.unshift(null); } + + } else { + this.numClusters = this.config.puppeteer_cluster_config.maxConcurrency; + proxies = _.times(this.numClusters, null); } - // Give the per browser options each a random user agent when random user agent is set - while (perBrowserOptions.length < this.numClusters) { - const userAgent = new UserAgent(); - perBrowserOptions.push({ + this.logger.info(`Using ${this.numClusters} clusters.`); + + // Give the per browser options + const perBrowserOptions = _.map(proxies, (proxy) => { + const userAgent = (this.config.random_user_agent) ? (new UserAgent({deviceCategory: 'desktop'})).toString() : this.config.user_agent; + let args = chrome_flags.concat([`--user-agent=${userAgent}`]); + + if (proxy) { + args = args.concat([`--proxy-server=${proxy}`]); + } + + return { headless: this.config.headless, ignoreHTTPSErrors: true, - args: default_chrome_flags.slice().concat(`--user-agent=${userAgent.toString()}`) - }) - } + args + }; + }); debug('perBrowserOptions=%O', perBrowserOptions) this.cluster = await Cluster.launch({ monitor: this.config.puppeteer_cluster_config.monitor, timeout: this.config.puppeteer_cluster_config.timeout, // max timeout set to 30 minutes - concurrency: this.config.puppeteer_cluster_config.concurrency, - maxConcurrency: this.config.puppeteer_cluster_config.maxConcurrency, - puppeteerOptions: launch_args, - perBrowserOptions: perBrowserOptions, - }); - - this.cluster.on('taskerror', (err, data) => { - this.logger.error(`Error while scraping ${data}: ${err.message}`); - debug('Error during cluster task', err); + concurrency: CustomConcurrencyImpl, + maxConcurrency: this.numClusters, + puppeteerOptions: { + perBrowserOptions: perBrowserOptions + } }); } } @@ -352,26 +335,21 @@ class ScrapeManager { chunks[k % this.numClusters].push(this.config.keywords[k]); } - let execPromises = []; - let scraperInstances = []; - for (var c = 0; c < chunks.length; c++) { - this.config.keywords = chunks[c]; + debug('chunks=%o', chunks); - if (this.config.use_proxies_only) { - this.config.proxy = this.config.proxies[c]; // every cluster has a dedicated proxy - } else if(c > 0) { - this.config.proxy = this.config.proxies[c-1]; // first cluster uses own ip address - } + let execPromises = []; + for (var c = 0; c < chunks.length; c++) { + const config = _.clone(this.config); + config.keywords = chunks[c]; var obj = getScraper(this.config.search_engine, { - config: this.config, + config: config, context: {}, pluggable: this.pluggable, }); var boundMethod = obj.run.bind(obj); execPromises.push(this.cluster.execute({}, boundMethod)); - scraperInstances.push(obj); } let promiseReturns = await Promise.all(execPromises); diff --git a/src/puppeteer-cluster b/src/puppeteer-cluster deleted file mode 160000 index f333cd0..0000000 --- a/src/puppeteer-cluster +++ /dev/null @@ -1 +0,0 @@ -Subproject commit f333cd0ffc7d5dbbc1b7df255986c13f353672d8 diff --git a/test/html_output.js b/test/html_output.js new file mode 100644 index 0000000..5afdae0 --- /dev/null +++ b/test/html_output.js @@ -0,0 +1,101 @@ +'use strict'; +const express = require('express'); +const { createLogger, transports } = require('winston'); +const http = require('http'); +const https = require('https'); +const assert = require('assert'); +const path = require('path'); +const keyCert = require('key-cert'); +const Promise = require('bluebird'); +const Proxy = require('http-mitm-proxy'); + +const debug = require('debug')('se-scraper:test'); +const se_scraper = require('../'); + +const httpPort = 3012; +const httpsPort = httpPort + 1; +const proxyPort = httpPort + 2; + +const fakeSearchEngine = express(); +fakeSearchEngine.get('/search', (req, res) => { + debug('q=%s', req.query.q); + const pageNumber = ((req.query.start/10) || 0) + 1; + res.sendFile(path.join(__dirname, 'mocks/google/' + req.query.q + '_page' + pageNumber + '.html')); +}); +fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']})); + +describe('Config', function(){ + + let httpServer, httpsServer, proxy; + before(async function(){ + // Here mount our fake engine in both http and https listen server + httpServer = http.createServer(fakeSearchEngine); + httpsServer = https.createServer(await keyCert(), fakeSearchEngine); + + proxy = Proxy(); + proxy.onRequest((ctx, callback) => { + ctx.proxyToServerRequestOptions.host = 'localhost'; + ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort; + ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine'; + debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host); + return callback(); + }); + + await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort}); + await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort); + await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort); + debug('Fake http search engine servers started'); + }); + + after(function(){ + httpsServer.close(); + httpServer.close(); + proxy.close(); + }); + + describe('html_output', function(){ + + const testLogger = createLogger({ + transports: [ + new transports.Console({ + level: 'error' + }) + ] + }); + + /** + * Test html_output option + */ + it('html_output single page single keyword', async function () { + + const scrape_job = { + search_engine: 'google', + /* TODO refactor start_url + google_settings: { + start_url: 'http://localhost:' + httpPort + }, + */ + keywords: ['test keyword'], + }; + + var scraper = new se_scraper.ScrapeManager({ + throw_on_detection: true, + logger: testLogger, + html_output: true, + //clean_html_output: false, + //clean_data_images: false, + // TODO refactor start_url so we can use-it instead of depending of the proxy for this test + proxies: ['http://localhost:' + proxyPort], + use_proxies_only: true, + }); + await scraper.start(); + const { results } = await scraper.scrape(scrape_job); + await scraper.quit(); + + assert(results['test keyword']['1'].html.length > 1000, 'Html of google page 1 should be provided'); + + }); + + }); + +}); \ No newline at end of file diff --git a/test/mocks/bing/index.html b/test/mocks/bing/index.html new file mode 100644 index 0000000..02b3071 --- /dev/null +++ b/test/mocks/bing/index.html @@ -0,0 +1,23 @@ +Bing

Image of the day

janv. 8, 2020
Avoir le souffle coupé, ça se mérite
© Bogdan Dyiakonovych/Shutterstock
S’il vous prenait l’envie de gravir un petit millier de marches, comme ça, juste pour le plaisir, on vous conseille de vous rendre au mont Tianmen, en Chine (littéralement « la porte du Paradis », rien que ça). Situé 1 500 mètres au-dessus du niveau de la mer, ce trou dans la montagne, que vous voyez sur l’image d’aujourd’hui, est la plus haute arche naturelle au monde. A l’origine il y avait une grotte mais cette dernière se transforma en arche en 263 avant J.C quand l’arrière de la montagne s’effondra, créant alors cette cavité béante. Pour atteindre le sommet, il vous faudra escalader les 999 marches qui y mènent. Mais la vue en vaut la peine, c’est promis.
Learn more
\ No newline at end of file diff --git a/test/mocks/bing/test keyword_page1.html b/test/mocks/bing/test keyword_page1.html new file mode 100644 index 0000000..445adab --- /dev/null +++ b/test/mocks/bing/test keyword_page1.html @@ -0,0 +1,42 @@ +test keyword - Bing

3 600 000 results
  1. Keyword Tests | TestComplete Documentation

    https://support.smartbear.com/testcomplete/docs/keyword-testing/index.html

    In keyword-driven tests, every action to be performed (a mouse click, keystroke, and so on) is described by a keyword. TestComplete supports this kind of tests. This section contains topics that describe how to create and use keyword tests with TestComplete.

  2. About Keyword Testing | TestComplete Documentation

    https://support.smartbear.com/testcomplete/docs/keyword-testing/overview.html

    Some keyword test operations (for instance, the Log Message, Delay and Comment operations) do not have nodes. If you add such operations to a test, they may break the current group node. To avoid this, TestComplete allows you to specify keyword test operations that do not break operation groups when they are added to a test.

  3. https://keywordtool.io/fr

    Keyword Tool vous permet d'extraire des mots-clés de 192 domaines Google et d'utiliser 83 langues pour générer des suggestions de mots clés. De cette façon, nous nous assurons que les mots clés générés seront pertinents pour le pays et / ou la langue pour laquelle vous créez votre contenu.

  4. Keyword-driven testing - Wikipedia

    https://en.wikipedia.org/wiki/Keyword-driven_testing

    A Keyword or Action Word is a defined combination of actions on a test object which describes how test lines must be executed. An action word contains arguments and is defined by a test analyst. An action word contains arguments and is defined by a test analyst.

  5. https://ads.google.com/intl/fr_fr/home/tools/keyword-planner

    Découvrez de nouveaux mots clés Cherchez des termes ou des expressions en rapport avec vos produits ou services. Notre outil de recherche de mots clés vous aidera à trouver les mots clés les plus pertinents pour votre activité.

  6. https://www.woorank.com/fr

    Un Keyword Tool performant . Vous souhaitez connaître la position exacte de vos mots-clés ? Suivez en détail la position de vos mots-clés, analysez l'historique de vos performances, la popularité des mots-clés que vous avez choisi et comparez vos résultats avec ceux de vos concurrents avec l'outil Keyword Tool. Essayez-le maintenant!

\ No newline at end of file diff --git a/test/mocks/bing/test keyword_page2.html b/test/mocks/bing/test keyword_page2.html new file mode 100644 index 0000000..8154635 --- /dev/null +++ b/test/mocks/bing/test keyword_page2.html @@ -0,0 +1,42 @@ +test keyword - Bing

7-16 of 3 600 000 results
  1. Keywords - TestLink

    testlink.sourceforge.net/docs/docs/toc.php?page=16

    Keywords were created to gives users another level of depth when categorizing test cases. Keyword Creation. At this time keywords can only be created by users with the mgt_modify_key rights. These rights are currently held only by leads. Once a keyword or grouping of keywords have been created users may assign them to test cases.

  2. https://docs.microsoft.com/fr-fr/dotnet/csharp/language-reference/keywords/is

    Type pattern, which tests whether an expression can be converted to a specified type and, if it can be, casts it to a variable of that type. Modèle de constante : teste si une expression correspond à une valeur de constante spécifiée. Constant pattern, which tests whether an expression evaluates to …

  3. Keyword Tool (FREE) ᐈ #1 Google Keyword Planner Alternative

    https://keywordtool.io

    Using Keyword Tool, you can choose a specific Google domain out of 192 supported domains and one out of 83 languages that will be used to produce keyword suggestions. The free version of Keyword Tool can generate up to 750+ keywords from Google autocomplete in seconds.

  4. Parameterizing Keyword Tests with TestComplete - YouTube

    https://www.youtube.com/watch?v=3Ed8T7XcpH8

    22/06/2011 · This video demonstrates how you can create and use parameters in automated keyword-driven tests. Parameters allow you to pass data into keyword-driven tests or between tests. By replacing hard ...

    • Author: SmartBear
    • Views: 5K
  5. https://www.seopageoptimizer.fr/fr/default/2008302/SEO/Test.aspx

    Faites le test dès maintenant. Cet outil permet de tester si votre page est conforme aux exigences de Google et la compare avec les pages obtenant les meilleurs scores. SEO Page Optimizer vous donne toutes les clés pour vous approcher au maximum des chiffres indiqués. Vous pouvez ainsi obtenir un meilleur positionnement dans Google.

  6. Keyword Rank Checker - A Free online Google keyword ...

    https://smallseotools.com/keyword-position

    Keyword Position Checker is a tool used to detect the position of a website or URL in the search engine (particularly, Google) for a given keyword as per competing with other websites for the same keyword.

  7. Choose the Right Keywords with Our Research Tools - Google Ads

    https://ads.google.com/home/tools/keyword-planner

    Our keyword research tool gives you insight into how often certain words are searched and how those searches have changed over time. This can help you narrow your …

  8. Search Results for - Tests.com

    https://tests.com/wordsearch?keyword=+

    Test Product/Service Organization Description Cost; Save Checked Listings | Compare Checked Listings. About Us | Contact | Terms of Sale / Refunds

  9. https://www.rtbf.be/tendance/mot-cle_test?keyword=1158186

  10. Keyword Density Checker | Online Keyword Density Tool Free

    https://smallseotools.com/keyword-density-checker

    Keyword Density Checker is a tool built solely for the purpose of calculating the keyword density of any web page. The dev team at Small SEO Tools created the tool after finding out that some marketers were still stuffing their content with loads of keywords even without realizing it.

\ No newline at end of file diff --git a/test/mocks/bing/test keyword_page3.html b/test/mocks/bing/test keyword_page3.html new file mode 100644 index 0000000..7f1d6fd --- /dev/null +++ b/test/mocks/bing/test keyword_page3.html @@ -0,0 +1,40 @@ +test keyword - Bing

17-26 of 3 600 000 results
  1. Keyword Driven Testing | TestComplete

    https://smartbear.com/product/testcomplete/features/keyword-driven-testing

    Keyword-driven testing is an approach to software testing that separates test case design from execution. Each automated UI test is built on a series of operations, specified by keywords, that simulates a user action, such as a mouse click or keystroke.

  2. Keywords Usage Test | SEO Site Checkup

    https://seositecheckup.com/tools/keywords-usage-test

    Keywords Usage Test What is it? This will check if your most common keywords are used in the webpage's title and description. Check your URL: × Would you like to analyze more competitors simultaneously? Sign up for our free trial. Checkup! How do I fix it? First of all, you must make sure that your page is using the title and meta-description tags. Second, you must adjust these tags content ...

  3. https://www.rtbf.be/tendance/mot-cle_test?keyword=1158186

  4. https://fr.sputniknews.com/tags/keyword_test

    Les tests de la fusée Starhopper de SpaceX ont été une nouvelle fois interrompus après que sa partie supérieure a pris feu dès les premières secondes.

  5. Keywords Cloud Test | SEO Site Checkup

    https://seositecheckup.com/tools/keywords-cloud-test

    The Keyword Cloud is a visual representation of keywords used on your website. This will show you which words are frequently used in the content of your webpage. Keywords having higher density are presented in larger fonts and displayed in alphabetic order.

  6. MTest keywords - Generation

    tfel.sourceforge.net/MTest-keywords.html

    The keyword @ImposedOpeningDisplacement is not documented yet. The @ImposedStrain keyword. The @ImposedStrain keyword allows the user to impose the evolution of a component of the strains. This keyword may have one option, which is the way the evolution will be defined. Two values are accepted: evolution and function.

  7. https://deusyss.developpez.com/tutoriels/Python/Robotframework

    La version complète convient aux développeurs en leur offrant la possibilité de coder des tests complets et/ou des keywords directement en Python. Dans tous les cas, il permet l'écriture rapide de jeux de tests, et ses rapports, à la fois complets et explicites ne pourront …

  8. YouTube Keyword Tool: Generate YouTube Tags & Keywords for ...

    https://kparser.com/youtube-keyword-tool

    Get 7x more the best YouTube keywords with YouTube Keyword Tool alternative! Explore the step-by-step algorithm for using Kparser for blogging: find new ideas for own videos, generate thousands of long-tail suggestions for the most profitable keywords in YouTube title, description, tags.

  9. KWFinder: Keyword Research and Analysis Tool

    https://kwfinder.com

    Keywords with exact search volumes Get search volumes with historical data. Timing is the key! Be ready and create content based on historical search volumes and long-term trends. Identify seasonal keywords and hot topics 🌶️ that will boost the organic traffic of your website.

  10. Take the SEO Expert Quiz - Moz

    https://moz.com/seo-expert-quiz

    The SEO Expert Quiz has 50 action-packed questions and takes 15 minutes to complete. You have nothing to lose and a lot of prestige to gain. Let the games …

\ No newline at end of file diff --git a/test/mocks/duckduckgo/index.html b/test/mocks/duckduckgo/index.html new file mode 100644 index 0000000..274e284 --- /dev/null +++ b/test/mocks/duckduckgo/index.html @@ -0,0 +1,148 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + DuckDuckGo — Privacy, simplified. + + + + + + + + + + + + + + + + + + + + + +
+ + +
+
+ +
+
+
+
+ + +
+ + +
+ + + + + +
+
+ + + + +
+
+
+ + + + + + +
+ + diff --git a/test/mocks/duckduckgo/test keyword_page1.html b/test/mocks/duckduckgo/test keyword_page1.html new file mode 100644 index 0000000..7d522e4 --- /dev/null +++ b/test/mocks/duckduckgo/test keyword_page1.html @@ -0,0 +1,3 @@ +test keyword at DuckDuckGoIgnore this box please.
diff --git a/test/mocks/duckduckgo/test keyword_page2.html b/test/mocks/duckduckgo/test keyword_page2.html new file mode 100644 index 0000000..aa3e61f --- /dev/null +++ b/test/mocks/duckduckgo/test keyword_page2.html @@ -0,0 +1,3 @@ +test keyword at DuckDuckGoIgnore this box please.
diff --git a/test/mocks/duckduckgo/test keyword_page3.html b/test/mocks/duckduckgo/test keyword_page3.html new file mode 100644 index 0000000..6d235e3 --- /dev/null +++ b/test/mocks/duckduckgo/test keyword_page3.html @@ -0,0 +1,3 @@ +test keyword at DuckDuckGoIgnore this box please.
diff --git a/test/mocks/google/index.html b/test/mocks/google/index.html new file mode 100644 index 0000000..488d185 --- /dev/null +++ b/test/mocks/google/index.html @@ -0,0 +1,358 @@ +Google
Google offered in: Français
\ No newline at end of file diff --git a/test/mocks/google/test keyword_page1.html b/test/mocks/google/test keyword_page1.html new file mode 100644 index 0000000..975f547 --- /dev/null +++ b/test/mocks/google/test keyword_page1.html @@ -0,0 +1,209 @@ +test keyword - Google Search

Accessibility links

Skip to main contentAccessibility help
Accessibility feedback
About 298,000,000 results (0.77 seconds) 

Ad

  1. Outil de mots clés SEMrush : Surpassez vos concurrents - Essai gratuit. Best SEO Suite 2018. 150 bases de données. 3M+ d’utilisateurs. 8 milliards de mots clés. Services: Analyse de mots clés, Audit de site, Analyse du trafic, Suivi de position, Recherche organique.
    TRIAL -
    $0.00/mo
    -
    S'ABONNER
    · More
    PRO -
    $99.95/mo
    S'ABONNER
    BUSINESS -
    $399.95/mo
    S'ABONNER
    GURU -
    $199.95/mo
    S'ABONNER

    People also search for

Google apps
\ No newline at end of file diff --git a/test/mocks/google/test keyword_page2.html b/test/mocks/google/test keyword_page2.html new file mode 100644 index 0000000..ab36724 --- /dev/null +++ b/test/mocks/google/test keyword_page2.html @@ -0,0 +1,206 @@ +test keyword - Google Search

Accessibility links

Skip to main contentAccessibility help
Accessibility feedback
Page 2 of about 234,000,000 results (0.38 seconds) 

Ad

  1. Improve Your Quality Score. Boost Your CTR. Reduce Your CPC. 5X Faster Than The Editor. Automatic Negative Keyword Generation, DKI, 24/7 Chat Support & More. Sign Up...

    People also search for

Google apps
\ No newline at end of file diff --git a/test/mocks/google/test keyword_page3.html b/test/mocks/google/test keyword_page3.html new file mode 100644 index 0000000..b27b638 --- /dev/null +++ b/test/mocks/google/test keyword_page3.html @@ -0,0 +1,191 @@ +test keyword - Google Search

Accessibility links

Skip to main contentAccessibility help
Accessibility feedback
Page 3 of about 234,000,000 results (0.54 seconds) 
Google apps
\ No newline at end of file diff --git a/test/modules/bing.js b/test/modules/bing.js new file mode 100644 index 0000000..bbc9dd0 --- /dev/null +++ b/test/modules/bing.js @@ -0,0 +1,123 @@ +'use strict'; +const express = require('express'); +const puppeteer = require('puppeteer'); +const { createLogger, transports } = require('winston'); +const http = require('http'); +const https = require('https'); +const assert = require('assert'); +const path = require('path'); +const keyCert = require('key-cert'); +const Promise = require('bluebird'); +const Proxy = require('http-mitm-proxy'); + +const debug = require('debug')('se-scraper:test'); +const { BingScraper } = require('../../src/modules/bing'); + +const httpPort = 3012; +const httpsPort = httpPort + 1; +const proxyPort = httpPort + 2; + +const fakeSearchEngine = express(); +fakeSearchEngine.get('/search', (req, res, next) => { + debug('q=%s', req.query.q); + const pageNumber = Math.round((req.query.first || 0) /10) + 1; + res.sendFile(path.join(__dirname, '../mocks/bing/' + req.query.q + '_page' + pageNumber + '.html')); +}); +fakeSearchEngine.use(express.static('test/mocks/bing', {extensions: ['html']})); + +describe('Module Bing', function(){ + + let httpServer, httpsServer, proxy; + before(async function(){ + // Here mount our fake engine in both http and https listen server + httpServer = http.createServer(fakeSearchEngine); + httpsServer = https.createServer(await keyCert(), fakeSearchEngine); + + proxy = Proxy(); + proxy.onRequest((ctx, callback) => { + ctx.proxyToServerRequestOptions.host = 'localhost'; + ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort; + ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine'; + debug('connection proxied askedHost=%s toPort=%s', ctx.clientToProxyRequest.headers.host, ctx.proxyToServerRequestOptions.port); + return callback(); + }); + + await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort }); + await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort); + await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort); + debug('Fake http search engine servers started'); + }); + + after(function(){ + proxy.close(); + httpsServer.close(); + httpServer.close(); + }); + + let browser; + let page; + beforeEach(async function(){ + debug('Start a new browser'); + browser = await puppeteer.launch({ + //dumpio: true, + //headless: false, + ignoreHTTPSErrors: true, + args: [ '--proxy-server=http://localhost:' + proxyPort ] + }); + debug('Open a fresh page'); + page = await browser.newPage(); + }); + + afterEach(async function(){ + await browser.close(); + }); + + const testLogger = createLogger({ + transports: [ + new transports.Console({ + level: 'error' + }) + ] + }); + + it('one keyword one page', function(){ + const bingScraper = new BingScraper({ + config: { + search_engine_name: 'bing', + throw_on_detection: true, + keywords: ['test keyword'], + logger: testLogger, + scrape_from_file: '', + } + }); + bingScraper.STANDARD_TIMEOUT = 500; + return bingScraper.run({page}).then(({results, metadata, num_requests}) => { + assert.strictEqual(num_requests, 1, 'Must do one request'); + assert.strictEqual(results['test keyword']['1'].results.length, 6, 'Must have 6 organic results parsed'); + }); + }); + + it('one keyword 3 pages', function () { + const bingScraper = new BingScraper({ + config: { + search_engine_name: 'bing', + throw_on_detection: true, + keywords: ['test keyword'], + logger: testLogger, + scrape_from_file: '', + num_pages: 3, + } + }); + bingScraper.STANDARD_TIMEOUT = 500; + return bingScraper.run({page}).then(({results, metadata, num_requests}) => { + assert.strictEqual(num_requests, 3, 'Must three requests'); + assert.strictEqual(results['test keyword']['1'].results.length, 6, 'Must have 6 organic results parsed on page 1'); + assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tests | TestComplete Documentation', 'Title not matching on first organic result page 1'); + assert.strictEqual(results['test keyword']['2'].results.length, 10, 'Must have 10 organic results parsed on page 2'); + assert.strictEqual(results['test keyword']['2'].results[0].title, 'Keywords - TestLink', 'Title not matching on first organic result page 2'); + assert.strictEqual(results['test keyword']['3'].results.length, 10, 'Must have 10 organic results parsed on page 3'); + assert.strictEqual(results['test keyword']['3'].results[0].title, 'Keyword Driven Testing | TestComplete', 'Title not matching on first organic result page 3'); + }); + }); + +}); \ No newline at end of file diff --git a/test/modules/duckduckgo.js b/test/modules/duckduckgo.js new file mode 100644 index 0000000..0997200 --- /dev/null +++ b/test/modules/duckduckgo.js @@ -0,0 +1,140 @@ +'use strict'; +const express = require('express'); +const puppeteer = require('puppeteer'); +const { createLogger, transports } = require('winston'); +const http = require('http'); +const https = require('https'); +const assert = require('assert'); +const path = require('path'); +const keyCert = require('key-cert'); +const Promise = require('bluebird'); +const Proxy = require('http-mitm-proxy'); + +const debug = require('debug')('se-scraper:test'); +const { DuckduckgoScraper } = require('../../src/modules/duckduckgo'); + +const httpPort = 3012; +const httpsPort = httpPort + 1; +const proxyPort = httpPort + 2; + +const fakeSearchEngine = express(); +fakeSearchEngine.use(express.urlencoded({ extended: true })) +fakeSearchEngine.get('/', (req, res, next) => { + if(!req.query.q){ + return next(); + } + debug('q=%s page=%d', req.query.q, req.query.page); + const pageNumber = req.query.page; + res.sendFile(path.join(__dirname, '../mocks/duckduckgo/' + req.query.q + '_page' + pageNumber + '.html')); +}); +fakeSearchEngine.post('/html', (req, res) => { + debug('body=%o', req.body); + const pageNumber = 1; + res.sendFile(path.join(__dirname, '../mocks/duckduckgo/' + req.body.q + '_page' + pageNumber + '.html')); +}); +fakeSearchEngine.use(express.static('test/mocks/duckduckgo', {extensions: ['html']})); + +describe('Module DuckDuckGo', function(){ + + let httpServer, httpsServer, proxy; + before(async function(){ + // Here mount our fake engine in both http and https listen server + httpServer = http.createServer(fakeSearchEngine); + httpsServer = https.createServer(await keyCert(), fakeSearchEngine); + + proxy = Proxy(); + proxy.onRequest((ctx, callback) => { + ctx.proxyToServerRequestOptions.host = 'localhost'; + ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort; + ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine'; + debug('proxy askedHost=%s method=%s url=%s toPort=%s', + ctx.clientToProxyRequest.headers.host, + ctx.clientToProxyRequest.method, + ctx.clientToProxyRequest.url, + ctx.proxyToServerRequestOptions.port + ); + return callback(); + }); + + await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort }); + await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort); + await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort); + debug('Fake http search engine servers started'); + }); + + after(function(){ + proxy.close(); + httpsServer.close(); + httpServer.close(); + }); + + let browser; + let page; + beforeEach(async function(){ + debug('Start a new browser'); + browser = await puppeteer.launch({ + //dumpio: true, + //headless: false, + ignoreHTTPSErrors: true, + args: [ '--proxy-server=http://localhost:' + proxyPort ] + }); + debug('Open a fresh page'); + page = await browser.newPage(); + }); + + afterEach(async function(){ + await browser.close(); + }); + + const testLogger = createLogger({ + transports: [ + new transports.Console({ + level: 'error' + }) + ] + }); + + it('one keyword one page', function(){ + const duckduckgoScraper = new DuckduckgoScraper({ + config: { + search_engine_name: 'duckduckgo', + throw_on_detection: true, + keywords: ['test keyword'], + logger: testLogger, + scrape_from_file: '', + } + }); + duckduckgoScraper.STANDARD_TIMEOUT = 1000; + return duckduckgoScraper.run({page}).then(({results, metadata, num_requests}) => { + assert.strictEqual(num_requests, 1, 'Must do one request'); + assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed'); + }); + }); + + it('one keyword 3 pages', function () { + this.timeout(4000); + const duckduckgoScraper = new DuckduckgoScraper({ + config: { + search_engine_name: 'google', + throw_on_detection: true, + keywords: ['test keyword'], + logger: testLogger, + scrape_from_file: '', + num_pages: 3, + } + }); + duckduckgoScraper.STANDARD_TIMEOUT = 1000; + return duckduckgoScraper.run({page}).then(({results, metadata, num_requests}) => { + assert.strictEqual(num_requests, 3, 'Must three requests'); + assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed on page 1'); + assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tests | TestComplete Documentation', 'Title not matching on first organic result page 1'); + debug('results page 1 %O',results['test keyword']['1'].results); + debug('results page 2 %O', results['test keyword']['2'].results); + assert.strictEqual(results['test keyword']['2'].results.length, 19, 'Must have 19 organic results parsed on page 2'); + assert.strictEqual(results['test keyword']['2'].results[0].title, 'Quest Diagnostics: Test Directory', 'Title not matching on first organic result page 1'); + assert.strictEqual(results['test keyword']['3'].results.length, 48, 'Must have 48 organic results parsed on page 3'); + assert.strictEqual(results['test keyword']['3'].results[0].title, 'Java Keywords Quiz - Sporcle', 'Title not matching on first organic result page 1'); + }); + }); + +}); \ No newline at end of file diff --git a/test/modules/google.js b/test/modules/google.js new file mode 100644 index 0000000..83c2ae3 --- /dev/null +++ b/test/modules/google.js @@ -0,0 +1,123 @@ +'use strict'; +const express = require('express'); +const puppeteer = require('puppeteer'); +const { createLogger, transports } = require('winston'); +const http = require('http'); +const https = require('https'); +const assert = require('assert'); +const path = require('path'); +const keyCert = require('key-cert'); +const Promise = require('bluebird'); +const Proxy = require('http-mitm-proxy'); + +const debug = require('debug')('se-scraper:test'); +const { GoogleScraper } = require('../../src/modules/google'); + +const httpPort = 3012; +const httpsPort = httpPort + 1; +const proxyPort = httpPort + 2; + +const fakeSearchEngine = express(); +fakeSearchEngine.get('/search', (req, res) => { + debug('q=%s', req.query.q); + const pageNumber = ((req.query.start/10) || 0) + 1; + res.sendFile(path.join(__dirname, '../mocks/google/' + req.query.q + '_page' + pageNumber + '.html')); +}); +fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']})); + +describe('Module Google', function(){ + + let httpServer, httpsServer, proxy; + before(async function(){ + // Here mount our fake engine in both http and https listen server + httpServer = http.createServer(fakeSearchEngine); + httpsServer = https.createServer(await keyCert(), fakeSearchEngine); + + proxy = Proxy(); + proxy.onRequest((ctx, callback) => { + ctx.proxyToServerRequestOptions.host = 'localhost'; + ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort; + ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine'; + debug('connection proxied askedHost=%s toPort=%s', ctx.clientToProxyRequest.headers.host, ctx.proxyToServerRequestOptions.port); + return callback(); + }); + + await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort }); + await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort); + await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort); + debug('Fake http search engine servers started'); + }); + + after(function(){ + proxy.close(); + httpsServer.close(); + httpServer.close(); + }); + + let browser; + let page; + beforeEach(async function(){ + debug('Start a new browser'); + browser = await puppeteer.launch({ + //dumpio: true, + //headless: false, + ignoreHTTPSErrors: true, + args: [ '--proxy-server=http://localhost:' + proxyPort ] + }); + debug('Open a fresh page'); + page = await browser.newPage(); + }); + + afterEach(async function(){ + await browser.close(); + }); + + const testLogger = createLogger({ + transports: [ + new transports.Console({ + level: 'error' + }) + ] + }); + + it('one keyword one page', function(){ + const googleScraper = new GoogleScraper({ + config: { + search_engine_name: 'google', + throw_on_detection: true, + keywords: ['test keyword'], + logger: testLogger, + scrape_from_file: '', + } + }); + googleScraper.STANDARD_TIMEOUT = 500; + return googleScraper.run({page}).then(({results, metadata, num_requests}) => { + assert.strictEqual(num_requests, 1, 'Must do one request'); + assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed'); + }); + }); + + it('one keyword 3 pages', function () { + const googleScraper = new GoogleScraper({ + config: { + search_engine_name: 'google', + throw_on_detection: true, + keywords: ['test keyword'], + logger: testLogger, + scrape_from_file: '', + num_pages: 3, + } + }); + googleScraper.STANDARD_TIMEOUT = 500; + return googleScraper.run({page}).then(({results, metadata, num_requests}) => { + assert.strictEqual(num_requests, 3, 'Must three requests'); + assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed on page 1'); + assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tool (FREE) ᐈ #1 Google Keyword Planner Alternative', 'Title not matching on first organic result page 1'); + assert.strictEqual(results['test keyword']['2'].results.length, 10, 'Must have 10 organic results parsed on page 2'); + assert.strictEqual(results['test keyword']['2'].results[0].title, 'Keyword Research | The Beginner\'s Guide to SEO - Moz', 'Title not matching on first organic result page 1'); + assert.strictEqual(results['test keyword']['3'].results.length, 10, 'Must have 10 organic results parsed on page 3'); + assert.strictEqual(results['test keyword']['3'].results[0].title, 'The ACT Keyword Study Plan — NerdCoach', 'Title not matching on first organic result page 1'); + }); + }); + +}); \ No newline at end of file diff --git a/test/proxy.js b/test/proxy.js new file mode 100644 index 0000000..c1092ea --- /dev/null +++ b/test/proxy.js @@ -0,0 +1,161 @@ +'use strict'; +const express = require('express'); +const { createLogger, transports } = require('winston'); +const http = require('http'); +const https = require('https'); +const assert = require('assert'); +const keyCert = require('key-cert'); +const Promise = require('bluebird'); +const Proxy = require('http-mitm-proxy'); + +const debug = require('debug')('se-scraper:test'); +const se_scraper = require('../'); +const Scraper = require('../src/modules/se_scraper'); + +const httpPort = 3012; +const httpsPort = httpPort + 1; +const proxyPort = httpPort + 2; + +const fakeSearchEngine = express(); +fakeSearchEngine.set('trust proxy', 'loopback'); +fakeSearchEngine.get('/test-proxy', (req, res) => { + debug('fake-search-engine req.hostname=%s', req.hostname); + //debug('req to', req.socket.localAddress, req.socket.localPort); + res.send(req.hostname); +}); + +describe('Config', function(){ + + let httpServer, httpsServer, proxy; + before(async function(){ + // Here mount our fake engine in both http and https listen server + httpServer = http.createServer(fakeSearchEngine); + httpsServer = https.createServer(await keyCert(), fakeSearchEngine); + + proxy = Proxy(); + proxy.onRequest((ctx, callback) => { + ctx.proxyToServerRequestOptions.host = 'localhost'; + ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort; + ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine'; + debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host); + return callback(); + }); + + await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort}); + await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort); + await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort); + debug('Fake http search engine servers started'); + }); + + after(function(){ + httpsServer.close(); + httpServer.close(); + proxy.close(); + }); + + describe('proxies', function(){ + + class MockScraperTestProxy extends Scraper { + + async load_start_page(){ + return true; + } + + async search_keyword(){ + await this.page.goto('http://test.local:' + httpPort + '/test-proxy'); + } + + async parse_async(){ + const bodyHandle = await this.page.$('body'); + return await this.page.evaluate(body => body.innerHTML, bodyHandle); + } + } + + const testLogger = createLogger({ + transports: [ + new transports.Console({ + level: 'error' + }) + ] + }); + + /** + * Jobs will be executed 2 by 2 through the proxy and direct connection + * THIS TEST NEED TO HAVE test.local 127.0.0.1 in /etc/hosts because chrome bypass localhost even with proxy set + */ + it('one proxy given, use_proxies_only=false', async function () { + + const scrape_job = { + search_engine: MockScraperTestProxy, + keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'], + }; + + var scraper = new se_scraper.ScrapeManager({ + throw_on_detection: true, + proxies: ['http://localhost:' + proxyPort], + // default is use_proxies_only: false, + logger: testLogger, + }); + await scraper.start(); + + const { results } = await scraper.scrape(scrape_job); + assert.strictEqual(results['news']['1'], 'test.local'); + assert.strictEqual(results['some stuff']['1'], 'ProxiedThroughFakeEngine'); + assert.strictEqual(results['i work too much']['1'], 'test.local'); + assert.strictEqual(results['what to do?']['1'], 'ProxiedThroughFakeEngine'); + assert.strictEqual(results['javascript is hard']['1'], 'test.local'); + + await scraper.quit(); + }); + + /** + * Jobs will be executed 1 by 1 through the proxy + */ + it('one proxy given, use_proxies_only=true', async function () { + + const scrape_job = { + search_engine: MockScraperTestProxy, + keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'], + }; + + var scraper = new se_scraper.ScrapeManager({ + throw_on_detection: true, + proxies: ['http://localhost:' + proxyPort], + use_proxies_only: true, + logger: testLogger, + }); + await scraper.start(); + + const { results } = await scraper.scrape(scrape_job); + assert.strictEqual(results['news']['1'], 'ProxiedThroughFakeEngine'); + assert.strictEqual(results['some stuff']['1'], 'ProxiedThroughFakeEngine'); + assert.strictEqual(results['i work too much']['1'], 'ProxiedThroughFakeEngine'); + assert.strictEqual(results['what to do?']['1'], 'ProxiedThroughFakeEngine'); + assert.strictEqual(results['javascript is hard']['1'], 'ProxiedThroughFakeEngine'); + + await scraper.quit(); + }); + + it('zero proxy given, use_proxies_only=true', async function () { + + const scrape_job = { + search_engine: MockScraperTestProxy, + keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'], + }; + + await assert.rejects(async () => { + var scraper = new se_scraper.ScrapeManager({ + throw_on_detection: true, + use_proxies_only: true, + logger: testLogger, + }); + await scraper.start(); + const { results } = await scraper.scrape(scrape_job); + await scraper.quit(); + }, /Must provide at least one proxy in proxies if you enable use_proxies_only/); + + }); + + }); + +}); \ No newline at end of file diff --git a/test/static_tests/README.md b/test/static_tests/README.md deleted file mode 100644 index 140c6d2..0000000 --- a/test/static_tests/README.md +++ /dev/null @@ -1,15 +0,0 @@ -## Test with static HTML - -Dynamic testing of se-scraper takes too much time. - -Save some html and initialize se-scraper by loading the search from disk. - -### Disadvantage - -static html gets outdated after some time - -### Advantages - -1. Let's us test corner cases that are missed easily -2. Testing is not reliable, since search engines do not always return the same results for the same query -3. As said, much faster \ No newline at end of file diff --git a/test/static_tests/bing.js b/test/static_tests/bing.js deleted file mode 100644 index ae0b127..0000000 --- a/test/static_tests/bing.js +++ /dev/null @@ -1,222 +0,0 @@ -const se_scraper = require('./../../index.js'); -const chai = require('chai'); -chai.use(require('chai-string')); -const assert = chai.assert; -const path = require('path'); - -async function bing_ads() { - let config = { - compress: false, - debug_level: 1, - headless: true, - }; - - let scrape_config = { - search_engine: 'bing', - keywords: ['kaffeemaschine kaufen'], - num_pages: 1, - scrape_from_file: 'file://' + path.join(__dirname, './html/bing.html'), - }; - - var scraper = new se_scraper.ScrapeManager(config); - - await scraper.start(); - - bing_search_with_ads( await scraper.scrape(scrape_config) ); - - scrape_config.keywords = ['best cloud services']; - scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/bing2.html'); - - bing_search_with_ads2( await scraper.scrape(scrape_config) ); - - scrape_config.keywords = ['car tires cheap']; - scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/bing3.html'); - - bing_search_with_ads3( await scraper.scrape(scrape_config) ); - - scrape_config.keywords = ['service auto garage']; - scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/bing4.html'); - - bing_search_with_ads4( await scraper.scrape(scrape_config) ); - - await scraper.quit(); -} - -// we test with a callback function to our handler -function bing_search_with_ads(response) { - assert.equal(response.metadata.num_requests, 1); - - for (let query in response.results) { - - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - assert.include(obj.num_results, '1’100’000', 'num results not included'); - assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object'); - assert.isAtLeast(obj.results.length, 6, 'results must have at least 6 SERP objects'); - assert.isAtLeast(obj.ads.length, 7, 'there are 7 ads'); - - assert.isAtLeast(obj.right_side_ads.length, 5, 'there are 5 ads'); - - assert.equal(obj.no_results, false, 'no results should be false'); - assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); - assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - - confirm_results_ok(obj); - } - } -} - - -function bing_search_with_ads2(response) { - assert.equal(response.metadata.num_requests, 1); - - for (let query in response.results) { - - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - assert.include(obj.num_results, '44’300’000', 'num results not included'); - assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object'); - assert.isAtLeast(obj.results.length, 6, 'results must have at least 6 SERP objects'); - assert.isAtLeast(obj.ads.length, 7, 'there are 7 ads'); - - assert.isAtLeast(obj.right_side_ads.length, 5, 'there are 5 ads'); - - assert.equal(obj.no_results, false, 'no results should be false'); - assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); - assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - - confirm_results_ok(obj); - } - } -} - -function bing_search_with_ads3(response) { - assert.equal(response.metadata.num_requests, 1); - - for (let query in response.results) { - - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - assert.include(obj.num_results, '65.500.000 Results', 'num results not included'); - assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object'); - assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects'); - assert.isAtLeast(obj.ads.length, 3, 'there are 3 ads'); - - assert.equal(obj.no_results, false, 'no results should be false'); - assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); - assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - - confirm_results_ok(obj); - } - } -} - -function bing_search_with_ads4(response) { - assert.equal(response.metadata.num_requests, 1); - - for (let query in response.results) { - - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - assert.include(obj.num_results, '4.200.000 Ergebnisse', 'num results not included'); - assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object'); - assert.isAtLeast(obj.results.length, 9, 'results must have at least 9 SERP objects'); - assert.isAtLeast(obj.ads.length, 3, 'there are 3 ads'); - - assert.equal(obj.no_results, false, 'no results should be false'); - assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); - assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - - confirm_results_ok(obj); - } - } -} - - -function confirm_results_ok(obj) { - - for (let res of obj.results) { - assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object'); - - assert.isOk(res.link, 'link must be ok'); - assert.typeOf(res.link, 'string', 'link must be string'); - assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.visible_link, 'visible_link must be ok'); - assert.typeOf(res.visible_link, 'string', 'visible_link must be string'); - assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars'); - - assert.isOk(res.title, 'title must be ok'); - assert.typeOf(res.title, 'string', 'title must be string'); - assert.isAtLeast(res.title.length, 8, 'title must have at least 8 chars'); - - assert.isOk(res.snippet, 'snippet must be ok'); - assert.typeOf(res.snippet, 'string', 'snippet must be string'); - assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); - - assert.isNumber(res.rank, 'rank must be integer'); - } - - for (let res of obj.ads) { - - assert.isOk(res.tracking_link, 'link must be ok'); - assert.typeOf(res.tracking_link, 'string', 'link must be string'); - assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.visible_link, 'link must be ok'); - assert.typeOf(res.visible_link, 'string', 'link must be string'); - assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.title, 'title must be ok'); - assert.typeOf(res.title, 'string', 'title must be string'); - assert.isAtLeast(res.title.length, 8, 'title must have at least 8 chars'); - - assert.isOk(res.snippet, 'snippet must be ok'); - assert.typeOf(res.snippet, 'string', 'snippet must be string'); - assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); - } - - for (let res of obj.right_side_ads) { - - assert.isOk(res.tracking_link, 'link must be ok'); - assert.typeOf(res.tracking_link, 'string', 'link must be string'); - assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.visible_link, 'link must be ok'); - assert.typeOf(res.visible_link, 'string', 'link must be string'); - assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.title, 'title must be ok'); - assert.typeOf(res.title, 'string', 'title must be string'); - assert.isAtLeast(res.title.length, 8, 'title must have at least 8 chars'); - - assert.isOk(res.snippet, 'snippet must be ok'); - assert.typeOf(res.snippet, 'string', 'snippet must be string'); - assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); - } -} - -describe('Bing', function(){ - this.timeout(15000); - it('static bing searches with ads', bing_ads); -}); \ No newline at end of file diff --git a/test/static_tests/clean_html_test.js b/test/static_tests/clean_html_test.js deleted file mode 100644 index 6bbe4dc..0000000 --- a/test/static_tests/clean_html_test.js +++ /dev/null @@ -1,173 +0,0 @@ -const se_scraper = require('./../../index.js'); -const chai = require('chai'); -chai.use(require('chai-string')); -const assert = chai.assert; -const path = require('path'); -const cheerio = require('cheerio'); - - -async function test_html_output() { - let config = { - debug_level: 1, - headless: true, - html_output: true, - // whether to strip JS and CSS from the html_output - // has only an effect if `html_output` is true - clean_html_output: true, - // remove all data images from the html - clean_data_images: true, - // test compression - compress: false, - }; - - let scrape_config = { - search_engine: 'bing', - keywords: ['kaffeemaschine kaufen'], - num_pages: 1, - scrape_from_file: 'file://' + path.join(__dirname, './html/bing.html'), - }; - - var scraper = new se_scraper.ScrapeManager(config); - - await scraper.start(); - - var response = await scraper.scrape(scrape_config); - - scrape_config.clean_html_output = false; - scrape_config.clean_data_images = false; - - var response_no_cleaned = await scraper.scrape(scrape_config); - - test(response, response_no_cleaned, 'bing'); - - scrape_config.search_engine = 'google'; - scrape_config.keywords = ['rückspiegel schwarz']; - scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google.html'); - scrape_config.clean_html_output = true; - scrape_config.clean_data_images = true; - - var responseGoogle = await scraper.scrape(scrape_config); - - scrape_config.clean_html_output = false; - scrape_config.clean_data_images = false; - - var response_no_cleanedGoogle = await scraper.scrape(scrape_config); - - test(responseGoogle, response_no_cleanedGoogle, 'google'); - - - scrape_config.keywords = ['cloud services']; - scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/googleLarge.html'); - scrape_config.clean_html_output = true; - scrape_config.clean_data_images = true; - - var responseGoogle = await scraper.scrape(scrape_config); - - scrape_config.clean_html_output = false; - scrape_config.clean_data_images = false; - - var response_no_cleanedGoogle = await scraper.scrape(scrape_config); - - test(responseGoogle, response_no_cleanedGoogle, 'google'); - - await scraper.quit(); -} - -function test(response, response_no_cleaned, se='google') { - for (let query in response.results) { - for (let page_number in response.results[query]) { - let obj = response.results[query][page_number]; - let obj_no_cleaned = response_no_cleaned.results[query][page_number]; - - console.log('html length of no cleaned SERP: ' + obj_no_cleaned.html.length); - console.log('html length of cleaned SERP: ' + obj.html.length); - - assert.isOk(obj.html, 'Html must be ok!'); - assert.isAtLeast(obj.html.length, 100, 'html must be a length string'); - - assert.isOk(obj_no_cleaned.html, 'Html must be ok!'); - assert.isAtLeast(obj_no_cleaned.html.length, 100, 'html must be a length string'); - - assert.isBelow(obj.html.length, obj_no_cleaned.html.length, 'cleaned html must be smaller'); - - // test that we can parse the html of both the cleaned and no cleaned versions - // with cheerio and that serp results are roughly the same - - const cleaned$ = cheerio.load(obj.html); - const no_cleaned$ = cheerio.load(obj_no_cleaned.html); - - var resCleaned = parseResults(cleaned$, se); - var resNoCleaned = parseResults(no_cleaned$, se); - - assert.equal(resCleaned.length, resNoCleaned.length); - assert.equal(resCleaned.length, obj.results.length); - assert.equal(resNoCleaned.length, obj.results.length); - - // unset the rank - resCleaned = resCleaned.map((el) => el.rank = undefined); - resNoCleaned = resNoCleaned.map((el) => el.rank = undefined); - obj.results = obj.results.map((el) => el.rank = undefined); - - assert.deepEqual(resCleaned, resNoCleaned, 'parsed results should be equal, even if html is cleaned'); - assert.deepEqual(resCleaned, obj.results, 'parsed results from cleaned html should be equal to se-scraper results'); - assert.deepEqual(resNoCleaned, obj.results, 'parsed results from non-cleaned html should be equal to se-scraper results'); - } - } -} - - -function parseResults(s$, se) { - - var results = []; - - if (se === 'google') { - s$('#center_col .g').each((i, link) => { - results.push({ - link: s$(link).find('.r a').attr('href'), - title: s$(link).find('.r a').text(), - snippet: s$(link).find('span.st').text(), - visible_link: s$(link).find('.r cite').text(), - date: s$(link).find('span.f').text() || '', - }) - }); - - } else if (se === 'bing') { - s$('#b_content #b_results .b_algo').each((i, link) => { - results.push({ - link: s$(link).find('h2 a').attr('href'), - title: s$(link).find('h2').text(), - snippet: s$(link).find('.b_caption p').text(), - visible_link: s$(link).find('cite').text(), - }) - }); - } else { - throw "no such search engine"; - } - - results = clean_results(results, ['title', 'link', 'snippet']); - return results; -} - -function clean_results(results, attributes) { - const cleaned = []; - var rank = 1; - for (var res of results) { - let goodboy = true; - for (var attr of attributes) { - if (!res[attr] || !res[attr].trim()) { - goodboy = false; - break; - } - } - if (goodboy) { - res.rank = rank++; - cleaned.push(res); - } - } - return cleaned; -} - -describe('html output', function(){ - this.timeout(15000); - it('static html output test', test_html_output); -}); \ No newline at end of file diff --git a/test/static_tests/compression.js b/test/static_tests/compression.js deleted file mode 100644 index a41dba8..0000000 --- a/test/static_tests/compression.js +++ /dev/null @@ -1,24 +0,0 @@ -'use strict'; -const zlib = require('zlib'); -const fs = require('fs'); -const path = require('path'); - -var files = ['google.html', 'google2.html', 'google3.html', 'bing.html', 'bing2.html']; - -for (var file of files) { - var html = fs.readFileSync(path.resolve(__dirname, './html/' + file)); - - var compressed = zlib.gzipSync(html); - var deflated = zlib.deflateSync(html); - - var compressed_encoded = compressed.toString('base64'); - var deflated_encoded = deflated.toString('base64'); - - console.log(file) - console.log('Normal length: ' + html.length/1000); - console.log('GZIP Compressed length: ' + compressed.length/1000); - console.log('Deflate Compressed length: ' + deflated.length/1000); - console.log('Encoded GZIP Compressed length: ' + compressed_encoded.length/1000); - console.log('Encoded Deflate Compressed length: ' + deflated_encoded.length/1000); - console.log('------\n') -} diff --git a/test/static_tests/duckduckgo.js b/test/static_tests/duckduckgo.js deleted file mode 100644 index f0f0834..0000000 --- a/test/static_tests/duckduckgo.js +++ /dev/null @@ -1,99 +0,0 @@ -const se_scraper = require('./../../index.js'); -const chai = require('chai'); -chai.use(require('chai-string')); -const assert = chai.assert; -const path = require('path'); - -async function duckduckgo() { - let config = { - compress: false, - debug_level: 1, - headless: true, - }; - - let scrape_config = { - search_engine: 'duckduckgo', - keywords: ['cloud service'], - num_pages: 1, - scrape_from_file: 'file://' + path.join(__dirname, './html/duckduckgo1.html'), - }; - - var scraper = new se_scraper.ScrapeManager(config); - - await scraper.start(); - - duckduckgo_normal( await scraper.scrape(scrape_config) ); - - await scraper.quit(); -} - -function duckduckgo_normal(response) { - assert.equal(response.metadata.num_requests, 1); - - for (let query in response.results) { - - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - assert.containsAllKeys(obj, ['results', 'time', 'ads',], 'not all keys are in the object'); - assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects'); - assert.isAtLeast(obj.ads.length, 2, 'ads must have at least 2 SERP objects'); - - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - - confirm_results_ok(obj); - } - } -} - -function confirm_results_ok(obj) { - - for (let res of obj.results) { - assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object'); - - assert.isOk(res.link, 'link must be ok'); - assert.typeOf(res.link, 'string', 'link must be string'); - assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.visible_link, 'visible_link must be ok'); - assert.typeOf(res.visible_link, 'string', 'visible_link must be string'); - assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars'); - - assert.isOk(res.title, 'title must be ok'); - assert.typeOf(res.title, 'string', 'title must be string'); - assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); - - assert.isOk(res.snippet, 'snippet must be ok'); - assert.typeOf(res.snippet, 'string', 'snippet must be string'); - assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); - - assert.isNumber(res.rank, 'rank must be integer'); - } - - for (let res of obj.ads) { - - assert.isOk(res.tracking_link, 'link must be ok'); - assert.typeOf(res.tracking_link, 'string', 'link must be string'); - assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.visible_link, 'link must be ok'); - assert.typeOf(res.visible_link, 'string', 'link must be string'); - assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.title, 'title must be ok'); - assert.typeOf(res.title, 'string', 'title must be string'); - assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); - - assert.isOk(res.snippet, 'snippet must be ok'); - assert.typeOf(res.snippet, 'string', 'snippet must be string'); - assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); - } -} - -describe('Duckduckgo', function(){ - this.timeout(10000); - it('static duckduckgo sarch', duckduckgo); -}); \ No newline at end of file diff --git a/test/static_tests/google.js b/test/static_tests/google.js deleted file mode 100644 index fd9e154..0000000 --- a/test/static_tests/google.js +++ /dev/null @@ -1,410 +0,0 @@ -const se_scraper = require('./../../index.js'); -const chai = require('chai'); -chai.use(require('chai-string')); -const assert = chai.assert; -const path = require('path'); - -async function normal_search_test() { - let config = { - compress: false, - debug_level: 1, - headless: true, - }; - - let scrape_config = { - search_engine: 'google', - keywords: ['rückspiegel schwarz'], - num_pages: 1, - scrape_from_file: 'file://' + path.join(__dirname, './html/google.html'), - }; - - var scraper = new se_scraper.ScrapeManager(config); - - await scraper.start(); - - google_search_with_products( await scraper.scrape(scrape_config) ); - - scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google2.html'); - scrape_config.keywords = ['autoreifen mercedes c-klasse']; - - google_search_with_products2( await scraper.scrape(scrape_config) ); - - scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google3.html'); - scrape_config.keywords = ['kaffeemaschine kaufen']; - - google_places( await scraper.scrape(scrape_config) ); - - scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google4.html'); - scrape_config.keywords = ['MODEL MARKET SW18 4ES']; - - right_side_info_text( await scraper.scrape(scrape_config) ); - - scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google5.html'); - scrape_config.keywords = ['BRANDON MOTORS HP13 6NR']; - - right_side_info_text2( await scraper.scrape(scrape_config) ); - - scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google6.html'); - scrape_config.keywords = ['car tires for sale']; - - google_places_and_ads( await scraper.scrape(scrape_config) ); - - scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google_bmw_felgen.html'); - scrape_config.keywords = ['bmw felgen']; - - google_ads2( await scraper.scrape(scrape_config) ); - - await scraper.quit(); -} - -// we test with a callback function to our handler -function google_search_with_products(response) { - assert.equal(response.metadata.num_requests, 1); - - for (let query in response.results) { - - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - assert.include(obj.num_results, '1’780’000', 'num results not included'); - assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object'); - assert.isAtLeast(obj.results.length, 9, 'results must have at least 8 SERP objects'); - assert.isAtLeast(obj.top_ads.length, 0, 'there are no top ads'); - assert.isAtLeast(obj.bottom_ads.length, 3, 'there are 3 bottom ads'); - assert.isAtLeast(obj.top_products.length, 15, 'there are 15 top products'); - assert.equal(obj.right_products.length, 0, 'there are 0 right products'); - - assert.equal(obj.no_results, false, 'no results should be false'); - assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); - assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - - confirm_results_ok(obj); - - } - } -} - - -function google_search_with_products2(response) { - assert.equal(response.metadata.num_requests, 1); - - for (let query in response.results) { - - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - assert.include(obj.num_results, '437’000 Ergebnisse (0.41 Sekunden)', 'num results not included'); - assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object'); - assert.isAtLeast(obj.results.length, 9, 'results must have at least 8 SERP objects'); - assert.isAtLeast(obj.top_ads.length, 0, 'there are no top ads'); - assert.isAtLeast(obj.bottom_ads.length, 1, 'there are 1 bottom ads'); - assert.isAtLeast(obj.top_products.length, 0, 'there are 0 top products'); - assert.equal(obj.right_products.length, 4, 'there are 4 right products'); - - assert.equal(obj.no_results, false, 'no results should be false'); - assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); - assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - - confirm_results_ok(obj); - - } - } -} - -function google_places(response) { - assert.equal(response.metadata.num_requests, 1); - - for (let query in response.results) { - - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - assert.include(obj.num_results, '6’750’000 Ergebnisse (0.52 Sekunden)', 'num results not included'); - assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object'); - assert.isAtLeast(obj.results.length, 10, 'results must have at least 9 SERP objects'); - assert.equal(obj.top_ads.length, 0, 'there are no top ads'); - assert.equal(obj.bottom_ads.length, 0, 'there are 0 bottom ads'); - assert.equal(obj.top_products.length, 0, 'there are 0 top products'); - assert.equal(obj.right_products.length, 0, 'there are 0 right products'); - assert.equal(obj.places.length, 3, 'there are 3 places'); - - assert.equal(obj.no_results, false, 'no results should be false'); - assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); - assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - - confirm_results_ok(obj); - } - } -} - -function right_side_info_text(response) { - assert.equal(response.metadata.num_requests, 1); - for (let query in response.results) { - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - assert.include(obj.num_results, '6 Ergebnisse', 'num results not included'); - assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', - 'effective_query', 'top_ads', 'bottom_ads', 'right_side_info_text'], 'not all keys are in the object'); - - assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects'); - - assert.isOk(obj.right_side_info_text.length, 'right_side_info_text should have some data'); - assert.isAtLeast(obj.right_side_info_text.length, 50, 'right_side_info_text should have some data'); - - assert.equal(obj.no_results, false, 'no results should be false'); - assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); - assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - - confirm_results_ok(obj); - } - } -} - -function right_side_info_text2(response) { - assert.equal(response.metadata.num_requests, 1); - for (let query in response.results) { - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - assert.include(obj.num_results, '5 Ergebnisse', 'num results not included'); - assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', - 'effective_query', 'top_ads', 'bottom_ads', 'right_side_info_text'], 'not all keys are in the object'); - - assert.isAtLeast(obj.results.length, 5, 'results must have at least 5 SERP objects'); - assert.isOk(obj.right_side_info_text.length, 'right_side_info_text should have some data'); - assert.isAtLeast(obj.right_side_info_text.length, 50, 'right_side_info_text should have some data'); - - assert.equal(obj.no_results, false, 'no results should be false'); - assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); - assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - - confirm_results_ok(obj); - } - } -} - -function google_places_and_ads(response) { - assert.equal(response.metadata.num_requests, 1); - - for (let query in response.results) { - - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - assert.include(obj.num_results, '439.000.000 Ergebnisse (0,64 Sekunden)', 'num results not included'); - assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object'); - assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects'); - assert.equal(obj.top_ads.length, 0, 'there are no top ads'); - assert.equal(obj.bottom_ads.length, 0, 'there are 0 bottom ads'); - assert.isAtLeast(obj.top_products.length, 13, 'there are 13 top products'); - assert.equal(obj.right_products.length, 0, 'there are 0 right products'); - assert.equal(obj.places.length, 2, 'there are 2 places'); - - assert.equal(obj.no_results, false, 'no results should be false'); - assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); - assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - - confirm_results_ok(obj); - } - } -} - - -function google_ads2(response) { - assert.equal(response.metadata.num_requests, 1); - - for (let query in response.results) { - - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - assert.include(obj.num_results, 'Ungefähr 23.200.000 Ergebnisse (0,29 Sekunden)', 'num results not included'); - assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object'); - assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects'); - assert.equal(obj.top_ads.length, 3, 'there are no top ads'); - assert.equal(obj.bottom_ads.length, 0, 'there are 0 bottom ads'); - assert.isAtLeast(obj.top_products.length, 0, 'there must be 0 top products'); - assert.equal(obj.right_products.length, 9, 'there are 9 right products'); - assert.equal(obj.places.length, 0, 'there are 0 places'); - - assert.equal(obj.no_results, false, 'no results should be false'); - assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); - assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - - confirm_results_ok(obj); - } - } -} - - -function confirm_results_ok(obj) { - - for (let res of obj.results) { - assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object'); - - assert.isOk(res.link, 'link must be ok'); - assert.typeOf(res.link, 'string', 'link must be string'); - assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.visible_link, 'visible_link must be ok'); - assert.typeOf(res.visible_link, 'string', 'visible_link must be string'); - assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars'); - - assert.isOk(res.title, 'title must be ok'); - assert.typeOf(res.title, 'string', 'title must be string'); - assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); - - assert.isOk(res.snippet, 'snippet must be ok'); - assert.typeOf(res.snippet, 'string', 'snippet must be string'); - assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); - - assert.isNumber(res.rank, 'rank must be integer'); - } - - for (let res of obj.top_ads) { - - assert.isOk(res.tracking_link, 'link must be ok'); - assert.typeOf(res.tracking_link, 'string', 'link must be string'); - assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.visible_link, 'link must be ok'); - assert.typeOf(res.visible_link, 'string', 'link must be string'); - assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.link, 'visible_link must be ok'); - assert.typeOf(res.link, 'string', 'visible_link must be string'); - assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars'); - - assert.isOk(res.title, 'title must be ok'); - assert.typeOf(res.title, 'string', 'title must be string'); - assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); - - assert.isOk(res.snippet, 'snippet must be ok'); - assert.typeOf(res.snippet, 'string', 'snippet must be string'); - assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); - - assert.typeOf(res.links, 'array', 'links must be array'); - } - - for (let res of obj.bottom_ads) { - assert.isOk(res.tracking_link, 'link must be ok'); - assert.typeOf(res.tracking_link, 'string', 'link must be string'); - assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.visible_link, 'link must be ok'); - assert.typeOf(res.visible_link, 'string', 'link must be string'); - assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.link, 'visible_link must be ok'); - assert.typeOf(res.link, 'string', 'visible_link must be string'); - assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars'); - - assert.isOk(res.title, 'title must be ok'); - assert.typeOf(res.title, 'string', 'title must be string'); - assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); - - assert.isOk(res.snippet, 'snippet must be ok'); - assert.typeOf(res.snippet, 'string', 'snippet must be string'); - assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); - - assert.typeOf(res.links, 'array', 'links must be array'); - } - - for (let res of obj.top_products) { - - assert.isOk(res.tracking_link, 'link must be ok'); - assert.typeOf(res.tracking_link, 'string', 'link must be string'); - assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.link, 'link must be ok'); - assert.typeOf(res.link, 'string', 'link must be string'); - assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.price, 'price must be ok'); - assert.typeOf(res.price, 'string', 'price must be string'); - assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars'); - - assert.isOk(res.title, 'title must be ok'); - assert.typeOf(res.title, 'string', 'title must be string'); - assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); - - assert.isOk(res.vendor_link, 'vendor_link must be ok'); - assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string'); - assert.isAtLeast(res.vendor_link.length, 8, 'vendor_link must have at least 10 chars'); - } - - for (let res of obj.right_products) { - assert.isOk(res.tracking_link, 'link must be ok'); - assert.typeOf(res.tracking_link, 'string', 'link must be string'); - assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.link, 'link must be ok'); - assert.typeOf(res.link, 'string', 'link must be string'); - assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.price, 'price must be ok'); - assert.typeOf(res.price, 'string', 'price must be string'); - assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars'); - - assert.isOk(res.title, 'title must be ok'); - assert.typeOf(res.title, 'string', 'title must be string'); - assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); - - assert.isOk(res.vendor_link, 'vendor_link must be ok'); - assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string'); - assert.isAtLeast(res.vendor_link.length, 8, 'vendor_link must have at least 10 chars'); - } - - for (let res of obj.places) { - assert.isOk(res.heading, 'heading must be ok'); - assert.typeOf(res.heading, 'string', 'heading must be string'); - assert.isAtLeast(res.heading.length, 5, 'heading must have at least 5 chars'); - - assert.isOk(res.rating, 'rating must be ok'); - assert.typeOf(res.rating, 'string', 'rating must be string'); - assert.isAtLeast(res.rating.length, 5, 'rating must have at least 5 chars'); - - assert.isOk(res.contact, 'contact must be ok'); - assert.typeOf(res.contact, 'string', 'contact must be string'); - assert.isAtLeast(res.contact.length, 5, 'contact must have at least 5 chars'); - - assert.typeOf(res.hours, 'string', 'hours must be string'); - if (res.hours) { - assert.isAtLeast(res.hours.length, 10, 'hours must have at least 10 chars'); - } - } -} - -describe('Google', function() { - this.timeout(25000); - it('static google searches with products,ads and places', normal_search_test); -}); \ No newline at end of file diff --git a/test/static_tests/second_google.js b/test/static_tests/second_google.js deleted file mode 100644 index 9fd95b8..0000000 --- a/test/static_tests/second_google.js +++ /dev/null @@ -1,213 +0,0 @@ -const se_scraper = require('./../../index.js'); -const chai = require('chai'); -chai.use(require('chai-string')); -const assert = chai.assert; -const path = require('path'); - -async function normal_search_test() { - let config = { - compress: false, - debug_level: 1, - headless: true, - }; - - let scrape_config = { - search_engine: 'google', - keywords: ['in.linkedin.com/in/altanai'], - num_pages: 1, - scrape_from_file: 'file://' + path.join(__dirname, './html/google7.html'), - }; - - var scraper = new se_scraper.ScrapeManager(config); - - await scraper.start(); - - google_test_title( await scraper.scrape(scrape_config) ); - - await scraper.quit(); -} - -// we test with a callback function to our handler -function google_test_title(response) { - assert.equal(response.metadata.num_requests, 1); - - for (let query in response.results) { - - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - assert.include(obj.num_results, '7.600', 'num results not included'); - assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object'); - assert.isAtLeast(obj.results.length, 9, 'results must have at least 9 SERP objects'); - assert.isAtLeast(obj.top_ads.length, 0, 'there are no top ads'); - assert.isAtLeast(obj.bottom_ads.length, 0, 'there are 0 bottom ads'); - assert.isAtLeast(obj.top_products.length, 0, 'there are 0 top products'); - assert.equal(obj.right_products.length, 0, 'there are 0 right products'); - - assert.equal(obj.no_results, false, 'no results should be false'); - assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); - assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - - confirm_results_ok(obj); - - assert.equal( obj.results[0].title, 'ALTANAI BISHT - SD2 at Voice Engineering - Plivo | LinkedIn' ); - assert.equal( obj.results[1].title, 'ALTANAI BISHT | LinkedIn' ); - assert.equal( obj.results[2].title, 'ALTANAI BISHT – SD2 at Voice Engineering – Plivo | LinkedIn' ); - assert.equal( obj.results[3].title, 'AI AT - South Delhi, Delhi, India | Professional Profile | LinkedIn' ); - assert.equal( obj.results[4].title, 'ALTANAI BISHT | LinkedIn' ); - assert.equal( obj.results[9].title, 'Phani Kumar Parasaram - VOIP Expert - Infinite ... - LinkedIn'); - - assert.equal (obj.results[0].date, '27.07.2016'); - assert.equal( obj.results[0].snippet, '27.07.2016 - View ALTANAI BISHT\'S profile on LinkedIn, the world\'s largest professional community. ALTANAI has 6 jobs listed on their profile. See the ...'); - - assert.equal (obj.results[2].date, '27.07.2016'); - } - } -} - -function confirm_results_ok(obj) { - - for (let res of obj.results) { - assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object'); - - assert.isOk(res.link, 'link must be ok'); - assert.typeOf(res.link, 'string', 'link must be string'); - assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.visible_link, 'visible_link must be ok'); - assert.typeOf(res.visible_link, 'string', 'visible_link must be string'); - assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars'); - - assert.isOk(res.title, 'title must be ok'); - assert.typeOf(res.title, 'string', 'title must be string'); - assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); - - assert.isOk(res.snippet, 'snippet must be ok'); - assert.typeOf(res.snippet, 'string', 'snippet must be string'); - assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); - - assert.isNumber(res.rank, 'rank must be integer'); - } - - for (let res of obj.top_ads) { - - assert.isOk(res.tracking_link, 'link must be ok'); - assert.typeOf(res.tracking_link, 'string', 'link must be string'); - assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.visible_link, 'link must be ok'); - assert.typeOf(res.visible_link, 'string', 'link must be string'); - assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.link, 'visible_link must be ok'); - assert.typeOf(res.link, 'string', 'visible_link must be string'); - assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars'); - - assert.isOk(res.title, 'title must be ok'); - assert.typeOf(res.title, 'string', 'title must be string'); - assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); - - assert.isOk(res.snippet, 'snippet must be ok'); - assert.typeOf(res.snippet, 'string', 'snippet must be string'); - assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); - - assert.typeOf(res.links, 'array', 'links must be array'); - } - - for (let res of obj.bottom_ads) { - assert.isOk(res.tracking_link, 'link must be ok'); - assert.typeOf(res.tracking_link, 'string', 'link must be string'); - assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.visible_link, 'link must be ok'); - assert.typeOf(res.visible_link, 'string', 'link must be string'); - assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.link, 'visible_link must be ok'); - assert.typeOf(res.link, 'string', 'visible_link must be string'); - assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars'); - - assert.isOk(res.title, 'title must be ok'); - assert.typeOf(res.title, 'string', 'title must be string'); - assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); - - assert.isOk(res.snippet, 'snippet must be ok'); - assert.typeOf(res.snippet, 'string', 'snippet must be string'); - assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); - - assert.typeOf(res.links, 'array', 'links must be array'); - } - - for (let res of obj.top_products) { - - assert.isOk(res.tracking_link, 'link must be ok'); - assert.typeOf(res.tracking_link, 'string', 'link must be string'); - assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.link, 'link must be ok'); - assert.typeOf(res.link, 'string', 'link must be string'); - assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.price, 'price must be ok'); - assert.typeOf(res.price, 'string', 'price must be string'); - assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars'); - - assert.isOk(res.title, 'title must be ok'); - assert.typeOf(res.title, 'string', 'title must be string'); - assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); - - assert.isOk(res.vendor_link, 'vendor_link must be ok'); - assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string'); - assert.isAtLeast(res.vendor_link.length, 10, 'vendor_link must have at least 10 chars'); - } - - for (let res of obj.right_products) { - assert.isOk(res.tracking_link, 'link must be ok'); - assert.typeOf(res.tracking_link, 'string', 'link must be string'); - assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.link, 'link must be ok'); - assert.typeOf(res.link, 'string', 'link must be string'); - assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.price, 'price must be ok'); - assert.typeOf(res.price, 'string', 'price must be string'); - assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars'); - - assert.isOk(res.title, 'title must be ok'); - assert.typeOf(res.title, 'string', 'title must be string'); - assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); - - assert.isOk(res.vendor_link, 'vendor_link must be ok'); - assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string'); - assert.isAtLeast(res.vendor_link.length, 10, 'vendor_link must have at least 10 chars'); - } - - for (let res of obj.places) { - assert.isOk(res.heading, 'heading must be ok'); - assert.typeOf(res.heading, 'string', 'heading must be string'); - assert.isAtLeast(res.heading.length, 5, 'heading must have at least 5 chars'); - - assert.isOk(res.rating, 'rating must be ok'); - assert.typeOf(res.rating, 'string', 'rating must be string'); - assert.isAtLeast(res.rating.length, 5, 'rating must have at least 5 chars'); - - assert.isOk(res.contact, 'contact must be ok'); - assert.typeOf(res.contact, 'string', 'contact must be string'); - assert.isAtLeast(res.contact.length, 5, 'contact must have at least 5 chars'); - - assert.typeOf(res.hours, 'string', 'hours must be string'); - if (res.hours) { - assert.isAtLeast(res.hours.length, 10, 'hours must have at least 10 chars'); - } - } -} - -describe('Google2', function(){ - this.timeout(10000); - it('static google searches testing various details', normal_search_test); -}); \ No newline at end of file diff --git a/test/static_tests/yandex.js b/test/static_tests/yandex.js deleted file mode 100644 index 9cb806c..0000000 --- a/test/static_tests/yandex.js +++ /dev/null @@ -1,152 +0,0 @@ -const se_scraper = require('./../../index.js'); -const chai = require('chai'); -chai.use(require('chai-string')); -const assert = chai.assert; -const path = require('path'); - -async function yandex_ads() { - let config = { - compress: false, - debug_level: 1, - headless: true, - }; - - let scrape_config = { - search_engine: 'yandex', - keywords: ['cloud service'], - num_pages: 1, - scrape_from_file: 'file://' + path.join(__dirname, './html/yandex1.html'), - }; - - var scraper = new se_scraper.ScrapeManager(config); - - await scraper.start(); - - yandex_search_with_ads( await scraper.scrape(scrape_config) ); - - scrape_config.keywords = ['car tires cheap']; - scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/yandex2.html'); - - yandex_search_with_ads2( await scraper.scrape(scrape_config) ); - - scrape_config.keywords = ['купить деревянные окна']; - scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/yandex3.html'); - - yandex_search_with_ads3( await scraper.scrape(scrape_config) ); - - await scraper.quit(); -} - -// we test with a callback function to our handler -function yandex_search_with_ads(response) { - assert.equal(response.metadata.num_requests, 1); - - for (let query in response.results) { - - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - assert.include(obj.num_results, '2 million results', 'num results not included'); - assert.containsAllKeys(obj, ['results', 'time', 'num_results',], 'not all keys are in the object'); - assert.isAtLeast(obj.results.length, 12, 'results must have at least 12 SERP objects'); - - assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); - assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - - confirm_results_ok(obj); - } - } -} - - -function yandex_search_with_ads2(response) { - assert.equal(response.metadata.num_requests, 1); - - for (let query in response.results) { - - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - assert.include(obj.num_results, '5 million results', 'num results not included'); - assert.containsAllKeys(obj, ['results', 'time', 'num_results',], 'not all keys are in the object'); - assert.isAtLeast(obj.results.length, 11, 'results must have at least 12 SERP objects'); - - assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); - assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - - confirm_results_ok(obj); - } - } -} - - -function yandex_search_with_ads3(response) { - assert.equal(response.metadata.num_requests, 1); - - for (let query in response.results) { - - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - // console.dir(obj.results, {depth: null, colors: true}); - - assert.containsAllKeys(obj, ['results', 'time', 'num_results',], 'not all keys are in the object'); - assert.isAtLeast(obj.results.length, 14, 'results must have at least 14 SERP objects'); - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - - // at least 4 ads - let cnt = 0; - obj.results.forEach((res) => { - if (res.is_ad) { - cnt++; - } - }); - - assert.isAtLeast(cnt, 4, 'there should be at least 4 ads in the results'); - - confirm_results_ok(obj); - } - } -} - - -function confirm_results_ok(obj) { - - for (let res of obj.results) { - assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object'); - - assert.isOk(res.link, 'link must be ok'); - assert.typeOf(res.link, 'string', 'link must be string'); - assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.visible_link, 'visible_link must be ok'); - assert.typeOf(res.visible_link, 'string', 'visible_link must be string'); - assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars'); - - assert.isOk(res.title, 'title must be ok'); - assert.typeOf(res.title, 'string', 'title must be string'); - assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); - - assert.isOk(res.snippet, 'snippet must be ok'); - assert.typeOf(res.snippet, 'string', 'snippet must be string'); - assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); - - assert.isNumber(res.rank, 'rank must be integer'); - } -} - -describe('Yandex', function(){ - this.timeout(10000); - it('static yandex searches with ads', yandex_ads); -}); \ No newline at end of file diff --git a/test/test_amazon.js b/test/test_amazon.js deleted file mode 100644 index aa38e82..0000000 --- a/test/test_amazon.js +++ /dev/null @@ -1,141 +0,0 @@ -'use strict'; -const se_scraper = require('./../index.js'); -const assert = require('chai').assert; - -/* - * Use chai and mocha for tests. - * https://mochajs.org/#installation - */ - -const normal_search_keywords = ['iphone', 'clock']; - -async function normal_search_test() { - let config = { - compress: false, - keyword_file: '', - headless: true, - output_file: '', - block_assets: true, - random_user_agent: false, - }; - - let scrape_config = { - search_engine: 'amazon', - num_pages: 1, - keywords: normal_search_keywords, - }; - - console.log('normal_search_test()'); - normal_search_test_case( await se_scraper.scrape(config, scrape_config) ); -} - -// we test with a callback function to our handler -function normal_search_test_case(response) { - assert.equal(response.metadata.num_requests, 2); - - for (let query in response.results) { - let total_rank = 1; - assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.'); - - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object'); - - assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects'); - assert.equal(obj.no_results, false, 'no results should be false'); - assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); - assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - - for (let res of obj.results) { - - assert.containsAllKeys(res, ['link', 'title', 'rank', 'image', 'seller', 'stars', 'num_reviews', 'price', 'oldprice'], 'not all keys are in the SERP object'); - - assert.isOk(res.link, 'link must be ok'); - assert.typeOf(res.link, 'string', 'link must be string'); - assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.title, 'title must be ok'); - assert.typeOf(res.title, 'string', 'title must be string'); - assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars'); - - assert.isOk(res.seller, 'seller must be ok'); - assert.typeOf(res.seller, 'string', 'seller must be string'); - assert.isAtLeast(res.seller.length, 5, 'seller must have at least 10 chars'); - - assert.isOk(res.stars, 'stars be ok'); - assert.typeOf(res.stars, 'string', 'stars must be string'); - assert.isAtLeast(res.stars.length, 5, 'stars must have at least 6 chars'); - assert.include(res.stars, ' out of ', 'stars must include " out of "'); - - assert.isOk(res.num_reviews, 'num_reviews be ok'); - assert.typeOf(res.num_reviews, 'string', 'num_reviews must be string'); - assert.isAtLeast(res.num_reviews.length, 1, 'num_reviews must have at least 1 chars'); - - assert.isOk(res.price, 'price be ok'); - assert.typeOf(res.price, 'string', 'price must be string'); - assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars'); - - assert.isNumber(res.rank, 'rank must be integer'); - assert.equal(res.rank, total_rank++, 'rank ist wrong'); - } - } - } -} - -const keywords_no_results = ['2342kljp;fj9834u40abJ54634344023safkl34a44dsflkjaQQuBBdfk',]; - -async function no_results_test() { - let config = { - compress: false, - debug_level: 1, - keyword_file: '', - headless: true, - output_file: '', - block_assets: true, - random_user_agent: false, - }; - - let scrape_config = { - search_engine: 'amazon', - num_pages: 1, - keywords: keywords_no_results, - }; - - console.log('no_results_test()'); - test_case_no_results( await se_scraper.scrape(config, scrape_config) ); -} - -// we test with a callback function to our handler -function test_case_no_results(response) { - assert.equal(response.metadata.num_requests, 1); - - for (let query in response.results) { - - assert.containsAllKeys(response.results, keywords_no_results, 'not all keywords were scraped.'); - - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object'); - - assert(obj.results.length === 0, 'results must have 0 SERP objects'); - assert.equal(obj.no_results, true, 'no results should be true'); - assert.isEmpty(obj.num_results, 'no results should be a empty string'); - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - } - } -} - -describe('Amazon', function(){ - this.timeout(30000); - it('normal search test', normal_search_test); - it('no results test', no_results_test); -}); \ No newline at end of file diff --git a/test/test_baidu.js b/test/test_baidu.js deleted file mode 100644 index 887d7d1..0000000 --- a/test/test_baidu.js +++ /dev/null @@ -1,87 +0,0 @@ -'use strict'; -const se_scraper = require('./../index.js'); -const assert = require('chai').assert; - -/* - * Use chai and mocha for tests. - * https://mochajs.org/#installation - */ - -const normal_search_keywords = ['mouse', 'cat']; - -async function normal_search_test() { - let config = { - compress: false, - debug_level: 1, - keyword_file: '', - headless: true, - block_assets: true, - user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', - }; - - let scrape_config = { - search_engine: 'baidu', - keywords: normal_search_keywords, - num_pages: 2, - }; - - console.log('normal_search_test()'); - normal_search_test_case( await se_scraper.scrape(config, scrape_config) ); -} - -// we test with a callback function to our handler -function normal_search_test_case(response) { - assert.equal(response.metadata.num_requests, 4); - - for (let query in response.results) { - let total_rank = 1; - - assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.'); - - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - assert.containsAllKeys(obj, ['results', 'time', 'num_results', 'no_results'], 'not all keys are in the object'); - - assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects'); - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - - assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); - assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); - - assert.equal(obj.no_results, false, 'no results should be false'); - - for (let res of obj.results) { - - assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object'); - - assert.isOk(res.link, 'link must be ok'); - assert.typeOf(res.link, 'string', 'link must be string'); - assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.visible_link, 'visible_link must be ok'); - assert.typeOf(res.visible_link, 'string', 'visible_link must be string'); - assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars'); - - assert.isOk(res.title, 'title must be ok'); - assert.typeOf(res.title, 'string', 'title must be string'); - assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars'); - - assert.isOk(res.snippet, 'snippet must be ok'); - assert.typeOf(res.snippet, 'string', 'snippet must be string'); - assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); - - assert.isNumber(res.rank, 'rank must be integer'); - assert.equal(res.rank, total_rank++, 'rank ist wrong'); - } - } - } -} - -describe('Baidu', function(){ - this.timeout(30000); - it('normal search test', normal_search_test); -}); \ No newline at end of file diff --git a/test/test_bing.js b/test/test_bing.js deleted file mode 100644 index f62c56a..0000000 --- a/test/test_bing.js +++ /dev/null @@ -1,271 +0,0 @@ -'use strict'; -const se_scraper = require('./../index.js'); -const chai = require('chai'); -chai.use(require('chai-string')); -const assert = chai.assert; -/* - * Use chai and mocha for tests. - * https://mochajs.org/#installation - */ - -const normal_search_keywords = ['apple tree', 'weather tomorrow']; - -async function normal_search_test() { - let config = { - search_engine: 'bing', - compress: false, - debug_level: 1, - keywords: normal_search_keywords, - keyword_file: '', - num_pages: 3, - headless: true, - output_file: '', - block_assets: true, - user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', - random_user_agent: false, - }; - - let scrape_config = { - search_engine: 'bing', - keywords: normal_search_keywords, - num_pages: 3, - }; - - console.log('normal_search_test()'); - normal_search_test_case( await se_scraper.scrape(config, scrape_config) ); -} - -// we test with a callback function to our handler -function normal_search_test_case(response) { - assert.equal(response.metadata.num_requests, 6); - - for (let query in response.results) { - let total_rank = 1; - assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.'); - - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object'); - - assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects'); - assert.equal(obj.no_results, false, 'no results should be false'); - assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); - assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - - for (let res of obj.results) { - - assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'rank'], 'not all keys are in the SERP object'); - - assert.isOk(res.link, 'link must be ok'); - assert.typeOf(res.link, 'string', 'link must be string'); - assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.visible_link, 'visible_link must be ok'); - assert.typeOf(res.visible_link, 'string', 'visible_link must be string'); - assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars'); - - assert.isOk(res.title, 'title must be ok'); - assert.typeOf(res.title, 'string', 'title must be string'); - assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars'); - - if (res.snippet) { - assert.isOk(res.snippet, 'snippet must be ok'); - assert.typeOf(res.snippet, 'string', 'snippet must be string'); - assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); - } - - assert.isNumber(res.rank, 'rank must be integer'); - assert.equal(res.rank, total_rank++, 'rank ist wrong'); - } - } - } -} - -const keywords_no_results = ['2342kljp;fj9834u40abJAkasdlfkjsladfkjasfdas;lk3453-934023safkl34a44dsflkjaQQuBBdfk',]; - -async function no_results_test() { - let config = { - search_engine: 'bing', - compress: false, - debug_level: 1, - keywords: keywords_no_results, - keyword_file: '', - num_pages: 1, - headless: true, - output_file: '', - block_assets: true, - user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', - random_user_agent: false, - }; - - let scrape_config = { - search_engine: 'bing', - keywords: keywords_no_results, - num_pages: 1, - }; - - console.log('no_results_test()'); - test_case_no_results( await se_scraper.scrape(config, scrape_config) ); -} - -// we test with a callback function to our handler -function test_case_no_results(response) { - assert.equal(response.metadata.num_requests, 1); - - for (let query in response.results) { - - assert.containsAllKeys(response.results, keywords_no_results, 'not all keywords were scraped.'); - - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object'); - - assert(obj.results.length === 0, 'results must have 0 SERP objects'); - assert.equal(obj.no_results, true, 'no results should be true'); - assert.isEmpty(obj.num_results, 'no results should be a empty string'); - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - } - } -} - -const effective_query_keywords = ['mount everrest']; - -async function effective_query_test() { - let config = { - compress: false, - debug_level: 1, - keyword_file: '', - headless: true, - output_file: '', - block_assets: true, - user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', - random_user_agent: false, - }; - - let scrape_config = { - search_engine: 'bing', - keywords: effective_query_keywords, - num_pages: 1, - }; - - console.log('effective_query_test()'); - test_case_effective_query( await se_scraper.scrape(config, scrape_config) ); -} - -// we test with a callback function to our handler -function test_case_effective_query(response) { - assert.equal(response.metadata.num_requests, 1); - - for (let query in response.results) { - - assert.containsAllKeys(response.results, effective_query_keywords, 'not all keywords were scraped.'); - - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object'); - - // effective query must be different to the original keyword - assert.isOk(obj.effective_query, 'effective query must be ok'); - assert.isNotEmpty(obj.effective_query, 'effective query must be valid'); - assert(obj.effective_query !== query, 'effective query must be different from keyword'); - - assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects'); - assert.equal(obj.no_results, false, 'no results should be false'); - assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); - assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - } - } -} - - -const ads_keywords = ['cloud services', 'buy shoes']; - -async function ads_test() { - let config = { - compress: false, - debug_level: 1, - headless: true, - block_assets: false, - random_user_agent: true, - }; - - let scrape_config = { - search_engine: 'bing', - keywords: ads_keywords, - num_pages: 1, - }; - - console.log('ads_test()'); - test_case_ads_test( await se_scraper.scrape(config, scrape_config) ); -} - -function test_case_ads_test(response) { - assert.equal(response.metadata.num_requests, 2); - - for (let query in response.results) { - - assert.containsAllKeys(response.results, ads_keywords, 'not all keywords were scraped.'); - - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object'); - - assert.isAtLeast(obj.results.length, 5, 'results must have at least 5 SERP objects'); - assert.equal(obj.no_results, false, 'no results should be false'); - assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); - assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - - assert.isAtLeast(obj.ads.length, 2, 'ads must have at least 2 SERP objects'); - - for (let res of obj.ads) { - - assert.isOk(res.tracking_link, 'link must be ok'); - assert.typeOf(res.tracking_link, 'string', 'link must be string'); - assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars'); - - // assert.isOk(res.link, 'link must be ok'); - // assert.typeOf(res.link, 'string', 'link must be string'); - // assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.visible_link, 'visible_link must be ok'); - assert.typeOf(res.visible_link, 'string', 'visible_link must be string'); - assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars'); - - assert.isOk(res.title, 'title must be ok'); - assert.typeOf(res.title, 'string', 'title must be string'); - assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); - - assert.isOk(res.snippet, 'snippet must be ok'); - assert.typeOf(res.snippet, 'string', 'snippet must be string'); - assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); - } - } - } -} - -describe('Bing', function(){ - this.timeout(30000); - it('normal search', normal_search_test); - it('no results', no_results_test); - it('effective query', effective_query_test); - it('finds ads', ads_test); -}); diff --git a/test/test_duckduckgo.js b/test/test_duckduckgo.js deleted file mode 100644 index 39efe6e..0000000 --- a/test/test_duckduckgo.js +++ /dev/null @@ -1,192 +0,0 @@ -'use strict'; -const se_scraper = require('./../index.js'); -const chai = require('chai'); -chai.use(require('chai-string')); -const assert = chai.assert; - -const normal_search_keywords = ['apple tree', 'weather tomorrow']; - -async function normal_search_test() { - let config = { - compress: false, - debug_level: 1, - headless: true, - block_assets: false, - random_user_agent: true, - }; - - let scrape_config = { - search_engine: 'duckduckgo', - keywords: normal_search_keywords, - num_pages: 2, - }; - - console.log('normal_search_test()'); - normal_search_test_case( await se_scraper.scrape(config, scrape_config) ); -} - -// we test with a callback function to our handler -function normal_search_test_case(response) { - assert.equal(response.metadata.num_requests, 2); - - for (let query in response.results) { - let total_rank = 1; - - assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.'); - - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - assert.containsAllKeys(obj, ['results', 'time', 'effective_query'], 'not all keys are in the object'); - - assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects'); - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - - for (let res of obj.results) { - - assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object'); - - assert.isOk(res.link, 'link must be ok'); - assert.typeOf(res.link, 'string', 'link must be string'); - assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.visible_link, 'visible_link must be ok'); - assert.typeOf(res.visible_link, 'string', 'visible_link must be string'); - assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars'); - - assert.isOk(res.title, 'title must be ok'); - assert.typeOf(res.title, 'string', 'title must be string'); - assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars'); - - assert.isOk(res.snippet, 'snippet must be ok'); - assert.typeOf(res.snippet, 'string', 'snippet must be string'); - assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); - - assert.isNumber(res.rank, 'rank must be integer'); - assert.equal(res.rank, total_rank++, 'rank ist wrong'); - } - } - } -} - -const effective_query_keywords = ['mount everrest']; - -async function effective_query_test() { - let config = { - compress: false, - debug_level: 1, - headless: true, - block_assets: true, - random_user_agent: true, - }; - - let scrape_config = { - search_engine: 'duckduckgo', - keywords: effective_query_keywords, - num_pages: 1, - }; - - console.log('test_case_effective_query()'); - test_case_effective_query( await se_scraper.scrape(config, scrape_config) ); -} - -// we test with a callback function to our handler -function test_case_effective_query(response) { - assert.equal(response.metadata.num_requests, 1); - - results = response.results; - for (let query in response.results) { - - assert.containsAllKeys(response.results, effective_query_keywords, 'not all keywords were scraped.'); - - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - assert.containsAllKeys(obj, ['results', 'time', 'effective_query'], 'not all keys are in the object'); - - // effective query must be different to the original keyword - assert.isOk(obj.effective_query, 'effective query must be ok'); - assert.isNotEmpty(obj.effective_query, 'effective query must be valid'); - assert(obj.effective_query !== query, 'effective query must be different from keyword'); - - assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects'); - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - } - } -} - -const ads_keywords = ['cloud services', 'buy shoes']; - -async function ads_test() { - let config = { - compress: false, - debug_level: 1, - headless: true, - block_assets: false, - random_user_agent: false, - }; - - let scrape_config = { - search_engine: 'duckduckgo', - keywords: ads_keywords, - num_pages: 1, - }; - - console.log('ads_test()'); - test_case_ads_test( await se_scraper.scrape(config, scrape_config) ); -} - -function test_case_ads_test(response) { - assert.equal(response.metadata.num_requests, 2); - - for (let query in response.results) { - - assert.containsAllKeys(response.results, ads_keywords, 'not all keywords were scraped.'); - - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - assert.containsAllKeys(obj, ['results', 'time', 'effective_query', 'ads'], 'not all keys are in the object'); - - assert.isAtLeast(obj.results.length, 6, 'results must have at least 6 SERP objects'); - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - assert.isAtLeast(obj.ads.length, 2, 'ads must have at least 2 SERP object'); - - for (let res of obj.ads) { - - assert.isOk(res.tracking_link, 'link must be ok'); - assert.typeOf(res.tracking_link, 'string', 'link must be string'); - assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.visible_link, 'visible_link must be ok'); - assert.typeOf(res.visible_link, 'string', 'visible_link must be string'); - assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars'); - - assert.isOk(res.title, 'title must be ok'); - assert.typeOf(res.title, 'string', 'title must be string'); - assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); - - assert.isOk(res.snippet, 'snippet must be ok'); - assert.typeOf(res.snippet, 'string', 'snippet must be string'); - assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); - } - } - } -} - - -describe('Duckduckgo', function(){ - this.timeout(30000); - it('normal search', normal_search_test); - it('effective query', effective_query_test); - it('finds ads', ads_test); -}); \ No newline at end of file diff --git a/test/test_google.js b/test/test_google.js deleted file mode 100644 index c21ef4b..0000000 --- a/test/test_google.js +++ /dev/null @@ -1,424 +0,0 @@ -'use strict'; -const se_scraper = require('./../index.js'); -const chai = require('chai'); -chai.use(require('chai-string')); -const assert = chai.assert; - -/* - * Use chai and mocha for tests. - * https://mochajs.org/#installation - */ - -const normal_search_keywords = ['apple tree', 'weather tomorrow']; - -async function normal_search_test() { - let config = { - compress: false, - debug_level: 1, - keyword_file: '', - headless: true, - output_file: '', - block_assets: true, - user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', - random_user_agent: false, - }; - - let scrape_config = { - search_engine: 'google', - keywords: normal_search_keywords, - num_pages: 3, - }; - - console.log('normal_search_test()'); - normal_search_test_case( await se_scraper.scrape(config, scrape_config) ); -} - -// we test with a callback function to our handler -function normal_search_test_case(response) { - assert.equal(response.metadata.num_requests, 6); - - for (let query in response.results) { - let total_rank = 1; - - assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.'); - - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object'); - - assert.isAtLeast(obj.results.length, 7, 'results must have at least 8 SERP objects'); - assert.equal(obj.no_results, false, 'no results should be false'); - assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); - assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - - for (let res of obj.results) { - - assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link'], 'not all keys are in the SERP object'); - - assert.isOk(res.link, 'link must be ok'); - assert.typeOf(res.link, 'string', 'link must be string'); - assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.visible_link, 'visible_link must be ok'); - assert.typeOf(res.visible_link, 'string', 'visible_link must be string'); - assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars'); - - assert.isOk(res.title, 'title must be ok'); - assert.typeOf(res.title, 'string', 'title must be string'); - assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); - - assert.isOk(res.snippet, 'snippet must be ok'); - assert.typeOf(res.snippet, 'string', 'snippet must be string'); - assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); - - assert.isNumber(res.rank, 'rank must be integer'); - assert.equal(res.rank, total_rank++, 'rank ist wrong'); - } - } - } -} - -const keywords_no_results = ['fgskl34440abJAksfs4353534a3l34AVGFDFflkjaQQuBBdfk',]; - -async function no_results_test() { - let config = { - compress: false, - debug_level: 1, - keyword_file: '', - headless: true, - output_file: '', - block_assets: true, - user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', - random_user_agent: false, - }; - - let scrape_config = { - search_engine: 'google', - keywords: keywords_no_results, - num_pages: 1, - }; - - console.log('no_results_test()'); - test_case_no_results( await se_scraper.scrape(config, scrape_config) ); -} - -// we test with a callback function to our handler -function test_case_no_results(response) { - assert.equal(response.metadata.num_requests, 1); - - for (let query in response.results) { - - assert.containsAllKeys(response.results, keywords_no_results, 'not all keywords were scraped.'); - - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object'); - - assert.strictEqual(obj.results.length, 0, 'results must have 0 SERP objects'); - assert.equal(obj.no_results, true, 'no results should be true'); - assert.isEmpty(obj.num_results, 'num_results should be a empty string'); - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - } - } -} - -const effective_query_keywords = ['mount evverrest']; - -async function effective_query_test() { - let config = { - compress: false, - debug_level: 1, - keyword_file: '', - headless: true, - output_file: '', - block_assets: true, - user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', - random_user_agent: false, - }; - - let scrape_config = { - search_engine: 'google', - keywords: effective_query_keywords, - num_pages: 1, - }; - - console.log('effective_query_test()'); - test_case_effective_query( await se_scraper.scrape(config, scrape_config) ); -} - -// we test with a callback function to our handler -function test_case_effective_query(response) { - assert.equal(response.metadata.num_requests, 1); - - for (let query in response.results) { - - assert.containsAllKeys(response.results, effective_query_keywords, 'not all keywords were scraped.'); - - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object'); - - // effective query must be different to the original keyword - assert.isOk(obj.effective_query, 'effective query must be ok'); - assert.isNotEmpty(obj.effective_query, 'effective query must be valid'); - assert(obj.effective_query !== query, 'effective query must be different from keyword'); - - assert.isAtLeast(obj.results.length, 7, 'results must have at least 8 SERP objects'); - assert.equal(obj.no_results, false, 'no results should be false'); - assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); - assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - } - } -} - -async function html_output_query_test() { - let config = { - compress: false, - debug_level: 1, - keyword_file: '', - headless: true, - output_file: '', - block_assets: true, - user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', - random_user_agent: false, - }; - - let scrape_config = { - search_engine: 'google', - keywords: normal_search_keywords, - num_pages: 3, - html_output: true, - }; - - let output = await se_scraper.scrape(config, scrape_config); - normal_search_test_case( output ); - check_html_output_test_case( output ); -} - -function check_html_output_test_case( response ) { - for (let query in response.html_output) { - - assert.containsAllKeys(response.html_output, normal_search_keywords, 'not all keywords were scraped.'); - - for (let page_number in response.html_output[query]) { - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - assert.startsWith(response.html_output[query][page_number], '= 1 || obj.bottom_ads.length >= 1, 'top_ads or bottom_ads must have at least 1 SERP object'); - - for (let res of obj.top_ads) { - - assert.isOk(res.tracking_link, 'link must be ok'); - assert.typeOf(res.tracking_link, 'string', 'link must be string'); - assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.visible_link, 'link must be ok'); - assert.typeOf(res.visible_link, 'string', 'link must be string'); - assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.link, 'visible_link must be ok'); - assert.typeOf(res.link, 'string', 'visible_link must be string'); - assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars'); - - assert.isOk(res.title, 'title must be ok'); - assert.typeOf(res.title, 'string', 'title must be string'); - assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); - - assert.isOk(res.snippet, 'snippet must be ok'); - assert.typeOf(res.snippet, 'string', 'snippet must be string'); - assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); - - assert.typeOf(res.links, 'array', 'links must be array'); - } - - for (let res of obj.bottom_ads) { - assert.isOk(res.tracking_link, 'link must be ok'); - assert.typeOf(res.tracking_link, 'string', 'link must be string'); - assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.visible_link, 'link must be ok'); - assert.typeOf(res.visible_link, 'string', 'link must be string'); - assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.link, 'visible_link must be ok'); - assert.typeOf(res.link, 'string', 'visible_link must be string'); - assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars'); - - assert.isOk(res.title, 'title must be ok'); - assert.typeOf(res.title, 'string', 'title must be string'); - assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); - - assert.isOk(res.snippet, 'snippet must be ok'); - assert.typeOf(res.snippet, 'string', 'snippet must be string'); - assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); - - assert.typeOf(res.links, 'array', 'links must be array'); - } - - } - } -} - - - -const product_keywords = ['autoreifen bmw']; - -async function products_test() { - let config = { - compress: false, - debug_level: 1, - headless: true, - block_assets: false, - random_user_agent: false, // dont try to trick google with ads - }; - - let scrape_config = { - search_engine: 'google', - keywords: ads_keywords, - num_pages: 1, - }; - - console.log('products_test()'); - test_case_products_test( await se_scraper.scrape(config, scrape_config) ); -} - -function test_case_products_test(response) { - assert.equal(response.metadata.num_requests, 2); - - for (let query in response.results) { - - assert.containsAllKeys(response.results, ads_keywords, 'not all keywords were scraped.'); - - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object'); - - assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects'); - assert.equal(obj.no_results, false, 'no results should be false'); - assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); - assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - - assert(obj.top_products.length >= 1 || obj.right_products.length >= 1, 'top_products or right_products must have at least 1 SERP object'); - - for (let res of obj.top_products) { - - assert.isOk(res.tracking_link, 'link must be ok'); - assert.typeOf(res.tracking_link, 'string', 'link must be string'); - assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.link, 'link must be ok'); - assert.typeOf(res.link, 'string', 'link must be string'); - assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.price, 'price must be ok'); - assert.typeOf(res.price, 'string', 'price must be string'); - assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars'); - - assert.isOk(res.title, 'title must be ok'); - assert.typeOf(res.title, 'string', 'title must be string'); - assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); - - assert.isOk(res.vendor_link, 'vendor_link must be ok'); - assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string'); - assert.isAtLeast(res.vendor_link.length, 10, 'vendor_link must have at least 10 chars'); - } - - for (let res of obj.right_products) { - assert.isOk(res.tracking_link, 'link must be ok'); - assert.typeOf(res.tracking_link, 'string', 'link must be string'); - assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.link, 'link must be ok'); - assert.typeOf(res.link, 'string', 'link must be string'); - assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.price, 'price must be ok'); - assert.typeOf(res.price, 'string', 'price must be string'); - assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars'); - - assert.isOk(res.title, 'title must be ok'); - assert.typeOf(res.title, 'string', 'title must be string'); - assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); - - assert.isOk(res.vendor_link, 'vendor_link must be ok'); - assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string'); - assert.isAtLeast(res.vendor_link.length, 10, 'vendor_link must have at least 10 chars'); - } - - } - } -} - -describe('Google', function(){ - this.timeout(30000); - it('normal search', normal_search_test); - it('no results', no_results_test); - it('effective query', effective_query_test); - it('html output query', html_output_query_test); - it('ads', ads_test); - it('products test', products_test); -}); diff --git a/test/test_googleimage.js b/test/test_googleimage.js deleted file mode 100644 index 31f00e9..0000000 --- a/test/test_googleimage.js +++ /dev/null @@ -1,80 +0,0 @@ -'use strict'; -const se_scraper = require('./../index.js'); -const assert = require('chai').assert; - -/* - * Use chai and mocha for tests. - * https://mochajs.org/#installation - */ - -const normal_search_keywords = ['apple', 'rain']; - -async function normal_image_search_test() { - let config = { - compress: false, - debug_level: 0, - headless: true, - block_assets: true, - user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', - random_user_agent: false, - }; - - let scrape_config = { - search_engine: 'google_image', - keywords: normal_search_keywords, - num_pages: 2, - }; - - console.log('normal_image_search_test()'); - normal_image_search_test_case( await se_scraper.scrape(config, scrape_config) ); -} - -// we test with a callback function to our handler -function normal_image_search_test_case(response) { - assert.equal(response.metadata.num_requests, 2); - - for (let query in response.results) { - - let total_rank = 1; - - assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.'); - - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'effective_query'], 'not all keys are in the object'); - - assert.isAtLeast(obj.results.length, 15, 'results must have at least 15 SERP objects'); - assert.equal(obj.no_results, false, 'no results should be false'); - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - - for (let res of obj.results) { - - assert.containsAllKeys(res, ['link', 'snippet', 'rank', 'clean_link'], 'not all keys are in the SERP object'); - - assert.isOk(res.link, 'link must be ok'); - assert.typeOf(res.link, 'string', 'link must be string'); - assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.clean_link, 'clean_link must be ok'); - assert.typeOf(res.clean_link, 'string', 'clean_link must be string'); - assert.isAtLeast(res.clean_link.length, 5, 'clean_link must have at least 5 chars'); - - assert.isOk(res.snippet, 'snippet must be ok'); - assert.typeOf(res.snippet, 'string', 'snippet must be string'); - assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); - - assert.isNumber(res.rank, 'rank must be integer'); - assert.equal(res.rank, total_rank++, 'rank ist wrong'); - } - } - } -} - -describe('Google Image', function(){ - this.timeout(30000); - it('normal image search test', normal_image_search_test); -}); \ No newline at end of file diff --git a/test/test_queryargs_google.js b/test/test_queryargs_google.js deleted file mode 100644 index d4b51db..0000000 --- a/test/test_queryargs_google.js +++ /dev/null @@ -1,91 +0,0 @@ -'use strict'; -const se_scraper = require('./../index.js'); -const assert = require('chai').assert; - -const normal_search_keywords = ['apple juice']; - -async function queryargs_search_test() { - let config = { - search_engine: 'google', - compress: false, - debug: true, - verbose: true, - keywords: normal_search_keywords, - keyword_file: '', - num_pages: 2, - headless: true, - output_file: '', - block_assets: true, - // use specific search engine parameters for various search engines - google_settings: { - google_domain: 'google.com', - gl: 'fr', // The gl parameter determines the Google country to use for the query. - hl: 'fr', // The hl parameter determines the Google UI language to return results. - start: 30, // Determines the results offset to use, defaults to 0. - num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100. - }, - }; - - console.log('queryargs_search_test()'); - await se_scraper.scrape(config, queryargs_search_test_case); -} - -// we test with a callback function to our handler -function queryargs_search_test_case(err, response) { - - if (err) { - console.error(err); - } else { - assert.equal(response.metadata.num_requests, 2); - - for (let query in response.results) { - let total_rank = 1; - - assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.'); - - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object'); - - assert.isAtLeast(obj.results.length, 90, 'results must have at least 80 SERP objects'); - assert.equal(obj.no_results, false, 'no results should be false'); - assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); - assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - - for (let res of obj.results) { - - assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link'], 'not all keys are in the SERP object'); - - assert.isOk(res.link, 'link must be ok'); - assert.typeOf(res.link, 'string', 'link must be string'); - assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.visible_link, 'visible_link must be ok'); - assert.typeOf(res.visible_link, 'string', 'visible_link must be string'); - assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars'); - - assert.isOk(res.title, 'title must be ok'); - assert.typeOf(res.title, 'string', 'title must be string'); - assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); - - assert.isOk(res.snippet, 'snippet must be ok'); - assert.typeOf(res.snippet, 'string', 'snippet must be string'); - assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); - - assert.isNumber(res.rank, 'rank must be integer'); - assert.equal(res.rank, total_rank++, 'rank ist wrong'); - } - } - } - } -} - -describe('Google with query arguments', function(){ - this.timeout(30000); - it('query args search test', queryargs_search_test); -}); diff --git a/test/test_ticker_search.js b/test/test_ticker_search.js deleted file mode 100644 index f8a379f..0000000 --- a/test/test_ticker_search.js +++ /dev/null @@ -1,217 +0,0 @@ -'use strict'; -const se_scraper = require('./../index.js'); -const assert = require('chai').assert; - -/* - * Use chai and mocha for tests. - * https://mochajs.org/#installation - */ - -const quote_search_keywords = ['MSFT', 'AAPL']; - -async function reuters_search_test() { - let config = { - search_engine: 'reuters', - compress: false, - debug: false, - verbose: false, - keywords: quote_search_keywords, - keyword_file: '', - num_pages: 1, - headless: true, - output_file: '', - block_assets: true, - user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', - random_user_agent: false, - }; - - console.log('reuters_search_test()'); - await se_scraper.scrape(config, reuters_search_test_case); -} - -// we test with a callback function to our handler -function reuters_search_test_case(err, response) { - - if (err) { - console.error(err); - } else { - - for (let query in response.results) { - let total_rank = 1; - assert.containsAllKeys(response.results, quote_search_keywords, 'not all keywords were scraped.'); - - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - assert.containsAllKeys(obj, ['results', 'time'], 'not all keys are in the object'); - - assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects'); - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - - for (let res of obj.results) { - - assert.containsAllKeys(res, ['link', 'title', 'date', 'snippet'], 'not all keys are in the SERP object'); - - assert.isOk(res.link, 'link must be ok'); - assert.typeOf(res.link, 'string', 'link must be string'); - assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.title, 'title must be ok'); - assert.typeOf(res.title, 'string', 'title must be string'); - assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars'); - - assert.isOk(res.snippet, 'snippet must be ok'); - assert.typeOf(res.snippet, 'string', 'snippet must be string'); - assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); - - assert.isOk(res.date, 'date must be ok'); - assert.typeOf(res.date, 'string', 'date must be string'); - assert.isAtLeast(res.date.length, 5, 'date must have at least 5 chars'); - } - } - } - } -} - -async function cnbc_search_test() { - let config = { - search_engine: 'cnbc', - compress: false, - debug: false, - verbose: false, - keywords: quote_search_keywords, - keyword_file: '', - num_pages: 1, - headless: true, - output_file: '', - block_assets: true, - user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', - random_user_agent: false, - }; - - console.log('cnbc_search_test()'); - await se_scraper.scrape(config, cnbc_search_test_case); -} - -// we test with a callback function to our handler -function cnbc_search_test_case(err, response) { - - if (err) { - console.error(err); - } else { - - for (let query in response.results) { - let total_rank = 1; - assert.containsAllKeys(response.results, quote_search_keywords, 'not all keywords were scraped.'); - - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - assert.containsAllKeys(obj, ['results', 'time'], 'not all keys are in the object'); - - assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects'); - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - - for (let res of obj.results) { - - assert.containsAllKeys(res, ['link', 'title', 'date'], 'not all keys are in the SERP object'); - - assert.isOk(res.link, 'link must be ok'); - assert.typeOf(res.link, 'string', 'link must be string'); - assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.title, 'title must be ok'); - assert.typeOf(res.title, 'string', 'title must be string'); - assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars'); - - assert.isOk(res.date, 'date must be ok'); - assert.typeOf(res.date, 'string', 'date must be string'); - assert.isAtLeast(res.date.length, 5, 'date must have at least 5 chars'); - } - } - } - } -} - -const marketwatch_search_keywords = ['MSFT']; - -async function marketwatch_search_test() { - let config = { - search_engine: 'marketwatch', - compress: false, - debug: false, - verbose: false, - keywords: marketwatch_search_keywords, - keyword_file: '', - num_pages: 1, - headless: true, - output_file: '', - block_assets: true, - user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', - random_user_agent: false, - }; - - console.log('marketwatch_search_test()'); - await se_scraper.scrape(config, marketwatch_search_test_case); -} - -// we test with a callback function to our handler -function marketwatch_search_test_case(err, response) { - - if (err) { - console.error(err); - } else { - - for (let query in response.results) { - let total_rank = 1; - assert.containsAllKeys(response.results, marketwatch_search_keywords, 'not all keywords were scraped.'); - - for (let page_number in response.results[query]) { - - assert.isNumber(parseInt(page_number), 'page_number must be numeric'); - - let obj = response.results[query][page_number]; - - assert.containsAllKeys(obj, ['results', 'time'], 'not all keys are in the object'); - - assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects'); - assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); - - for (let res of obj.results) { - - assert.containsAllKeys(res, ['link', 'title', 'date', 'author'], 'not all keys are in the SERP object'); - - assert.isOk(res.link, 'link must be ok'); - assert.typeOf(res.link, 'string', 'link must be string'); - assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars'); - - assert.isOk(res.title, 'title must be ok'); - assert.typeOf(res.title, 'string', 'title must be string'); - assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars'); - - assert.isOk(res.author, 'author must be ok'); - assert.typeOf(res.author, 'string', 'author must be string'); - assert.isAtLeast(res.author.length, 5, 'author must have at least 5 chars'); - - assert.isOk(res.date, 'date must be ok'); - assert.typeOf(res.date, 'string', 'date must be string'); - assert.isAtLeast(res.date.length, 5, 'date must have at least 5 chars'); - } - } - } - } -} - - -describe('Ticker', function(){ - this.timeout(30000); - it('Reuters search test', reuters_search_test); - it('CNBC search test', cnbc_search_test); - it('Marketwatch search test', marketwatch_search_test); -}); \ No newline at end of file diff --git a/test/user_agent.js b/test/user_agent.js new file mode 100644 index 0000000..b4ddc68 --- /dev/null +++ b/test/user_agent.js @@ -0,0 +1,144 @@ +'use strict'; +const express = require('express'); +const { createLogger, transports } = require('winston'); +const http = require('http'); +const https = require('https'); +const assert = require('assert'); +const keyCert = require('key-cert'); +const Promise = require('bluebird'); +const Proxy = require('http-mitm-proxy'); +const UAParser = require('ua-parser-js'); +const _ = require('lodash'); + +const debug = require('debug')('se-scraper:test'); +const se_scraper = require('../'); +const Scraper = require('../src/modules/se_scraper'); + +const httpPort = 3012; +const httpsPort = httpPort + 1; +const proxyPort = httpPort + 2; + +const fakeSearchEngine = express(); +fakeSearchEngine.set('trust proxy', 'loopback'); +fakeSearchEngine.get('/test-user_agent', (req, res) => { + debug('fake-search-engine req.headers.user-agent=%s', req.headers['user-agent']); + res.send(req.headers['user-agent']); +}); + +describe('Config', function(){ + + let httpServer, httpsServer, proxy; + before(async function(){ + // Here mount our fake engine in both http and https listen server + httpServer = http.createServer(fakeSearchEngine); + httpsServer = https.createServer(await keyCert(), fakeSearchEngine); + + proxy = Proxy(); + proxy.onRequest((ctx, callback) => { + ctx.proxyToServerRequestOptions.host = 'localhost'; + ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort; + ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine'; + debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host); + return callback(); + }); + + await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort}); + await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort); + await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort); + debug('Fake http search engine servers started'); + }); + + after(function(){ + httpsServer.close(); + httpServer.close(); + proxy.close(); + }); + + describe('user_agent', function(){ + + class MockScraperTestUserAgent extends Scraper { + + async load_start_page(){ + return true; + } + + async search_keyword(){ + await this.page.goto('http://localhost:' + httpPort + '/test-user_agent'); + } + + async parse_async(){ + const bodyHandle = await this.page.$('body'); + return await this.page.evaluate(body => body.innerHTML, bodyHandle); + } + } + + const testLogger = createLogger({ + transports: [ + new transports.Console({ + level: 'error' + }) + ] + }); + + /** + * Test user_agent option + */ + it('fixed user_agent', async function () { + + const scrape_job = { + search_engine: MockScraperTestUserAgent, + keywords: ['javascript is hard'], + }; + + var scraper = new se_scraper.ScrapeManager({ + throw_on_detection: true, + logger: testLogger, + user_agent: 'THIS IS A USERAGENT 42.0' + }); + await scraper.start(); + + const { results } = await scraper.scrape(scrape_job); + assert.strictEqual(results['javascript is hard']['1'], 'THIS IS A USERAGENT 42.0'); + + await scraper.quit(); + }); + + /** + * Test random_user_agent option + * TODO generated user_agent should be different for each keyword + * TODO this test will sometimes fail because user_agent not very random :-( + */ + it('random_user_agent', async function () { + + const scrape_job = { + search_engine: MockScraperTestUserAgent, + keywords: ['news'], + }; + + const NUMBER_OF_EXEC = 10; + + const uaList = await Promise.map(_.range(NUMBER_OF_EXEC), async (i) => { + const scraper = new se_scraper.ScrapeManager({ + throw_on_detection: true, + logger: testLogger, + random_user_agent: true, + }); + await scraper.start(); + const { results: { news } } = await scraper.scrape(scrape_job); + await scraper.quit(); + return news['1']; + }); + + uaList.forEach((userAgent) => { + const uaParsed = UAParser(userAgent); + assert(uaParsed.browser.name, 'UserAgent should have a browser name detected'); + assert(uaParsed.os.name, 'UserAgent should have a os name detected'); + }); + + assert( _.chain(uaList).countBy().toPairs().sortBy(e => e[1]).last().value()[1] < (NUMBER_OF_EXEC * 0.4), 'Each user agent should appear less than 40% of the time' ); + + }); + + }); + +}); \ No newline at end of file