diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..e836084 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,69 @@ +FROM node:10-slim + +# Application parameters and variables +# ENV NODE_ENV=production +ENV HOST=0.0.0.0 +ENV PORT=3000 +ENV application_directory=/se-scraper +ENV puppeteer_cluster_directory=/se-scraper/src/puppeteer-cluster + +# Create app directory +WORKDIR $application_directory + +RUN apt-get update && \ +apt-get install -y \ +gconf-service \ +libasound2 \ +libatk1.0-0 \ +libc6 \ +libcairo2 \ +libcups2 \ +libdbus-1-3 \ +libexpat1 \ +libfontconfig1 \ +libgcc1 \ +libgconf-2-4 \ +libgdk-pixbuf2.0-0 \ +libglib2.0-0 \ +libgtk-3-0 \ +libnspr4 \ +libpango-1.0-0 \ +libpangocairo-1.0-0 \ +libstdc++6 \ +libx11-6 \ +libx11-xcb1 \ +libxcb1 \ +libxcomposite1 \ +libxcursor1 \ +libxdamage1 \ +libxext6 \ +libxfixes3 \ +libxi6 \ +libxrandr2 \ +libxrender1 \ +libxss1 \ +libxtst6 \ +ca-certificates \ +fonts-liberation \ +libappindicator1 \ +libnss3 \ +lsb-release \ +xdg-utils \ +wget + +# Bundle app source +COPY . . +WORKDIR $puppeteer_cluster_directory +RUN npm install \ + && npm run build + +WORKDIR $application_directory +# skip installing scripts for puppeteer dependencies +# we've already installed puppeteer above. +RUN npm install --ignore-scripts + +# Cleanup +RUN apt-get clean && rm -rf /var/lib/apt/lists/* + +EXPOSE $PORT +CMD [ "node", "server/server.js" ] \ No newline at end of file diff --git a/README.md b/README.md index 67d05c9..5764130 100644 --- a/README.md +++ b/README.md @@ -80,6 +80,33 @@ If you **don't** want puppeteer to download a complete chromium browser, add thi export PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=1 ``` +### Docker Image + +I will maintain a public docker image of se-scraper. Pull the docker image with the command: + +```bash +docker pull tschachn/se-scraper +``` + +When the image is running, you may start scrape jobs via an HTTP API: + +```bash +curl -XPOST http://0.0.0.0:3000 -H 'Content-Type: application/json' \ +-d '{ + "browser_config": { + "random_user_agent": true + }, + "scrape_config": { + "search_engine": "google", + "keywords": ["test"], + "num_pages": 1 + } +}' +``` + +Many thanks goes to [slotix](https://github.com/NikolaiT/se-scraper/pull/21) for his tremendous help in setting up a docker image. + + ## Minimal Example Create a file named `minimal.js` with the following contents diff --git a/package-lock.json b/package-lock.json index 04c38af..f70beac 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,6 +1,6 @@ { "name": "se-scraper", - "version": "1.4.0", + "version": "1.4.5", "lockfileVersion": 1, "requires": true, "dependencies": { @@ -22,6 +22,15 @@ "resolved": "https://registry.npmjs.org/@types/node/-/node-10.12.18.tgz", "integrity": "sha512-fh+pAqt4xRzPfqA6eh3Z2y6fyZavRIumvjhaCL753+TVkGKGhpPeyrJG2JftD0T9q4GF00KjefsQ+PQNDdWQaQ==" }, + "accepts": { + "version": "1.3.7", + "resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.7.tgz", + "integrity": "sha512-Il80Qs2WjYlJIBNzNkK6KYqlVMTbZLXgHx2oT0pU/fjRHyEp+PEfEPY0R3WCwAGVOtauxh1hOxNgIf5bv7dQpA==", + "requires": { + "mime-types": "~2.1.24", + "negotiator": "0.6.2" + } + }, "agent-base": { "version": "4.3.0", "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-4.3.0.tgz", @@ -65,6 +74,11 @@ "resolved": "https://registry.npmjs.org/arr-union/-/arr-union-3.1.0.tgz", "integrity": "sha1-45sJrqne+Gao8gbiiK9jkZuuOcQ=" }, + "array-flatten": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz", + "integrity": "sha1-ml9pkFGx5wczKPKgCJaLZOopVdI=" + }, "assertion-error": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/assertion-error/-/assertion-error-1.1.0.tgz", @@ -86,6 +100,38 @@ "resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.5.3.tgz", "integrity": "sha512-/qKPUQlaW1OyR51WeCPBvRnAlnZFUJkCSG5HzGnuIqhgyJtF+T94lFnn33eiazjRm2LAHVy2guNnaq48X9SJuw==" }, + "body-parser": { + "version": "1.19.0", + "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.19.0.tgz", + "integrity": "sha512-dhEPs72UPbDnAQJ9ZKMNTP6ptJaionhP5cBb541nXPlW60Jepo9RV/a4fX4XWW9CuFNK22krhrj1+rgzifNCsw==", + "requires": { + "bytes": "3.1.0", + "content-type": "~1.0.4", + "debug": "2.6.9", + "depd": "~1.1.2", + "http-errors": "1.7.2", + "iconv-lite": "0.4.24", + "on-finished": "~2.3.0", + "qs": "6.7.0", + "raw-body": "2.4.0", + "type-is": "~1.6.17" + }, + "dependencies": { + "debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "requires": { + "ms": "2.0.0" + } + }, + "ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=" + } + } + }, "boolbase": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz", @@ -111,6 +157,11 @@ "resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.1.tgz", "integrity": "sha512-MQcXEUbCKtEo7bhqEs6560Hyd4XaovZlO/k9V3hjVUF/zwW7KBVdSK4gIt/bzwS9MbR5qob+F5jusZsb0YQK2A==" }, + "bytes": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.0.tgz", + "integrity": "sha512-zauLjrfCG+xvoyaqLoV8bLVXXNGC4JqlxFCutSDWA6fJrTo2ZuvLYTqZ7aHBLZSMOopbzwv8f+wZcVzfVTI2Dg==" + }, "cacheable-request": { "version": "6.0.0", "resolved": "https://registry.npmjs.org/cacheable-request/-/cacheable-request-6.0.0.tgz", @@ -256,7 +307,7 @@ }, "concat-stream": { "version": "1.6.2", - "resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz", + "resolved": "http://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz", "integrity": "sha512-27HBghJxjiZtIk3Ycvn/4kbJk/1uZuJFfuPEns6LaEvpvG1f0hTea8lilrouyo9mVc2GWdcEZ8OLoGmSADlrCw==", "requires": { "buffer-from": "^1.0.0", @@ -267,7 +318,7 @@ "dependencies": { "readable-stream": { "version": "2.3.6", - "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz", + "resolved": "http://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz", "integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==", "requires": { "core-util-is": "~1.0.0", @@ -281,7 +332,7 @@ }, "string_decoder": { "version": "1.1.1", - "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", + "resolved": "http://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", "integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==", "requires": { "safe-buffer": "~5.1.0" @@ -289,6 +340,29 @@ } } }, + "content-disposition": { + "version": "0.5.3", + "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-0.5.3.tgz", + "integrity": "sha512-ExO0774ikEObIAEV9kDo50o+79VCUdEB6n6lzKgGwupcVeRlhrj3qGAfwq8G6uBJjkqLrhT0qEYFcWng8z1z0g==", + "requires": { + "safe-buffer": "5.1.2" + } + }, + "content-type": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/content-type/-/content-type-1.0.4.tgz", + "integrity": "sha512-hIP3EEPs8tB9AT1L+NUqtwOAps4mk2Zob89MWXMHjHWg9milF/j4osnnQLXBCBFBk/tvIG/tUc9mOUJiPBhPXA==" + }, + "cookie": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.4.0.tgz", + "integrity": "sha512-+Hp8fLp57wnUSt0tY0tHEXh4voZRDnoIrZPqlo3DPiI4y9lwg/jqx+1Om94/W6ZaPDOUbnjOt/99w66zk+l1Xg==" + }, + "cookie-signature": { + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.0.6.tgz", + "integrity": "sha1-4wOogrNCzD7oylE6eZmXNNqzriw=" + }, "core-util-is": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz", @@ -373,6 +447,16 @@ "object-keys": "^1.0.12" } }, + "depd": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/depd/-/depd-1.1.2.tgz", + "integrity": "sha1-m81S4UwJd2PnSbJ0xDRu0uVgtak=" + }, + "destroy": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/destroy/-/destroy-1.0.4.tgz", + "integrity": "sha1-l4hXRCxEdJ5CBmE+N5RiBYJqvYA=" + }, "diff": { "version": "3.5.0", "resolved": "https://registry.npmjs.org/diff/-/diff-3.5.0.tgz", @@ -436,12 +520,22 @@ "resolved": "https://registry.npmjs.org/duplexer3/-/duplexer3-0.1.4.tgz", "integrity": "sha1-7gHdHKwO08vH/b6jfcCo8c4ALOI=" }, + "ee-first": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz", + "integrity": "sha1-WQxhFWsK4vTwJVcyoViyZrxWsh0=" + }, "emoji-regex": { "version": "7.0.3", "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-7.0.3.tgz", "integrity": "sha512-CwBLREIQ7LvYFB0WyRvwhq5N5qPhc6PMjD6bYggFlI5YyDgl+0vxq5VHbMOFqLg7hfWzmu8T5Z1QofhmTIhItA==", "dev": true }, + "encodeurl": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz", + "integrity": "sha1-rT/0yG7C0CkyL1oCw6mmBslbP1k=" + }, "end-of-stream": { "version": "1.4.1", "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.1.tgz", @@ -487,12 +581,17 @@ }, "es6-promisify": { "version": "5.0.0", - "resolved": "https://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz", + "resolved": "http://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz", "integrity": "sha1-UQnWLz5W6pZ8S2NQWu8IKRyKUgM=", "requires": { "es6-promise": "^4.0.3" } }, + "escape-html": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/escape-html/-/escape-html-1.0.3.tgz", + "integrity": "sha1-Aljq5NPQwJdN4cFpGI7wBR0dGYg=" + }, "escape-string-regexp": { "version": "1.0.5", "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-1.0.5.tgz", @@ -505,6 +604,11 @@ "integrity": "sha512-eGuFFw7Upda+g4p+QHvnW0RyTX/SVeJBDM/gCtMARO0cLuT2HcEKnTPvhjV6aGeqrCB/sbNop0Kszm0jsaWU4A==", "dev": true }, + "etag": { + "version": "1.8.1", + "resolved": "https://registry.npmjs.org/etag/-/etag-1.8.1.tgz", + "integrity": "sha1-Qa4u62XvpiJorr/qg6x9eSmbCIc=" + }, "execa": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/execa/-/execa-1.0.0.tgz", @@ -520,6 +624,58 @@ "strip-eof": "^1.0.0" } }, + "express": { + "version": "4.17.1", + "resolved": "https://registry.npmjs.org/express/-/express-4.17.1.tgz", + "integrity": "sha512-mHJ9O79RqluphRrcw2X/GTh3k9tVv8YcoyY4Kkh4WDMUYKRZUq0h1o0w2rrrxBqM7VoeUVqgb27xlEMXTnYt4g==", + "requires": { + "accepts": "~1.3.7", + "array-flatten": "1.1.1", + "body-parser": "1.19.0", + "content-disposition": "0.5.3", + "content-type": "~1.0.4", + "cookie": "0.4.0", + "cookie-signature": "1.0.6", + "debug": "2.6.9", + "depd": "~1.1.2", + "encodeurl": "~1.0.2", + "escape-html": "~1.0.3", + "etag": "~1.8.1", + "finalhandler": "~1.1.2", + "fresh": "0.5.2", + "merge-descriptors": "1.0.1", + "methods": "~1.1.2", + "on-finished": "~2.3.0", + "parseurl": "~1.3.3", + "path-to-regexp": "0.1.7", + "proxy-addr": "~2.0.5", + "qs": "6.7.0", + "range-parser": "~1.2.1", + "safe-buffer": "5.1.2", + "send": "0.17.1", + "serve-static": "1.14.1", + "setprototypeof": "1.1.1", + "statuses": "~1.5.0", + "type-is": "~1.6.18", + "utils-merge": "1.0.1", + "vary": "~1.1.2" + }, + "dependencies": { + "debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "requires": { + "ms": "2.0.0" + } + }, + "ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=" + } + } + }, "extract-zip": { "version": "1.6.7", "resolved": "https://registry.npmjs.org/extract-zip/-/extract-zip-1.6.7.tgz", @@ -554,6 +710,35 @@ "pend": "~1.2.0" } }, + "finalhandler": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-1.1.2.tgz", + "integrity": "sha512-aAWcW57uxVNrQZqFXjITpW3sIUQmHGG3qSb9mUah9MgMC4NeWhNOlNjXEYq3HjRAvL6arUviZGGJsBg6z0zsWA==", + "requires": { + "debug": "2.6.9", + "encodeurl": "~1.0.2", + "escape-html": "~1.0.3", + "on-finished": "~2.3.0", + "parseurl": "~1.3.3", + "statuses": "~1.5.0", + "unpipe": "~1.0.0" + }, + "dependencies": { + "debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "requires": { + "ms": "2.0.0" + } + }, + "ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=" + } + } + }, "find-up": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/find-up/-/find-up-3.0.0.tgz", @@ -585,6 +770,16 @@ "for-in": "^1.0.1" } }, + "forwarded": { + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.1.2.tgz", + "integrity": "sha1-mMI9qxF1ZXuMBXPozszZGw/xjIQ=" + }, + "fresh": { + "version": "0.5.2", + "resolved": "https://registry.npmjs.org/fresh/-/fresh-0.5.2.tgz", + "integrity": "sha1-PYyt2Q2XZWn6g1qx+OSyOhBWBac=" + }, "fs.realpath": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", @@ -698,6 +893,18 @@ "resolved": "https://registry.npmjs.org/http-cache-semantics/-/http-cache-semantics-4.0.2.tgz", "integrity": "sha512-laeSTWIkuFa6lUgZAt+ic9RwOSEwbi9VDQNcCvMFO4sZiDc2Ha8DaZVCJnfpLLQCcS8rvCnIWYmz0POLxt7Dew==" }, + "http-errors": { + "version": "1.7.2", + "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-1.7.2.tgz", + "integrity": "sha512-uUQBt3H/cSIVfch6i1EuPNy/YsRSOUBXTVfZ+yR7Zjez3qjBz6i9+i4zjNaoqcoFVI4lQJ5plg63TvGfRSDCRg==", + "requires": { + "depd": "~1.1.2", + "inherits": "2.0.3", + "setprototypeof": "1.1.1", + "statuses": ">= 1.5.0 < 2", + "toidentifier": "1.0.0" + } + }, "https-proxy-agent": { "version": "2.2.2", "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-2.2.2.tgz", @@ -717,6 +924,14 @@ } } }, + "iconv-lite": { + "version": "0.4.24", + "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz", + "integrity": "sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA==", + "requires": { + "safer-buffer": ">= 2.1.2 < 3" + } + }, "inflight": { "version": "1.0.6", "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz", @@ -737,6 +952,11 @@ "integrity": "sha512-wPVv/y/QQ/Uiirj/vh3oP+1Ww+AWehmi1g5fFWGPF6IpCBCDVrhgHRMvrLfdYcwDh3QJbGXDW4JAuzxElLSqKA==", "dev": true }, + "ipaddr.js": { + "version": "1.9.0", + "resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.9.0.tgz", + "integrity": "sha512-M4Sjn6N/+O6/IXSJseKqHoFc+5FdGJ22sXqnjTpdZweHK64MzEPAyQZyEU3R/KRv2GLoa7nNtg/C2Ev6m7z+eA==" + }, "is-buffer": { "version": "2.0.3", "resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-2.0.3.tgz", @@ -909,6 +1129,11 @@ "p-defer": "^1.0.0" } }, + "media-typer": { + "version": "0.3.0", + "resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz", + "integrity": "sha1-hxDXrwqmJvj/+hzgAWhUUmMlV0g=" + }, "mem": { "version": "4.3.0", "resolved": "https://registry.npmjs.org/mem/-/mem-4.3.0.tgz", @@ -930,11 +1155,34 @@ "kind-of": "^3.0.2" } }, + "merge-descriptors": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.1.tgz", + "integrity": "sha1-sAqqVW3YtEVoFQ7J0blT8/kMu2E=" + }, + "methods": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/methods/-/methods-1.1.2.tgz", + "integrity": "sha1-VSmk1nZUE07cxSZmVoNbD4Ua/O4=" + }, "mime": { "version": "2.4.4", "resolved": "https://registry.npmjs.org/mime/-/mime-2.4.4.tgz", "integrity": "sha512-LRxmNwziLPT828z+4YkNzloCFC2YM4wrB99k+AV5ZbEyfGNWfG8SO1FUXLmLDBSo89NrJZ4DIWeLjy1CHGhMGA==" }, + "mime-db": { + "version": "1.40.0", + "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.40.0.tgz", + "integrity": "sha512-jYdeOMPy9vnxEqFRRo6ZvTZ8d9oPb+k18PKoYNYUe2stVEBPPwsln/qWzdbmaIvnhZ9v2P+CuecK+fpUfsV2mA==" + }, + "mime-types": { + "version": "2.1.24", + "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.24.tgz", + "integrity": "sha512-WaFHS3MCl5fapm3oLxU4eYDw77IQM2ACcxQ9RIxfaC3ooc6PFuBMGZZsYpvoXS5D5QTWPieo1jjLdAm3TBP3cQ==", + "requires": { + "mime-db": "1.40.0" + } + }, "mimic-fn": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/mimic-fn/-/mimic-fn-2.1.0.tgz", @@ -1044,6 +1292,11 @@ "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.1.tgz", "integrity": "sha512-tgp+dl5cGk28utYktBsrFqA7HKgrhgPsg6Z/EfhWI4gl1Hwq8B/GmY/0oXZ6nF8hDVesS/FpnYaD/kOWhYQvyg==" }, + "negotiator": { + "version": "0.6.2", + "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.2.tgz", + "integrity": "sha512-hZXc7K2e+PgeI1eDBe/10Ard4ekbfrrqG8Ep+8Jmf4JID2bNg7NvCPOZN+kfF574pFQI7mum2AUqDidoKqcTOw==" + }, "nice-try": { "version": "1.0.5", "resolved": "https://registry.npmjs.org/nice-try/-/nice-try-1.0.5.tgz", @@ -1116,6 +1369,14 @@ "es-abstract": "^1.5.1" } }, + "on-finished": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/on-finished/-/on-finished-2.3.0.tgz", + "integrity": "sha1-IPEzZIGwg811M3mSoWlxqi2QaUc=", + "requires": { + "ee-first": "1.1.1" + } + }, "once": { "version": "1.4.0", "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", @@ -1190,6 +1451,11 @@ "@types/node": "*" } }, + "parseurl": { + "version": "1.3.3", + "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz", + "integrity": "sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ==" + }, "path-exists": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-3.0.0.tgz", @@ -1207,6 +1473,11 @@ "integrity": "sha1-QRyttXTFoUDTpLGRDUDYDMn0C0A=", "dev": true }, + "path-to-regexp": { + "version": "0.1.7", + "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.7.tgz", + "integrity": "sha1-32BBeABfUi8V60SQ5yR6G/qmf4w=" + }, "pathval": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/pathval/-/pathval-1.1.0.tgz", @@ -1263,6 +1534,15 @@ "resolved": "https://registry.npmjs.org/progress/-/progress-2.0.3.tgz", "integrity": "sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA==" }, + "proxy-addr": { + "version": "2.0.5", + "resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.5.tgz", + "integrity": "sha512-t/7RxHXPH6cJtP0pRG6smSr9QJidhB+3kXu0KgXnbGYMgzEnUxRQ4/LDdfOwZEMyIh3/xHb8PX3t+lfL9z+YVQ==", + "requires": { + "forwarded": "~0.1.2", + "ipaddr.js": "1.9.0" + } + }, "proxy-chain": { "version": "0.2.7", "resolved": "https://registry.npmjs.org/proxy-chain/-/proxy-chain-0.2.7.tgz", @@ -1387,6 +1667,27 @@ } } }, + "qs": { + "version": "6.7.0", + "resolved": "https://registry.npmjs.org/qs/-/qs-6.7.0.tgz", + "integrity": "sha512-VCdBRNFTX1fyE7Nb6FYoURo/SPe62QCaAyzJvUjwRaIsc+NePBEniHlvxFmmX56+HZphIGtV0XeCirBtpDrTyQ==" + }, + "range-parser": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz", + "integrity": "sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg==" + }, + "raw-body": { + "version": "2.4.0", + "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.4.0.tgz", + "integrity": "sha512-4Oz8DUIwdvoa5qMJelxipzi/iJIi40O5cGV1wNYp5hvZP8ZN0T+jiNkL0QepXs+EsQ9XJ8ipEDoiH70ySUJP3Q==", + "requires": { + "bytes": "3.1.0", + "http-errors": "1.7.2", + "iconv-lite": "0.4.24", + "unpipe": "1.0.0" + } + }, "readable-stream": { "version": "3.1.1", "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.1.1.tgz", @@ -1430,18 +1731,81 @@ "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==" }, + "safer-buffer": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", + "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==" + }, "semver": { "version": "5.7.0", "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.0.tgz", "integrity": "sha512-Ya52jSX2u7QKghxeoFGpLwCtGlt7j0oY9DYb5apt9nPlJ42ID+ulTXESnt/qAQcoSERyZ5sl3LDIOw0nAn/5DA==", "dev": true }, + "send": { + "version": "0.17.1", + "resolved": "https://registry.npmjs.org/send/-/send-0.17.1.tgz", + "integrity": "sha512-BsVKsiGcQMFwT8UxypobUKyv7irCNRHk1T0G680vk88yf6LBByGcZJOTJCrTP2xVN6yI+XjPJcNuE3V4fT9sAg==", + "requires": { + "debug": "2.6.9", + "depd": "~1.1.2", + "destroy": "~1.0.4", + "encodeurl": "~1.0.2", + "escape-html": "~1.0.3", + "etag": "~1.8.1", + "fresh": "0.5.2", + "http-errors": "~1.7.2", + "mime": "1.6.0", + "ms": "2.1.1", + "on-finished": "~2.3.0", + "range-parser": "~1.2.1", + "statuses": "~1.5.0" + }, + "dependencies": { + "debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "requires": { + "ms": "2.0.0" + }, + "dependencies": { + "ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=" + } + } + }, + "mime": { + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/mime/-/mime-1.6.0.tgz", + "integrity": "sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg==" + } + } + }, + "serve-static": { + "version": "1.14.1", + "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.14.1.tgz", + "integrity": "sha512-JMrvUwE54emCYWlTI+hGrGv5I8dEwmco/00EvkzIIsR7MqrHonbD9pO2MOfFnpFntl7ecpZs+3mW+XbQZu9QCg==", + "requires": { + "encodeurl": "~1.0.2", + "escape-html": "~1.0.3", + "parseurl": "~1.3.3", + "send": "0.17.1" + } + }, "set-blocking": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/set-blocking/-/set-blocking-2.0.0.tgz", "integrity": "sha1-BF+XgtARrppoA93TgrJDkrPYkPc=", "dev": true }, + "setprototypeof": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.1.1.tgz", + "integrity": "sha512-JvdAWfbXeIGaZ9cILp38HntZSFSo3mWg6xGcJJsd+d4aRMOqauag1C63dJfDw7OaMYwEbHMOxEZ1lqVRYP2OAw==" + }, "shallow-clone": { "version": "0.1.2", "resolved": "https://registry.npmjs.org/shallow-clone/-/shallow-clone-0.1.2.tgz", @@ -1500,6 +1864,11 @@ "integrity": "sha1-BOaSb2YolTVPPdAVIDYzuFcpfiw=", "dev": true }, + "statuses": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/statuses/-/statuses-1.5.0.tgz", + "integrity": "sha1-Fhx9rBd2Wf2YEfQ3cfqZOBR4Yow=" + }, "string-width": { "version": "2.1.1", "resolved": "https://registry.npmjs.org/string-width/-/string-width-2.1.1.tgz", @@ -1553,12 +1922,26 @@ "resolved": "https://registry.npmjs.org/to-readable-stream/-/to-readable-stream-1.0.0.tgz", "integrity": "sha512-Iq25XBt6zD5npPhlLVXGFN3/gyR2/qODcKNNyTMd4vbm39HUaOiAM4PMq0eMVC/Tkxz+Zjdsc55g9yyz+Yq00Q==" }, + "toidentifier": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/toidentifier/-/toidentifier-1.0.0.tgz", + "integrity": "sha512-yaOH/Pk/VEhBWWTlhI+qXxDFXlejDGcQipMlyxda9nthulaxLZUNcUqFxokp0vcYnvteJln5FNQDRrxj3YcbVw==" + }, "type-detect": { "version": "4.0.8", "resolved": "https://registry.npmjs.org/type-detect/-/type-detect-4.0.8.tgz", "integrity": "sha512-0fr/mIH1dlO+x7TlcMy+bIDqKPsw/70tVyeHW787goQjhmqaZe10uwLujubK9q9Lg6Fiho1KUKDYz0Z7k7g5/g==", "dev": true }, + "type-is": { + "version": "1.6.18", + "resolved": "https://registry.npmjs.org/type-is/-/type-is-1.6.18.tgz", + "integrity": "sha512-TkRKr9sUTxEH8MdfuCSP7VizJyzRNMjj2J2do2Jr3Kym598JVdEksuzPQCnlFPW4ky9Q+iA+ma9BGm06XQBy8g==", + "requires": { + "media-typer": "0.3.0", + "mime-types": "~2.1.24" + } + }, "typedarray": { "version": "0.0.6", "resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz", @@ -1577,6 +1960,11 @@ "underscore": "*" } }, + "unpipe": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz", + "integrity": "sha1-sr9O6FFKrmFltIF4KdIbLvSZBOw=" + }, "url-parse-lax": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/url-parse-lax/-/url-parse-lax-3.0.0.tgz", @@ -1599,6 +1987,16 @@ "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", "integrity": "sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8=" }, + "utils-merge": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/utils-merge/-/utils-merge-1.0.1.tgz", + "integrity": "sha1-n5VxD1CiZ5R7LMwSR0HBAoQn5xM=" + }, + "vary": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/vary/-/vary-1.1.2.tgz", + "integrity": "sha1-IpnwLG3tMNSllhsLn3RSShj2NPw=" + }, "which": { "version": "1.3.1", "resolved": "https://registry.npmjs.org/which/-/which-1.3.1.tgz", diff --git a/package.json b/package.json index 8833e1f..21eba41 100644 --- a/package.json +++ b/package.json @@ -23,6 +23,7 @@ "dependencies": { "cheerio": "^1.0.0-rc.2", "debug": "^4.1.1", + "express": "^4.17.1", "got": "^9.6.0", "lodash": "^4.17.14", "proxy-chain": "^0.2.7", diff --git a/server/server.js b/server/server.js new file mode 100644 index 0000000..aece6e6 --- /dev/null +++ b/server/server.js @@ -0,0 +1,69 @@ +/** + + Test server with: + +curl -XPOST http://0.0.0.0:3000 -H 'Content-Type: application/json' \ +-d '{ + "browser_config": { + "random_user_agent": true + }, + "scrape_config": { + "search_engine": "google", + "keywords": ["test"], + "num_pages": 1 + } +}' + +*/ + +const se_scraper = require('../index.js'); +'use strict'; +const express = require('express'); + +// Constants +const PORT = process.env.PORT || 3000; +const HOST = process.env.HOST || '0.0.0.0'; + +// App +const app = express(); +app.use(express.json()); + +let browser_config = { + random_user_agent: true, + headless : true, + debug_level: 1, + sleep_range: '', + puppeteer_cluster_config: { + timeout: 30 * 60 * 1000, // max timeout set to 30 minutes + monitor: false, + concurrency: 1, // one scraper per tab + maxConcurrency: 1, // scrape with 5 tabs + } +}; + +app.post('/', async (req, res) => { + if (!req.body.browser_config || !req.body.scrape_config) { + res.json({ + 'status': 400, + 'msg': 'please specify browser_config and scrape_config' + }); + } else { + // overwrite standard browser config + Object.assign(browser_config, req.body.browser_config); + + var scraper = new se_scraper.ScrapeManager(browser_config); + await scraper.start(); + + var results = await scraper.scrape(req.body.scrape_config); + + // console.dir(results, {depth: null, colors: true}); + + res.send(results); + + await scraper.quit(); + } +}); + +app.listen(PORT, HOST); + +console.log(`Running on http://${HOST}:${PORT}`); diff --git a/src/node_scraper.js b/src/node_scraper.js index 27f0a68..20ed8c4 100644 --- a/src/node_scraper.js +++ b/src/node_scraper.js @@ -88,7 +88,7 @@ class ScrapeManager { log_http_headers: false, // how long to sleep between requests. a random sleep interval within the range [a,b] // is drawn before every request. empty string for no sleeping. - sleep_range: '', + sleep_range: undefined, // which search engine to scrape search_engine: 'google', search_engine_name: 'google', @@ -126,7 +126,7 @@ class ScrapeManager { // this module should export the functions: // get_browser, handle_metadata, close_browser //custom_func: resolve('examples/pluggable.js'), - custom_func: '', + custom_func: undefined, throw_on_detection: false, // use a proxy for all connections // example: 'socks5://78.94.172.42:1080' @@ -236,7 +236,7 @@ class ScrapeManager { } if (this.config.random_user_agent) { - const userAgent = new UserAgent(); + const userAgent = new UserAgent({ deviceCategory: 'desktop' }); user_agent = userAgent.toString(); }