docker support added

This commit is contained in:
Nikolai Tschacher 2019-08-13 17:35:06 +02:00
parent 19a172c654
commit 98414259fe
6 changed files with 572 additions and 8 deletions

69
Dockerfile Normal file
View File

@ -0,0 +1,69 @@
FROM node:10-slim
# Application parameters and variables
# ENV NODE_ENV=production
ENV HOST=0.0.0.0
ENV PORT=3000
ENV application_directory=/se-scraper
ENV puppeteer_cluster_directory=/se-scraper/src/puppeteer-cluster
# Create app directory
WORKDIR $application_directory
RUN apt-get update && \
apt-get install -y \
gconf-service \
libasound2 \
libatk1.0-0 \
libc6 \
libcairo2 \
libcups2 \
libdbus-1-3 \
libexpat1 \
libfontconfig1 \
libgcc1 \
libgconf-2-4 \
libgdk-pixbuf2.0-0 \
libglib2.0-0 \
libgtk-3-0 \
libnspr4 \
libpango-1.0-0 \
libpangocairo-1.0-0 \
libstdc++6 \
libx11-6 \
libx11-xcb1 \
libxcb1 \
libxcomposite1 \
libxcursor1 \
libxdamage1 \
libxext6 \
libxfixes3 \
libxi6 \
libxrandr2 \
libxrender1 \
libxss1 \
libxtst6 \
ca-certificates \
fonts-liberation \
libappindicator1 \
libnss3 \
lsb-release \
xdg-utils \
wget
# Bundle app source
COPY . .
WORKDIR $puppeteer_cluster_directory
RUN npm install \
&& npm run build
WORKDIR $application_directory
# skip installing scripts for puppeteer dependencies
# we've already installed puppeteer above.
RUN npm install --ignore-scripts
# Cleanup
RUN apt-get clean && rm -rf /var/lib/apt/lists/*
EXPOSE $PORT
CMD [ "node", "server/server.js" ]

View File

@ -80,6 +80,33 @@ If you **don't** want puppeteer to download a complete chromium browser, add thi
export PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=1
```
### Docker Image
I will maintain a public docker image of se-scraper. Pull the docker image with the command:
```bash
docker pull tschachn/se-scraper
```
When the image is running, you may start scrape jobs via an HTTP API:
```bash
curl -XPOST http://0.0.0.0:3000 -H 'Content-Type: application/json' \
-d '{
"browser_config": {
"random_user_agent": true
},
"scrape_config": {
"search_engine": "google",
"keywords": ["test"],
"num_pages": 1
}
}'
```
Many thanks goes to [slotix](https://github.com/NikolaiT/se-scraper/pull/21) for his tremendous help in setting up a docker image.
## Minimal Example
Create a file named `minimal.js` with the following contents

408
package-lock.json generated
View File

@ -1,6 +1,6 @@
{
"name": "se-scraper",
"version": "1.4.0",
"version": "1.4.5",
"lockfileVersion": 1,
"requires": true,
"dependencies": {
@ -22,6 +22,15 @@
"resolved": "https://registry.npmjs.org/@types/node/-/node-10.12.18.tgz",
"integrity": "sha512-fh+pAqt4xRzPfqA6eh3Z2y6fyZavRIumvjhaCL753+TVkGKGhpPeyrJG2JftD0T9q4GF00KjefsQ+PQNDdWQaQ=="
},
"accepts": {
"version": "1.3.7",
"resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.7.tgz",
"integrity": "sha512-Il80Qs2WjYlJIBNzNkK6KYqlVMTbZLXgHx2oT0pU/fjRHyEp+PEfEPY0R3WCwAGVOtauxh1hOxNgIf5bv7dQpA==",
"requires": {
"mime-types": "~2.1.24",
"negotiator": "0.6.2"
}
},
"agent-base": {
"version": "4.3.0",
"resolved": "https://registry.npmjs.org/agent-base/-/agent-base-4.3.0.tgz",
@ -65,6 +74,11 @@
"resolved": "https://registry.npmjs.org/arr-union/-/arr-union-3.1.0.tgz",
"integrity": "sha1-45sJrqne+Gao8gbiiK9jkZuuOcQ="
},
"array-flatten": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz",
"integrity": "sha1-ml9pkFGx5wczKPKgCJaLZOopVdI="
},
"assertion-error": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/assertion-error/-/assertion-error-1.1.0.tgz",
@ -86,6 +100,38 @@
"resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.5.3.tgz",
"integrity": "sha512-/qKPUQlaW1OyR51WeCPBvRnAlnZFUJkCSG5HzGnuIqhgyJtF+T94lFnn33eiazjRm2LAHVy2guNnaq48X9SJuw=="
},
"body-parser": {
"version": "1.19.0",
"resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.19.0.tgz",
"integrity": "sha512-dhEPs72UPbDnAQJ9ZKMNTP6ptJaionhP5cBb541nXPlW60Jepo9RV/a4fX4XWW9CuFNK22krhrj1+rgzifNCsw==",
"requires": {
"bytes": "3.1.0",
"content-type": "~1.0.4",
"debug": "2.6.9",
"depd": "~1.1.2",
"http-errors": "1.7.2",
"iconv-lite": "0.4.24",
"on-finished": "~2.3.0",
"qs": "6.7.0",
"raw-body": "2.4.0",
"type-is": "~1.6.17"
},
"dependencies": {
"debug": {
"version": "2.6.9",
"resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
"integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
"requires": {
"ms": "2.0.0"
}
},
"ms": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g="
}
}
},
"boolbase": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz",
@ -111,6 +157,11 @@
"resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.1.tgz",
"integrity": "sha512-MQcXEUbCKtEo7bhqEs6560Hyd4XaovZlO/k9V3hjVUF/zwW7KBVdSK4gIt/bzwS9MbR5qob+F5jusZsb0YQK2A=="
},
"bytes": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.0.tgz",
"integrity": "sha512-zauLjrfCG+xvoyaqLoV8bLVXXNGC4JqlxFCutSDWA6fJrTo2ZuvLYTqZ7aHBLZSMOopbzwv8f+wZcVzfVTI2Dg=="
},
"cacheable-request": {
"version": "6.0.0",
"resolved": "https://registry.npmjs.org/cacheable-request/-/cacheable-request-6.0.0.tgz",
@ -256,7 +307,7 @@
},
"concat-stream": {
"version": "1.6.2",
"resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz",
"resolved": "http://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz",
"integrity": "sha512-27HBghJxjiZtIk3Ycvn/4kbJk/1uZuJFfuPEns6LaEvpvG1f0hTea8lilrouyo9mVc2GWdcEZ8OLoGmSADlrCw==",
"requires": {
"buffer-from": "^1.0.0",
@ -267,7 +318,7 @@
"dependencies": {
"readable-stream": {
"version": "2.3.6",
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz",
"resolved": "http://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz",
"integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==",
"requires": {
"core-util-is": "~1.0.0",
@ -281,7 +332,7 @@
},
"string_decoder": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz",
"resolved": "http://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz",
"integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==",
"requires": {
"safe-buffer": "~5.1.0"
@ -289,6 +340,29 @@
}
}
},
"content-disposition": {
"version": "0.5.3",
"resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-0.5.3.tgz",
"integrity": "sha512-ExO0774ikEObIAEV9kDo50o+79VCUdEB6n6lzKgGwupcVeRlhrj3qGAfwq8G6uBJjkqLrhT0qEYFcWng8z1z0g==",
"requires": {
"safe-buffer": "5.1.2"
}
},
"content-type": {
"version": "1.0.4",
"resolved": "https://registry.npmjs.org/content-type/-/content-type-1.0.4.tgz",
"integrity": "sha512-hIP3EEPs8tB9AT1L+NUqtwOAps4mk2Zob89MWXMHjHWg9milF/j4osnnQLXBCBFBk/tvIG/tUc9mOUJiPBhPXA=="
},
"cookie": {
"version": "0.4.0",
"resolved": "https://registry.npmjs.org/cookie/-/cookie-0.4.0.tgz",
"integrity": "sha512-+Hp8fLp57wnUSt0tY0tHEXh4voZRDnoIrZPqlo3DPiI4y9lwg/jqx+1Om94/W6ZaPDOUbnjOt/99w66zk+l1Xg=="
},
"cookie-signature": {
"version": "1.0.6",
"resolved": "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.0.6.tgz",
"integrity": "sha1-4wOogrNCzD7oylE6eZmXNNqzriw="
},
"core-util-is": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz",
@ -373,6 +447,16 @@
"object-keys": "^1.0.12"
}
},
"depd": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/depd/-/depd-1.1.2.tgz",
"integrity": "sha1-m81S4UwJd2PnSbJ0xDRu0uVgtak="
},
"destroy": {
"version": "1.0.4",
"resolved": "https://registry.npmjs.org/destroy/-/destroy-1.0.4.tgz",
"integrity": "sha1-l4hXRCxEdJ5CBmE+N5RiBYJqvYA="
},
"diff": {
"version": "3.5.0",
"resolved": "https://registry.npmjs.org/diff/-/diff-3.5.0.tgz",
@ -436,12 +520,22 @@
"resolved": "https://registry.npmjs.org/duplexer3/-/duplexer3-0.1.4.tgz",
"integrity": "sha1-7gHdHKwO08vH/b6jfcCo8c4ALOI="
},
"ee-first": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz",
"integrity": "sha1-WQxhFWsK4vTwJVcyoViyZrxWsh0="
},
"emoji-regex": {
"version": "7.0.3",
"resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-7.0.3.tgz",
"integrity": "sha512-CwBLREIQ7LvYFB0WyRvwhq5N5qPhc6PMjD6bYggFlI5YyDgl+0vxq5VHbMOFqLg7hfWzmu8T5Z1QofhmTIhItA==",
"dev": true
},
"encodeurl": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz",
"integrity": "sha1-rT/0yG7C0CkyL1oCw6mmBslbP1k="
},
"end-of-stream": {
"version": "1.4.1",
"resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.1.tgz",
@ -487,12 +581,17 @@
},
"es6-promisify": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz",
"resolved": "http://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz",
"integrity": "sha1-UQnWLz5W6pZ8S2NQWu8IKRyKUgM=",
"requires": {
"es6-promise": "^4.0.3"
}
},
"escape-html": {
"version": "1.0.3",
"resolved": "https://registry.npmjs.org/escape-html/-/escape-html-1.0.3.tgz",
"integrity": "sha1-Aljq5NPQwJdN4cFpGI7wBR0dGYg="
},
"escape-string-regexp": {
"version": "1.0.5",
"resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-1.0.5.tgz",
@ -505,6 +604,11 @@
"integrity": "sha512-eGuFFw7Upda+g4p+QHvnW0RyTX/SVeJBDM/gCtMARO0cLuT2HcEKnTPvhjV6aGeqrCB/sbNop0Kszm0jsaWU4A==",
"dev": true
},
"etag": {
"version": "1.8.1",
"resolved": "https://registry.npmjs.org/etag/-/etag-1.8.1.tgz",
"integrity": "sha1-Qa4u62XvpiJorr/qg6x9eSmbCIc="
},
"execa": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/execa/-/execa-1.0.0.tgz",
@ -520,6 +624,58 @@
"strip-eof": "^1.0.0"
}
},
"express": {
"version": "4.17.1",
"resolved": "https://registry.npmjs.org/express/-/express-4.17.1.tgz",
"integrity": "sha512-mHJ9O79RqluphRrcw2X/GTh3k9tVv8YcoyY4Kkh4WDMUYKRZUq0h1o0w2rrrxBqM7VoeUVqgb27xlEMXTnYt4g==",
"requires": {
"accepts": "~1.3.7",
"array-flatten": "1.1.1",
"body-parser": "1.19.0",
"content-disposition": "0.5.3",
"content-type": "~1.0.4",
"cookie": "0.4.0",
"cookie-signature": "1.0.6",
"debug": "2.6.9",
"depd": "~1.1.2",
"encodeurl": "~1.0.2",
"escape-html": "~1.0.3",
"etag": "~1.8.1",
"finalhandler": "~1.1.2",
"fresh": "0.5.2",
"merge-descriptors": "1.0.1",
"methods": "~1.1.2",
"on-finished": "~2.3.0",
"parseurl": "~1.3.3",
"path-to-regexp": "0.1.7",
"proxy-addr": "~2.0.5",
"qs": "6.7.0",
"range-parser": "~1.2.1",
"safe-buffer": "5.1.2",
"send": "0.17.1",
"serve-static": "1.14.1",
"setprototypeof": "1.1.1",
"statuses": "~1.5.0",
"type-is": "~1.6.18",
"utils-merge": "1.0.1",
"vary": "~1.1.2"
},
"dependencies": {
"debug": {
"version": "2.6.9",
"resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
"integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
"requires": {
"ms": "2.0.0"
}
},
"ms": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g="
}
}
},
"extract-zip": {
"version": "1.6.7",
"resolved": "https://registry.npmjs.org/extract-zip/-/extract-zip-1.6.7.tgz",
@ -554,6 +710,35 @@
"pend": "~1.2.0"
}
},
"finalhandler": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-1.1.2.tgz",
"integrity": "sha512-aAWcW57uxVNrQZqFXjITpW3sIUQmHGG3qSb9mUah9MgMC4NeWhNOlNjXEYq3HjRAvL6arUviZGGJsBg6z0zsWA==",
"requires": {
"debug": "2.6.9",
"encodeurl": "~1.0.2",
"escape-html": "~1.0.3",
"on-finished": "~2.3.0",
"parseurl": "~1.3.3",
"statuses": "~1.5.0",
"unpipe": "~1.0.0"
},
"dependencies": {
"debug": {
"version": "2.6.9",
"resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
"integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
"requires": {
"ms": "2.0.0"
}
},
"ms": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g="
}
}
},
"find-up": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/find-up/-/find-up-3.0.0.tgz",
@ -585,6 +770,16 @@
"for-in": "^1.0.1"
}
},
"forwarded": {
"version": "0.1.2",
"resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.1.2.tgz",
"integrity": "sha1-mMI9qxF1ZXuMBXPozszZGw/xjIQ="
},
"fresh": {
"version": "0.5.2",
"resolved": "https://registry.npmjs.org/fresh/-/fresh-0.5.2.tgz",
"integrity": "sha1-PYyt2Q2XZWn6g1qx+OSyOhBWBac="
},
"fs.realpath": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz",
@ -698,6 +893,18 @@
"resolved": "https://registry.npmjs.org/http-cache-semantics/-/http-cache-semantics-4.0.2.tgz",
"integrity": "sha512-laeSTWIkuFa6lUgZAt+ic9RwOSEwbi9VDQNcCvMFO4sZiDc2Ha8DaZVCJnfpLLQCcS8rvCnIWYmz0POLxt7Dew=="
},
"http-errors": {
"version": "1.7.2",
"resolved": "https://registry.npmjs.org/http-errors/-/http-errors-1.7.2.tgz",
"integrity": "sha512-uUQBt3H/cSIVfch6i1EuPNy/YsRSOUBXTVfZ+yR7Zjez3qjBz6i9+i4zjNaoqcoFVI4lQJ5plg63TvGfRSDCRg==",
"requires": {
"depd": "~1.1.2",
"inherits": "2.0.3",
"setprototypeof": "1.1.1",
"statuses": ">= 1.5.0 < 2",
"toidentifier": "1.0.0"
}
},
"https-proxy-agent": {
"version": "2.2.2",
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-2.2.2.tgz",
@ -717,6 +924,14 @@
}
}
},
"iconv-lite": {
"version": "0.4.24",
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz",
"integrity": "sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA==",
"requires": {
"safer-buffer": ">= 2.1.2 < 3"
}
},
"inflight": {
"version": "1.0.6",
"resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz",
@ -737,6 +952,11 @@
"integrity": "sha512-wPVv/y/QQ/Uiirj/vh3oP+1Ww+AWehmi1g5fFWGPF6IpCBCDVrhgHRMvrLfdYcwDh3QJbGXDW4JAuzxElLSqKA==",
"dev": true
},
"ipaddr.js": {
"version": "1.9.0",
"resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.9.0.tgz",
"integrity": "sha512-M4Sjn6N/+O6/IXSJseKqHoFc+5FdGJ22sXqnjTpdZweHK64MzEPAyQZyEU3R/KRv2GLoa7nNtg/C2Ev6m7z+eA=="
},
"is-buffer": {
"version": "2.0.3",
"resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-2.0.3.tgz",
@ -909,6 +1129,11 @@
"p-defer": "^1.0.0"
}
},
"media-typer": {
"version": "0.3.0",
"resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz",
"integrity": "sha1-hxDXrwqmJvj/+hzgAWhUUmMlV0g="
},
"mem": {
"version": "4.3.0",
"resolved": "https://registry.npmjs.org/mem/-/mem-4.3.0.tgz",
@ -930,11 +1155,34 @@
"kind-of": "^3.0.2"
}
},
"merge-descriptors": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.1.tgz",
"integrity": "sha1-sAqqVW3YtEVoFQ7J0blT8/kMu2E="
},
"methods": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/methods/-/methods-1.1.2.tgz",
"integrity": "sha1-VSmk1nZUE07cxSZmVoNbD4Ua/O4="
},
"mime": {
"version": "2.4.4",
"resolved": "https://registry.npmjs.org/mime/-/mime-2.4.4.tgz",
"integrity": "sha512-LRxmNwziLPT828z+4YkNzloCFC2YM4wrB99k+AV5ZbEyfGNWfG8SO1FUXLmLDBSo89NrJZ4DIWeLjy1CHGhMGA=="
},
"mime-db": {
"version": "1.40.0",
"resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.40.0.tgz",
"integrity": "sha512-jYdeOMPy9vnxEqFRRo6ZvTZ8d9oPb+k18PKoYNYUe2stVEBPPwsln/qWzdbmaIvnhZ9v2P+CuecK+fpUfsV2mA=="
},
"mime-types": {
"version": "2.1.24",
"resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.24.tgz",
"integrity": "sha512-WaFHS3MCl5fapm3oLxU4eYDw77IQM2ACcxQ9RIxfaC3ooc6PFuBMGZZsYpvoXS5D5QTWPieo1jjLdAm3TBP3cQ==",
"requires": {
"mime-db": "1.40.0"
}
},
"mimic-fn": {
"version": "2.1.0",
"resolved": "https://registry.npmjs.org/mimic-fn/-/mimic-fn-2.1.0.tgz",
@ -1044,6 +1292,11 @@
"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.1.tgz",
"integrity": "sha512-tgp+dl5cGk28utYktBsrFqA7HKgrhgPsg6Z/EfhWI4gl1Hwq8B/GmY/0oXZ6nF8hDVesS/FpnYaD/kOWhYQvyg=="
},
"negotiator": {
"version": "0.6.2",
"resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.2.tgz",
"integrity": "sha512-hZXc7K2e+PgeI1eDBe/10Ard4ekbfrrqG8Ep+8Jmf4JID2bNg7NvCPOZN+kfF574pFQI7mum2AUqDidoKqcTOw=="
},
"nice-try": {
"version": "1.0.5",
"resolved": "https://registry.npmjs.org/nice-try/-/nice-try-1.0.5.tgz",
@ -1116,6 +1369,14 @@
"es-abstract": "^1.5.1"
}
},
"on-finished": {
"version": "2.3.0",
"resolved": "https://registry.npmjs.org/on-finished/-/on-finished-2.3.0.tgz",
"integrity": "sha1-IPEzZIGwg811M3mSoWlxqi2QaUc=",
"requires": {
"ee-first": "1.1.1"
}
},
"once": {
"version": "1.4.0",
"resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz",
@ -1190,6 +1451,11 @@
"@types/node": "*"
}
},
"parseurl": {
"version": "1.3.3",
"resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz",
"integrity": "sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ=="
},
"path-exists": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/path-exists/-/path-exists-3.0.0.tgz",
@ -1207,6 +1473,11 @@
"integrity": "sha1-QRyttXTFoUDTpLGRDUDYDMn0C0A=",
"dev": true
},
"path-to-regexp": {
"version": "0.1.7",
"resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.7.tgz",
"integrity": "sha1-32BBeABfUi8V60SQ5yR6G/qmf4w="
},
"pathval": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/pathval/-/pathval-1.1.0.tgz",
@ -1263,6 +1534,15 @@
"resolved": "https://registry.npmjs.org/progress/-/progress-2.0.3.tgz",
"integrity": "sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA=="
},
"proxy-addr": {
"version": "2.0.5",
"resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.5.tgz",
"integrity": "sha512-t/7RxHXPH6cJtP0pRG6smSr9QJidhB+3kXu0KgXnbGYMgzEnUxRQ4/LDdfOwZEMyIh3/xHb8PX3t+lfL9z+YVQ==",
"requires": {
"forwarded": "~0.1.2",
"ipaddr.js": "1.9.0"
}
},
"proxy-chain": {
"version": "0.2.7",
"resolved": "https://registry.npmjs.org/proxy-chain/-/proxy-chain-0.2.7.tgz",
@ -1387,6 +1667,27 @@
}
}
},
"qs": {
"version": "6.7.0",
"resolved": "https://registry.npmjs.org/qs/-/qs-6.7.0.tgz",
"integrity": "sha512-VCdBRNFTX1fyE7Nb6FYoURo/SPe62QCaAyzJvUjwRaIsc+NePBEniHlvxFmmX56+HZphIGtV0XeCirBtpDrTyQ=="
},
"range-parser": {
"version": "1.2.1",
"resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz",
"integrity": "sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg=="
},
"raw-body": {
"version": "2.4.0",
"resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.4.0.tgz",
"integrity": "sha512-4Oz8DUIwdvoa5qMJelxipzi/iJIi40O5cGV1wNYp5hvZP8ZN0T+jiNkL0QepXs+EsQ9XJ8ipEDoiH70ySUJP3Q==",
"requires": {
"bytes": "3.1.0",
"http-errors": "1.7.2",
"iconv-lite": "0.4.24",
"unpipe": "1.0.0"
}
},
"readable-stream": {
"version": "3.1.1",
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.1.1.tgz",
@ -1430,18 +1731,81 @@
"resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz",
"integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g=="
},
"safer-buffer": {
"version": "2.1.2",
"resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
"integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg=="
},
"semver": {
"version": "5.7.0",
"resolved": "https://registry.npmjs.org/semver/-/semver-5.7.0.tgz",
"integrity": "sha512-Ya52jSX2u7QKghxeoFGpLwCtGlt7j0oY9DYb5apt9nPlJ42ID+ulTXESnt/qAQcoSERyZ5sl3LDIOw0nAn/5DA==",
"dev": true
},
"send": {
"version": "0.17.1",
"resolved": "https://registry.npmjs.org/send/-/send-0.17.1.tgz",
"integrity": "sha512-BsVKsiGcQMFwT8UxypobUKyv7irCNRHk1T0G680vk88yf6LBByGcZJOTJCrTP2xVN6yI+XjPJcNuE3V4fT9sAg==",
"requires": {
"debug": "2.6.9",
"depd": "~1.1.2",
"destroy": "~1.0.4",
"encodeurl": "~1.0.2",
"escape-html": "~1.0.3",
"etag": "~1.8.1",
"fresh": "0.5.2",
"http-errors": "~1.7.2",
"mime": "1.6.0",
"ms": "2.1.1",
"on-finished": "~2.3.0",
"range-parser": "~1.2.1",
"statuses": "~1.5.0"
},
"dependencies": {
"debug": {
"version": "2.6.9",
"resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
"integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
"requires": {
"ms": "2.0.0"
},
"dependencies": {
"ms": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g="
}
}
},
"mime": {
"version": "1.6.0",
"resolved": "https://registry.npmjs.org/mime/-/mime-1.6.0.tgz",
"integrity": "sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg=="
}
}
},
"serve-static": {
"version": "1.14.1",
"resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.14.1.tgz",
"integrity": "sha512-JMrvUwE54emCYWlTI+hGrGv5I8dEwmco/00EvkzIIsR7MqrHonbD9pO2MOfFnpFntl7ecpZs+3mW+XbQZu9QCg==",
"requires": {
"encodeurl": "~1.0.2",
"escape-html": "~1.0.3",
"parseurl": "~1.3.3",
"send": "0.17.1"
}
},
"set-blocking": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/set-blocking/-/set-blocking-2.0.0.tgz",
"integrity": "sha1-BF+XgtARrppoA93TgrJDkrPYkPc=",
"dev": true
},
"setprototypeof": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.1.1.tgz",
"integrity": "sha512-JvdAWfbXeIGaZ9cILp38HntZSFSo3mWg6xGcJJsd+d4aRMOqauag1C63dJfDw7OaMYwEbHMOxEZ1lqVRYP2OAw=="
},
"shallow-clone": {
"version": "0.1.2",
"resolved": "https://registry.npmjs.org/shallow-clone/-/shallow-clone-0.1.2.tgz",
@ -1500,6 +1864,11 @@
"integrity": "sha1-BOaSb2YolTVPPdAVIDYzuFcpfiw=",
"dev": true
},
"statuses": {
"version": "1.5.0",
"resolved": "https://registry.npmjs.org/statuses/-/statuses-1.5.0.tgz",
"integrity": "sha1-Fhx9rBd2Wf2YEfQ3cfqZOBR4Yow="
},
"string-width": {
"version": "2.1.1",
"resolved": "https://registry.npmjs.org/string-width/-/string-width-2.1.1.tgz",
@ -1553,12 +1922,26 @@
"resolved": "https://registry.npmjs.org/to-readable-stream/-/to-readable-stream-1.0.0.tgz",
"integrity": "sha512-Iq25XBt6zD5npPhlLVXGFN3/gyR2/qODcKNNyTMd4vbm39HUaOiAM4PMq0eMVC/Tkxz+Zjdsc55g9yyz+Yq00Q=="
},
"toidentifier": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/toidentifier/-/toidentifier-1.0.0.tgz",
"integrity": "sha512-yaOH/Pk/VEhBWWTlhI+qXxDFXlejDGcQipMlyxda9nthulaxLZUNcUqFxokp0vcYnvteJln5FNQDRrxj3YcbVw=="
},
"type-detect": {
"version": "4.0.8",
"resolved": "https://registry.npmjs.org/type-detect/-/type-detect-4.0.8.tgz",
"integrity": "sha512-0fr/mIH1dlO+x7TlcMy+bIDqKPsw/70tVyeHW787goQjhmqaZe10uwLujubK9q9Lg6Fiho1KUKDYz0Z7k7g5/g==",
"dev": true
},
"type-is": {
"version": "1.6.18",
"resolved": "https://registry.npmjs.org/type-is/-/type-is-1.6.18.tgz",
"integrity": "sha512-TkRKr9sUTxEH8MdfuCSP7VizJyzRNMjj2J2do2Jr3Kym598JVdEksuzPQCnlFPW4ky9Q+iA+ma9BGm06XQBy8g==",
"requires": {
"media-typer": "0.3.0",
"mime-types": "~2.1.24"
}
},
"typedarray": {
"version": "0.0.6",
"resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz",
@ -1577,6 +1960,11 @@
"underscore": "*"
}
},
"unpipe": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz",
"integrity": "sha1-sr9O6FFKrmFltIF4KdIbLvSZBOw="
},
"url-parse-lax": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/url-parse-lax/-/url-parse-lax-3.0.0.tgz",
@ -1599,6 +1987,16 @@
"resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz",
"integrity": "sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8="
},
"utils-merge": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/utils-merge/-/utils-merge-1.0.1.tgz",
"integrity": "sha1-n5VxD1CiZ5R7LMwSR0HBAoQn5xM="
},
"vary": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/vary/-/vary-1.1.2.tgz",
"integrity": "sha1-IpnwLG3tMNSllhsLn3RSShj2NPw="
},
"which": {
"version": "1.3.1",
"resolved": "https://registry.npmjs.org/which/-/which-1.3.1.tgz",

View File

@ -23,6 +23,7 @@
"dependencies": {
"cheerio": "^1.0.0-rc.2",
"debug": "^4.1.1",
"express": "^4.17.1",
"got": "^9.6.0",
"lodash": "^4.17.14",
"proxy-chain": "^0.2.7",

69
server/server.js Normal file
View File

@ -0,0 +1,69 @@
/**
Test server with:
curl -XPOST http://0.0.0.0:3000 -H 'Content-Type: application/json' \
-d '{
"browser_config": {
"random_user_agent": true
},
"scrape_config": {
"search_engine": "google",
"keywords": ["test"],
"num_pages": 1
}
}'
*/
const se_scraper = require('../index.js');
'use strict';
const express = require('express');
// Constants
const PORT = process.env.PORT || 3000;
const HOST = process.env.HOST || '0.0.0.0';
// App
const app = express();
app.use(express.json());
let browser_config = {
random_user_agent: true,
headless : true,
debug_level: 1,
sleep_range: '',
puppeteer_cluster_config: {
timeout: 30 * 60 * 1000, // max timeout set to 30 minutes
monitor: false,
concurrency: 1, // one scraper per tab
maxConcurrency: 1, // scrape with 5 tabs
}
};
app.post('/', async (req, res) => {
if (!req.body.browser_config || !req.body.scrape_config) {
res.json({
'status': 400,
'msg': 'please specify browser_config and scrape_config'
});
} else {
// overwrite standard browser config
Object.assign(browser_config, req.body.browser_config);
var scraper = new se_scraper.ScrapeManager(browser_config);
await scraper.start();
var results = await scraper.scrape(req.body.scrape_config);
// console.dir(results, {depth: null, colors: true});
res.send(results);
await scraper.quit();
}
});
app.listen(PORT, HOST);
console.log(`Running on http://${HOST}:${PORT}`);

View File

@ -88,7 +88,7 @@ class ScrapeManager {
log_http_headers: false,
// how long to sleep between requests. a random sleep interval within the range [a,b]
// is drawn before every request. empty string for no sleeping.
sleep_range: '',
sleep_range: undefined,
// which search engine to scrape
search_engine: 'google',
search_engine_name: 'google',
@ -126,7 +126,7 @@ class ScrapeManager {
// this module should export the functions:
// get_browser, handle_metadata, close_browser
//custom_func: resolve('examples/pluggable.js'),
custom_func: '',
custom_func: undefined,
throw_on_detection: false,
// use a proxy for all connections
// example: 'socks5://78.94.172.42:1080'
@ -236,7 +236,7 @@ class ScrapeManager {
}
if (this.config.random_user_agent) {
const userAgent = new UserAgent();
const userAgent = new UserAgent({ deviceCategory: 'desktop' });
user_agent = userAgent.toString();
}