diff --git a/README.md b/README.md index faf7ed5..77bd4c1 100644 --- a/README.md +++ b/README.md @@ -317,25 +317,49 @@ const se_scraper = require('se-scraper'); // and cannot give to se-scraper on scrape() calls let browser_config = { // the user agent to scrape with - user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', + user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3835.0 Safari/537.36', // if random_user_agent is set to True, a random user agent is chosen random_user_agent: false, - // whether to start the browser in headless mode - headless: true, + // whether to select manual settings in visible mode + set_manual_settings: false, + // log ip address data + log_ip_address: false, + // log http headers + log_http_headers: false, + // how long to sleep between requests. a random sleep interval within the range [a,b] + // is drawn before every request. empty string for no sleeping. + sleep_range: '', + // which search engine to scrape + search_engine: 'google', + compress: false, // compress // whether debug information should be printed // level 0: print nothing // level 1: print most important info // ... // level 4: print all shit nobody wants to know debug_level: 1, + keywords: ['nodejs rocks',], + // whether to start the browser in headless mode + headless: true, // specify flags passed to chrome here chrome_flags: [], + // the number of pages to scrape for each keyword + num_pages: 1, + // path to output file, data will be stored in JSON + output_file: '', + // whether to also passthru all the html output of the serp pages + html_output: false, + // whether to return a screenshot of serp pages as b64 data + screen_output: false, + // whether to prevent images, css, fonts and media from being loaded + // will speed up scraping a great deal + block_assets: true, // path to js module that extends functionality // this module should export the functions: // get_browser, handle_metadata, close_browser - // must be an absolute path to the module //custom_func: resolve('examples/pluggable.js'), custom_func: '', + throw_on_detection: false, // use a proxy for all connections // example: 'socks5://78.94.172.42:1080' // example: 'http://118.174.233.10:48400' @@ -344,11 +368,20 @@ let browser_config = { // socks5://78.94.172.42:1080 // http://118.174.233.10:48400 proxy_file: '', + // whether to use proxies only + // when this is set to true, se-scraper will not use + // your default IP address + use_proxies_only: false, + // check if headless chrome escapes common detection techniques + // this is a quick test and should be used for debugging + test_evasion: false, + apply_evasion_techniques: true, + // settings for puppeteer-cluster puppeteer_cluster_config: { - timeout: 10 * 60 * 1000, // max timeout set to 10 minutes + timeout: 30 * 60 * 1000, // max timeout set to 30 minutes monitor: false, - concurrency: 1, // one scraper per tab - maxConcurrency: 1, // scrape with 2 tabs + concurrency: Cluster.CONCURRENCY_BROWSER, + maxConcurrency: 1, } }; diff --git a/TODO.md b/TODO.md index 8484f81..2ad0c81 100644 --- a/TODO.md +++ b/TODO.md @@ -49,18 +49,19 @@ - remove unnecessary sleep() calls and replace with waitFor selectors - ### 16.7.2019 - resolve issues - fix this https://github.com/NikolaiT/se-scraper/issues/37 [done] -- use puppeteer stealth plugin +- use puppeteer stealth plugin: https://www.npmjs.com/package/puppeteer-extra-plugin-stealth -- user random user agents plugin: https://github.com/intoli/user-agents + - we will need to load at the concurrency impl of puppeteer-cluster [no typescript support :(), I will not support this right now] + +- user random user agents plugin: https://github.com/intoli/user-agents [done] - add screenshot capability (make the screen after parsing) - - store as b64 + - store as b64 [done] ### TODO: 1. fix googlenewsscraper waiting for results and parsing. remove the static sleep [done] diff --git a/examples/quickstart.js b/examples/quickstart.js index 6cb2015..4af0595 100644 --- a/examples/quickstart.js +++ b/examples/quickstart.js @@ -6,6 +6,8 @@ const se_scraper = require('./../src/node_scraper.js'); test_evasion: false, log_http_headers: true, random_user_agent: true, + apply_evasion_techniques: true, + screen_output: true, }; let scrape_job = { diff --git a/headless-evasion-result.png b/headless-evasion-result.png new file mode 100644 index 0000000..060099d Binary files /dev/null and b/headless-evasion-result.png differ diff --git a/package-lock.json b/package-lock.json index 4bd154f..04c38af 100644 --- a/package-lock.json +++ b/package-lock.json @@ -60,6 +60,11 @@ "sprintf-js": "~1.0.2" } }, + "arr-union": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/arr-union/-/arr-union-3.1.0.tgz", + "integrity": "sha1-45sJrqne+Gao8gbiiK9jkZuuOcQ=" + }, "assertion-error": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/assertion-error/-/assertion-error-1.1.0.tgz", @@ -198,6 +203,18 @@ "wrap-ansi": "^2.0.0" } }, + "clone-deep": { + "version": "0.2.4", + "resolved": "https://registry.npmjs.org/clone-deep/-/clone-deep-0.2.4.tgz", + "integrity": "sha1-TnPdCen7lxzDhnDF3O2cGJZIHMY=", + "requires": { + "for-own": "^0.1.3", + "is-plain-object": "^2.0.1", + "kind-of": "^3.0.2", + "lazy-cache": "^1.0.3", + "shallow-clone": "^0.1.2" + } + }, "clone-response": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/clone-response/-/clone-response-1.0.2.tgz", @@ -239,7 +256,7 @@ }, "concat-stream": { "version": "1.6.2", - "resolved": "http://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz", + "resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz", "integrity": "sha512-27HBghJxjiZtIk3Ycvn/4kbJk/1uZuJFfuPEns6LaEvpvG1f0hTea8lilrouyo9mVc2GWdcEZ8OLoGmSADlrCw==", "requires": { "buffer-from": "^1.0.0", @@ -250,7 +267,7 @@ "dependencies": { "readable-stream": { "version": "2.3.6", - "resolved": "http://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz", "integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==", "requires": { "core-util-is": "~1.0.0", @@ -264,7 +281,7 @@ }, "string_decoder": { "version": "1.1.1", - "resolved": "http://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", "integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==", "requires": { "safe-buffer": "~5.1.0" @@ -337,6 +354,11 @@ "type-detect": "^4.0.0" } }, + "deepmerge": { + "version": "2.2.1", + "resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-2.2.1.tgz", + "integrity": "sha512-R9hc1Xa/NOBi9WRVUWg19rl1UB7Tt4kuPd+thNJgFZoxXsTz7ncaPaeIm+40oSGuP33DfMb4sZt1QIGiJzC4EA==" + }, "defer-to-connect": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/defer-to-connect/-/defer-to-connect-1.0.2.tgz", @@ -465,7 +487,7 @@ }, "es6-promisify": { "version": "5.0.0", - "resolved": "http://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz", + "resolved": "https://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz", "integrity": "sha1-UQnWLz5W6pZ8S2NQWu8IKRyKUgM=", "requires": { "es6-promise": "^4.0.3" @@ -550,6 +572,19 @@ "is-buffer": "~2.0.3" } }, + "for-in": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/for-in/-/for-in-1.0.2.tgz", + "integrity": "sha1-gQaNKVqBQuwKxybG4iAMMPttXoA=" + }, + "for-own": { + "version": "0.1.5", + "resolved": "https://registry.npmjs.org/for-own/-/for-own-0.1.5.tgz", + "integrity": "sha1-UmXGgaTylNq78XyVCbZ2OqhFEM4=", + "requires": { + "for-in": "^1.0.1" + } + }, "fs.realpath": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", @@ -664,11 +699,11 @@ "integrity": "sha512-laeSTWIkuFa6lUgZAt+ic9RwOSEwbi9VDQNcCvMFO4sZiDc2Ha8DaZVCJnfpLLQCcS8rvCnIWYmz0POLxt7Dew==" }, "https-proxy-agent": { - "version": "2.2.1", - "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-2.2.1.tgz", - "integrity": "sha512-HPCTS1LW51bcyMYbxUIOO4HEOlQ1/1qRaFWcyxvwaqUS9TY88aoEuHUY33kuAh1YhVVaDQhLZsnPd+XNARWZlQ==", + "version": "2.2.2", + "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-2.2.2.tgz", + "integrity": "sha512-c8Ndjc9Bkpfx/vCJueCPy0jlP4ccCCSNDp8xwCZzPjKJUm+B+u9WX2x98Qx4n1PiMNTWo3D7KK5ifNV/yJyRzg==", "requires": { - "agent-base": "^4.1.0", + "agent-base": "^4.3.0", "debug": "^3.1.0" }, "dependencies": { @@ -720,12 +755,25 @@ "integrity": "sha1-mqIOtq7rv/d/vTPnTKAbM1gdOhY=", "dev": true }, + "is-extendable": { + "version": "0.1.1", + "resolved": "https://registry.npmjs.org/is-extendable/-/is-extendable-0.1.1.tgz", + "integrity": "sha1-YrEQ4omkcUGOPsNqYX1HLjAd/Ik=" + }, "is-fullwidth-code-point": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-2.0.0.tgz", "integrity": "sha1-o7MKXE8ZkYMWeqq5O+764937ZU8=", "dev": true }, + "is-plain-object": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/is-plain-object/-/is-plain-object-2.0.4.tgz", + "integrity": "sha512-h5PpgXkWitc38BBMYawTYMWJHFZJVnBquFE57xFpjB8pJFiF6gZ+bU+WyI/yqXiFR5mdLsgYNaPe8uao6Uv9Og==", + "requires": { + "isobject": "^3.0.1" + } + }, "is-regex": { "version": "1.0.4", "resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.0.4.tgz", @@ -761,6 +809,11 @@ "integrity": "sha1-6PvzdNxVb/iUehDcsFctYz8s+hA=", "dev": true }, + "isobject": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", + "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=" + }, "js-yaml": { "version": "3.13.1", "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-3.13.1.tgz", @@ -784,6 +837,26 @@ "json-buffer": "3.0.0" } }, + "kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", + "requires": { + "is-buffer": "^1.1.5" + }, + "dependencies": { + "is-buffer": { + "version": "1.1.6", + "resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-1.1.6.tgz", + "integrity": "sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w==" + } + } + }, + "lazy-cache": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/lazy-cache/-/lazy-cache-1.0.4.tgz", + "integrity": "sha1-odePw6UEdMuAhF07O24dpJpEbo4=" + }, "lcid": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/lcid/-/lcid-2.0.0.tgz", @@ -847,6 +920,16 @@ "p-is-promise": "^2.0.0" } }, + "merge-deep": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/merge-deep/-/merge-deep-3.0.2.tgz", + "integrity": "sha512-T7qC8kg4Zoti1cFd8Cr0M+qaZfOwjlPDEdZIIPPB2JZctjaPM4fX+i7HOId69tAti2fvO6X5ldfYUONDODsrkA==", + "requires": { + "arr-union": "^3.1.0", + "clone-deep": "^0.2.4", + "kind-of": "^3.0.2" + } + }, "mime": { "version": "2.4.4", "resolved": "https://registry.npmjs.org/mime/-/mime-2.4.4.tgz", @@ -876,6 +959,22 @@ "resolved": "http://registry.npmjs.org/minimist/-/minimist-0.0.8.tgz", "integrity": "sha1-hX/Kv8M5fSYluCKCYuhqp6ARsF0=" }, + "mixin-object": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/mixin-object/-/mixin-object-2.0.1.tgz", + "integrity": "sha1-T7lJRB2rGCVA8f4DW6YOGUel5X4=", + "requires": { + "for-in": "^0.1.3", + "is-extendable": "^0.1.1" + }, + "dependencies": { + "for-in": { + "version": "0.1.8", + "resolved": "https://registry.npmjs.org/for-in/-/for-in-0.1.8.tgz", + "integrity": "sha1-2Hc5COMSVhCZUrH9ubP6hn0ndeE=" + } + } + }, "mkdirp": { "version": "0.5.1", "resolved": "http://registry.npmjs.org/mkdirp/-/mkdirp-0.5.1.tgz", @@ -1155,9 +1254,9 @@ "integrity": "sha1-6SQ0v6XqjBn0HN/UAddBo8gZ2Jc=" }, "process-nextick-args": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.0.tgz", - "integrity": "sha512-MtEC1TqN0EU5nephaJ4rAtThHtC86dNN9qCuEhtshvpVBkAW5ZO7BASN9REnF9eoXGcRub+pFuKEpOHE+HbEMw==" + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.1.tgz", + "integrity": "sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==" }, "progress": { "version": "2.0.3", @@ -1189,9 +1288,9 @@ } }, "puppeteer": { - "version": "1.17.0", - "resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-1.17.0.tgz", - "integrity": "sha512-3EXZSximCzxuVKpIHtyec8Wm2dWZn1fc5tQi34qWfiUgubEVYHjUvr0GOJojqf3mifI6oyKnCdrGxaOI+lWReA==", + "version": "1.18.1", + "resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-1.18.1.tgz", + "integrity": "sha512-luUy0HPSuWPsPZ1wAp6NinE0zgetWtudf5zwZ6dHjMWfYpTQcmKveFRox7VBNhQ98OjNA9PQ9PzQyX8k/KrxTg==", "requires": { "debug": "^4.1.0", "extract-zip": "^1.6.6", @@ -1211,6 +1310,83 @@ "debug": "^4.1.1" } }, + "puppeteer-extra": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/puppeteer-extra/-/puppeteer-extra-2.1.3.tgz", + "integrity": "sha512-RjQp3BkjzkY8JgTcHUsu4RdFcqE3AdTzUquRC9WBUZXRXbSgSXI1CtEmNG7OcxorScXNUTKEbY7Z13xtQVkHnQ==", + "requires": { + "debug": "^3.1.0", + "deepmerge": "^2.1.0" + }, + "dependencies": { + "debug": { + "version": "3.2.6", + "resolved": "https://registry.npmjs.org/debug/-/debug-3.2.6.tgz", + "integrity": "sha512-mel+jf7nrtEl5Pn1Qx46zARXKDpBbvzezse7p7LqINmdoIk8PYP5SySaxEmYv6TZ0JyEKA1hsCId6DIhgITtWQ==", + "requires": { + "ms": "^2.1.1" + } + } + } + }, + "puppeteer-extra-plugin": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/puppeteer-extra-plugin/-/puppeteer-extra-plugin-3.0.4.tgz", + "integrity": "sha512-qgb5pmyNWI64PHtvRG1MjxEL7S3wAouDeqWklpg+/1jnxsbhRJsoi91SEg4U4Ji+rEOn28kTVM9O4KTXJ1PQ1Q==", + "requires": { + "debug": "^3.1.0", + "merge-deep": "^3.0.1" + }, + "dependencies": { + "debug": { + "version": "3.2.6", + "resolved": "https://registry.npmjs.org/debug/-/debug-3.2.6.tgz", + "integrity": "sha512-mel+jf7nrtEl5Pn1Qx46zARXKDpBbvzezse7p7LqINmdoIk8PYP5SySaxEmYv6TZ0JyEKA1hsCId6DIhgITtWQ==", + "requires": { + "ms": "^2.1.1" + } + } + } + }, + "puppeteer-extra-plugin-anonymize-ua": { + "version": "2.1.4", + "resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-anonymize-ua/-/puppeteer-extra-plugin-anonymize-ua-2.1.4.tgz", + "integrity": "sha512-AAxicMHX3AbsBvWgMQEqaIy7vqH+pzzh/HZ4+bdXiZExPm1dTUNq7Kmvog2xfY4bdQO+bQkk+Qtfr1dOQ0KRyg==", + "requires": { + "debug": "^3.1.0", + "puppeteer-extra-plugin": "^3.0.4" + }, + "dependencies": { + "debug": { + "version": "3.2.6", + "resolved": "https://registry.npmjs.org/debug/-/debug-3.2.6.tgz", + "integrity": "sha512-mel+jf7nrtEl5Pn1Qx46zARXKDpBbvzezse7p7LqINmdoIk8PYP5SySaxEmYv6TZ0JyEKA1hsCId6DIhgITtWQ==", + "requires": { + "ms": "^2.1.1" + } + } + } + }, + "puppeteer-extra-plugin-stealth": { + "version": "2.2.2", + "resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-stealth/-/puppeteer-extra-plugin-stealth-2.2.2.tgz", + "integrity": "sha512-tVf+M0U4VAht/q2vNlf1eGGeCFzK9q0vTHnuCEXHf06mu7ukUe1J060s6kij5r7Fv51UsTcXnMa069oSXvYSOw==", + "requires": { + "debug": "^3.1.0", + "puppeteer-extra-plugin": "^3.0.4", + "puppeteer-extra-plugin-anonymize-ua": "^2.1.4" + }, + "dependencies": { + "debug": { + "version": "3.2.6", + "resolved": "https://registry.npmjs.org/debug/-/debug-3.2.6.tgz", + "integrity": "sha512-mel+jf7nrtEl5Pn1Qx46zARXKDpBbvzezse7p7LqINmdoIk8PYP5SySaxEmYv6TZ0JyEKA1hsCId6DIhgITtWQ==", + "requires": { + "ms": "^2.1.1" + } + } + } + }, "readable-stream": { "version": "3.1.1", "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.1.1.tgz", @@ -1266,6 +1442,37 @@ "integrity": "sha1-BF+XgtARrppoA93TgrJDkrPYkPc=", "dev": true }, + "shallow-clone": { + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/shallow-clone/-/shallow-clone-0.1.2.tgz", + "integrity": "sha1-WQnodLp3EG1zrEFM/sH/yofZcGA=", + "requires": { + "is-extendable": "^0.1.1", + "kind-of": "^2.0.1", + "lazy-cache": "^0.2.3", + "mixin-object": "^2.0.1" + }, + "dependencies": { + "is-buffer": { + "version": "1.1.6", + "resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-1.1.6.tgz", + "integrity": "sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w==" + }, + "kind-of": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-2.0.1.tgz", + "integrity": "sha1-AY7HpM5+OobLkUG+UZ0kyPqpgbU=", + "requires": { + "is-buffer": "^1.0.2" + } + }, + "lazy-cache": { + "version": "0.2.7", + "resolved": "https://registry.npmjs.org/lazy-cache/-/lazy-cache-0.2.7.tgz", + "integrity": "sha1-f+3fLctu23fRHvHRF6tf/fCrG2U=" + } + } + }, "shebang-command": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-1.2.0.tgz", diff --git a/package.json b/package.json index 3d4e56d..4dd2bfd 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "se-scraper", - "version": "1.4.0", + "version": "1.4.1", "description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu", "homepage": "https://scrapeulous.com/", "main": "index.js", @@ -26,8 +26,10 @@ "got": "^9.6.0", "lodash": "^4.17.14", "proxy-chain": "^0.2.7", - "puppeteer": "^1.17.0", + "puppeteer": "^1.18.1", "puppeteer-cluster": "^0.13.0", + "puppeteer-extra": "^2.1.3", + "puppeteer-extra-plugin-stealth": "^2.2.2", "user-agents": "^1.0.321" }, "devDependencies": { diff --git a/src/modules/se_scraper.js b/src/modules/se_scraper.js index ea23a3d..0f57b66 100644 --- a/src/modules/se_scraper.js +++ b/src/modules/se_scraper.js @@ -29,11 +29,8 @@ module.exports = class Scraper { this.keywords = config.keywords; this.STANDARD_TIMEOUT = 10000; - // longer timeout when using proxies - this.PROXY_TIMEOUT = 15000; this.SOLVE_CAPTCHA_TIME = 45000; - this.html_output = {}; this.results = {}; this.result_rank = 1; // keep track of the requests done @@ -52,7 +49,6 @@ module.exports = class Scraper { } } } - } async run({page, data}) { @@ -103,12 +99,10 @@ module.exports = class Scraper { if (this.config.test_evasion === true) { // Navigate to the page that will perform the tests. - const testUrl = 'https://intoli.com/blog/' + - 'not-possible-to-block-chrome-headless/chrome-headless-test.html'; + const testUrl = 'https://bot.sannysoft.com'; await this.page.goto(testUrl); - // Save a screenshot of the results. - await this.page.screenshot({path: 'headless-test-result.png'}); + await this.page.screenshot({path: 'headless-evasion-result.png'}); } if (this.config.log_http_headers === true) { @@ -196,6 +190,13 @@ module.exports = class Scraper { this.results[keyword][this.page_num].html = html; } + if (this.config.screen_output) { + this.results[keyword][this.page_num].screenshot = await this.page.screenshot({ + encoding: 'base64', + fullPage: false, + }); + } + this.page_num += 1; // only load the next page when we will pass the next iteration diff --git a/src/node_scraper.js b/src/node_scraper.js index 334b33d..8ab5389 100644 --- a/src/node_scraper.js +++ b/src/node_scraper.js @@ -108,6 +108,8 @@ class ScrapeManager { output_file: '', // whether to also passthru all the html output of the serp pages html_output: false, + // whether to return a screenshot of serp pages as b64 data + screen_output: false, // whether to prevent images, css, fonts and media from being loaded // will speed up scraping a great deal block_assets: true, diff --git a/src/puppeteer-cluster b/src/puppeteer-cluster index 1fcf7f9..03c9a76 160000 --- a/src/puppeteer-cluster +++ b/src/puppeteer-cluster @@ -1 +1 @@ -Subproject commit 1fcf7f9a23c5a9abb491157e505b475f7aea72e5 +Subproject commit 03c9a764298f3f55b46bace810f4d3b2e1cb3266