better user agents now, added option to include screenshots as base64 in results

This commit is contained in:
Nikolai Tschacher 2019-07-18 20:19:15 +02:00
parent fcbe66b56b
commit 78fe12390b
9 changed files with 284 additions and 36 deletions

View File

@ -317,25 +317,49 @@ const se_scraper = require('se-scraper');
// and cannot give to se-scraper on scrape() calls // and cannot give to se-scraper on scrape() calls
let browser_config = { let browser_config = {
// the user agent to scrape with // the user agent to scrape with
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3835.0 Safari/537.36',
// if random_user_agent is set to True, a random user agent is chosen // if random_user_agent is set to True, a random user agent is chosen
random_user_agent: false, random_user_agent: false,
// whether to start the browser in headless mode // whether to select manual settings in visible mode
headless: true, set_manual_settings: false,
// log ip address data
log_ip_address: false,
// log http headers
log_http_headers: false,
// how long to sleep between requests. a random sleep interval within the range [a,b]
// is drawn before every request. empty string for no sleeping.
sleep_range: '',
// which search engine to scrape
search_engine: 'google',
compress: false, // compress
// whether debug information should be printed // whether debug information should be printed
// level 0: print nothing // level 0: print nothing
// level 1: print most important info // level 1: print most important info
// ... // ...
// level 4: print all shit nobody wants to know // level 4: print all shit nobody wants to know
debug_level: 1, debug_level: 1,
keywords: ['nodejs rocks',],
// whether to start the browser in headless mode
headless: true,
// specify flags passed to chrome here // specify flags passed to chrome here
chrome_flags: [], chrome_flags: [],
// the number of pages to scrape for each keyword
num_pages: 1,
// path to output file, data will be stored in JSON
output_file: '',
// whether to also passthru all the html output of the serp pages
html_output: false,
// whether to return a screenshot of serp pages as b64 data
screen_output: false,
// whether to prevent images, css, fonts and media from being loaded
// will speed up scraping a great deal
block_assets: true,
// path to js module that extends functionality // path to js module that extends functionality
// this module should export the functions: // this module should export the functions:
// get_browser, handle_metadata, close_browser // get_browser, handle_metadata, close_browser
// must be an absolute path to the module
//custom_func: resolve('examples/pluggable.js'), //custom_func: resolve('examples/pluggable.js'),
custom_func: '', custom_func: '',
throw_on_detection: false,
// use a proxy for all connections // use a proxy for all connections
// example: 'socks5://78.94.172.42:1080' // example: 'socks5://78.94.172.42:1080'
// example: 'http://118.174.233.10:48400' // example: 'http://118.174.233.10:48400'
@ -344,11 +368,20 @@ let browser_config = {
// socks5://78.94.172.42:1080 // socks5://78.94.172.42:1080
// http://118.174.233.10:48400 // http://118.174.233.10:48400
proxy_file: '', proxy_file: '',
// whether to use proxies only
// when this is set to true, se-scraper will not use
// your default IP address
use_proxies_only: false,
// check if headless chrome escapes common detection techniques
// this is a quick test and should be used for debugging
test_evasion: false,
apply_evasion_techniques: true,
// settings for puppeteer-cluster
puppeteer_cluster_config: { puppeteer_cluster_config: {
timeout: 10 * 60 * 1000, // max timeout set to 10 minutes timeout: 30 * 60 * 1000, // max timeout set to 30 minutes
monitor: false, monitor: false,
concurrency: 1, // one scraper per tab concurrency: Cluster.CONCURRENCY_BROWSER,
maxConcurrency: 1, // scrape with 2 tabs maxConcurrency: 1,
} }
}; };

View File

@ -49,18 +49,19 @@
- remove unnecessary sleep() calls and replace with waitFor selectors - remove unnecessary sleep() calls and replace with waitFor selectors
### 16.7.2019 ### 16.7.2019
- resolve issues - resolve issues
- fix this https://github.com/NikolaiT/se-scraper/issues/37 [done] - fix this https://github.com/NikolaiT/se-scraper/issues/37 [done]
- use puppeteer stealth plugin - use puppeteer stealth plugin: https://www.npmjs.com/package/puppeteer-extra-plugin-stealth
- user random user agents plugin: https://github.com/intoli/user-agents - we will need to load at the concurrency impl of puppeteer-cluster [no typescript support :(), I will not support this right now]
- user random user agents plugin: https://github.com/intoli/user-agents [done]
- add screenshot capability (make the screen after parsing) - add screenshot capability (make the screen after parsing)
- store as b64 - store as b64 [done]
### TODO: ### TODO:
1. fix googlenewsscraper waiting for results and parsing. remove the static sleep [done] 1. fix googlenewsscraper waiting for results and parsing. remove the static sleep [done]

View File

@ -6,6 +6,8 @@ const se_scraper = require('./../src/node_scraper.js');
test_evasion: false, test_evasion: false,
log_http_headers: true, log_http_headers: true,
random_user_agent: true, random_user_agent: true,
apply_evasion_techniques: true,
screen_output: true,
}; };
let scrape_job = { let scrape_job = {

BIN
headless-evasion-result.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 145 KiB

235
package-lock.json generated
View File

@ -60,6 +60,11 @@
"sprintf-js": "~1.0.2" "sprintf-js": "~1.0.2"
} }
}, },
"arr-union": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/arr-union/-/arr-union-3.1.0.tgz",
"integrity": "sha1-45sJrqne+Gao8gbiiK9jkZuuOcQ="
},
"assertion-error": { "assertion-error": {
"version": "1.1.0", "version": "1.1.0",
"resolved": "https://registry.npmjs.org/assertion-error/-/assertion-error-1.1.0.tgz", "resolved": "https://registry.npmjs.org/assertion-error/-/assertion-error-1.1.0.tgz",
@ -198,6 +203,18 @@
"wrap-ansi": "^2.0.0" "wrap-ansi": "^2.0.0"
} }
}, },
"clone-deep": {
"version": "0.2.4",
"resolved": "https://registry.npmjs.org/clone-deep/-/clone-deep-0.2.4.tgz",
"integrity": "sha1-TnPdCen7lxzDhnDF3O2cGJZIHMY=",
"requires": {
"for-own": "^0.1.3",
"is-plain-object": "^2.0.1",
"kind-of": "^3.0.2",
"lazy-cache": "^1.0.3",
"shallow-clone": "^0.1.2"
}
},
"clone-response": { "clone-response": {
"version": "1.0.2", "version": "1.0.2",
"resolved": "https://registry.npmjs.org/clone-response/-/clone-response-1.0.2.tgz", "resolved": "https://registry.npmjs.org/clone-response/-/clone-response-1.0.2.tgz",
@ -239,7 +256,7 @@
}, },
"concat-stream": { "concat-stream": {
"version": "1.6.2", "version": "1.6.2",
"resolved": "http://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz", "resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz",
"integrity": "sha512-27HBghJxjiZtIk3Ycvn/4kbJk/1uZuJFfuPEns6LaEvpvG1f0hTea8lilrouyo9mVc2GWdcEZ8OLoGmSADlrCw==", "integrity": "sha512-27HBghJxjiZtIk3Ycvn/4kbJk/1uZuJFfuPEns6LaEvpvG1f0hTea8lilrouyo9mVc2GWdcEZ8OLoGmSADlrCw==",
"requires": { "requires": {
"buffer-from": "^1.0.0", "buffer-from": "^1.0.0",
@ -250,7 +267,7 @@
"dependencies": { "dependencies": {
"readable-stream": { "readable-stream": {
"version": "2.3.6", "version": "2.3.6",
"resolved": "http://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz", "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz",
"integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==", "integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==",
"requires": { "requires": {
"core-util-is": "~1.0.0", "core-util-is": "~1.0.0",
@ -264,7 +281,7 @@
}, },
"string_decoder": { "string_decoder": {
"version": "1.1.1", "version": "1.1.1",
"resolved": "http://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz",
"integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==", "integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==",
"requires": { "requires": {
"safe-buffer": "~5.1.0" "safe-buffer": "~5.1.0"
@ -337,6 +354,11 @@
"type-detect": "^4.0.0" "type-detect": "^4.0.0"
} }
}, },
"deepmerge": {
"version": "2.2.1",
"resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-2.2.1.tgz",
"integrity": "sha512-R9hc1Xa/NOBi9WRVUWg19rl1UB7Tt4kuPd+thNJgFZoxXsTz7ncaPaeIm+40oSGuP33DfMb4sZt1QIGiJzC4EA=="
},
"defer-to-connect": { "defer-to-connect": {
"version": "1.0.2", "version": "1.0.2",
"resolved": "https://registry.npmjs.org/defer-to-connect/-/defer-to-connect-1.0.2.tgz", "resolved": "https://registry.npmjs.org/defer-to-connect/-/defer-to-connect-1.0.2.tgz",
@ -465,7 +487,7 @@
}, },
"es6-promisify": { "es6-promisify": {
"version": "5.0.0", "version": "5.0.0",
"resolved": "http://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz", "resolved": "https://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz",
"integrity": "sha1-UQnWLz5W6pZ8S2NQWu8IKRyKUgM=", "integrity": "sha1-UQnWLz5W6pZ8S2NQWu8IKRyKUgM=",
"requires": { "requires": {
"es6-promise": "^4.0.3" "es6-promise": "^4.0.3"
@ -550,6 +572,19 @@
"is-buffer": "~2.0.3" "is-buffer": "~2.0.3"
} }
}, },
"for-in": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/for-in/-/for-in-1.0.2.tgz",
"integrity": "sha1-gQaNKVqBQuwKxybG4iAMMPttXoA="
},
"for-own": {
"version": "0.1.5",
"resolved": "https://registry.npmjs.org/for-own/-/for-own-0.1.5.tgz",
"integrity": "sha1-UmXGgaTylNq78XyVCbZ2OqhFEM4=",
"requires": {
"for-in": "^1.0.1"
}
},
"fs.realpath": { "fs.realpath": {
"version": "1.0.0", "version": "1.0.0",
"resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz",
@ -664,11 +699,11 @@
"integrity": "sha512-laeSTWIkuFa6lUgZAt+ic9RwOSEwbi9VDQNcCvMFO4sZiDc2Ha8DaZVCJnfpLLQCcS8rvCnIWYmz0POLxt7Dew==" "integrity": "sha512-laeSTWIkuFa6lUgZAt+ic9RwOSEwbi9VDQNcCvMFO4sZiDc2Ha8DaZVCJnfpLLQCcS8rvCnIWYmz0POLxt7Dew=="
}, },
"https-proxy-agent": { "https-proxy-agent": {
"version": "2.2.1", "version": "2.2.2",
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-2.2.1.tgz", "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-2.2.2.tgz",
"integrity": "sha512-HPCTS1LW51bcyMYbxUIOO4HEOlQ1/1qRaFWcyxvwaqUS9TY88aoEuHUY33kuAh1YhVVaDQhLZsnPd+XNARWZlQ==", "integrity": "sha512-c8Ndjc9Bkpfx/vCJueCPy0jlP4ccCCSNDp8xwCZzPjKJUm+B+u9WX2x98Qx4n1PiMNTWo3D7KK5ifNV/yJyRzg==",
"requires": { "requires": {
"agent-base": "^4.1.0", "agent-base": "^4.3.0",
"debug": "^3.1.0" "debug": "^3.1.0"
}, },
"dependencies": { "dependencies": {
@ -720,12 +755,25 @@
"integrity": "sha1-mqIOtq7rv/d/vTPnTKAbM1gdOhY=", "integrity": "sha1-mqIOtq7rv/d/vTPnTKAbM1gdOhY=",
"dev": true "dev": true
}, },
"is-extendable": {
"version": "0.1.1",
"resolved": "https://registry.npmjs.org/is-extendable/-/is-extendable-0.1.1.tgz",
"integrity": "sha1-YrEQ4omkcUGOPsNqYX1HLjAd/Ik="
},
"is-fullwidth-code-point": { "is-fullwidth-code-point": {
"version": "2.0.0", "version": "2.0.0",
"resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-2.0.0.tgz", "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-2.0.0.tgz",
"integrity": "sha1-o7MKXE8ZkYMWeqq5O+764937ZU8=", "integrity": "sha1-o7MKXE8ZkYMWeqq5O+764937ZU8=",
"dev": true "dev": true
}, },
"is-plain-object": {
"version": "2.0.4",
"resolved": "https://registry.npmjs.org/is-plain-object/-/is-plain-object-2.0.4.tgz",
"integrity": "sha512-h5PpgXkWitc38BBMYawTYMWJHFZJVnBquFE57xFpjB8pJFiF6gZ+bU+WyI/yqXiFR5mdLsgYNaPe8uao6Uv9Og==",
"requires": {
"isobject": "^3.0.1"
}
},
"is-regex": { "is-regex": {
"version": "1.0.4", "version": "1.0.4",
"resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.0.4.tgz", "resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.0.4.tgz",
@ -761,6 +809,11 @@
"integrity": "sha1-6PvzdNxVb/iUehDcsFctYz8s+hA=", "integrity": "sha1-6PvzdNxVb/iUehDcsFctYz8s+hA=",
"dev": true "dev": true
}, },
"isobject": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz",
"integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8="
},
"js-yaml": { "js-yaml": {
"version": "3.13.1", "version": "3.13.1",
"resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-3.13.1.tgz", "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-3.13.1.tgz",
@ -784,6 +837,26 @@
"json-buffer": "3.0.0" "json-buffer": "3.0.0"
} }
}, },
"kind-of": {
"version": "3.2.2",
"resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz",
"integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=",
"requires": {
"is-buffer": "^1.1.5"
},
"dependencies": {
"is-buffer": {
"version": "1.1.6",
"resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-1.1.6.tgz",
"integrity": "sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w=="
}
}
},
"lazy-cache": {
"version": "1.0.4",
"resolved": "https://registry.npmjs.org/lazy-cache/-/lazy-cache-1.0.4.tgz",
"integrity": "sha1-odePw6UEdMuAhF07O24dpJpEbo4="
},
"lcid": { "lcid": {
"version": "2.0.0", "version": "2.0.0",
"resolved": "https://registry.npmjs.org/lcid/-/lcid-2.0.0.tgz", "resolved": "https://registry.npmjs.org/lcid/-/lcid-2.0.0.tgz",
@ -847,6 +920,16 @@
"p-is-promise": "^2.0.0" "p-is-promise": "^2.0.0"
} }
}, },
"merge-deep": {
"version": "3.0.2",
"resolved": "https://registry.npmjs.org/merge-deep/-/merge-deep-3.0.2.tgz",
"integrity": "sha512-T7qC8kg4Zoti1cFd8Cr0M+qaZfOwjlPDEdZIIPPB2JZctjaPM4fX+i7HOId69tAti2fvO6X5ldfYUONDODsrkA==",
"requires": {
"arr-union": "^3.1.0",
"clone-deep": "^0.2.4",
"kind-of": "^3.0.2"
}
},
"mime": { "mime": {
"version": "2.4.4", "version": "2.4.4",
"resolved": "https://registry.npmjs.org/mime/-/mime-2.4.4.tgz", "resolved": "https://registry.npmjs.org/mime/-/mime-2.4.4.tgz",
@ -876,6 +959,22 @@
"resolved": "http://registry.npmjs.org/minimist/-/minimist-0.0.8.tgz", "resolved": "http://registry.npmjs.org/minimist/-/minimist-0.0.8.tgz",
"integrity": "sha1-hX/Kv8M5fSYluCKCYuhqp6ARsF0=" "integrity": "sha1-hX/Kv8M5fSYluCKCYuhqp6ARsF0="
}, },
"mixin-object": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/mixin-object/-/mixin-object-2.0.1.tgz",
"integrity": "sha1-T7lJRB2rGCVA8f4DW6YOGUel5X4=",
"requires": {
"for-in": "^0.1.3",
"is-extendable": "^0.1.1"
},
"dependencies": {
"for-in": {
"version": "0.1.8",
"resolved": "https://registry.npmjs.org/for-in/-/for-in-0.1.8.tgz",
"integrity": "sha1-2Hc5COMSVhCZUrH9ubP6hn0ndeE="
}
}
},
"mkdirp": { "mkdirp": {
"version": "0.5.1", "version": "0.5.1",
"resolved": "http://registry.npmjs.org/mkdirp/-/mkdirp-0.5.1.tgz", "resolved": "http://registry.npmjs.org/mkdirp/-/mkdirp-0.5.1.tgz",
@ -1155,9 +1254,9 @@
"integrity": "sha1-6SQ0v6XqjBn0HN/UAddBo8gZ2Jc=" "integrity": "sha1-6SQ0v6XqjBn0HN/UAddBo8gZ2Jc="
}, },
"process-nextick-args": { "process-nextick-args": {
"version": "2.0.0", "version": "2.0.1",
"resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.0.tgz", "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.1.tgz",
"integrity": "sha512-MtEC1TqN0EU5nephaJ4rAtThHtC86dNN9qCuEhtshvpVBkAW5ZO7BASN9REnF9eoXGcRub+pFuKEpOHE+HbEMw==" "integrity": "sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag=="
}, },
"progress": { "progress": {
"version": "2.0.3", "version": "2.0.3",
@ -1189,9 +1288,9 @@
} }
}, },
"puppeteer": { "puppeteer": {
"version": "1.17.0", "version": "1.18.1",
"resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-1.17.0.tgz", "resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-1.18.1.tgz",
"integrity": "sha512-3EXZSximCzxuVKpIHtyec8Wm2dWZn1fc5tQi34qWfiUgubEVYHjUvr0GOJojqf3mifI6oyKnCdrGxaOI+lWReA==", "integrity": "sha512-luUy0HPSuWPsPZ1wAp6NinE0zgetWtudf5zwZ6dHjMWfYpTQcmKveFRox7VBNhQ98OjNA9PQ9PzQyX8k/KrxTg==",
"requires": { "requires": {
"debug": "^4.1.0", "debug": "^4.1.0",
"extract-zip": "^1.6.6", "extract-zip": "^1.6.6",
@ -1211,6 +1310,83 @@
"debug": "^4.1.1" "debug": "^4.1.1"
} }
}, },
"puppeteer-extra": {
"version": "2.1.3",
"resolved": "https://registry.npmjs.org/puppeteer-extra/-/puppeteer-extra-2.1.3.tgz",
"integrity": "sha512-RjQp3BkjzkY8JgTcHUsu4RdFcqE3AdTzUquRC9WBUZXRXbSgSXI1CtEmNG7OcxorScXNUTKEbY7Z13xtQVkHnQ==",
"requires": {
"debug": "^3.1.0",
"deepmerge": "^2.1.0"
},
"dependencies": {
"debug": {
"version": "3.2.6",
"resolved": "https://registry.npmjs.org/debug/-/debug-3.2.6.tgz",
"integrity": "sha512-mel+jf7nrtEl5Pn1Qx46zARXKDpBbvzezse7p7LqINmdoIk8PYP5SySaxEmYv6TZ0JyEKA1hsCId6DIhgITtWQ==",
"requires": {
"ms": "^2.1.1"
}
}
}
},
"puppeteer-extra-plugin": {
"version": "3.0.4",
"resolved": "https://registry.npmjs.org/puppeteer-extra-plugin/-/puppeteer-extra-plugin-3.0.4.tgz",
"integrity": "sha512-qgb5pmyNWI64PHtvRG1MjxEL7S3wAouDeqWklpg+/1jnxsbhRJsoi91SEg4U4Ji+rEOn28kTVM9O4KTXJ1PQ1Q==",
"requires": {
"debug": "^3.1.0",
"merge-deep": "^3.0.1"
},
"dependencies": {
"debug": {
"version": "3.2.6",
"resolved": "https://registry.npmjs.org/debug/-/debug-3.2.6.tgz",
"integrity": "sha512-mel+jf7nrtEl5Pn1Qx46zARXKDpBbvzezse7p7LqINmdoIk8PYP5SySaxEmYv6TZ0JyEKA1hsCId6DIhgITtWQ==",
"requires": {
"ms": "^2.1.1"
}
}
}
},
"puppeteer-extra-plugin-anonymize-ua": {
"version": "2.1.4",
"resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-anonymize-ua/-/puppeteer-extra-plugin-anonymize-ua-2.1.4.tgz",
"integrity": "sha512-AAxicMHX3AbsBvWgMQEqaIy7vqH+pzzh/HZ4+bdXiZExPm1dTUNq7Kmvog2xfY4bdQO+bQkk+Qtfr1dOQ0KRyg==",
"requires": {
"debug": "^3.1.0",
"puppeteer-extra-plugin": "^3.0.4"
},
"dependencies": {
"debug": {
"version": "3.2.6",
"resolved": "https://registry.npmjs.org/debug/-/debug-3.2.6.tgz",
"integrity": "sha512-mel+jf7nrtEl5Pn1Qx46zARXKDpBbvzezse7p7LqINmdoIk8PYP5SySaxEmYv6TZ0JyEKA1hsCId6DIhgITtWQ==",
"requires": {
"ms": "^2.1.1"
}
}
}
},
"puppeteer-extra-plugin-stealth": {
"version": "2.2.2",
"resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-stealth/-/puppeteer-extra-plugin-stealth-2.2.2.tgz",
"integrity": "sha512-tVf+M0U4VAht/q2vNlf1eGGeCFzK9q0vTHnuCEXHf06mu7ukUe1J060s6kij5r7Fv51UsTcXnMa069oSXvYSOw==",
"requires": {
"debug": "^3.1.0",
"puppeteer-extra-plugin": "^3.0.4",
"puppeteer-extra-plugin-anonymize-ua": "^2.1.4"
},
"dependencies": {
"debug": {
"version": "3.2.6",
"resolved": "https://registry.npmjs.org/debug/-/debug-3.2.6.tgz",
"integrity": "sha512-mel+jf7nrtEl5Pn1Qx46zARXKDpBbvzezse7p7LqINmdoIk8PYP5SySaxEmYv6TZ0JyEKA1hsCId6DIhgITtWQ==",
"requires": {
"ms": "^2.1.1"
}
}
}
},
"readable-stream": { "readable-stream": {
"version": "3.1.1", "version": "3.1.1",
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.1.1.tgz", "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.1.1.tgz",
@ -1266,6 +1442,37 @@
"integrity": "sha1-BF+XgtARrppoA93TgrJDkrPYkPc=", "integrity": "sha1-BF+XgtARrppoA93TgrJDkrPYkPc=",
"dev": true "dev": true
}, },
"shallow-clone": {
"version": "0.1.2",
"resolved": "https://registry.npmjs.org/shallow-clone/-/shallow-clone-0.1.2.tgz",
"integrity": "sha1-WQnodLp3EG1zrEFM/sH/yofZcGA=",
"requires": {
"is-extendable": "^0.1.1",
"kind-of": "^2.0.1",
"lazy-cache": "^0.2.3",
"mixin-object": "^2.0.1"
},
"dependencies": {
"is-buffer": {
"version": "1.1.6",
"resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-1.1.6.tgz",
"integrity": "sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w=="
},
"kind-of": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/kind-of/-/kind-of-2.0.1.tgz",
"integrity": "sha1-AY7HpM5+OobLkUG+UZ0kyPqpgbU=",
"requires": {
"is-buffer": "^1.0.2"
}
},
"lazy-cache": {
"version": "0.2.7",
"resolved": "https://registry.npmjs.org/lazy-cache/-/lazy-cache-0.2.7.tgz",
"integrity": "sha1-f+3fLctu23fRHvHRF6tf/fCrG2U="
}
}
},
"shebang-command": { "shebang-command": {
"version": "1.2.0", "version": "1.2.0",
"resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-1.2.0.tgz", "resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-1.2.0.tgz",

View File

@ -1,6 +1,6 @@
{ {
"name": "se-scraper", "name": "se-scraper",
"version": "1.4.0", "version": "1.4.1",
"description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu", "description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu",
"homepage": "https://scrapeulous.com/", "homepage": "https://scrapeulous.com/",
"main": "index.js", "main": "index.js",
@ -26,8 +26,10 @@
"got": "^9.6.0", "got": "^9.6.0",
"lodash": "^4.17.14", "lodash": "^4.17.14",
"proxy-chain": "^0.2.7", "proxy-chain": "^0.2.7",
"puppeteer": "^1.17.0", "puppeteer": "^1.18.1",
"puppeteer-cluster": "^0.13.0", "puppeteer-cluster": "^0.13.0",
"puppeteer-extra": "^2.1.3",
"puppeteer-extra-plugin-stealth": "^2.2.2",
"user-agents": "^1.0.321" "user-agents": "^1.0.321"
}, },
"devDependencies": { "devDependencies": {

View File

@ -29,11 +29,8 @@ module.exports = class Scraper {
this.keywords = config.keywords; this.keywords = config.keywords;
this.STANDARD_TIMEOUT = 10000; this.STANDARD_TIMEOUT = 10000;
// longer timeout when using proxies
this.PROXY_TIMEOUT = 15000;
this.SOLVE_CAPTCHA_TIME = 45000; this.SOLVE_CAPTCHA_TIME = 45000;
this.html_output = {};
this.results = {}; this.results = {};
this.result_rank = 1; this.result_rank = 1;
// keep track of the requests done // keep track of the requests done
@ -52,7 +49,6 @@ module.exports = class Scraper {
} }
} }
} }
} }
async run({page, data}) { async run({page, data}) {
@ -103,12 +99,10 @@ module.exports = class Scraper {
if (this.config.test_evasion === true) { if (this.config.test_evasion === true) {
// Navigate to the page that will perform the tests. // Navigate to the page that will perform the tests.
const testUrl = 'https://intoli.com/blog/' + const testUrl = 'https://bot.sannysoft.com';
'not-possible-to-block-chrome-headless/chrome-headless-test.html';
await this.page.goto(testUrl); await this.page.goto(testUrl);
// Save a screenshot of the results. // Save a screenshot of the results.
await this.page.screenshot({path: 'headless-test-result.png'}); await this.page.screenshot({path: 'headless-evasion-result.png'});
} }
if (this.config.log_http_headers === true) { if (this.config.log_http_headers === true) {
@ -196,6 +190,13 @@ module.exports = class Scraper {
this.results[keyword][this.page_num].html = html; this.results[keyword][this.page_num].html = html;
} }
if (this.config.screen_output) {
this.results[keyword][this.page_num].screenshot = await this.page.screenshot({
encoding: 'base64',
fullPage: false,
});
}
this.page_num += 1; this.page_num += 1;
// only load the next page when we will pass the next iteration // only load the next page when we will pass the next iteration

View File

@ -108,6 +108,8 @@ class ScrapeManager {
output_file: '', output_file: '',
// whether to also passthru all the html output of the serp pages // whether to also passthru all the html output of the serp pages
html_output: false, html_output: false,
// whether to return a screenshot of serp pages as b64 data
screen_output: false,
// whether to prevent images, css, fonts and media from being loaded // whether to prevent images, css, fonts and media from being loaded
// will speed up scraping a great deal // will speed up scraping a great deal
block_assets: true, block_assets: true,

@ -1 +1 @@
Subproject commit 1fcf7f9a23c5a9abb491157e505b475f7aea72e5 Subproject commit 03c9a764298f3f55b46bace810f4d3b2e1cb3266