mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2025-06-26 12:31:52 +02:00
better user agents now, added option to include screenshots as base64 in results
This commit is contained in:
parent
fcbe66b56b
commit
78fe12390b
47
README.md
47
README.md
@ -317,25 +317,49 @@ const se_scraper = require('se-scraper');
|
||||
// and cannot give to se-scraper on scrape() calls
|
||||
let browser_config = {
|
||||
// the user agent to scrape with
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3835.0 Safari/537.36',
|
||||
// if random_user_agent is set to True, a random user agent is chosen
|
||||
random_user_agent: false,
|
||||
// whether to start the browser in headless mode
|
||||
headless: true,
|
||||
// whether to select manual settings in visible mode
|
||||
set_manual_settings: false,
|
||||
// log ip address data
|
||||
log_ip_address: false,
|
||||
// log http headers
|
||||
log_http_headers: false,
|
||||
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
||||
// is drawn before every request. empty string for no sleeping.
|
||||
sleep_range: '',
|
||||
// which search engine to scrape
|
||||
search_engine: 'google',
|
||||
compress: false, // compress
|
||||
// whether debug information should be printed
|
||||
// level 0: print nothing
|
||||
// level 1: print most important info
|
||||
// ...
|
||||
// level 4: print all shit nobody wants to know
|
||||
debug_level: 1,
|
||||
keywords: ['nodejs rocks',],
|
||||
// whether to start the browser in headless mode
|
||||
headless: true,
|
||||
// specify flags passed to chrome here
|
||||
chrome_flags: [],
|
||||
// the number of pages to scrape for each keyword
|
||||
num_pages: 1,
|
||||
// path to output file, data will be stored in JSON
|
||||
output_file: '',
|
||||
// whether to also passthru all the html output of the serp pages
|
||||
html_output: false,
|
||||
// whether to return a screenshot of serp pages as b64 data
|
||||
screen_output: false,
|
||||
// whether to prevent images, css, fonts and media from being loaded
|
||||
// will speed up scraping a great deal
|
||||
block_assets: true,
|
||||
// path to js module that extends functionality
|
||||
// this module should export the functions:
|
||||
// get_browser, handle_metadata, close_browser
|
||||
// must be an absolute path to the module
|
||||
//custom_func: resolve('examples/pluggable.js'),
|
||||
custom_func: '',
|
||||
throw_on_detection: false,
|
||||
// use a proxy for all connections
|
||||
// example: 'socks5://78.94.172.42:1080'
|
||||
// example: 'http://118.174.233.10:48400'
|
||||
@ -344,11 +368,20 @@ let browser_config = {
|
||||
// socks5://78.94.172.42:1080
|
||||
// http://118.174.233.10:48400
|
||||
proxy_file: '',
|
||||
// whether to use proxies only
|
||||
// when this is set to true, se-scraper will not use
|
||||
// your default IP address
|
||||
use_proxies_only: false,
|
||||
// check if headless chrome escapes common detection techniques
|
||||
// this is a quick test and should be used for debugging
|
||||
test_evasion: false,
|
||||
apply_evasion_techniques: true,
|
||||
// settings for puppeteer-cluster
|
||||
puppeteer_cluster_config: {
|
||||
timeout: 10 * 60 * 1000, // max timeout set to 10 minutes
|
||||
timeout: 30 * 60 * 1000, // max timeout set to 30 minutes
|
||||
monitor: false,
|
||||
concurrency: 1, // one scraper per tab
|
||||
maxConcurrency: 1, // scrape with 2 tabs
|
||||
concurrency: Cluster.CONCURRENCY_BROWSER,
|
||||
maxConcurrency: 1,
|
||||
}
|
||||
};
|
||||
|
||||
|
9
TODO.md
9
TODO.md
@ -49,18 +49,19 @@
|
||||
- remove unnecessary sleep() calls and replace with waitFor selectors
|
||||
|
||||
|
||||
|
||||
### 16.7.2019
|
||||
|
||||
- resolve issues
|
||||
- fix this https://github.com/NikolaiT/se-scraper/issues/37 [done]
|
||||
|
||||
- use puppeteer stealth plugin
|
||||
- use puppeteer stealth plugin: https://www.npmjs.com/package/puppeteer-extra-plugin-stealth
|
||||
|
||||
- user random user agents plugin: https://github.com/intoli/user-agents
|
||||
- we will need to load at the concurrency impl of puppeteer-cluster [no typescript support :(), I will not support this right now]
|
||||
|
||||
- user random user agents plugin: https://github.com/intoli/user-agents [done]
|
||||
|
||||
- add screenshot capability (make the screen after parsing)
|
||||
- store as b64
|
||||
- store as b64 [done]
|
||||
|
||||
### TODO:
|
||||
1. fix googlenewsscraper waiting for results and parsing. remove the static sleep [done]
|
||||
|
@ -6,6 +6,8 @@ const se_scraper = require('./../src/node_scraper.js');
|
||||
test_evasion: false,
|
||||
log_http_headers: true,
|
||||
random_user_agent: true,
|
||||
apply_evasion_techniques: true,
|
||||
screen_output: true,
|
||||
};
|
||||
|
||||
let scrape_job = {
|
||||
|
BIN
headless-evasion-result.png
Normal file
BIN
headless-evasion-result.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 145 KiB |
235
package-lock.json
generated
235
package-lock.json
generated
@ -60,6 +60,11 @@
|
||||
"sprintf-js": "~1.0.2"
|
||||
}
|
||||
},
|
||||
"arr-union": {
|
||||
"version": "3.1.0",
|
||||
"resolved": "https://registry.npmjs.org/arr-union/-/arr-union-3.1.0.tgz",
|
||||
"integrity": "sha1-45sJrqne+Gao8gbiiK9jkZuuOcQ="
|
||||
},
|
||||
"assertion-error": {
|
||||
"version": "1.1.0",
|
||||
"resolved": "https://registry.npmjs.org/assertion-error/-/assertion-error-1.1.0.tgz",
|
||||
@ -198,6 +203,18 @@
|
||||
"wrap-ansi": "^2.0.0"
|
||||
}
|
||||
},
|
||||
"clone-deep": {
|
||||
"version": "0.2.4",
|
||||
"resolved": "https://registry.npmjs.org/clone-deep/-/clone-deep-0.2.4.tgz",
|
||||
"integrity": "sha1-TnPdCen7lxzDhnDF3O2cGJZIHMY=",
|
||||
"requires": {
|
||||
"for-own": "^0.1.3",
|
||||
"is-plain-object": "^2.0.1",
|
||||
"kind-of": "^3.0.2",
|
||||
"lazy-cache": "^1.0.3",
|
||||
"shallow-clone": "^0.1.2"
|
||||
}
|
||||
},
|
||||
"clone-response": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/clone-response/-/clone-response-1.0.2.tgz",
|
||||
@ -239,7 +256,7 @@
|
||||
},
|
||||
"concat-stream": {
|
||||
"version": "1.6.2",
|
||||
"resolved": "http://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz",
|
||||
"resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz",
|
||||
"integrity": "sha512-27HBghJxjiZtIk3Ycvn/4kbJk/1uZuJFfuPEns6LaEvpvG1f0hTea8lilrouyo9mVc2GWdcEZ8OLoGmSADlrCw==",
|
||||
"requires": {
|
||||
"buffer-from": "^1.0.0",
|
||||
@ -250,7 +267,7 @@
|
||||
"dependencies": {
|
||||
"readable-stream": {
|
||||
"version": "2.3.6",
|
||||
"resolved": "http://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz",
|
||||
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz",
|
||||
"integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==",
|
||||
"requires": {
|
||||
"core-util-is": "~1.0.0",
|
||||
@ -264,7 +281,7 @@
|
||||
},
|
||||
"string_decoder": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "http://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz",
|
||||
"resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz",
|
||||
"integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==",
|
||||
"requires": {
|
||||
"safe-buffer": "~5.1.0"
|
||||
@ -337,6 +354,11 @@
|
||||
"type-detect": "^4.0.0"
|
||||
}
|
||||
},
|
||||
"deepmerge": {
|
||||
"version": "2.2.1",
|
||||
"resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-2.2.1.tgz",
|
||||
"integrity": "sha512-R9hc1Xa/NOBi9WRVUWg19rl1UB7Tt4kuPd+thNJgFZoxXsTz7ncaPaeIm+40oSGuP33DfMb4sZt1QIGiJzC4EA=="
|
||||
},
|
||||
"defer-to-connect": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/defer-to-connect/-/defer-to-connect-1.0.2.tgz",
|
||||
@ -465,7 +487,7 @@
|
||||
},
|
||||
"es6-promisify": {
|
||||
"version": "5.0.0",
|
||||
"resolved": "http://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz",
|
||||
"resolved": "https://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz",
|
||||
"integrity": "sha1-UQnWLz5W6pZ8S2NQWu8IKRyKUgM=",
|
||||
"requires": {
|
||||
"es6-promise": "^4.0.3"
|
||||
@ -550,6 +572,19 @@
|
||||
"is-buffer": "~2.0.3"
|
||||
}
|
||||
},
|
||||
"for-in": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/for-in/-/for-in-1.0.2.tgz",
|
||||
"integrity": "sha1-gQaNKVqBQuwKxybG4iAMMPttXoA="
|
||||
},
|
||||
"for-own": {
|
||||
"version": "0.1.5",
|
||||
"resolved": "https://registry.npmjs.org/for-own/-/for-own-0.1.5.tgz",
|
||||
"integrity": "sha1-UmXGgaTylNq78XyVCbZ2OqhFEM4=",
|
||||
"requires": {
|
||||
"for-in": "^1.0.1"
|
||||
}
|
||||
},
|
||||
"fs.realpath": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz",
|
||||
@ -664,11 +699,11 @@
|
||||
"integrity": "sha512-laeSTWIkuFa6lUgZAt+ic9RwOSEwbi9VDQNcCvMFO4sZiDc2Ha8DaZVCJnfpLLQCcS8rvCnIWYmz0POLxt7Dew=="
|
||||
},
|
||||
"https-proxy-agent": {
|
||||
"version": "2.2.1",
|
||||
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-2.2.1.tgz",
|
||||
"integrity": "sha512-HPCTS1LW51bcyMYbxUIOO4HEOlQ1/1qRaFWcyxvwaqUS9TY88aoEuHUY33kuAh1YhVVaDQhLZsnPd+XNARWZlQ==",
|
||||
"version": "2.2.2",
|
||||
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-2.2.2.tgz",
|
||||
"integrity": "sha512-c8Ndjc9Bkpfx/vCJueCPy0jlP4ccCCSNDp8xwCZzPjKJUm+B+u9WX2x98Qx4n1PiMNTWo3D7KK5ifNV/yJyRzg==",
|
||||
"requires": {
|
||||
"agent-base": "^4.1.0",
|
||||
"agent-base": "^4.3.0",
|
||||
"debug": "^3.1.0"
|
||||
},
|
||||
"dependencies": {
|
||||
@ -720,12 +755,25 @@
|
||||
"integrity": "sha1-mqIOtq7rv/d/vTPnTKAbM1gdOhY=",
|
||||
"dev": true
|
||||
},
|
||||
"is-extendable": {
|
||||
"version": "0.1.1",
|
||||
"resolved": "https://registry.npmjs.org/is-extendable/-/is-extendable-0.1.1.tgz",
|
||||
"integrity": "sha1-YrEQ4omkcUGOPsNqYX1HLjAd/Ik="
|
||||
},
|
||||
"is-fullwidth-code-point": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-2.0.0.tgz",
|
||||
"integrity": "sha1-o7MKXE8ZkYMWeqq5O+764937ZU8=",
|
||||
"dev": true
|
||||
},
|
||||
"is-plain-object": {
|
||||
"version": "2.0.4",
|
||||
"resolved": "https://registry.npmjs.org/is-plain-object/-/is-plain-object-2.0.4.tgz",
|
||||
"integrity": "sha512-h5PpgXkWitc38BBMYawTYMWJHFZJVnBquFE57xFpjB8pJFiF6gZ+bU+WyI/yqXiFR5mdLsgYNaPe8uao6Uv9Og==",
|
||||
"requires": {
|
||||
"isobject": "^3.0.1"
|
||||
}
|
||||
},
|
||||
"is-regex": {
|
||||
"version": "1.0.4",
|
||||
"resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.0.4.tgz",
|
||||
@ -761,6 +809,11 @@
|
||||
"integrity": "sha1-6PvzdNxVb/iUehDcsFctYz8s+hA=",
|
||||
"dev": true
|
||||
},
|
||||
"isobject": {
|
||||
"version": "3.0.1",
|
||||
"resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz",
|
||||
"integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8="
|
||||
},
|
||||
"js-yaml": {
|
||||
"version": "3.13.1",
|
||||
"resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-3.13.1.tgz",
|
||||
@ -784,6 +837,26 @@
|
||||
"json-buffer": "3.0.0"
|
||||
}
|
||||
},
|
||||
"kind-of": {
|
||||
"version": "3.2.2",
|
||||
"resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz",
|
||||
"integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=",
|
||||
"requires": {
|
||||
"is-buffer": "^1.1.5"
|
||||
},
|
||||
"dependencies": {
|
||||
"is-buffer": {
|
||||
"version": "1.1.6",
|
||||
"resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-1.1.6.tgz",
|
||||
"integrity": "sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w=="
|
||||
}
|
||||
}
|
||||
},
|
||||
"lazy-cache": {
|
||||
"version": "1.0.4",
|
||||
"resolved": "https://registry.npmjs.org/lazy-cache/-/lazy-cache-1.0.4.tgz",
|
||||
"integrity": "sha1-odePw6UEdMuAhF07O24dpJpEbo4="
|
||||
},
|
||||
"lcid": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/lcid/-/lcid-2.0.0.tgz",
|
||||
@ -847,6 +920,16 @@
|
||||
"p-is-promise": "^2.0.0"
|
||||
}
|
||||
},
|
||||
"merge-deep": {
|
||||
"version": "3.0.2",
|
||||
"resolved": "https://registry.npmjs.org/merge-deep/-/merge-deep-3.0.2.tgz",
|
||||
"integrity": "sha512-T7qC8kg4Zoti1cFd8Cr0M+qaZfOwjlPDEdZIIPPB2JZctjaPM4fX+i7HOId69tAti2fvO6X5ldfYUONDODsrkA==",
|
||||
"requires": {
|
||||
"arr-union": "^3.1.0",
|
||||
"clone-deep": "^0.2.4",
|
||||
"kind-of": "^3.0.2"
|
||||
}
|
||||
},
|
||||
"mime": {
|
||||
"version": "2.4.4",
|
||||
"resolved": "https://registry.npmjs.org/mime/-/mime-2.4.4.tgz",
|
||||
@ -876,6 +959,22 @@
|
||||
"resolved": "http://registry.npmjs.org/minimist/-/minimist-0.0.8.tgz",
|
||||
"integrity": "sha1-hX/Kv8M5fSYluCKCYuhqp6ARsF0="
|
||||
},
|
||||
"mixin-object": {
|
||||
"version": "2.0.1",
|
||||
"resolved": "https://registry.npmjs.org/mixin-object/-/mixin-object-2.0.1.tgz",
|
||||
"integrity": "sha1-T7lJRB2rGCVA8f4DW6YOGUel5X4=",
|
||||
"requires": {
|
||||
"for-in": "^0.1.3",
|
||||
"is-extendable": "^0.1.1"
|
||||
},
|
||||
"dependencies": {
|
||||
"for-in": {
|
||||
"version": "0.1.8",
|
||||
"resolved": "https://registry.npmjs.org/for-in/-/for-in-0.1.8.tgz",
|
||||
"integrity": "sha1-2Hc5COMSVhCZUrH9ubP6hn0ndeE="
|
||||
}
|
||||
}
|
||||
},
|
||||
"mkdirp": {
|
||||
"version": "0.5.1",
|
||||
"resolved": "http://registry.npmjs.org/mkdirp/-/mkdirp-0.5.1.tgz",
|
||||
@ -1155,9 +1254,9 @@
|
||||
"integrity": "sha1-6SQ0v6XqjBn0HN/UAddBo8gZ2Jc="
|
||||
},
|
||||
"process-nextick-args": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.0.tgz",
|
||||
"integrity": "sha512-MtEC1TqN0EU5nephaJ4rAtThHtC86dNN9qCuEhtshvpVBkAW5ZO7BASN9REnF9eoXGcRub+pFuKEpOHE+HbEMw=="
|
||||
"version": "2.0.1",
|
||||
"resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.1.tgz",
|
||||
"integrity": "sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag=="
|
||||
},
|
||||
"progress": {
|
||||
"version": "2.0.3",
|
||||
@ -1189,9 +1288,9 @@
|
||||
}
|
||||
},
|
||||
"puppeteer": {
|
||||
"version": "1.17.0",
|
||||
"resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-1.17.0.tgz",
|
||||
"integrity": "sha512-3EXZSximCzxuVKpIHtyec8Wm2dWZn1fc5tQi34qWfiUgubEVYHjUvr0GOJojqf3mifI6oyKnCdrGxaOI+lWReA==",
|
||||
"version": "1.18.1",
|
||||
"resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-1.18.1.tgz",
|
||||
"integrity": "sha512-luUy0HPSuWPsPZ1wAp6NinE0zgetWtudf5zwZ6dHjMWfYpTQcmKveFRox7VBNhQ98OjNA9PQ9PzQyX8k/KrxTg==",
|
||||
"requires": {
|
||||
"debug": "^4.1.0",
|
||||
"extract-zip": "^1.6.6",
|
||||
@ -1211,6 +1310,83 @@
|
||||
"debug": "^4.1.1"
|
||||
}
|
||||
},
|
||||
"puppeteer-extra": {
|
||||
"version": "2.1.3",
|
||||
"resolved": "https://registry.npmjs.org/puppeteer-extra/-/puppeteer-extra-2.1.3.tgz",
|
||||
"integrity": "sha512-RjQp3BkjzkY8JgTcHUsu4RdFcqE3AdTzUquRC9WBUZXRXbSgSXI1CtEmNG7OcxorScXNUTKEbY7Z13xtQVkHnQ==",
|
||||
"requires": {
|
||||
"debug": "^3.1.0",
|
||||
"deepmerge": "^2.1.0"
|
||||
},
|
||||
"dependencies": {
|
||||
"debug": {
|
||||
"version": "3.2.6",
|
||||
"resolved": "https://registry.npmjs.org/debug/-/debug-3.2.6.tgz",
|
||||
"integrity": "sha512-mel+jf7nrtEl5Pn1Qx46zARXKDpBbvzezse7p7LqINmdoIk8PYP5SySaxEmYv6TZ0JyEKA1hsCId6DIhgITtWQ==",
|
||||
"requires": {
|
||||
"ms": "^2.1.1"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"puppeteer-extra-plugin": {
|
||||
"version": "3.0.4",
|
||||
"resolved": "https://registry.npmjs.org/puppeteer-extra-plugin/-/puppeteer-extra-plugin-3.0.4.tgz",
|
||||
"integrity": "sha512-qgb5pmyNWI64PHtvRG1MjxEL7S3wAouDeqWklpg+/1jnxsbhRJsoi91SEg4U4Ji+rEOn28kTVM9O4KTXJ1PQ1Q==",
|
||||
"requires": {
|
||||
"debug": "^3.1.0",
|
||||
"merge-deep": "^3.0.1"
|
||||
},
|
||||
"dependencies": {
|
||||
"debug": {
|
||||
"version": "3.2.6",
|
||||
"resolved": "https://registry.npmjs.org/debug/-/debug-3.2.6.tgz",
|
||||
"integrity": "sha512-mel+jf7nrtEl5Pn1Qx46zARXKDpBbvzezse7p7LqINmdoIk8PYP5SySaxEmYv6TZ0JyEKA1hsCId6DIhgITtWQ==",
|
||||
"requires": {
|
||||
"ms": "^2.1.1"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"puppeteer-extra-plugin-anonymize-ua": {
|
||||
"version": "2.1.4",
|
||||
"resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-anonymize-ua/-/puppeteer-extra-plugin-anonymize-ua-2.1.4.tgz",
|
||||
"integrity": "sha512-AAxicMHX3AbsBvWgMQEqaIy7vqH+pzzh/HZ4+bdXiZExPm1dTUNq7Kmvog2xfY4bdQO+bQkk+Qtfr1dOQ0KRyg==",
|
||||
"requires": {
|
||||
"debug": "^3.1.0",
|
||||
"puppeteer-extra-plugin": "^3.0.4"
|
||||
},
|
||||
"dependencies": {
|
||||
"debug": {
|
||||
"version": "3.2.6",
|
||||
"resolved": "https://registry.npmjs.org/debug/-/debug-3.2.6.tgz",
|
||||
"integrity": "sha512-mel+jf7nrtEl5Pn1Qx46zARXKDpBbvzezse7p7LqINmdoIk8PYP5SySaxEmYv6TZ0JyEKA1hsCId6DIhgITtWQ==",
|
||||
"requires": {
|
||||
"ms": "^2.1.1"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"puppeteer-extra-plugin-stealth": {
|
||||
"version": "2.2.2",
|
||||
"resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-stealth/-/puppeteer-extra-plugin-stealth-2.2.2.tgz",
|
||||
"integrity": "sha512-tVf+M0U4VAht/q2vNlf1eGGeCFzK9q0vTHnuCEXHf06mu7ukUe1J060s6kij5r7Fv51UsTcXnMa069oSXvYSOw==",
|
||||
"requires": {
|
||||
"debug": "^3.1.0",
|
||||
"puppeteer-extra-plugin": "^3.0.4",
|
||||
"puppeteer-extra-plugin-anonymize-ua": "^2.1.4"
|
||||
},
|
||||
"dependencies": {
|
||||
"debug": {
|
||||
"version": "3.2.6",
|
||||
"resolved": "https://registry.npmjs.org/debug/-/debug-3.2.6.tgz",
|
||||
"integrity": "sha512-mel+jf7nrtEl5Pn1Qx46zARXKDpBbvzezse7p7LqINmdoIk8PYP5SySaxEmYv6TZ0JyEKA1hsCId6DIhgITtWQ==",
|
||||
"requires": {
|
||||
"ms": "^2.1.1"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"readable-stream": {
|
||||
"version": "3.1.1",
|
||||
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.1.1.tgz",
|
||||
@ -1266,6 +1442,37 @@
|
||||
"integrity": "sha1-BF+XgtARrppoA93TgrJDkrPYkPc=",
|
||||
"dev": true
|
||||
},
|
||||
"shallow-clone": {
|
||||
"version": "0.1.2",
|
||||
"resolved": "https://registry.npmjs.org/shallow-clone/-/shallow-clone-0.1.2.tgz",
|
||||
"integrity": "sha1-WQnodLp3EG1zrEFM/sH/yofZcGA=",
|
||||
"requires": {
|
||||
"is-extendable": "^0.1.1",
|
||||
"kind-of": "^2.0.1",
|
||||
"lazy-cache": "^0.2.3",
|
||||
"mixin-object": "^2.0.1"
|
||||
},
|
||||
"dependencies": {
|
||||
"is-buffer": {
|
||||
"version": "1.1.6",
|
||||
"resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-1.1.6.tgz",
|
||||
"integrity": "sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w=="
|
||||
},
|
||||
"kind-of": {
|
||||
"version": "2.0.1",
|
||||
"resolved": "https://registry.npmjs.org/kind-of/-/kind-of-2.0.1.tgz",
|
||||
"integrity": "sha1-AY7HpM5+OobLkUG+UZ0kyPqpgbU=",
|
||||
"requires": {
|
||||
"is-buffer": "^1.0.2"
|
||||
}
|
||||
},
|
||||
"lazy-cache": {
|
||||
"version": "0.2.7",
|
||||
"resolved": "https://registry.npmjs.org/lazy-cache/-/lazy-cache-0.2.7.tgz",
|
||||
"integrity": "sha1-f+3fLctu23fRHvHRF6tf/fCrG2U="
|
||||
}
|
||||
}
|
||||
},
|
||||
"shebang-command": {
|
||||
"version": "1.2.0",
|
||||
"resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-1.2.0.tgz",
|
||||
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "se-scraper",
|
||||
"version": "1.4.0",
|
||||
"version": "1.4.1",
|
||||
"description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu",
|
||||
"homepage": "https://scrapeulous.com/",
|
||||
"main": "index.js",
|
||||
@ -26,8 +26,10 @@
|
||||
"got": "^9.6.0",
|
||||
"lodash": "^4.17.14",
|
||||
"proxy-chain": "^0.2.7",
|
||||
"puppeteer": "^1.17.0",
|
||||
"puppeteer": "^1.18.1",
|
||||
"puppeteer-cluster": "^0.13.0",
|
||||
"puppeteer-extra": "^2.1.3",
|
||||
"puppeteer-extra-plugin-stealth": "^2.2.2",
|
||||
"user-agents": "^1.0.321"
|
||||
},
|
||||
"devDependencies": {
|
||||
|
@ -29,11 +29,8 @@ module.exports = class Scraper {
|
||||
this.keywords = config.keywords;
|
||||
|
||||
this.STANDARD_TIMEOUT = 10000;
|
||||
// longer timeout when using proxies
|
||||
this.PROXY_TIMEOUT = 15000;
|
||||
this.SOLVE_CAPTCHA_TIME = 45000;
|
||||
|
||||
this.html_output = {};
|
||||
this.results = {};
|
||||
this.result_rank = 1;
|
||||
// keep track of the requests done
|
||||
@ -52,7 +49,6 @@ module.exports = class Scraper {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
async run({page, data}) {
|
||||
@ -103,12 +99,10 @@ module.exports = class Scraper {
|
||||
|
||||
if (this.config.test_evasion === true) {
|
||||
// Navigate to the page that will perform the tests.
|
||||
const testUrl = 'https://intoli.com/blog/' +
|
||||
'not-possible-to-block-chrome-headless/chrome-headless-test.html';
|
||||
const testUrl = 'https://bot.sannysoft.com';
|
||||
await this.page.goto(testUrl);
|
||||
|
||||
// Save a screenshot of the results.
|
||||
await this.page.screenshot({path: 'headless-test-result.png'});
|
||||
await this.page.screenshot({path: 'headless-evasion-result.png'});
|
||||
}
|
||||
|
||||
if (this.config.log_http_headers === true) {
|
||||
@ -196,6 +190,13 @@ module.exports = class Scraper {
|
||||
this.results[keyword][this.page_num].html = html;
|
||||
}
|
||||
|
||||
if (this.config.screen_output) {
|
||||
this.results[keyword][this.page_num].screenshot = await this.page.screenshot({
|
||||
encoding: 'base64',
|
||||
fullPage: false,
|
||||
});
|
||||
}
|
||||
|
||||
this.page_num += 1;
|
||||
|
||||
// only load the next page when we will pass the next iteration
|
||||
|
@ -108,6 +108,8 @@ class ScrapeManager {
|
||||
output_file: '',
|
||||
// whether to also passthru all the html output of the serp pages
|
||||
html_output: false,
|
||||
// whether to return a screenshot of serp pages as b64 data
|
||||
screen_output: false,
|
||||
// whether to prevent images, css, fonts and media from being loaded
|
||||
// will speed up scraping a great deal
|
||||
block_assets: true,
|
||||
|
@ -1 +1 @@
|
||||
Subproject commit 1fcf7f9a23c5a9abb491157e505b475f7aea72e5
|
||||
Subproject commit 03c9a764298f3f55b46bace810f4d3b2e1cb3266
|
Loading…
x
Reference in New Issue
Block a user