mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2025-06-26 12:31:52 +02:00
better user agents now, added option to include screenshots as base64 in results
This commit is contained in:
parent
fcbe66b56b
commit
78fe12390b
47
README.md
47
README.md
@ -317,25 +317,49 @@ const se_scraper = require('se-scraper');
|
|||||||
// and cannot give to se-scraper on scrape() calls
|
// and cannot give to se-scraper on scrape() calls
|
||||||
let browser_config = {
|
let browser_config = {
|
||||||
// the user agent to scrape with
|
// the user agent to scrape with
|
||||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3835.0 Safari/537.36',
|
||||||
// if random_user_agent is set to True, a random user agent is chosen
|
// if random_user_agent is set to True, a random user agent is chosen
|
||||||
random_user_agent: false,
|
random_user_agent: false,
|
||||||
// whether to start the browser in headless mode
|
// whether to select manual settings in visible mode
|
||||||
headless: true,
|
set_manual_settings: false,
|
||||||
|
// log ip address data
|
||||||
|
log_ip_address: false,
|
||||||
|
// log http headers
|
||||||
|
log_http_headers: false,
|
||||||
|
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
||||||
|
// is drawn before every request. empty string for no sleeping.
|
||||||
|
sleep_range: '',
|
||||||
|
// which search engine to scrape
|
||||||
|
search_engine: 'google',
|
||||||
|
compress: false, // compress
|
||||||
// whether debug information should be printed
|
// whether debug information should be printed
|
||||||
// level 0: print nothing
|
// level 0: print nothing
|
||||||
// level 1: print most important info
|
// level 1: print most important info
|
||||||
// ...
|
// ...
|
||||||
// level 4: print all shit nobody wants to know
|
// level 4: print all shit nobody wants to know
|
||||||
debug_level: 1,
|
debug_level: 1,
|
||||||
|
keywords: ['nodejs rocks',],
|
||||||
|
// whether to start the browser in headless mode
|
||||||
|
headless: true,
|
||||||
// specify flags passed to chrome here
|
// specify flags passed to chrome here
|
||||||
chrome_flags: [],
|
chrome_flags: [],
|
||||||
|
// the number of pages to scrape for each keyword
|
||||||
|
num_pages: 1,
|
||||||
|
// path to output file, data will be stored in JSON
|
||||||
|
output_file: '',
|
||||||
|
// whether to also passthru all the html output of the serp pages
|
||||||
|
html_output: false,
|
||||||
|
// whether to return a screenshot of serp pages as b64 data
|
||||||
|
screen_output: false,
|
||||||
|
// whether to prevent images, css, fonts and media from being loaded
|
||||||
|
// will speed up scraping a great deal
|
||||||
|
block_assets: true,
|
||||||
// path to js module that extends functionality
|
// path to js module that extends functionality
|
||||||
// this module should export the functions:
|
// this module should export the functions:
|
||||||
// get_browser, handle_metadata, close_browser
|
// get_browser, handle_metadata, close_browser
|
||||||
// must be an absolute path to the module
|
|
||||||
//custom_func: resolve('examples/pluggable.js'),
|
//custom_func: resolve('examples/pluggable.js'),
|
||||||
custom_func: '',
|
custom_func: '',
|
||||||
|
throw_on_detection: false,
|
||||||
// use a proxy for all connections
|
// use a proxy for all connections
|
||||||
// example: 'socks5://78.94.172.42:1080'
|
// example: 'socks5://78.94.172.42:1080'
|
||||||
// example: 'http://118.174.233.10:48400'
|
// example: 'http://118.174.233.10:48400'
|
||||||
@ -344,11 +368,20 @@ let browser_config = {
|
|||||||
// socks5://78.94.172.42:1080
|
// socks5://78.94.172.42:1080
|
||||||
// http://118.174.233.10:48400
|
// http://118.174.233.10:48400
|
||||||
proxy_file: '',
|
proxy_file: '',
|
||||||
|
// whether to use proxies only
|
||||||
|
// when this is set to true, se-scraper will not use
|
||||||
|
// your default IP address
|
||||||
|
use_proxies_only: false,
|
||||||
|
// check if headless chrome escapes common detection techniques
|
||||||
|
// this is a quick test and should be used for debugging
|
||||||
|
test_evasion: false,
|
||||||
|
apply_evasion_techniques: true,
|
||||||
|
// settings for puppeteer-cluster
|
||||||
puppeteer_cluster_config: {
|
puppeteer_cluster_config: {
|
||||||
timeout: 10 * 60 * 1000, // max timeout set to 10 minutes
|
timeout: 30 * 60 * 1000, // max timeout set to 30 minutes
|
||||||
monitor: false,
|
monitor: false,
|
||||||
concurrency: 1, // one scraper per tab
|
concurrency: Cluster.CONCURRENCY_BROWSER,
|
||||||
maxConcurrency: 1, // scrape with 2 tabs
|
maxConcurrency: 1,
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
9
TODO.md
9
TODO.md
@ -49,18 +49,19 @@
|
|||||||
- remove unnecessary sleep() calls and replace with waitFor selectors
|
- remove unnecessary sleep() calls and replace with waitFor selectors
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
### 16.7.2019
|
### 16.7.2019
|
||||||
|
|
||||||
- resolve issues
|
- resolve issues
|
||||||
- fix this https://github.com/NikolaiT/se-scraper/issues/37 [done]
|
- fix this https://github.com/NikolaiT/se-scraper/issues/37 [done]
|
||||||
|
|
||||||
- use puppeteer stealth plugin
|
- use puppeteer stealth plugin: https://www.npmjs.com/package/puppeteer-extra-plugin-stealth
|
||||||
|
|
||||||
- user random user agents plugin: https://github.com/intoli/user-agents
|
- we will need to load at the concurrency impl of puppeteer-cluster [no typescript support :(), I will not support this right now]
|
||||||
|
|
||||||
|
- user random user agents plugin: https://github.com/intoli/user-agents [done]
|
||||||
|
|
||||||
- add screenshot capability (make the screen after parsing)
|
- add screenshot capability (make the screen after parsing)
|
||||||
- store as b64
|
- store as b64 [done]
|
||||||
|
|
||||||
### TODO:
|
### TODO:
|
||||||
1. fix googlenewsscraper waiting for results and parsing. remove the static sleep [done]
|
1. fix googlenewsscraper waiting for results and parsing. remove the static sleep [done]
|
||||||
|
@ -6,6 +6,8 @@ const se_scraper = require('./../src/node_scraper.js');
|
|||||||
test_evasion: false,
|
test_evasion: false,
|
||||||
log_http_headers: true,
|
log_http_headers: true,
|
||||||
random_user_agent: true,
|
random_user_agent: true,
|
||||||
|
apply_evasion_techniques: true,
|
||||||
|
screen_output: true,
|
||||||
};
|
};
|
||||||
|
|
||||||
let scrape_job = {
|
let scrape_job = {
|
||||||
|
BIN
headless-evasion-result.png
Normal file
BIN
headless-evasion-result.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 145 KiB |
235
package-lock.json
generated
235
package-lock.json
generated
@ -60,6 +60,11 @@
|
|||||||
"sprintf-js": "~1.0.2"
|
"sprintf-js": "~1.0.2"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"arr-union": {
|
||||||
|
"version": "3.1.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/arr-union/-/arr-union-3.1.0.tgz",
|
||||||
|
"integrity": "sha1-45sJrqne+Gao8gbiiK9jkZuuOcQ="
|
||||||
|
},
|
||||||
"assertion-error": {
|
"assertion-error": {
|
||||||
"version": "1.1.0",
|
"version": "1.1.0",
|
||||||
"resolved": "https://registry.npmjs.org/assertion-error/-/assertion-error-1.1.0.tgz",
|
"resolved": "https://registry.npmjs.org/assertion-error/-/assertion-error-1.1.0.tgz",
|
||||||
@ -198,6 +203,18 @@
|
|||||||
"wrap-ansi": "^2.0.0"
|
"wrap-ansi": "^2.0.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"clone-deep": {
|
||||||
|
"version": "0.2.4",
|
||||||
|
"resolved": "https://registry.npmjs.org/clone-deep/-/clone-deep-0.2.4.tgz",
|
||||||
|
"integrity": "sha1-TnPdCen7lxzDhnDF3O2cGJZIHMY=",
|
||||||
|
"requires": {
|
||||||
|
"for-own": "^0.1.3",
|
||||||
|
"is-plain-object": "^2.0.1",
|
||||||
|
"kind-of": "^3.0.2",
|
||||||
|
"lazy-cache": "^1.0.3",
|
||||||
|
"shallow-clone": "^0.1.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
"clone-response": {
|
"clone-response": {
|
||||||
"version": "1.0.2",
|
"version": "1.0.2",
|
||||||
"resolved": "https://registry.npmjs.org/clone-response/-/clone-response-1.0.2.tgz",
|
"resolved": "https://registry.npmjs.org/clone-response/-/clone-response-1.0.2.tgz",
|
||||||
@ -239,7 +256,7 @@
|
|||||||
},
|
},
|
||||||
"concat-stream": {
|
"concat-stream": {
|
||||||
"version": "1.6.2",
|
"version": "1.6.2",
|
||||||
"resolved": "http://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz",
|
"resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz",
|
||||||
"integrity": "sha512-27HBghJxjiZtIk3Ycvn/4kbJk/1uZuJFfuPEns6LaEvpvG1f0hTea8lilrouyo9mVc2GWdcEZ8OLoGmSADlrCw==",
|
"integrity": "sha512-27HBghJxjiZtIk3Ycvn/4kbJk/1uZuJFfuPEns6LaEvpvG1f0hTea8lilrouyo9mVc2GWdcEZ8OLoGmSADlrCw==",
|
||||||
"requires": {
|
"requires": {
|
||||||
"buffer-from": "^1.0.0",
|
"buffer-from": "^1.0.0",
|
||||||
@ -250,7 +267,7 @@
|
|||||||
"dependencies": {
|
"dependencies": {
|
||||||
"readable-stream": {
|
"readable-stream": {
|
||||||
"version": "2.3.6",
|
"version": "2.3.6",
|
||||||
"resolved": "http://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz",
|
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz",
|
||||||
"integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==",
|
"integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==",
|
||||||
"requires": {
|
"requires": {
|
||||||
"core-util-is": "~1.0.0",
|
"core-util-is": "~1.0.0",
|
||||||
@ -264,7 +281,7 @@
|
|||||||
},
|
},
|
||||||
"string_decoder": {
|
"string_decoder": {
|
||||||
"version": "1.1.1",
|
"version": "1.1.1",
|
||||||
"resolved": "http://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz",
|
"resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz",
|
||||||
"integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==",
|
"integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==",
|
||||||
"requires": {
|
"requires": {
|
||||||
"safe-buffer": "~5.1.0"
|
"safe-buffer": "~5.1.0"
|
||||||
@ -337,6 +354,11 @@
|
|||||||
"type-detect": "^4.0.0"
|
"type-detect": "^4.0.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"deepmerge": {
|
||||||
|
"version": "2.2.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-2.2.1.tgz",
|
||||||
|
"integrity": "sha512-R9hc1Xa/NOBi9WRVUWg19rl1UB7Tt4kuPd+thNJgFZoxXsTz7ncaPaeIm+40oSGuP33DfMb4sZt1QIGiJzC4EA=="
|
||||||
|
},
|
||||||
"defer-to-connect": {
|
"defer-to-connect": {
|
||||||
"version": "1.0.2",
|
"version": "1.0.2",
|
||||||
"resolved": "https://registry.npmjs.org/defer-to-connect/-/defer-to-connect-1.0.2.tgz",
|
"resolved": "https://registry.npmjs.org/defer-to-connect/-/defer-to-connect-1.0.2.tgz",
|
||||||
@ -465,7 +487,7 @@
|
|||||||
},
|
},
|
||||||
"es6-promisify": {
|
"es6-promisify": {
|
||||||
"version": "5.0.0",
|
"version": "5.0.0",
|
||||||
"resolved": "http://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz",
|
"resolved": "https://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz",
|
||||||
"integrity": "sha1-UQnWLz5W6pZ8S2NQWu8IKRyKUgM=",
|
"integrity": "sha1-UQnWLz5W6pZ8S2NQWu8IKRyKUgM=",
|
||||||
"requires": {
|
"requires": {
|
||||||
"es6-promise": "^4.0.3"
|
"es6-promise": "^4.0.3"
|
||||||
@ -550,6 +572,19 @@
|
|||||||
"is-buffer": "~2.0.3"
|
"is-buffer": "~2.0.3"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"for-in": {
|
||||||
|
"version": "1.0.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/for-in/-/for-in-1.0.2.tgz",
|
||||||
|
"integrity": "sha1-gQaNKVqBQuwKxybG4iAMMPttXoA="
|
||||||
|
},
|
||||||
|
"for-own": {
|
||||||
|
"version": "0.1.5",
|
||||||
|
"resolved": "https://registry.npmjs.org/for-own/-/for-own-0.1.5.tgz",
|
||||||
|
"integrity": "sha1-UmXGgaTylNq78XyVCbZ2OqhFEM4=",
|
||||||
|
"requires": {
|
||||||
|
"for-in": "^1.0.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
"fs.realpath": {
|
"fs.realpath": {
|
||||||
"version": "1.0.0",
|
"version": "1.0.0",
|
||||||
"resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz",
|
"resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz",
|
||||||
@ -664,11 +699,11 @@
|
|||||||
"integrity": "sha512-laeSTWIkuFa6lUgZAt+ic9RwOSEwbi9VDQNcCvMFO4sZiDc2Ha8DaZVCJnfpLLQCcS8rvCnIWYmz0POLxt7Dew=="
|
"integrity": "sha512-laeSTWIkuFa6lUgZAt+ic9RwOSEwbi9VDQNcCvMFO4sZiDc2Ha8DaZVCJnfpLLQCcS8rvCnIWYmz0POLxt7Dew=="
|
||||||
},
|
},
|
||||||
"https-proxy-agent": {
|
"https-proxy-agent": {
|
||||||
"version": "2.2.1",
|
"version": "2.2.2",
|
||||||
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-2.2.1.tgz",
|
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-2.2.2.tgz",
|
||||||
"integrity": "sha512-HPCTS1LW51bcyMYbxUIOO4HEOlQ1/1qRaFWcyxvwaqUS9TY88aoEuHUY33kuAh1YhVVaDQhLZsnPd+XNARWZlQ==",
|
"integrity": "sha512-c8Ndjc9Bkpfx/vCJueCPy0jlP4ccCCSNDp8xwCZzPjKJUm+B+u9WX2x98Qx4n1PiMNTWo3D7KK5ifNV/yJyRzg==",
|
||||||
"requires": {
|
"requires": {
|
||||||
"agent-base": "^4.1.0",
|
"agent-base": "^4.3.0",
|
||||||
"debug": "^3.1.0"
|
"debug": "^3.1.0"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
@ -720,12 +755,25 @@
|
|||||||
"integrity": "sha1-mqIOtq7rv/d/vTPnTKAbM1gdOhY=",
|
"integrity": "sha1-mqIOtq7rv/d/vTPnTKAbM1gdOhY=",
|
||||||
"dev": true
|
"dev": true
|
||||||
},
|
},
|
||||||
|
"is-extendable": {
|
||||||
|
"version": "0.1.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/is-extendable/-/is-extendable-0.1.1.tgz",
|
||||||
|
"integrity": "sha1-YrEQ4omkcUGOPsNqYX1HLjAd/Ik="
|
||||||
|
},
|
||||||
"is-fullwidth-code-point": {
|
"is-fullwidth-code-point": {
|
||||||
"version": "2.0.0",
|
"version": "2.0.0",
|
||||||
"resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-2.0.0.tgz",
|
"resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-2.0.0.tgz",
|
||||||
"integrity": "sha1-o7MKXE8ZkYMWeqq5O+764937ZU8=",
|
"integrity": "sha1-o7MKXE8ZkYMWeqq5O+764937ZU8=",
|
||||||
"dev": true
|
"dev": true
|
||||||
},
|
},
|
||||||
|
"is-plain-object": {
|
||||||
|
"version": "2.0.4",
|
||||||
|
"resolved": "https://registry.npmjs.org/is-plain-object/-/is-plain-object-2.0.4.tgz",
|
||||||
|
"integrity": "sha512-h5PpgXkWitc38BBMYawTYMWJHFZJVnBquFE57xFpjB8pJFiF6gZ+bU+WyI/yqXiFR5mdLsgYNaPe8uao6Uv9Og==",
|
||||||
|
"requires": {
|
||||||
|
"isobject": "^3.0.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
"is-regex": {
|
"is-regex": {
|
||||||
"version": "1.0.4",
|
"version": "1.0.4",
|
||||||
"resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.0.4.tgz",
|
"resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.0.4.tgz",
|
||||||
@ -761,6 +809,11 @@
|
|||||||
"integrity": "sha1-6PvzdNxVb/iUehDcsFctYz8s+hA=",
|
"integrity": "sha1-6PvzdNxVb/iUehDcsFctYz8s+hA=",
|
||||||
"dev": true
|
"dev": true
|
||||||
},
|
},
|
||||||
|
"isobject": {
|
||||||
|
"version": "3.0.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz",
|
||||||
|
"integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8="
|
||||||
|
},
|
||||||
"js-yaml": {
|
"js-yaml": {
|
||||||
"version": "3.13.1",
|
"version": "3.13.1",
|
||||||
"resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-3.13.1.tgz",
|
"resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-3.13.1.tgz",
|
||||||
@ -784,6 +837,26 @@
|
|||||||
"json-buffer": "3.0.0"
|
"json-buffer": "3.0.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"kind-of": {
|
||||||
|
"version": "3.2.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz",
|
||||||
|
"integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=",
|
||||||
|
"requires": {
|
||||||
|
"is-buffer": "^1.1.5"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"is-buffer": {
|
||||||
|
"version": "1.1.6",
|
||||||
|
"resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-1.1.6.tgz",
|
||||||
|
"integrity": "sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w=="
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"lazy-cache": {
|
||||||
|
"version": "1.0.4",
|
||||||
|
"resolved": "https://registry.npmjs.org/lazy-cache/-/lazy-cache-1.0.4.tgz",
|
||||||
|
"integrity": "sha1-odePw6UEdMuAhF07O24dpJpEbo4="
|
||||||
|
},
|
||||||
"lcid": {
|
"lcid": {
|
||||||
"version": "2.0.0",
|
"version": "2.0.0",
|
||||||
"resolved": "https://registry.npmjs.org/lcid/-/lcid-2.0.0.tgz",
|
"resolved": "https://registry.npmjs.org/lcid/-/lcid-2.0.0.tgz",
|
||||||
@ -847,6 +920,16 @@
|
|||||||
"p-is-promise": "^2.0.0"
|
"p-is-promise": "^2.0.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"merge-deep": {
|
||||||
|
"version": "3.0.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/merge-deep/-/merge-deep-3.0.2.tgz",
|
||||||
|
"integrity": "sha512-T7qC8kg4Zoti1cFd8Cr0M+qaZfOwjlPDEdZIIPPB2JZctjaPM4fX+i7HOId69tAti2fvO6X5ldfYUONDODsrkA==",
|
||||||
|
"requires": {
|
||||||
|
"arr-union": "^3.1.0",
|
||||||
|
"clone-deep": "^0.2.4",
|
||||||
|
"kind-of": "^3.0.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
"mime": {
|
"mime": {
|
||||||
"version": "2.4.4",
|
"version": "2.4.4",
|
||||||
"resolved": "https://registry.npmjs.org/mime/-/mime-2.4.4.tgz",
|
"resolved": "https://registry.npmjs.org/mime/-/mime-2.4.4.tgz",
|
||||||
@ -876,6 +959,22 @@
|
|||||||
"resolved": "http://registry.npmjs.org/minimist/-/minimist-0.0.8.tgz",
|
"resolved": "http://registry.npmjs.org/minimist/-/minimist-0.0.8.tgz",
|
||||||
"integrity": "sha1-hX/Kv8M5fSYluCKCYuhqp6ARsF0="
|
"integrity": "sha1-hX/Kv8M5fSYluCKCYuhqp6ARsF0="
|
||||||
},
|
},
|
||||||
|
"mixin-object": {
|
||||||
|
"version": "2.0.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/mixin-object/-/mixin-object-2.0.1.tgz",
|
||||||
|
"integrity": "sha1-T7lJRB2rGCVA8f4DW6YOGUel5X4=",
|
||||||
|
"requires": {
|
||||||
|
"for-in": "^0.1.3",
|
||||||
|
"is-extendable": "^0.1.1"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"for-in": {
|
||||||
|
"version": "0.1.8",
|
||||||
|
"resolved": "https://registry.npmjs.org/for-in/-/for-in-0.1.8.tgz",
|
||||||
|
"integrity": "sha1-2Hc5COMSVhCZUrH9ubP6hn0ndeE="
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
"mkdirp": {
|
"mkdirp": {
|
||||||
"version": "0.5.1",
|
"version": "0.5.1",
|
||||||
"resolved": "http://registry.npmjs.org/mkdirp/-/mkdirp-0.5.1.tgz",
|
"resolved": "http://registry.npmjs.org/mkdirp/-/mkdirp-0.5.1.tgz",
|
||||||
@ -1155,9 +1254,9 @@
|
|||||||
"integrity": "sha1-6SQ0v6XqjBn0HN/UAddBo8gZ2Jc="
|
"integrity": "sha1-6SQ0v6XqjBn0HN/UAddBo8gZ2Jc="
|
||||||
},
|
},
|
||||||
"process-nextick-args": {
|
"process-nextick-args": {
|
||||||
"version": "2.0.0",
|
"version": "2.0.1",
|
||||||
"resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.0.tgz",
|
"resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.1.tgz",
|
||||||
"integrity": "sha512-MtEC1TqN0EU5nephaJ4rAtThHtC86dNN9qCuEhtshvpVBkAW5ZO7BASN9REnF9eoXGcRub+pFuKEpOHE+HbEMw=="
|
"integrity": "sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag=="
|
||||||
},
|
},
|
||||||
"progress": {
|
"progress": {
|
||||||
"version": "2.0.3",
|
"version": "2.0.3",
|
||||||
@ -1189,9 +1288,9 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"puppeteer": {
|
"puppeteer": {
|
||||||
"version": "1.17.0",
|
"version": "1.18.1",
|
||||||
"resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-1.17.0.tgz",
|
"resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-1.18.1.tgz",
|
||||||
"integrity": "sha512-3EXZSximCzxuVKpIHtyec8Wm2dWZn1fc5tQi34qWfiUgubEVYHjUvr0GOJojqf3mifI6oyKnCdrGxaOI+lWReA==",
|
"integrity": "sha512-luUy0HPSuWPsPZ1wAp6NinE0zgetWtudf5zwZ6dHjMWfYpTQcmKveFRox7VBNhQ98OjNA9PQ9PzQyX8k/KrxTg==",
|
||||||
"requires": {
|
"requires": {
|
||||||
"debug": "^4.1.0",
|
"debug": "^4.1.0",
|
||||||
"extract-zip": "^1.6.6",
|
"extract-zip": "^1.6.6",
|
||||||
@ -1211,6 +1310,83 @@
|
|||||||
"debug": "^4.1.1"
|
"debug": "^4.1.1"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"puppeteer-extra": {
|
||||||
|
"version": "2.1.3",
|
||||||
|
"resolved": "https://registry.npmjs.org/puppeteer-extra/-/puppeteer-extra-2.1.3.tgz",
|
||||||
|
"integrity": "sha512-RjQp3BkjzkY8JgTcHUsu4RdFcqE3AdTzUquRC9WBUZXRXbSgSXI1CtEmNG7OcxorScXNUTKEbY7Z13xtQVkHnQ==",
|
||||||
|
"requires": {
|
||||||
|
"debug": "^3.1.0",
|
||||||
|
"deepmerge": "^2.1.0"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"debug": {
|
||||||
|
"version": "3.2.6",
|
||||||
|
"resolved": "https://registry.npmjs.org/debug/-/debug-3.2.6.tgz",
|
||||||
|
"integrity": "sha512-mel+jf7nrtEl5Pn1Qx46zARXKDpBbvzezse7p7LqINmdoIk8PYP5SySaxEmYv6TZ0JyEKA1hsCId6DIhgITtWQ==",
|
||||||
|
"requires": {
|
||||||
|
"ms": "^2.1.1"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"puppeteer-extra-plugin": {
|
||||||
|
"version": "3.0.4",
|
||||||
|
"resolved": "https://registry.npmjs.org/puppeteer-extra-plugin/-/puppeteer-extra-plugin-3.0.4.tgz",
|
||||||
|
"integrity": "sha512-qgb5pmyNWI64PHtvRG1MjxEL7S3wAouDeqWklpg+/1jnxsbhRJsoi91SEg4U4Ji+rEOn28kTVM9O4KTXJ1PQ1Q==",
|
||||||
|
"requires": {
|
||||||
|
"debug": "^3.1.0",
|
||||||
|
"merge-deep": "^3.0.1"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"debug": {
|
||||||
|
"version": "3.2.6",
|
||||||
|
"resolved": "https://registry.npmjs.org/debug/-/debug-3.2.6.tgz",
|
||||||
|
"integrity": "sha512-mel+jf7nrtEl5Pn1Qx46zARXKDpBbvzezse7p7LqINmdoIk8PYP5SySaxEmYv6TZ0JyEKA1hsCId6DIhgITtWQ==",
|
||||||
|
"requires": {
|
||||||
|
"ms": "^2.1.1"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"puppeteer-extra-plugin-anonymize-ua": {
|
||||||
|
"version": "2.1.4",
|
||||||
|
"resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-anonymize-ua/-/puppeteer-extra-plugin-anonymize-ua-2.1.4.tgz",
|
||||||
|
"integrity": "sha512-AAxicMHX3AbsBvWgMQEqaIy7vqH+pzzh/HZ4+bdXiZExPm1dTUNq7Kmvog2xfY4bdQO+bQkk+Qtfr1dOQ0KRyg==",
|
||||||
|
"requires": {
|
||||||
|
"debug": "^3.1.0",
|
||||||
|
"puppeteer-extra-plugin": "^3.0.4"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"debug": {
|
||||||
|
"version": "3.2.6",
|
||||||
|
"resolved": "https://registry.npmjs.org/debug/-/debug-3.2.6.tgz",
|
||||||
|
"integrity": "sha512-mel+jf7nrtEl5Pn1Qx46zARXKDpBbvzezse7p7LqINmdoIk8PYP5SySaxEmYv6TZ0JyEKA1hsCId6DIhgITtWQ==",
|
||||||
|
"requires": {
|
||||||
|
"ms": "^2.1.1"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"puppeteer-extra-plugin-stealth": {
|
||||||
|
"version": "2.2.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-stealth/-/puppeteer-extra-plugin-stealth-2.2.2.tgz",
|
||||||
|
"integrity": "sha512-tVf+M0U4VAht/q2vNlf1eGGeCFzK9q0vTHnuCEXHf06mu7ukUe1J060s6kij5r7Fv51UsTcXnMa069oSXvYSOw==",
|
||||||
|
"requires": {
|
||||||
|
"debug": "^3.1.0",
|
||||||
|
"puppeteer-extra-plugin": "^3.0.4",
|
||||||
|
"puppeteer-extra-plugin-anonymize-ua": "^2.1.4"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"debug": {
|
||||||
|
"version": "3.2.6",
|
||||||
|
"resolved": "https://registry.npmjs.org/debug/-/debug-3.2.6.tgz",
|
||||||
|
"integrity": "sha512-mel+jf7nrtEl5Pn1Qx46zARXKDpBbvzezse7p7LqINmdoIk8PYP5SySaxEmYv6TZ0JyEKA1hsCId6DIhgITtWQ==",
|
||||||
|
"requires": {
|
||||||
|
"ms": "^2.1.1"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
"readable-stream": {
|
"readable-stream": {
|
||||||
"version": "3.1.1",
|
"version": "3.1.1",
|
||||||
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.1.1.tgz",
|
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.1.1.tgz",
|
||||||
@ -1266,6 +1442,37 @@
|
|||||||
"integrity": "sha1-BF+XgtARrppoA93TgrJDkrPYkPc=",
|
"integrity": "sha1-BF+XgtARrppoA93TgrJDkrPYkPc=",
|
||||||
"dev": true
|
"dev": true
|
||||||
},
|
},
|
||||||
|
"shallow-clone": {
|
||||||
|
"version": "0.1.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/shallow-clone/-/shallow-clone-0.1.2.tgz",
|
||||||
|
"integrity": "sha1-WQnodLp3EG1zrEFM/sH/yofZcGA=",
|
||||||
|
"requires": {
|
||||||
|
"is-extendable": "^0.1.1",
|
||||||
|
"kind-of": "^2.0.1",
|
||||||
|
"lazy-cache": "^0.2.3",
|
||||||
|
"mixin-object": "^2.0.1"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"is-buffer": {
|
||||||
|
"version": "1.1.6",
|
||||||
|
"resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-1.1.6.tgz",
|
||||||
|
"integrity": "sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w=="
|
||||||
|
},
|
||||||
|
"kind-of": {
|
||||||
|
"version": "2.0.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/kind-of/-/kind-of-2.0.1.tgz",
|
||||||
|
"integrity": "sha1-AY7HpM5+OobLkUG+UZ0kyPqpgbU=",
|
||||||
|
"requires": {
|
||||||
|
"is-buffer": "^1.0.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"lazy-cache": {
|
||||||
|
"version": "0.2.7",
|
||||||
|
"resolved": "https://registry.npmjs.org/lazy-cache/-/lazy-cache-0.2.7.tgz",
|
||||||
|
"integrity": "sha1-f+3fLctu23fRHvHRF6tf/fCrG2U="
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
"shebang-command": {
|
"shebang-command": {
|
||||||
"version": "1.2.0",
|
"version": "1.2.0",
|
||||||
"resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-1.2.0.tgz",
|
"resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-1.2.0.tgz",
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "se-scraper",
|
"name": "se-scraper",
|
||||||
"version": "1.4.0",
|
"version": "1.4.1",
|
||||||
"description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu",
|
"description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu",
|
||||||
"homepage": "https://scrapeulous.com/",
|
"homepage": "https://scrapeulous.com/",
|
||||||
"main": "index.js",
|
"main": "index.js",
|
||||||
@ -26,8 +26,10 @@
|
|||||||
"got": "^9.6.0",
|
"got": "^9.6.0",
|
||||||
"lodash": "^4.17.14",
|
"lodash": "^4.17.14",
|
||||||
"proxy-chain": "^0.2.7",
|
"proxy-chain": "^0.2.7",
|
||||||
"puppeteer": "^1.17.0",
|
"puppeteer": "^1.18.1",
|
||||||
"puppeteer-cluster": "^0.13.0",
|
"puppeteer-cluster": "^0.13.0",
|
||||||
|
"puppeteer-extra": "^2.1.3",
|
||||||
|
"puppeteer-extra-plugin-stealth": "^2.2.2",
|
||||||
"user-agents": "^1.0.321"
|
"user-agents": "^1.0.321"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
|
@ -29,11 +29,8 @@ module.exports = class Scraper {
|
|||||||
this.keywords = config.keywords;
|
this.keywords = config.keywords;
|
||||||
|
|
||||||
this.STANDARD_TIMEOUT = 10000;
|
this.STANDARD_TIMEOUT = 10000;
|
||||||
// longer timeout when using proxies
|
|
||||||
this.PROXY_TIMEOUT = 15000;
|
|
||||||
this.SOLVE_CAPTCHA_TIME = 45000;
|
this.SOLVE_CAPTCHA_TIME = 45000;
|
||||||
|
|
||||||
this.html_output = {};
|
|
||||||
this.results = {};
|
this.results = {};
|
||||||
this.result_rank = 1;
|
this.result_rank = 1;
|
||||||
// keep track of the requests done
|
// keep track of the requests done
|
||||||
@ -52,7 +49,6 @@ module.exports = class Scraper {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async run({page, data}) {
|
async run({page, data}) {
|
||||||
@ -103,12 +99,10 @@ module.exports = class Scraper {
|
|||||||
|
|
||||||
if (this.config.test_evasion === true) {
|
if (this.config.test_evasion === true) {
|
||||||
// Navigate to the page that will perform the tests.
|
// Navigate to the page that will perform the tests.
|
||||||
const testUrl = 'https://intoli.com/blog/' +
|
const testUrl = 'https://bot.sannysoft.com';
|
||||||
'not-possible-to-block-chrome-headless/chrome-headless-test.html';
|
|
||||||
await this.page.goto(testUrl);
|
await this.page.goto(testUrl);
|
||||||
|
|
||||||
// Save a screenshot of the results.
|
// Save a screenshot of the results.
|
||||||
await this.page.screenshot({path: 'headless-test-result.png'});
|
await this.page.screenshot({path: 'headless-evasion-result.png'});
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.config.log_http_headers === true) {
|
if (this.config.log_http_headers === true) {
|
||||||
@ -196,6 +190,13 @@ module.exports = class Scraper {
|
|||||||
this.results[keyword][this.page_num].html = html;
|
this.results[keyword][this.page_num].html = html;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (this.config.screen_output) {
|
||||||
|
this.results[keyword][this.page_num].screenshot = await this.page.screenshot({
|
||||||
|
encoding: 'base64',
|
||||||
|
fullPage: false,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
this.page_num += 1;
|
this.page_num += 1;
|
||||||
|
|
||||||
// only load the next page when we will pass the next iteration
|
// only load the next page when we will pass the next iteration
|
||||||
|
@ -108,6 +108,8 @@ class ScrapeManager {
|
|||||||
output_file: '',
|
output_file: '',
|
||||||
// whether to also passthru all the html output of the serp pages
|
// whether to also passthru all the html output of the serp pages
|
||||||
html_output: false,
|
html_output: false,
|
||||||
|
// whether to return a screenshot of serp pages as b64 data
|
||||||
|
screen_output: false,
|
||||||
// whether to prevent images, css, fonts and media from being loaded
|
// whether to prevent images, css, fonts and media from being loaded
|
||||||
// will speed up scraping a great deal
|
// will speed up scraping a great deal
|
||||||
block_assets: true,
|
block_assets: true,
|
||||||
|
@ -1 +1 @@
|
|||||||
Subproject commit 1fcf7f9a23c5a9abb491157e505b475f7aea72e5
|
Subproject commit 03c9a764298f3f55b46bace810f4d3b2e1cb3266
|
Loading…
x
Reference in New Issue
Block a user