forked from extern/se-scraper
refactor(log): remove common.js, use winston and debug
This commit is contained in:
parent
b4a86fcc51
commit
bcd181111b
@ -32,12 +32,6 @@ let browser_config = {
|
|||||||
verbose: true,
|
verbose: true,
|
||||||
// whether to start the browser in headless mode
|
// whether to start the browser in headless mode
|
||||||
headless: true,
|
headless: true,
|
||||||
// whether debug information should be printed
|
|
||||||
// level 0: print nothing
|
|
||||||
// level 1: print most important info
|
|
||||||
// ...
|
|
||||||
// level 4: print all shit nobody wants to know
|
|
||||||
debug_level: 1,
|
|
||||||
is_local: false,
|
is_local: false,
|
||||||
throw_on_detection: false,
|
throw_on_detection: false,
|
||||||
puppeteer_cluster_config: {
|
puppeteer_cluster_config: {
|
||||||
|
@ -30,12 +30,6 @@ let browser_config = {
|
|||||||
// if random_user_agent is set to True, a random user agent is chosen
|
// if random_user_agent is set to True, a random user agent is chosen
|
||||||
random_user_agent: true,
|
random_user_agent: true,
|
||||||
headless: true,
|
headless: true,
|
||||||
// whether debug information should be printed
|
|
||||||
// level 0: print nothing
|
|
||||||
// level 1: print most important info
|
|
||||||
// ...
|
|
||||||
// level 4: print all shit nobody wants to know
|
|
||||||
debug_level: 1,
|
|
||||||
is_local: false,
|
is_local: false,
|
||||||
throw_on_detection: false,
|
throw_on_detection: false,
|
||||||
puppeteer_cluster_config: {
|
puppeteer_cluster_config: {
|
||||||
|
@ -2,7 +2,6 @@ const se_scraper = require('./../src/node_scraper.js');
|
|||||||
|
|
||||||
(async () => {
|
(async () => {
|
||||||
let browser_config = {
|
let browser_config = {
|
||||||
debug_level: 1,
|
|
||||||
output_file: '',
|
output_file: '',
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -2,7 +2,6 @@ const se_scraper = require('./../src/node_scraper.js');
|
|||||||
|
|
||||||
(async () => {
|
(async () => {
|
||||||
let browser_config = {
|
let browser_config = {
|
||||||
debug_level: 2,
|
|
||||||
output_file: 'examples/results/gnold.json',
|
output_file: 'examples/results/gnold.json',
|
||||||
google_news_old_settings: {
|
google_news_old_settings: {
|
||||||
gl: 'us', // The gl parameter determines the Google country to use for the query.
|
gl: 'us', // The gl parameter determines the Google country to use for the query.
|
||||||
|
@ -2,7 +2,6 @@ const se_scraper = require('./../src/node_scraper.js');
|
|||||||
|
|
||||||
(async () => {
|
(async () => {
|
||||||
let browser_config = {
|
let browser_config = {
|
||||||
debug_level: 1,
|
|
||||||
output_file: 'examples/results/maps.json',
|
output_file: 'examples/results/maps.json',
|
||||||
test_evasion: false,
|
test_evasion: false,
|
||||||
block_assets: false,
|
block_assets: false,
|
||||||
|
@ -3,7 +3,6 @@ const se_scraper = require('./../src/node_scraper.js');
|
|||||||
(async () => {
|
(async () => {
|
||||||
let browser_config = {
|
let browser_config = {
|
||||||
search_engine: 'google',
|
search_engine: 'google',
|
||||||
debug_level: 1,
|
|
||||||
random_user_agent: true,
|
random_user_agent: true,
|
||||||
is_local: false,
|
is_local: false,
|
||||||
html_output: false,
|
html_output: false,
|
||||||
|
@ -5,7 +5,6 @@ const se_scraper = require('./../src/node_scraper.js');
|
|||||||
random_user_agent: true,
|
random_user_agent: true,
|
||||||
write_meta_data: true,
|
write_meta_data: true,
|
||||||
sleep_range: '[1,1]',
|
sleep_range: '[1,1]',
|
||||||
debug_level: 1,
|
|
||||||
headless: true,
|
headless: true,
|
||||||
output_file: `examples/results/multiple_search_engines.json`
|
output_file: `examples/results/multiple_search_engines.json`
|
||||||
};
|
};
|
||||||
|
@ -3,7 +3,6 @@ const resolve = require('path').resolve;
|
|||||||
|
|
||||||
(async () => {
|
(async () => {
|
||||||
let browser_config = {
|
let browser_config = {
|
||||||
debug_level: 1,
|
|
||||||
test_evasion: false,
|
test_evasion: false,
|
||||||
log_http_headers: true,
|
log_http_headers: true,
|
||||||
log_ip_address: true,
|
log_ip_address: true,
|
||||||
|
@ -2,7 +2,6 @@ const se_scraper = require('./../src/node_scraper.js');
|
|||||||
|
|
||||||
(async () => {
|
(async () => {
|
||||||
let browser_config = {
|
let browser_config = {
|
||||||
debug_level: 1,
|
|
||||||
output_file: 'examples/results/proxyresults.json',
|
output_file: 'examples/results/proxyresults.json',
|
||||||
log_ip_address: true,
|
log_ip_address: true,
|
||||||
// a file with one proxy per line. Example:
|
// a file with one proxy per line. Example:
|
||||||
|
@ -2,7 +2,6 @@ const se_scraper = require('./../src/node_scraper.js');
|
|||||||
|
|
||||||
(async () => {
|
(async () => {
|
||||||
let browser_config = {
|
let browser_config = {
|
||||||
debug_level: 1,
|
|
||||||
test_evasion: false,
|
test_evasion: false,
|
||||||
log_http_headers: false,
|
log_http_headers: false,
|
||||||
log_ip_address: false,
|
log_ip_address: false,
|
||||||
|
@ -2,7 +2,6 @@ const se_scraper = require('./../src/node_scraper.js');
|
|||||||
|
|
||||||
(async () => {
|
(async () => {
|
||||||
let browser_config = {
|
let browser_config = {
|
||||||
debug_level: 1,
|
|
||||||
output_file: 'examples/results/data.json',
|
output_file: 'examples/results/data.json',
|
||||||
};
|
};
|
||||||
|
|
||||||
|
187
package-lock.json
generated
187
package-lock.json
generated
@ -71,6 +71,14 @@
|
|||||||
"integrity": "sha512-jgsaNduz+ndvGyFt3uSuWqvy4lCnIJiovtouQN5JZHOKCS2QuhEdbcQHFhVksz2N2U9hXJo8odG7ETyWlEeuDw==",
|
"integrity": "sha512-jgsaNduz+ndvGyFt3uSuWqvy4lCnIJiovtouQN5JZHOKCS2QuhEdbcQHFhVksz2N2U9hXJo8odG7ETyWlEeuDw==",
|
||||||
"dev": true
|
"dev": true
|
||||||
},
|
},
|
||||||
|
"async": {
|
||||||
|
"version": "2.6.3",
|
||||||
|
"resolved": "https://registry.npmjs.org/async/-/async-2.6.3.tgz",
|
||||||
|
"integrity": "sha512-zflvls11DCy+dQWzTW2dzuilv8Z5X/pjfmZOWba6TNIVDm+2UDaJmXSOXlasHKfNBs8oo3M0aT50fDEWfKZjXg==",
|
||||||
|
"requires": {
|
||||||
|
"lodash": "^4.17.14"
|
||||||
|
}
|
||||||
|
},
|
||||||
"async-limiter": {
|
"async-limiter": {
|
||||||
"version": "1.0.1",
|
"version": "1.0.1",
|
||||||
"resolved": "https://registry.npmjs.org/async-limiter/-/async-limiter-1.0.1.tgz",
|
"resolved": "https://registry.npmjs.org/async-limiter/-/async-limiter-1.0.1.tgz",
|
||||||
@ -224,11 +232,19 @@
|
|||||||
"integrity": "sha1-DQcLTQQ6W+ozovGkDi7bPZpMz3c=",
|
"integrity": "sha1-DQcLTQQ6W+ozovGkDi7bPZpMz3c=",
|
||||||
"dev": true
|
"dev": true
|
||||||
},
|
},
|
||||||
|
"color": {
|
||||||
|
"version": "3.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/color/-/color-3.0.0.tgz",
|
||||||
|
"integrity": "sha512-jCpd5+s0s0t7p3pHQKpnJ0TpQKKdleP71LWcA0aqiljpiuAkOSUFN/dyH8ZwF0hRmFlrIuRhufds1QyEP9EB+w==",
|
||||||
|
"requires": {
|
||||||
|
"color-convert": "^1.9.1",
|
||||||
|
"color-string": "^1.5.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
"color-convert": {
|
"color-convert": {
|
||||||
"version": "1.9.3",
|
"version": "1.9.3",
|
||||||
"resolved": "https://registry.npmjs.org/color-convert/-/color-convert-1.9.3.tgz",
|
"resolved": "https://registry.npmjs.org/color-convert/-/color-convert-1.9.3.tgz",
|
||||||
"integrity": "sha512-QfAUtd+vFdAtFQcC8CCyYt1fYWxSqAiK2cSD6zDB8N3cpsEBAvRxp9zOGg6G/SHHJYAT88/az/IuDGALsNVbGg==",
|
"integrity": "sha512-QfAUtd+vFdAtFQcC8CCyYt1fYWxSqAiK2cSD6zDB8N3cpsEBAvRxp9zOGg6G/SHHJYAT88/az/IuDGALsNVbGg==",
|
||||||
"dev": true,
|
|
||||||
"requires": {
|
"requires": {
|
||||||
"color-name": "1.1.3"
|
"color-name": "1.1.3"
|
||||||
}
|
}
|
||||||
@ -236,8 +252,35 @@
|
|||||||
"color-name": {
|
"color-name": {
|
||||||
"version": "1.1.3",
|
"version": "1.1.3",
|
||||||
"resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.3.tgz",
|
"resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.3.tgz",
|
||||||
"integrity": "sha1-p9BVi9icQveV3UIyj3QIMcpTvCU=",
|
"integrity": "sha1-p9BVi9icQveV3UIyj3QIMcpTvCU="
|
||||||
"dev": true
|
},
|
||||||
|
"color-string": {
|
||||||
|
"version": "1.5.3",
|
||||||
|
"resolved": "https://registry.npmjs.org/color-string/-/color-string-1.5.3.tgz",
|
||||||
|
"integrity": "sha512-dC2C5qeWoYkxki5UAXapdjqO672AM4vZuPGRQfO8b5HKuKGBbKWpITyDYN7TOFKvRW7kOgAn3746clDBMDJyQw==",
|
||||||
|
"requires": {
|
||||||
|
"color-name": "^1.0.0",
|
||||||
|
"simple-swizzle": "^0.2.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"colornames": {
|
||||||
|
"version": "1.1.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/colornames/-/colornames-1.1.1.tgz",
|
||||||
|
"integrity": "sha1-+IiQMGhcfE/54qVZ9Qd+t2qBb5Y="
|
||||||
|
},
|
||||||
|
"colors": {
|
||||||
|
"version": "1.4.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/colors/-/colors-1.4.0.tgz",
|
||||||
|
"integrity": "sha512-a+UqTh4kgZg/SlGvfbzDHpgRu7AAQOmmqRHJnxhRZICKFUT91brVhNNt58CMWU9PsBbv3PDCZUHbVxuDiH2mtA=="
|
||||||
|
},
|
||||||
|
"colorspace": {
|
||||||
|
"version": "1.1.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/colorspace/-/colorspace-1.1.2.tgz",
|
||||||
|
"integrity": "sha512-vt+OoIP2d76xLhjwbBaucYlNSpPsrJWPlBTtwCpQKIu6/CSMutyzX93O/Do0qzpH3YoHEes8YEFXyZ797rEhzQ==",
|
||||||
|
"requires": {
|
||||||
|
"color": "3.0.x",
|
||||||
|
"text-hex": "1.0.x"
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"concat-map": {
|
"concat-map": {
|
||||||
"version": "0.0.1",
|
"version": "0.0.1",
|
||||||
@ -363,6 +406,16 @@
|
|||||||
"object-keys": "^1.0.12"
|
"object-keys": "^1.0.12"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"diagnostics": {
|
||||||
|
"version": "1.1.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/diagnostics/-/diagnostics-1.1.1.tgz",
|
||||||
|
"integrity": "sha512-8wn1PmdunLJ9Tqbx+Fx/ZEuHfJf4NKSN2ZBj7SJC/OWRWha843+WsTjqMe1B5E3p28jqBlp+mJ2fPVxPyNgYKQ==",
|
||||||
|
"requires": {
|
||||||
|
"colorspace": "1.1.x",
|
||||||
|
"enabled": "1.0.x",
|
||||||
|
"kuler": "1.0.x"
|
||||||
|
}
|
||||||
|
},
|
||||||
"diff": {
|
"diff": {
|
||||||
"version": "3.5.0",
|
"version": "3.5.0",
|
||||||
"resolved": "https://registry.npmjs.org/diff/-/diff-3.5.0.tgz",
|
"resolved": "https://registry.npmjs.org/diff/-/diff-3.5.0.tgz",
|
||||||
@ -425,6 +478,14 @@
|
|||||||
"integrity": "sha512-CwBLREIQ7LvYFB0WyRvwhq5N5qPhc6PMjD6bYggFlI5YyDgl+0vxq5VHbMOFqLg7hfWzmu8T5Z1QofhmTIhItA==",
|
"integrity": "sha512-CwBLREIQ7LvYFB0WyRvwhq5N5qPhc6PMjD6bYggFlI5YyDgl+0vxq5VHbMOFqLg7hfWzmu8T5Z1QofhmTIhItA==",
|
||||||
"dev": true
|
"dev": true
|
||||||
},
|
},
|
||||||
|
"enabled": {
|
||||||
|
"version": "1.0.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/enabled/-/enabled-1.0.2.tgz",
|
||||||
|
"integrity": "sha1-ll9lE9LC0cX0ZStkouM5ZGf8L5M=",
|
||||||
|
"requires": {
|
||||||
|
"env-variable": "0.0.x"
|
||||||
|
}
|
||||||
|
},
|
||||||
"end-of-stream": {
|
"end-of-stream": {
|
||||||
"version": "1.4.1",
|
"version": "1.4.1",
|
||||||
"resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.1.tgz",
|
"resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.1.tgz",
|
||||||
@ -438,6 +499,11 @@
|
|||||||
"resolved": "https://registry.npmjs.org/entities/-/entities-1.1.2.tgz",
|
"resolved": "https://registry.npmjs.org/entities/-/entities-1.1.2.tgz",
|
||||||
"integrity": "sha512-f2LZMYl1Fzu7YSBKg+RoROelpOaNrcGmE9AZubeDfrCEia483oW4MI4VyFd5VNHIgQ/7qm1I0wUHK1eJnn2y2w=="
|
"integrity": "sha512-f2LZMYl1Fzu7YSBKg+RoROelpOaNrcGmE9AZubeDfrCEia483oW4MI4VyFd5VNHIgQ/7qm1I0wUHK1eJnn2y2w=="
|
||||||
},
|
},
|
||||||
|
"env-variable": {
|
||||||
|
"version": "0.0.5",
|
||||||
|
"resolved": "https://registry.npmjs.org/env-variable/-/env-variable-0.0.5.tgz",
|
||||||
|
"integrity": "sha512-zoB603vQReOFvTg5xMl9I1P2PnHsHQQKTEowsKKD7nseUfJq6UWzK+4YtlWUO1nhiQUxe6XMkk+JleSZD1NZFA=="
|
||||||
|
},
|
||||||
"es-abstract": {
|
"es-abstract": {
|
||||||
"version": "1.13.0",
|
"version": "1.13.0",
|
||||||
"resolved": "https://registry.npmjs.org/es-abstract/-/es-abstract-1.13.0.tgz",
|
"resolved": "https://registry.npmjs.org/es-abstract/-/es-abstract-1.13.0.tgz",
|
||||||
@ -529,6 +595,11 @@
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"fast-safe-stringify": {
|
||||||
|
"version": "2.0.7",
|
||||||
|
"resolved": "https://registry.npmjs.org/fast-safe-stringify/-/fast-safe-stringify-2.0.7.tgz",
|
||||||
|
"integrity": "sha512-Utm6CdzT+6xsDk2m8S6uL8VHxNwI6Jub+e9NYTcAms28T84pTa25GJQV9j0CY0N1rM8hK4x6grpF2BQf+2qwVA=="
|
||||||
|
},
|
||||||
"fd-slicer": {
|
"fd-slicer": {
|
||||||
"version": "1.0.1",
|
"version": "1.0.1",
|
||||||
"resolved": "https://registry.npmjs.org/fd-slicer/-/fd-slicer-1.0.1.tgz",
|
"resolved": "https://registry.npmjs.org/fd-slicer/-/fd-slicer-1.0.1.tgz",
|
||||||
@ -537,6 +608,11 @@
|
|||||||
"pend": "~1.2.0"
|
"pend": "~1.2.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"fecha": {
|
||||||
|
"version": "2.3.3",
|
||||||
|
"resolved": "https://registry.npmjs.org/fecha/-/fecha-2.3.3.tgz",
|
||||||
|
"integrity": "sha512-lUGBnIamTAwk4znq5BcqsDaxSmZ9nDVJaij6NvRt/Tg4R69gERA+otPKbS86ROw9nxVMw2/mp1fnaiWqbs6Sdg=="
|
||||||
|
},
|
||||||
"find-up": {
|
"find-up": {
|
||||||
"version": "3.0.0",
|
"version": "3.0.0",
|
||||||
"resolved": "https://registry.npmjs.org/find-up/-/find-up-3.0.0.tgz",
|
"resolved": "https://registry.npmjs.org/find-up/-/find-up-3.0.0.tgz",
|
||||||
@ -720,6 +796,11 @@
|
|||||||
"integrity": "sha512-wPVv/y/QQ/Uiirj/vh3oP+1Ww+AWehmi1g5fFWGPF6IpCBCDVrhgHRMvrLfdYcwDh3QJbGXDW4JAuzxElLSqKA==",
|
"integrity": "sha512-wPVv/y/QQ/Uiirj/vh3oP+1Ww+AWehmi1g5fFWGPF6IpCBCDVrhgHRMvrLfdYcwDh3QJbGXDW4JAuzxElLSqKA==",
|
||||||
"dev": true
|
"dev": true
|
||||||
},
|
},
|
||||||
|
"is-arrayish": {
|
||||||
|
"version": "0.3.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.2.tgz",
|
||||||
|
"integrity": "sha512-eVRqCvVlZbuw3GrM63ovNSNAeA1K16kaR/LRY/92w0zxQ5/1YzwblUX652i4Xs9RwAGjW9d9y6X88t8OaAJfWQ=="
|
||||||
|
},
|
||||||
"is-buffer": {
|
"is-buffer": {
|
||||||
"version": "2.0.3",
|
"version": "2.0.3",
|
||||||
"resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-2.0.3.tgz",
|
"resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-2.0.3.tgz",
|
||||||
@ -769,8 +850,7 @@
|
|||||||
"is-stream": {
|
"is-stream": {
|
||||||
"version": "1.1.0",
|
"version": "1.1.0",
|
||||||
"resolved": "https://registry.npmjs.org/is-stream/-/is-stream-1.1.0.tgz",
|
"resolved": "https://registry.npmjs.org/is-stream/-/is-stream-1.1.0.tgz",
|
||||||
"integrity": "sha1-EtSj3U5o4Lec6428hBc66A2RykQ=",
|
"integrity": "sha1-EtSj3U5o4Lec6428hBc66A2RykQ="
|
||||||
"dev": true
|
|
||||||
},
|
},
|
||||||
"is-symbol": {
|
"is-symbol": {
|
||||||
"version": "1.0.2",
|
"version": "1.0.2",
|
||||||
@ -835,6 +915,14 @@
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"kuler": {
|
||||||
|
"version": "1.0.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/kuler/-/kuler-1.0.1.tgz",
|
||||||
|
"integrity": "sha512-J9nVUucG1p/skKul6DU3PUZrhs0LPulNaeUOox0IyXDi8S4CztTHs1gQphhuZmzXG7VOQSf6NJfKuzteQLv9gQ==",
|
||||||
|
"requires": {
|
||||||
|
"colornames": "^1.1.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
"lazy-cache": {
|
"lazy-cache": {
|
||||||
"version": "1.0.4",
|
"version": "1.0.4",
|
||||||
"resolved": "https://registry.npmjs.org/lazy-cache/-/lazy-cache-1.0.4.tgz",
|
"resolved": "https://registry.npmjs.org/lazy-cache/-/lazy-cache-1.0.4.tgz",
|
||||||
@ -878,6 +966,18 @@
|
|||||||
"chalk": "^2.0.1"
|
"chalk": "^2.0.1"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"logform": {
|
||||||
|
"version": "2.1.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/logform/-/logform-2.1.2.tgz",
|
||||||
|
"integrity": "sha512-+lZh4OpERDBLqjiwDLpAWNQu6KMjnlXH2ByZwCuSqVPJletw0kTWJf5CgSNAUKn1KUkv3m2cUz/LK8zyEy7wzQ==",
|
||||||
|
"requires": {
|
||||||
|
"colors": "^1.2.1",
|
||||||
|
"fast-safe-stringify": "^2.0.4",
|
||||||
|
"fecha": "^2.3.3",
|
||||||
|
"ms": "^2.1.1",
|
||||||
|
"triple-beam": "^1.3.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
"lowercase-keys": {
|
"lowercase-keys": {
|
||||||
"version": "1.0.1",
|
"version": "1.0.1",
|
||||||
"resolved": "https://registry.npmjs.org/lowercase-keys/-/lowercase-keys-1.0.1.tgz",
|
"resolved": "https://registry.npmjs.org/lowercase-keys/-/lowercase-keys-1.0.1.tgz",
|
||||||
@ -1107,6 +1207,11 @@
|
|||||||
"wrappy": "1"
|
"wrappy": "1"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"one-time": {
|
||||||
|
"version": "0.0.4",
|
||||||
|
"resolved": "https://registry.npmjs.org/one-time/-/one-time-0.0.4.tgz",
|
||||||
|
"integrity": "sha1-+M33eISCb+Tf+T46nMN7HkSAdC4="
|
||||||
|
},
|
||||||
"os-locale": {
|
"os-locale": {
|
||||||
"version": "3.1.0",
|
"version": "3.1.0",
|
||||||
"resolved": "https://registry.npmjs.org/os-locale/-/os-locale-3.1.0.tgz",
|
"resolved": "https://registry.npmjs.org/os-locale/-/os-locale-3.1.0.tgz",
|
||||||
@ -1429,12 +1534,25 @@
|
|||||||
"integrity": "sha1-tf3AjxKH6hF4Yo5BXiUTK3NkbG0=",
|
"integrity": "sha1-tf3AjxKH6hF4Yo5BXiUTK3NkbG0=",
|
||||||
"dev": true
|
"dev": true
|
||||||
},
|
},
|
||||||
|
"simple-swizzle": {
|
||||||
|
"version": "0.2.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.2.tgz",
|
||||||
|
"integrity": "sha1-pNprY1/8zMoz9w0Xy5JZLeleVXo=",
|
||||||
|
"requires": {
|
||||||
|
"is-arrayish": "^0.3.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
"sprintf-js": {
|
"sprintf-js": {
|
||||||
"version": "1.0.3",
|
"version": "1.0.3",
|
||||||
"resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz",
|
"resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz",
|
||||||
"integrity": "sha1-BOaSb2YolTVPPdAVIDYzuFcpfiw=",
|
"integrity": "sha1-BOaSb2YolTVPPdAVIDYzuFcpfiw=",
|
||||||
"dev": true
|
"dev": true
|
||||||
},
|
},
|
||||||
|
"stack-trace": {
|
||||||
|
"version": "0.0.10",
|
||||||
|
"resolved": "https://registry.npmjs.org/stack-trace/-/stack-trace-0.0.10.tgz",
|
||||||
|
"integrity": "sha1-VHxws0fo0ytOEI6hoqFZ5f3eGcA="
|
||||||
|
},
|
||||||
"string-width": {
|
"string-width": {
|
||||||
"version": "2.1.1",
|
"version": "2.1.1",
|
||||||
"resolved": "https://registry.npmjs.org/string-width/-/string-width-2.1.1.tgz",
|
"resolved": "https://registry.npmjs.org/string-width/-/string-width-2.1.1.tgz",
|
||||||
@ -1490,11 +1608,21 @@
|
|||||||
"has-flag": "^3.0.0"
|
"has-flag": "^3.0.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"text-hex": {
|
||||||
|
"version": "1.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/text-hex/-/text-hex-1.0.0.tgz",
|
||||||
|
"integrity": "sha512-uuVGNWzgJ4yhRaNSiubPY7OjISw4sw4E5Uv0wbjp+OzcbmVU/rsT8ujgcXJhn9ypzsgr5vlzpPqP+MBBKcGvbg=="
|
||||||
|
},
|
||||||
"to-readable-stream": {
|
"to-readable-stream": {
|
||||||
"version": "1.0.0",
|
"version": "1.0.0",
|
||||||
"resolved": "https://registry.npmjs.org/to-readable-stream/-/to-readable-stream-1.0.0.tgz",
|
"resolved": "https://registry.npmjs.org/to-readable-stream/-/to-readable-stream-1.0.0.tgz",
|
||||||
"integrity": "sha512-Iq25XBt6zD5npPhlLVXGFN3/gyR2/qODcKNNyTMd4vbm39HUaOiAM4PMq0eMVC/Tkxz+Zjdsc55g9yyz+Yq00Q=="
|
"integrity": "sha512-Iq25XBt6zD5npPhlLVXGFN3/gyR2/qODcKNNyTMd4vbm39HUaOiAM4PMq0eMVC/Tkxz+Zjdsc55g9yyz+Yq00Q=="
|
||||||
},
|
},
|
||||||
|
"triple-beam": {
|
||||||
|
"version": "1.3.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/triple-beam/-/triple-beam-1.3.0.tgz",
|
||||||
|
"integrity": "sha512-XrHUvV5HpdLmIj4uVMxHggLbFSZYIn7HEWsqePZcI50pco+MPqJ50wMGY794X7AOOhxOBAjbkqfAbEe/QMp2Lw=="
|
||||||
|
},
|
||||||
"type-detect": {
|
"type-detect": {
|
||||||
"version": "4.0.8",
|
"version": "4.0.8",
|
||||||
"resolved": "https://registry.npmjs.org/type-detect/-/type-detect-4.0.8.tgz",
|
"resolved": "https://registry.npmjs.org/type-detect/-/type-detect-4.0.8.tgz",
|
||||||
@ -1565,6 +1693,55 @@
|
|||||||
"string-width": "^1.0.2 || 2"
|
"string-width": "^1.0.2 || 2"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"winston": {
|
||||||
|
"version": "3.2.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/winston/-/winston-3.2.1.tgz",
|
||||||
|
"integrity": "sha512-zU6vgnS9dAWCEKg/QYigd6cgMVVNwyTzKs81XZtTFuRwJOcDdBg7AU0mXVyNbs7O5RH2zdv+BdNZUlx7mXPuOw==",
|
||||||
|
"requires": {
|
||||||
|
"async": "^2.6.1",
|
||||||
|
"diagnostics": "^1.1.1",
|
||||||
|
"is-stream": "^1.1.0",
|
||||||
|
"logform": "^2.1.1",
|
||||||
|
"one-time": "0.0.4",
|
||||||
|
"readable-stream": "^3.1.1",
|
||||||
|
"stack-trace": "0.0.x",
|
||||||
|
"triple-beam": "^1.3.0",
|
||||||
|
"winston-transport": "^4.3.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"winston-transport": {
|
||||||
|
"version": "4.3.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/winston-transport/-/winston-transport-4.3.0.tgz",
|
||||||
|
"integrity": "sha512-B2wPuwUi3vhzn/51Uukcao4dIduEiPOcOt9HJ3QeaXgkJ5Z7UwpBzxS4ZGNHtrxrUvTwemsQiSys0ihOf8Mp1A==",
|
||||||
|
"requires": {
|
||||||
|
"readable-stream": "^2.3.6",
|
||||||
|
"triple-beam": "^1.2.0"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"readable-stream": {
|
||||||
|
"version": "2.3.6",
|
||||||
|
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz",
|
||||||
|
"integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==",
|
||||||
|
"requires": {
|
||||||
|
"core-util-is": "~1.0.0",
|
||||||
|
"inherits": "~2.0.3",
|
||||||
|
"isarray": "~1.0.0",
|
||||||
|
"process-nextick-args": "~2.0.0",
|
||||||
|
"safe-buffer": "~5.1.1",
|
||||||
|
"string_decoder": "~1.1.1",
|
||||||
|
"util-deprecate": "~1.0.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"string_decoder": {
|
||||||
|
"version": "1.1.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz",
|
||||||
|
"integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==",
|
||||||
|
"requires": {
|
||||||
|
"safe-buffer": "~5.1.0"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
"wrap-ansi": {
|
"wrap-ansi": {
|
||||||
"version": "2.1.0",
|
"version": "2.1.0",
|
||||||
"resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-2.1.0.tgz",
|
"resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-2.1.0.tgz",
|
||||||
|
@ -29,7 +29,8 @@
|
|||||||
"puppeteer": "^2.0.0",
|
"puppeteer": "^2.0.0",
|
||||||
"puppeteer-extra": "^2.1.3",
|
"puppeteer-extra": "^2.1.3",
|
||||||
"puppeteer-extra-plugin-stealth": "^2.2.2",
|
"puppeteer-extra-plugin-stealth": "^2.2.2",
|
||||||
"user-agents": "^1.0.378"
|
"user-agents": "^1.0.378",
|
||||||
|
"winston": "^3.2.1"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"chai": "^4.2.0",
|
"chai": "^4.2.0",
|
||||||
|
@ -1,21 +0,0 @@
|
|||||||
function log(config, loglevel, msg = null, cb = null) {
|
|
||||||
if (typeof loglevel != "number") {
|
|
||||||
throw Error('loglevel must be numeric.');
|
|
||||||
}
|
|
||||||
|
|
||||||
if (loglevel <= config.debug_level) {
|
|
||||||
if (msg) {
|
|
||||||
if (typeof msg == 'object') {
|
|
||||||
console.dir(msg, {depth: null, colors: false});
|
|
||||||
} else {
|
|
||||||
console.log('[i] ' + msg);
|
|
||||||
}
|
|
||||||
} else if (cb) {
|
|
||||||
cb();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
module.exports = {
|
|
||||||
log: log,
|
|
||||||
};
|
|
@ -2,8 +2,6 @@
|
|||||||
|
|
||||||
const cheerio = require('cheerio');
|
const cheerio = require('cheerio');
|
||||||
const Scraper = require('./se_scraper');
|
const Scraper = require('./se_scraper');
|
||||||
const common = require('./common.js');
|
|
||||||
var log = common.log;
|
|
||||||
|
|
||||||
class GoogleScraper extends Scraper {
|
class GoogleScraper extends Scraper {
|
||||||
|
|
||||||
@ -243,7 +241,7 @@ class GoogleScraper extends Scraper {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
log(this.config, 1, 'Using startUrl: ' + startUrl);
|
this.logger.info('Using startUrl: ' + startUrl);
|
||||||
|
|
||||||
this.last_response = await this.page.goto(startUrl);
|
this.last_response = await this.page.goto(startUrl);
|
||||||
|
|
||||||
@ -642,7 +640,7 @@ class GoogleMapsScraper extends Scraper {
|
|||||||
this.scrape_in_detail = this.config.google_maps_settings.scrape_in_detail || false;
|
this.scrape_in_detail = this.config.google_maps_settings.scrape_in_detail || false;
|
||||||
}
|
}
|
||||||
|
|
||||||
log(this.config, 1, 'Using startUrl: ' + startUrl);
|
this.logger.info('Using startUrl: ' + startUrl);
|
||||||
|
|
||||||
this.last_response = await this.page.goto(startUrl);
|
this.last_response = await this.page.goto(startUrl);
|
||||||
|
|
||||||
@ -681,7 +679,7 @@ class GoogleMapsScraper extends Scraper {
|
|||||||
|
|
||||||
let last_title_last_result = this.results[this.keyword][this.page_num-1].results.slice(-1)[0].title;
|
let last_title_last_result = this.results[this.keyword][this.page_num-1].results.slice(-1)[0].title;
|
||||||
|
|
||||||
log(this.config, 1, `Waiting until new last serp title differs from: "${last_title_last_result}"`);
|
this.logger.info(`Waiting until new last serp title differs from: "${last_title_last_result}"`);
|
||||||
|
|
||||||
await this.page.waitForFunction((last_title) => {
|
await this.page.waitForFunction((last_title) => {
|
||||||
const res = document.querySelectorAll('.section-result .section-result-title span');
|
const res = document.querySelectorAll('.section-result .section-result-title span');
|
||||||
@ -775,7 +773,7 @@ class GoogleShoppingScraper extends Scraper {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
log(this.config, 1, 'Using startUrl: ' + startUrl);
|
this.logger.info('Using startUrl: ' + startUrl);
|
||||||
|
|
||||||
this.last_response = await this.page.goto(startUrl);
|
this.last_response = await this.page.goto(startUrl);
|
||||||
|
|
||||||
|
@ -1,8 +1,6 @@
|
|||||||
'use strict';
|
'use strict';
|
||||||
const meta = require('./metadata.js');
|
const meta = require('./metadata.js');
|
||||||
const common = require('./common.js');
|
const debug = require('debug')('se-scraper:Scraper');
|
||||||
var log = common.log;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Get useful JS knowledge and get awesome...
|
Get useful JS knowledge and get awesome...
|
||||||
|
|
||||||
@ -26,6 +24,7 @@ module.exports = class Scraper {
|
|||||||
};
|
};
|
||||||
this.pluggable = pluggable;
|
this.pluggable = pluggable;
|
||||||
this.config = config;
|
this.config = config;
|
||||||
|
this.logger = this.config.logger;
|
||||||
this.context = context;
|
this.context = context;
|
||||||
|
|
||||||
this.proxy = config.proxy;
|
this.proxy = config.proxy;
|
||||||
@ -113,25 +112,25 @@ module.exports = class Scraper {
|
|||||||
|
|
||||||
if (this.config.log_http_headers === true) {
|
if (this.config.log_http_headers === true) {
|
||||||
this.metadata.http_headers = await meta.get_http_headers(this.page);
|
this.metadata.http_headers = await meta.get_http_headers(this.page);
|
||||||
log(this.config, 2, this.metadata.http_headers);
|
debug('this.metadata.http_headers=%O', this.metadata.http_headers);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.config.log_ip_address === true) {
|
if (this.config.log_ip_address === true) {
|
||||||
let ipinfo = await meta.get_ip_data(this.page);
|
let ipinfo = await meta.get_ip_data(this.page);
|
||||||
this.metadata.ipinfo = ipinfo;
|
this.metadata.ipinfo = ipinfo;
|
||||||
log(this.config, 2, this.metadata.ipinfo);
|
debug('this.metadata.ipinfo', this.metadata.ipinfo);
|
||||||
}
|
}
|
||||||
|
|
||||||
// check that our proxy is working by confirming
|
// check that our proxy is working by confirming
|
||||||
// that ipinfo.io sees the proxy IP address
|
// that ipinfo.io sees the proxy IP address
|
||||||
if (this.proxy && this.config.log_ip_address === true) {
|
if (this.proxy && this.config.log_ip_address === true) {
|
||||||
log(this.config, 3, `${this.metadata.ipinfo.ip} vs ${this.proxy}`);
|
debug(`${this.metadata.ipinfo.ip} vs ${this.proxy}`);
|
||||||
|
|
||||||
// if the ip returned by ipinfo is not a substring of our proxystring, get the heck outta here
|
// if the ip returned by ipinfo is not a substring of our proxystring, get the heck outta here
|
||||||
if (!this.proxy.includes(this.metadata.ipinfo.ip)) {
|
if (!this.proxy.includes(this.metadata.ipinfo.ip)) {
|
||||||
throw new Error(`Proxy output ip ${this.proxy} does not match with provided one`);
|
throw new Error(`Proxy output ip ${this.proxy} does not match with provided one`);
|
||||||
} else {
|
} else {
|
||||||
log(this.config, 1, `Using valid Proxy: ${this.proxy}`);
|
this.logger.info(`Using valid Proxy: ${this.proxy}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -179,7 +178,7 @@ module.exports = class Scraper {
|
|||||||
|
|
||||||
do {
|
do {
|
||||||
|
|
||||||
log(this.config, 1, `${this.config.search_engine_name} scrapes keyword "${keyword}" on page ${this.page_num}`);
|
this.logger.info(`${this.config.search_engine_name} scrapes keyword "${keyword}" on page ${this.page_num}`);
|
||||||
|
|
||||||
await this.wait_for_results();
|
await this.wait_for_results();
|
||||||
|
|
||||||
@ -263,28 +262,21 @@ module.exports = class Scraper {
|
|||||||
|
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
|
|
||||||
console.error(`Problem with scraping ${keyword} in search engine ${this.config.search_engine_name}: ${e}`);
|
this.logger.warn(`Problem with scraping ${keyword} in search engine ${this.config.search_engine_name}: ${e.message}`);
|
||||||
|
debug('this.last_response=%O', this.last_response);
|
||||||
|
|
||||||
if (this.last_response) {
|
if (this.config.take_screenshot_on_error) {
|
||||||
log(this.config, 2, this.last_response);
|
await this.page.screenshot({ path: `debug_se_scraper_${this.config.search_engine_name}_${keyword}.png` });
|
||||||
}
|
|
||||||
|
|
||||||
if (this.config.debug_level > 2) {
|
|
||||||
try {
|
|
||||||
// Try to save a screenshot of the error
|
|
||||||
await this.page.screenshot({path: `debug_se_scraper_${this.config.search_engine_name}_${keyword}.png`});
|
|
||||||
} catch (e) {
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
this.metadata.scraping_detected = await this.detected();
|
this.metadata.scraping_detected = await this.detected();
|
||||||
|
|
||||||
if (this.metadata.scraping_detected === true) {
|
if (this.metadata.scraping_detected === true) {
|
||||||
console.error(`${this.config.search_engine_name} detected the scraping!`);
|
this.logger.warn(`${this.config.search_engine_name} detected the scraping!`);
|
||||||
|
|
||||||
if (this.config.is_local === true) {
|
if (this.config.is_local === true) {
|
||||||
await this.sleep(this.SOLVE_CAPTCHA_TIME);
|
await this.sleep(this.SOLVE_CAPTCHA_TIME);
|
||||||
console.error(`You have ${this.SOLVE_CAPTCHA_TIME}ms to enter the captcha.`);
|
this.logger.info(`You have ${this.SOLVE_CAPTCHA_TIME}ms to enter the captcha.`);
|
||||||
// expect that user filled out necessary captcha
|
// expect that user filled out necessary captcha
|
||||||
} else {
|
} else {
|
||||||
if (this.config.throw_on_detection === true) {
|
if (this.config.throw_on_detection === true) {
|
||||||
@ -318,7 +310,7 @@ module.exports = class Scraper {
|
|||||||
baseUrl += `${key}=${settings[key]}&`
|
baseUrl += `${key}=${settings[key]}&`
|
||||||
}
|
}
|
||||||
|
|
||||||
log(this.config, 1, 'Using startUrl: ' + baseUrl);
|
this.logger.info('Using startUrl: ' + baseUrl);
|
||||||
|
|
||||||
return baseUrl;
|
return baseUrl;
|
||||||
}
|
}
|
||||||
@ -335,7 +327,7 @@ module.exports = class Scraper {
|
|||||||
async random_sleep() {
|
async random_sleep() {
|
||||||
const [min, max] = this.config.sleep_range;
|
const [min, max] = this.config.sleep_range;
|
||||||
let rand = Math.floor(Math.random() * (max - min + 1) + min); //Generate Random number
|
let rand = Math.floor(Math.random() * (max - min + 1) + min); //Generate Random number
|
||||||
log(this.config, 1, `Sleeping for ${rand}s`);
|
this.logger.info(`Sleeping for ${rand}s`);
|
||||||
await this.sleep(rand * 1000);
|
await this.sleep(rand * 1000);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -349,7 +341,7 @@ module.exports = class Scraper {
|
|||||||
no_results(needles, html) {
|
no_results(needles, html) {
|
||||||
for (let needle of needles) {
|
for (let needle of needles) {
|
||||||
if (html.includes(needle)) {
|
if (html.includes(needle)) {
|
||||||
console.log(this.config, 2, `HTML contains needle ${needle}. no_results=true`);
|
this.logger.warn(`HTML contains needle ${needle}. no_results=true`);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,8 +1,6 @@
|
|||||||
'use strict';
|
'use strict';
|
||||||
|
|
||||||
const Scraper = require('./se_scraper');
|
const Scraper = require('./se_scraper');
|
||||||
const common = require('./common.js');
|
|
||||||
var log = common.log;
|
|
||||||
|
|
||||||
class YandexScraper extends Scraper {
|
class YandexScraper extends Scraper {
|
||||||
|
|
||||||
@ -75,7 +73,7 @@ class YandexScraper extends Scraper {
|
|||||||
async load_start_page() {
|
async load_start_page() {
|
||||||
let startUrl = 'https://yandex.com';
|
let startUrl = 'https://yandex.com';
|
||||||
|
|
||||||
log(this.config, 1, 'Using startUrl: ' + startUrl);
|
this.logger.info('Using startUrl: ' + startUrl);
|
||||||
|
|
||||||
this.last_response = await this.page.goto(startUrl);
|
this.last_response = await this.page.goto(startUrl);
|
||||||
|
|
||||||
|
@ -3,6 +3,9 @@
|
|||||||
const fs = require('fs');
|
const fs = require('fs');
|
||||||
const os = require('os');
|
const os = require('os');
|
||||||
const _ = require('lodash');
|
const _ = require('lodash');
|
||||||
|
const { createLogger, format, transports } = require('winston');
|
||||||
|
const { combine, timestamp, printf } = format;
|
||||||
|
const debug = require('debug')('se-scraper:ScrapeManager');
|
||||||
|
|
||||||
const UserAgent = require('user-agents');
|
const UserAgent = require('user-agents');
|
||||||
const google = require('./modules/google.js');
|
const google = require('./modules/google.js');
|
||||||
@ -11,8 +14,6 @@ const yandex = require('./modules/yandex.js');
|
|||||||
const infospace = require('./modules/infospace.js');
|
const infospace = require('./modules/infospace.js');
|
||||||
const duckduckgo = require('./modules/duckduckgo.js');
|
const duckduckgo = require('./modules/duckduckgo.js');
|
||||||
const { Cluster } = require('./puppeteer-cluster/dist/index.js');
|
const { Cluster } = require('./puppeteer-cluster/dist/index.js');
|
||||||
const common = require('./modules/common.js');
|
|
||||||
var log = common.log;
|
|
||||||
|
|
||||||
const MAX_ALLOWED_BROWSERS = 6;
|
const MAX_ALLOWED_BROWSERS = 6;
|
||||||
|
|
||||||
@ -81,12 +82,18 @@ class ScrapeManager {
|
|||||||
// which search engine to scrape
|
// which search engine to scrape
|
||||||
search_engine: 'google',
|
search_engine: 'google',
|
||||||
search_engine_name: 'google',
|
search_engine_name: 'google',
|
||||||
// whether debug information should be printed
|
logger: createLogger({
|
||||||
// level 0: print nothing
|
level: 'info',
|
||||||
// level 1: print most important info
|
format: combine(
|
||||||
// ...
|
timestamp(),
|
||||||
// level 4: print all shit nobody wants to know
|
printf(({ level, message, timestamp }) => {
|
||||||
debug_level: 1,
|
return `${timestamp} [${level}] ${message}`;
|
||||||
|
})
|
||||||
|
),
|
||||||
|
transports: [
|
||||||
|
new transports.Console()
|
||||||
|
]
|
||||||
|
}),
|
||||||
keywords: ['nodejs rocks',],
|
keywords: ['nodejs rocks',],
|
||||||
// whether to start the browser in headless mode
|
// whether to start the browser in headless mode
|
||||||
headless: true,
|
headless: true,
|
||||||
@ -154,6 +161,8 @@ class ScrapeManager {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
this.logger = this.config.logger;
|
||||||
|
|
||||||
if (config.sleep_range) {
|
if (config.sleep_range) {
|
||||||
// parse an array
|
// parse an array
|
||||||
config.sleep_range = eval(config.sleep_range);
|
config.sleep_range = eval(config.sleep_range);
|
||||||
@ -168,16 +177,15 @@ class ScrapeManager {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (this.config.proxies && this.config.proxy_file) {
|
if (this.config.proxies && this.config.proxy_file) {
|
||||||
console.error('Either use a proxy_file or specify a proxy for all connections. Do not use both options.');
|
throw new Error('Either use a proxy_file or specify a proxy for all connections. Do not use both options.');
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fs.existsSync(this.config.proxy_file)) {
|
if (fs.existsSync(this.config.proxy_file)) {
|
||||||
this.config.proxies = read_keywords_from_file(this.config.proxy_file);
|
this.config.proxies = read_keywords_from_file(this.config.proxy_file);
|
||||||
log(this.config, 1, `${this.config.proxies.length} proxies read from file.`);
|
this.logger.info(`${this.config.proxies.length} proxies read from file.`);
|
||||||
}
|
}
|
||||||
|
|
||||||
log(this.config, 2, this.config);
|
debug('this.config=%O', this.config);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -224,7 +232,7 @@ class ScrapeManager {
|
|||||||
ignoreHTTPSErrors: true,
|
ignoreHTTPSErrors: true,
|
||||||
};
|
};
|
||||||
|
|
||||||
log(this.config, 2, `Using the following puppeteer configuration: ${launch_args}`);
|
debug('Using the following puppeteer configuration launch_args=%O', launch_args);
|
||||||
|
|
||||||
if (this.pluggable && this.pluggable.start_browser) {
|
if (this.pluggable && this.pluggable.start_browser) {
|
||||||
launch_args.config = this.config;
|
launch_args.config = this.config;
|
||||||
@ -256,7 +264,7 @@ class ScrapeManager {
|
|||||||
MAX_ALLOWED_BROWSERS
|
MAX_ALLOWED_BROWSERS
|
||||||
);
|
);
|
||||||
|
|
||||||
log(this.config, 1, `Using ${this.numClusters} clusters.`);
|
this.logger.info(`Using ${this.numClusters} clusters.`);
|
||||||
|
|
||||||
this.config.puppeteer_cluster_config.maxConcurrency = this.numClusters;
|
this.config.puppeteer_cluster_config.maxConcurrency = this.numClusters;
|
||||||
|
|
||||||
@ -279,9 +287,7 @@ class ScrapeManager {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.config.debug_level >= 2) {
|
debug('perBrowserOptions=%O', perBrowserOptions)
|
||||||
console.dir(perBrowserOptions)
|
|
||||||
}
|
|
||||||
|
|
||||||
this.cluster = await Cluster.launch({
|
this.cluster = await Cluster.launch({
|
||||||
monitor: this.config.puppeteer_cluster_config.monitor,
|
monitor: this.config.puppeteer_cluster_config.monitor,
|
||||||
@ -293,8 +299,8 @@ class ScrapeManager {
|
|||||||
});
|
});
|
||||||
|
|
||||||
this.cluster.on('taskerror', (err, data) => {
|
this.cluster.on('taskerror', (err, data) => {
|
||||||
console.log(`Error while scraping ${data}: ${err.message}`);
|
this.logger.error(`Error while scraping ${data}: ${err.message}`);
|
||||||
console.log(err);
|
debug('Error during cluster task', err);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -305,8 +311,7 @@ class ScrapeManager {
|
|||||||
async scrape(scrape_config = {}) {
|
async scrape(scrape_config = {}) {
|
||||||
|
|
||||||
if (!scrape_config.keywords && !scrape_config.keyword_file) {
|
if (!scrape_config.keywords && !scrape_config.keyword_file) {
|
||||||
console.error('Either keywords or keyword_file must be supplied to scrape()');
|
throw new Error('Either keywords or keyword_file must be supplied to scrape()');
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Object.assign(this.config, scrape_config);
|
Object.assign(this.config, scrape_config);
|
||||||
@ -318,10 +323,7 @@ class ScrapeManager {
|
|||||||
|
|
||||||
this.config.search_engine_name = typeof this.config.search_engine === 'function' ? this.config.search_engine.name : this.config.search_engine;
|
this.config.search_engine_name = typeof this.config.search_engine === 'function' ? this.config.search_engine.name : this.config.search_engine;
|
||||||
|
|
||||||
if (this.config.keywords && this.config.search_engine) {
|
this.logger.info(`scrapes ${this.config.search_engine_name} with ${this.config.keywords.length} keywords on ${this.config.num_pages} pages each.`);
|
||||||
log(this.config, 1,
|
|
||||||
`[se-scraper] started at [${(new Date()).toUTCString()}] and scrapes ${this.config.search_engine_name} with ${this.config.keywords.length} keywords on ${this.config.num_pages} pages each.`)
|
|
||||||
}
|
|
||||||
|
|
||||||
if (this.pluggable && this.pluggable.start_browser) {
|
if (this.pluggable && this.pluggable.start_browser) {
|
||||||
|
|
||||||
@ -385,8 +387,8 @@ class ScrapeManager {
|
|||||||
let timeDelta = Date.now() - startTime;
|
let timeDelta = Date.now() - startTime;
|
||||||
let ms_per_request = timeDelta/num_requests;
|
let ms_per_request = timeDelta/num_requests;
|
||||||
|
|
||||||
log(this.config, 1, `Scraper took ${timeDelta}ms to perform ${num_requests} requests.`);
|
this.logger.info(`Scraper took ${timeDelta}ms to perform ${num_requests} requests.`);
|
||||||
log(this.config, 1, `On average ms/request: ${ms_per_request}ms/request`);
|
this.logger.info(`On average ms/request: ${ms_per_request}ms/request`);
|
||||||
|
|
||||||
if (this.pluggable && this.pluggable.handle_results) {
|
if (this.pluggable && this.pluggable.handle_results) {
|
||||||
await this.pluggable.handle_results(results);
|
await this.pluggable.handle_results(results);
|
||||||
@ -396,14 +398,14 @@ class ScrapeManager {
|
|||||||
metadata.ms_per_keyword = ms_per_request.toString();
|
metadata.ms_per_keyword = ms_per_request.toString();
|
||||||
metadata.num_requests = num_requests;
|
metadata.num_requests = num_requests;
|
||||||
|
|
||||||
log(this.config, 2, metadata);
|
debug('metadata=%O', metadata);
|
||||||
|
|
||||||
if (this.pluggable && this.pluggable.handle_metadata) {
|
if (this.pluggable && this.pluggable.handle_metadata) {
|
||||||
await this.pluggable.handle_metadata(metadata);
|
await this.pluggable.handle_metadata(metadata);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.config.output_file) {
|
if (this.config.output_file) {
|
||||||
log(this.config, 1, `Writing results to ${this.config.output_file}`);
|
this.logger.info(`Writing results to ${this.config.output_file}`);
|
||||||
write_results(this.config.output_file, JSON.stringify(results, null, 4));
|
write_results(this.config.output_file, JSON.stringify(results, null, 4));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user