refactor(log): remove common.js, use winston and debug

This commit is contained in:
HugoPoi 2019-12-15 17:56:22 +01:00
parent b4a86fcc51
commit bcd181111b
18 changed files with 236 additions and 110 deletions

View File

@ -32,12 +32,6 @@ let browser_config = {
verbose: true, verbose: true,
// whether to start the browser in headless mode // whether to start the browser in headless mode
headless: true, headless: true,
// whether debug information should be printed
// level 0: print nothing
// level 1: print most important info
// ...
// level 4: print all shit nobody wants to know
debug_level: 1,
is_local: false, is_local: false,
throw_on_detection: false, throw_on_detection: false,
puppeteer_cluster_config: { puppeteer_cluster_config: {

View File

@ -30,12 +30,6 @@ let browser_config = {
// if random_user_agent is set to True, a random user agent is chosen // if random_user_agent is set to True, a random user agent is chosen
random_user_agent: true, random_user_agent: true,
headless: true, headless: true,
// whether debug information should be printed
// level 0: print nothing
// level 1: print most important info
// ...
// level 4: print all shit nobody wants to know
debug_level: 1,
is_local: false, is_local: false,
throw_on_detection: false, throw_on_detection: false,
puppeteer_cluster_config: { puppeteer_cluster_config: {

View File

@ -2,7 +2,6 @@ const se_scraper = require('./../src/node_scraper.js');
(async () => { (async () => {
let browser_config = { let browser_config = {
debug_level: 1,
output_file: '', output_file: '',
}; };

View File

@ -2,7 +2,6 @@ const se_scraper = require('./../src/node_scraper.js');
(async () => { (async () => {
let browser_config = { let browser_config = {
debug_level: 2,
output_file: 'examples/results/gnold.json', output_file: 'examples/results/gnold.json',
google_news_old_settings: { google_news_old_settings: {
gl: 'us', // The gl parameter determines the Google country to use for the query. gl: 'us', // The gl parameter determines the Google country to use for the query.

View File

@ -2,7 +2,6 @@ const se_scraper = require('./../src/node_scraper.js');
(async () => { (async () => {
let browser_config = { let browser_config = {
debug_level: 1,
output_file: 'examples/results/maps.json', output_file: 'examples/results/maps.json',
test_evasion: false, test_evasion: false,
block_assets: false, block_assets: false,

View File

@ -3,7 +3,6 @@ const se_scraper = require('./../src/node_scraper.js');
(async () => { (async () => {
let browser_config = { let browser_config = {
search_engine: 'google', search_engine: 'google',
debug_level: 1,
random_user_agent: true, random_user_agent: true,
is_local: false, is_local: false,
html_output: false, html_output: false,

View File

@ -5,7 +5,6 @@ const se_scraper = require('./../src/node_scraper.js');
random_user_agent: true, random_user_agent: true,
write_meta_data: true, write_meta_data: true,
sleep_range: '[1,1]', sleep_range: '[1,1]',
debug_level: 1,
headless: true, headless: true,
output_file: `examples/results/multiple_search_engines.json` output_file: `examples/results/multiple_search_engines.json`
}; };

View File

@ -3,7 +3,6 @@ const resolve = require('path').resolve;
(async () => { (async () => {
let browser_config = { let browser_config = {
debug_level: 1,
test_evasion: false, test_evasion: false,
log_http_headers: true, log_http_headers: true,
log_ip_address: true, log_ip_address: true,

View File

@ -2,7 +2,6 @@ const se_scraper = require('./../src/node_scraper.js');
(async () => { (async () => {
let browser_config = { let browser_config = {
debug_level: 1,
output_file: 'examples/results/proxyresults.json', output_file: 'examples/results/proxyresults.json',
log_ip_address: true, log_ip_address: true,
// a file with one proxy per line. Example: // a file with one proxy per line. Example:

View File

@ -2,7 +2,6 @@ const se_scraper = require('./../src/node_scraper.js');
(async () => { (async () => {
let browser_config = { let browser_config = {
debug_level: 1,
test_evasion: false, test_evasion: false,
log_http_headers: false, log_http_headers: false,
log_ip_address: false, log_ip_address: false,

View File

@ -2,7 +2,6 @@ const se_scraper = require('./../src/node_scraper.js');
(async () => { (async () => {
let browser_config = { let browser_config = {
debug_level: 1,
output_file: 'examples/results/data.json', output_file: 'examples/results/data.json',
}; };

187
package-lock.json generated
View File

@ -71,6 +71,14 @@
"integrity": "sha512-jgsaNduz+ndvGyFt3uSuWqvy4lCnIJiovtouQN5JZHOKCS2QuhEdbcQHFhVksz2N2U9hXJo8odG7ETyWlEeuDw==", "integrity": "sha512-jgsaNduz+ndvGyFt3uSuWqvy4lCnIJiovtouQN5JZHOKCS2QuhEdbcQHFhVksz2N2U9hXJo8odG7ETyWlEeuDw==",
"dev": true "dev": true
}, },
"async": {
"version": "2.6.3",
"resolved": "https://registry.npmjs.org/async/-/async-2.6.3.tgz",
"integrity": "sha512-zflvls11DCy+dQWzTW2dzuilv8Z5X/pjfmZOWba6TNIVDm+2UDaJmXSOXlasHKfNBs8oo3M0aT50fDEWfKZjXg==",
"requires": {
"lodash": "^4.17.14"
}
},
"async-limiter": { "async-limiter": {
"version": "1.0.1", "version": "1.0.1",
"resolved": "https://registry.npmjs.org/async-limiter/-/async-limiter-1.0.1.tgz", "resolved": "https://registry.npmjs.org/async-limiter/-/async-limiter-1.0.1.tgz",
@ -224,11 +232,19 @@
"integrity": "sha1-DQcLTQQ6W+ozovGkDi7bPZpMz3c=", "integrity": "sha1-DQcLTQQ6W+ozovGkDi7bPZpMz3c=",
"dev": true "dev": true
}, },
"color": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/color/-/color-3.0.0.tgz",
"integrity": "sha512-jCpd5+s0s0t7p3pHQKpnJ0TpQKKdleP71LWcA0aqiljpiuAkOSUFN/dyH8ZwF0hRmFlrIuRhufds1QyEP9EB+w==",
"requires": {
"color-convert": "^1.9.1",
"color-string": "^1.5.2"
}
},
"color-convert": { "color-convert": {
"version": "1.9.3", "version": "1.9.3",
"resolved": "https://registry.npmjs.org/color-convert/-/color-convert-1.9.3.tgz", "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-1.9.3.tgz",
"integrity": "sha512-QfAUtd+vFdAtFQcC8CCyYt1fYWxSqAiK2cSD6zDB8N3cpsEBAvRxp9zOGg6G/SHHJYAT88/az/IuDGALsNVbGg==", "integrity": "sha512-QfAUtd+vFdAtFQcC8CCyYt1fYWxSqAiK2cSD6zDB8N3cpsEBAvRxp9zOGg6G/SHHJYAT88/az/IuDGALsNVbGg==",
"dev": true,
"requires": { "requires": {
"color-name": "1.1.3" "color-name": "1.1.3"
} }
@ -236,8 +252,35 @@
"color-name": { "color-name": {
"version": "1.1.3", "version": "1.1.3",
"resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.3.tgz", "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.3.tgz",
"integrity": "sha1-p9BVi9icQveV3UIyj3QIMcpTvCU=", "integrity": "sha1-p9BVi9icQveV3UIyj3QIMcpTvCU="
"dev": true },
"color-string": {
"version": "1.5.3",
"resolved": "https://registry.npmjs.org/color-string/-/color-string-1.5.3.tgz",
"integrity": "sha512-dC2C5qeWoYkxki5UAXapdjqO672AM4vZuPGRQfO8b5HKuKGBbKWpITyDYN7TOFKvRW7kOgAn3746clDBMDJyQw==",
"requires": {
"color-name": "^1.0.0",
"simple-swizzle": "^0.2.2"
}
},
"colornames": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/colornames/-/colornames-1.1.1.tgz",
"integrity": "sha1-+IiQMGhcfE/54qVZ9Qd+t2qBb5Y="
},
"colors": {
"version": "1.4.0",
"resolved": "https://registry.npmjs.org/colors/-/colors-1.4.0.tgz",
"integrity": "sha512-a+UqTh4kgZg/SlGvfbzDHpgRu7AAQOmmqRHJnxhRZICKFUT91brVhNNt58CMWU9PsBbv3PDCZUHbVxuDiH2mtA=="
},
"colorspace": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/colorspace/-/colorspace-1.1.2.tgz",
"integrity": "sha512-vt+OoIP2d76xLhjwbBaucYlNSpPsrJWPlBTtwCpQKIu6/CSMutyzX93O/Do0qzpH3YoHEes8YEFXyZ797rEhzQ==",
"requires": {
"color": "3.0.x",
"text-hex": "1.0.x"
}
}, },
"concat-map": { "concat-map": {
"version": "0.0.1", "version": "0.0.1",
@ -363,6 +406,16 @@
"object-keys": "^1.0.12" "object-keys": "^1.0.12"
} }
}, },
"diagnostics": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/diagnostics/-/diagnostics-1.1.1.tgz",
"integrity": "sha512-8wn1PmdunLJ9Tqbx+Fx/ZEuHfJf4NKSN2ZBj7SJC/OWRWha843+WsTjqMe1B5E3p28jqBlp+mJ2fPVxPyNgYKQ==",
"requires": {
"colorspace": "1.1.x",
"enabled": "1.0.x",
"kuler": "1.0.x"
}
},
"diff": { "diff": {
"version": "3.5.0", "version": "3.5.0",
"resolved": "https://registry.npmjs.org/diff/-/diff-3.5.0.tgz", "resolved": "https://registry.npmjs.org/diff/-/diff-3.5.0.tgz",
@ -425,6 +478,14 @@
"integrity": "sha512-CwBLREIQ7LvYFB0WyRvwhq5N5qPhc6PMjD6bYggFlI5YyDgl+0vxq5VHbMOFqLg7hfWzmu8T5Z1QofhmTIhItA==", "integrity": "sha512-CwBLREIQ7LvYFB0WyRvwhq5N5qPhc6PMjD6bYggFlI5YyDgl+0vxq5VHbMOFqLg7hfWzmu8T5Z1QofhmTIhItA==",
"dev": true "dev": true
}, },
"enabled": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/enabled/-/enabled-1.0.2.tgz",
"integrity": "sha1-ll9lE9LC0cX0ZStkouM5ZGf8L5M=",
"requires": {
"env-variable": "0.0.x"
}
},
"end-of-stream": { "end-of-stream": {
"version": "1.4.1", "version": "1.4.1",
"resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.1.tgz", "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.1.tgz",
@ -438,6 +499,11 @@
"resolved": "https://registry.npmjs.org/entities/-/entities-1.1.2.tgz", "resolved": "https://registry.npmjs.org/entities/-/entities-1.1.2.tgz",
"integrity": "sha512-f2LZMYl1Fzu7YSBKg+RoROelpOaNrcGmE9AZubeDfrCEia483oW4MI4VyFd5VNHIgQ/7qm1I0wUHK1eJnn2y2w==" "integrity": "sha512-f2LZMYl1Fzu7YSBKg+RoROelpOaNrcGmE9AZubeDfrCEia483oW4MI4VyFd5VNHIgQ/7qm1I0wUHK1eJnn2y2w=="
}, },
"env-variable": {
"version": "0.0.5",
"resolved": "https://registry.npmjs.org/env-variable/-/env-variable-0.0.5.tgz",
"integrity": "sha512-zoB603vQReOFvTg5xMl9I1P2PnHsHQQKTEowsKKD7nseUfJq6UWzK+4YtlWUO1nhiQUxe6XMkk+JleSZD1NZFA=="
},
"es-abstract": { "es-abstract": {
"version": "1.13.0", "version": "1.13.0",
"resolved": "https://registry.npmjs.org/es-abstract/-/es-abstract-1.13.0.tgz", "resolved": "https://registry.npmjs.org/es-abstract/-/es-abstract-1.13.0.tgz",
@ -529,6 +595,11 @@
} }
} }
}, },
"fast-safe-stringify": {
"version": "2.0.7",
"resolved": "https://registry.npmjs.org/fast-safe-stringify/-/fast-safe-stringify-2.0.7.tgz",
"integrity": "sha512-Utm6CdzT+6xsDk2m8S6uL8VHxNwI6Jub+e9NYTcAms28T84pTa25GJQV9j0CY0N1rM8hK4x6grpF2BQf+2qwVA=="
},
"fd-slicer": { "fd-slicer": {
"version": "1.0.1", "version": "1.0.1",
"resolved": "https://registry.npmjs.org/fd-slicer/-/fd-slicer-1.0.1.tgz", "resolved": "https://registry.npmjs.org/fd-slicer/-/fd-slicer-1.0.1.tgz",
@ -537,6 +608,11 @@
"pend": "~1.2.0" "pend": "~1.2.0"
} }
}, },
"fecha": {
"version": "2.3.3",
"resolved": "https://registry.npmjs.org/fecha/-/fecha-2.3.3.tgz",
"integrity": "sha512-lUGBnIamTAwk4znq5BcqsDaxSmZ9nDVJaij6NvRt/Tg4R69gERA+otPKbS86ROw9nxVMw2/mp1fnaiWqbs6Sdg=="
},
"find-up": { "find-up": {
"version": "3.0.0", "version": "3.0.0",
"resolved": "https://registry.npmjs.org/find-up/-/find-up-3.0.0.tgz", "resolved": "https://registry.npmjs.org/find-up/-/find-up-3.0.0.tgz",
@ -720,6 +796,11 @@
"integrity": "sha512-wPVv/y/QQ/Uiirj/vh3oP+1Ww+AWehmi1g5fFWGPF6IpCBCDVrhgHRMvrLfdYcwDh3QJbGXDW4JAuzxElLSqKA==", "integrity": "sha512-wPVv/y/QQ/Uiirj/vh3oP+1Ww+AWehmi1g5fFWGPF6IpCBCDVrhgHRMvrLfdYcwDh3QJbGXDW4JAuzxElLSqKA==",
"dev": true "dev": true
}, },
"is-arrayish": {
"version": "0.3.2",
"resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.2.tgz",
"integrity": "sha512-eVRqCvVlZbuw3GrM63ovNSNAeA1K16kaR/LRY/92w0zxQ5/1YzwblUX652i4Xs9RwAGjW9d9y6X88t8OaAJfWQ=="
},
"is-buffer": { "is-buffer": {
"version": "2.0.3", "version": "2.0.3",
"resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-2.0.3.tgz", "resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-2.0.3.tgz",
@ -769,8 +850,7 @@
"is-stream": { "is-stream": {
"version": "1.1.0", "version": "1.1.0",
"resolved": "https://registry.npmjs.org/is-stream/-/is-stream-1.1.0.tgz", "resolved": "https://registry.npmjs.org/is-stream/-/is-stream-1.1.0.tgz",
"integrity": "sha1-EtSj3U5o4Lec6428hBc66A2RykQ=", "integrity": "sha1-EtSj3U5o4Lec6428hBc66A2RykQ="
"dev": true
}, },
"is-symbol": { "is-symbol": {
"version": "1.0.2", "version": "1.0.2",
@ -835,6 +915,14 @@
} }
} }
}, },
"kuler": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/kuler/-/kuler-1.0.1.tgz",
"integrity": "sha512-J9nVUucG1p/skKul6DU3PUZrhs0LPulNaeUOox0IyXDi8S4CztTHs1gQphhuZmzXG7VOQSf6NJfKuzteQLv9gQ==",
"requires": {
"colornames": "^1.1.1"
}
},
"lazy-cache": { "lazy-cache": {
"version": "1.0.4", "version": "1.0.4",
"resolved": "https://registry.npmjs.org/lazy-cache/-/lazy-cache-1.0.4.tgz", "resolved": "https://registry.npmjs.org/lazy-cache/-/lazy-cache-1.0.4.tgz",
@ -878,6 +966,18 @@
"chalk": "^2.0.1" "chalk": "^2.0.1"
} }
}, },
"logform": {
"version": "2.1.2",
"resolved": "https://registry.npmjs.org/logform/-/logform-2.1.2.tgz",
"integrity": "sha512-+lZh4OpERDBLqjiwDLpAWNQu6KMjnlXH2ByZwCuSqVPJletw0kTWJf5CgSNAUKn1KUkv3m2cUz/LK8zyEy7wzQ==",
"requires": {
"colors": "^1.2.1",
"fast-safe-stringify": "^2.0.4",
"fecha": "^2.3.3",
"ms": "^2.1.1",
"triple-beam": "^1.3.0"
}
},
"lowercase-keys": { "lowercase-keys": {
"version": "1.0.1", "version": "1.0.1",
"resolved": "https://registry.npmjs.org/lowercase-keys/-/lowercase-keys-1.0.1.tgz", "resolved": "https://registry.npmjs.org/lowercase-keys/-/lowercase-keys-1.0.1.tgz",
@ -1107,6 +1207,11 @@
"wrappy": "1" "wrappy": "1"
} }
}, },
"one-time": {
"version": "0.0.4",
"resolved": "https://registry.npmjs.org/one-time/-/one-time-0.0.4.tgz",
"integrity": "sha1-+M33eISCb+Tf+T46nMN7HkSAdC4="
},
"os-locale": { "os-locale": {
"version": "3.1.0", "version": "3.1.0",
"resolved": "https://registry.npmjs.org/os-locale/-/os-locale-3.1.0.tgz", "resolved": "https://registry.npmjs.org/os-locale/-/os-locale-3.1.0.tgz",
@ -1429,12 +1534,25 @@
"integrity": "sha1-tf3AjxKH6hF4Yo5BXiUTK3NkbG0=", "integrity": "sha1-tf3AjxKH6hF4Yo5BXiUTK3NkbG0=",
"dev": true "dev": true
}, },
"simple-swizzle": {
"version": "0.2.2",
"resolved": "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.2.tgz",
"integrity": "sha1-pNprY1/8zMoz9w0Xy5JZLeleVXo=",
"requires": {
"is-arrayish": "^0.3.1"
}
},
"sprintf-js": { "sprintf-js": {
"version": "1.0.3", "version": "1.0.3",
"resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz", "resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz",
"integrity": "sha1-BOaSb2YolTVPPdAVIDYzuFcpfiw=", "integrity": "sha1-BOaSb2YolTVPPdAVIDYzuFcpfiw=",
"dev": true "dev": true
}, },
"stack-trace": {
"version": "0.0.10",
"resolved": "https://registry.npmjs.org/stack-trace/-/stack-trace-0.0.10.tgz",
"integrity": "sha1-VHxws0fo0ytOEI6hoqFZ5f3eGcA="
},
"string-width": { "string-width": {
"version": "2.1.1", "version": "2.1.1",
"resolved": "https://registry.npmjs.org/string-width/-/string-width-2.1.1.tgz", "resolved": "https://registry.npmjs.org/string-width/-/string-width-2.1.1.tgz",
@ -1490,11 +1608,21 @@
"has-flag": "^3.0.0" "has-flag": "^3.0.0"
} }
}, },
"text-hex": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/text-hex/-/text-hex-1.0.0.tgz",
"integrity": "sha512-uuVGNWzgJ4yhRaNSiubPY7OjISw4sw4E5Uv0wbjp+OzcbmVU/rsT8ujgcXJhn9ypzsgr5vlzpPqP+MBBKcGvbg=="
},
"to-readable-stream": { "to-readable-stream": {
"version": "1.0.0", "version": "1.0.0",
"resolved": "https://registry.npmjs.org/to-readable-stream/-/to-readable-stream-1.0.0.tgz", "resolved": "https://registry.npmjs.org/to-readable-stream/-/to-readable-stream-1.0.0.tgz",
"integrity": "sha512-Iq25XBt6zD5npPhlLVXGFN3/gyR2/qODcKNNyTMd4vbm39HUaOiAM4PMq0eMVC/Tkxz+Zjdsc55g9yyz+Yq00Q==" "integrity": "sha512-Iq25XBt6zD5npPhlLVXGFN3/gyR2/qODcKNNyTMd4vbm39HUaOiAM4PMq0eMVC/Tkxz+Zjdsc55g9yyz+Yq00Q=="
}, },
"triple-beam": {
"version": "1.3.0",
"resolved": "https://registry.npmjs.org/triple-beam/-/triple-beam-1.3.0.tgz",
"integrity": "sha512-XrHUvV5HpdLmIj4uVMxHggLbFSZYIn7HEWsqePZcI50pco+MPqJ50wMGY794X7AOOhxOBAjbkqfAbEe/QMp2Lw=="
},
"type-detect": { "type-detect": {
"version": "4.0.8", "version": "4.0.8",
"resolved": "https://registry.npmjs.org/type-detect/-/type-detect-4.0.8.tgz", "resolved": "https://registry.npmjs.org/type-detect/-/type-detect-4.0.8.tgz",
@ -1565,6 +1693,55 @@
"string-width": "^1.0.2 || 2" "string-width": "^1.0.2 || 2"
} }
}, },
"winston": {
"version": "3.2.1",
"resolved": "https://registry.npmjs.org/winston/-/winston-3.2.1.tgz",
"integrity": "sha512-zU6vgnS9dAWCEKg/QYigd6cgMVVNwyTzKs81XZtTFuRwJOcDdBg7AU0mXVyNbs7O5RH2zdv+BdNZUlx7mXPuOw==",
"requires": {
"async": "^2.6.1",
"diagnostics": "^1.1.1",
"is-stream": "^1.1.0",
"logform": "^2.1.1",
"one-time": "0.0.4",
"readable-stream": "^3.1.1",
"stack-trace": "0.0.x",
"triple-beam": "^1.3.0",
"winston-transport": "^4.3.0"
}
},
"winston-transport": {
"version": "4.3.0",
"resolved": "https://registry.npmjs.org/winston-transport/-/winston-transport-4.3.0.tgz",
"integrity": "sha512-B2wPuwUi3vhzn/51Uukcao4dIduEiPOcOt9HJ3QeaXgkJ5Z7UwpBzxS4ZGNHtrxrUvTwemsQiSys0ihOf8Mp1A==",
"requires": {
"readable-stream": "^2.3.6",
"triple-beam": "^1.2.0"
},
"dependencies": {
"readable-stream": {
"version": "2.3.6",
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz",
"integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==",
"requires": {
"core-util-is": "~1.0.0",
"inherits": "~2.0.3",
"isarray": "~1.0.0",
"process-nextick-args": "~2.0.0",
"safe-buffer": "~5.1.1",
"string_decoder": "~1.1.1",
"util-deprecate": "~1.0.1"
}
},
"string_decoder": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz",
"integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==",
"requires": {
"safe-buffer": "~5.1.0"
}
}
}
},
"wrap-ansi": { "wrap-ansi": {
"version": "2.1.0", "version": "2.1.0",
"resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-2.1.0.tgz", "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-2.1.0.tgz",

View File

@ -29,7 +29,8 @@
"puppeteer": "^2.0.0", "puppeteer": "^2.0.0",
"puppeteer-extra": "^2.1.3", "puppeteer-extra": "^2.1.3",
"puppeteer-extra-plugin-stealth": "^2.2.2", "puppeteer-extra-plugin-stealth": "^2.2.2",
"user-agents": "^1.0.378" "user-agents": "^1.0.378",
"winston": "^3.2.1"
}, },
"devDependencies": { "devDependencies": {
"chai": "^4.2.0", "chai": "^4.2.0",

View File

@ -1,21 +0,0 @@
function log(config, loglevel, msg = null, cb = null) {
if (typeof loglevel != "number") {
throw Error('loglevel must be numeric.');
}
if (loglevel <= config.debug_level) {
if (msg) {
if (typeof msg == 'object') {
console.dir(msg, {depth: null, colors: false});
} else {
console.log('[i] ' + msg);
}
} else if (cb) {
cb();
}
}
}
module.exports = {
log: log,
};

View File

@ -2,8 +2,6 @@
const cheerio = require('cheerio'); const cheerio = require('cheerio');
const Scraper = require('./se_scraper'); const Scraper = require('./se_scraper');
const common = require('./common.js');
var log = common.log;
class GoogleScraper extends Scraper { class GoogleScraper extends Scraper {
@ -243,7 +241,7 @@ class GoogleScraper extends Scraper {
} }
} }
log(this.config, 1, 'Using startUrl: ' + startUrl); this.logger.info('Using startUrl: ' + startUrl);
this.last_response = await this.page.goto(startUrl); this.last_response = await this.page.goto(startUrl);
@ -642,7 +640,7 @@ class GoogleMapsScraper extends Scraper {
this.scrape_in_detail = this.config.google_maps_settings.scrape_in_detail || false; this.scrape_in_detail = this.config.google_maps_settings.scrape_in_detail || false;
} }
log(this.config, 1, 'Using startUrl: ' + startUrl); this.logger.info('Using startUrl: ' + startUrl);
this.last_response = await this.page.goto(startUrl); this.last_response = await this.page.goto(startUrl);
@ -681,7 +679,7 @@ class GoogleMapsScraper extends Scraper {
let last_title_last_result = this.results[this.keyword][this.page_num-1].results.slice(-1)[0].title; let last_title_last_result = this.results[this.keyword][this.page_num-1].results.slice(-1)[0].title;
log(this.config, 1, `Waiting until new last serp title differs from: "${last_title_last_result}"`); this.logger.info(`Waiting until new last serp title differs from: "${last_title_last_result}"`);
await this.page.waitForFunction((last_title) => { await this.page.waitForFunction((last_title) => {
const res = document.querySelectorAll('.section-result .section-result-title span'); const res = document.querySelectorAll('.section-result .section-result-title span');
@ -775,7 +773,7 @@ class GoogleShoppingScraper extends Scraper {
} }
} }
log(this.config, 1, 'Using startUrl: ' + startUrl); this.logger.info('Using startUrl: ' + startUrl);
this.last_response = await this.page.goto(startUrl); this.last_response = await this.page.goto(startUrl);

View File

@ -1,8 +1,6 @@
'use strict'; 'use strict';
const meta = require('./metadata.js'); const meta = require('./metadata.js');
const common = require('./common.js'); const debug = require('debug')('se-scraper:Scraper');
var log = common.log;
/* /*
Get useful JS knowledge and get awesome... Get useful JS knowledge and get awesome...
@ -26,6 +24,7 @@ module.exports = class Scraper {
}; };
this.pluggable = pluggable; this.pluggable = pluggable;
this.config = config; this.config = config;
this.logger = this.config.logger;
this.context = context; this.context = context;
this.proxy = config.proxy; this.proxy = config.proxy;
@ -113,25 +112,25 @@ module.exports = class Scraper {
if (this.config.log_http_headers === true) { if (this.config.log_http_headers === true) {
this.metadata.http_headers = await meta.get_http_headers(this.page); this.metadata.http_headers = await meta.get_http_headers(this.page);
log(this.config, 2, this.metadata.http_headers); debug('this.metadata.http_headers=%O', this.metadata.http_headers);
} }
if (this.config.log_ip_address === true) { if (this.config.log_ip_address === true) {
let ipinfo = await meta.get_ip_data(this.page); let ipinfo = await meta.get_ip_data(this.page);
this.metadata.ipinfo = ipinfo; this.metadata.ipinfo = ipinfo;
log(this.config, 2, this.metadata.ipinfo); debug('this.metadata.ipinfo', this.metadata.ipinfo);
} }
// check that our proxy is working by confirming // check that our proxy is working by confirming
// that ipinfo.io sees the proxy IP address // that ipinfo.io sees the proxy IP address
if (this.proxy && this.config.log_ip_address === true) { if (this.proxy && this.config.log_ip_address === true) {
log(this.config, 3, `${this.metadata.ipinfo.ip} vs ${this.proxy}`); debug(`${this.metadata.ipinfo.ip} vs ${this.proxy}`);
// if the ip returned by ipinfo is not a substring of our proxystring, get the heck outta here // if the ip returned by ipinfo is not a substring of our proxystring, get the heck outta here
if (!this.proxy.includes(this.metadata.ipinfo.ip)) { if (!this.proxy.includes(this.metadata.ipinfo.ip)) {
throw new Error(`Proxy output ip ${this.proxy} does not match with provided one`); throw new Error(`Proxy output ip ${this.proxy} does not match with provided one`);
} else { } else {
log(this.config, 1, `Using valid Proxy: ${this.proxy}`); this.logger.info(`Using valid Proxy: ${this.proxy}`);
} }
} }
@ -179,7 +178,7 @@ module.exports = class Scraper {
do { do {
log(this.config, 1, `${this.config.search_engine_name} scrapes keyword "${keyword}" on page ${this.page_num}`); this.logger.info(`${this.config.search_engine_name} scrapes keyword "${keyword}" on page ${this.page_num}`);
await this.wait_for_results(); await this.wait_for_results();
@ -263,28 +262,21 @@ module.exports = class Scraper {
} catch (e) { } catch (e) {
console.error(`Problem with scraping ${keyword} in search engine ${this.config.search_engine_name}: ${e}`); this.logger.warn(`Problem with scraping ${keyword} in search engine ${this.config.search_engine_name}: ${e.message}`);
debug('this.last_response=%O', this.last_response);
if (this.last_response) { if (this.config.take_screenshot_on_error) {
log(this.config, 2, this.last_response); await this.page.screenshot({ path: `debug_se_scraper_${this.config.search_engine_name}_${keyword}.png` });
}
if (this.config.debug_level > 2) {
try {
// Try to save a screenshot of the error
await this.page.screenshot({path: `debug_se_scraper_${this.config.search_engine_name}_${keyword}.png`});
} catch (e) {
}
} }
this.metadata.scraping_detected = await this.detected(); this.metadata.scraping_detected = await this.detected();
if (this.metadata.scraping_detected === true) { if (this.metadata.scraping_detected === true) {
console.error(`${this.config.search_engine_name} detected the scraping!`); this.logger.warn(`${this.config.search_engine_name} detected the scraping!`);
if (this.config.is_local === true) { if (this.config.is_local === true) {
await this.sleep(this.SOLVE_CAPTCHA_TIME); await this.sleep(this.SOLVE_CAPTCHA_TIME);
console.error(`You have ${this.SOLVE_CAPTCHA_TIME}ms to enter the captcha.`); this.logger.info(`You have ${this.SOLVE_CAPTCHA_TIME}ms to enter the captcha.`);
// expect that user filled out necessary captcha // expect that user filled out necessary captcha
} else { } else {
if (this.config.throw_on_detection === true) { if (this.config.throw_on_detection === true) {
@ -318,7 +310,7 @@ module.exports = class Scraper {
baseUrl += `${key}=${settings[key]}&` baseUrl += `${key}=${settings[key]}&`
} }
log(this.config, 1, 'Using startUrl: ' + baseUrl); this.logger.info('Using startUrl: ' + baseUrl);
return baseUrl; return baseUrl;
} }
@ -335,7 +327,7 @@ module.exports = class Scraper {
async random_sleep() { async random_sleep() {
const [min, max] = this.config.sleep_range; const [min, max] = this.config.sleep_range;
let rand = Math.floor(Math.random() * (max - min + 1) + min); //Generate Random number let rand = Math.floor(Math.random() * (max - min + 1) + min); //Generate Random number
log(this.config, 1, `Sleeping for ${rand}s`); this.logger.info(`Sleeping for ${rand}s`);
await this.sleep(rand * 1000); await this.sleep(rand * 1000);
} }
@ -349,7 +341,7 @@ module.exports = class Scraper {
no_results(needles, html) { no_results(needles, html) {
for (let needle of needles) { for (let needle of needles) {
if (html.includes(needle)) { if (html.includes(needle)) {
console.log(this.config, 2, `HTML contains needle ${needle}. no_results=true`); this.logger.warn(`HTML contains needle ${needle}. no_results=true`);
return true; return true;
} }
} }

View File

@ -1,8 +1,6 @@
'use strict'; 'use strict';
const Scraper = require('./se_scraper'); const Scraper = require('./se_scraper');
const common = require('./common.js');
var log = common.log;
class YandexScraper extends Scraper { class YandexScraper extends Scraper {
@ -75,7 +73,7 @@ class YandexScraper extends Scraper {
async load_start_page() { async load_start_page() {
let startUrl = 'https://yandex.com'; let startUrl = 'https://yandex.com';
log(this.config, 1, 'Using startUrl: ' + startUrl); this.logger.info('Using startUrl: ' + startUrl);
this.last_response = await this.page.goto(startUrl); this.last_response = await this.page.goto(startUrl);

View File

@ -3,6 +3,9 @@
const fs = require('fs'); const fs = require('fs');
const os = require('os'); const os = require('os');
const _ = require('lodash'); const _ = require('lodash');
const { createLogger, format, transports } = require('winston');
const { combine, timestamp, printf } = format;
const debug = require('debug')('se-scraper:ScrapeManager');
const UserAgent = require('user-agents'); const UserAgent = require('user-agents');
const google = require('./modules/google.js'); const google = require('./modules/google.js');
@ -11,8 +14,6 @@ const yandex = require('./modules/yandex.js');
const infospace = require('./modules/infospace.js'); const infospace = require('./modules/infospace.js');
const duckduckgo = require('./modules/duckduckgo.js'); const duckduckgo = require('./modules/duckduckgo.js');
const { Cluster } = require('./puppeteer-cluster/dist/index.js'); const { Cluster } = require('./puppeteer-cluster/dist/index.js');
const common = require('./modules/common.js');
var log = common.log;
const MAX_ALLOWED_BROWSERS = 6; const MAX_ALLOWED_BROWSERS = 6;
@ -81,12 +82,18 @@ class ScrapeManager {
// which search engine to scrape // which search engine to scrape
search_engine: 'google', search_engine: 'google',
search_engine_name: 'google', search_engine_name: 'google',
// whether debug information should be printed logger: createLogger({
// level 0: print nothing level: 'info',
// level 1: print most important info format: combine(
// ... timestamp(),
// level 4: print all shit nobody wants to know printf(({ level, message, timestamp }) => {
debug_level: 1, return `${timestamp} [${level}] ${message}`;
})
),
transports: [
new transports.Console()
]
}),
keywords: ['nodejs rocks',], keywords: ['nodejs rocks',],
// whether to start the browser in headless mode // whether to start the browser in headless mode
headless: true, headless: true,
@ -154,6 +161,8 @@ class ScrapeManager {
} }
}); });
this.logger = this.config.logger;
if (config.sleep_range) { if (config.sleep_range) {
// parse an array // parse an array
config.sleep_range = eval(config.sleep_range); config.sleep_range = eval(config.sleep_range);
@ -168,16 +177,15 @@ class ScrapeManager {
} }
if (this.config.proxies && this.config.proxy_file) { if (this.config.proxies && this.config.proxy_file) {
console.error('Either use a proxy_file or specify a proxy for all connections. Do not use both options.'); throw new Error('Either use a proxy_file or specify a proxy for all connections. Do not use both options.');
return false;
} }
if (fs.existsSync(this.config.proxy_file)) { if (fs.existsSync(this.config.proxy_file)) {
this.config.proxies = read_keywords_from_file(this.config.proxy_file); this.config.proxies = read_keywords_from_file(this.config.proxy_file);
log(this.config, 1, `${this.config.proxies.length} proxies read from file.`); this.logger.info(`${this.config.proxies.length} proxies read from file.`);
} }
log(this.config, 2, this.config); debug('this.config=%O', this.config);
} }
/* /*
@ -224,7 +232,7 @@ class ScrapeManager {
ignoreHTTPSErrors: true, ignoreHTTPSErrors: true,
}; };
log(this.config, 2, `Using the following puppeteer configuration: ${launch_args}`); debug('Using the following puppeteer configuration launch_args=%O', launch_args);
if (this.pluggable && this.pluggable.start_browser) { if (this.pluggable && this.pluggable.start_browser) {
launch_args.config = this.config; launch_args.config = this.config;
@ -256,7 +264,7 @@ class ScrapeManager {
MAX_ALLOWED_BROWSERS MAX_ALLOWED_BROWSERS
); );
log(this.config, 1, `Using ${this.numClusters} clusters.`); this.logger.info(`Using ${this.numClusters} clusters.`);
this.config.puppeteer_cluster_config.maxConcurrency = this.numClusters; this.config.puppeteer_cluster_config.maxConcurrency = this.numClusters;
@ -279,9 +287,7 @@ class ScrapeManager {
}) })
} }
if (this.config.debug_level >= 2) { debug('perBrowserOptions=%O', perBrowserOptions)
console.dir(perBrowserOptions)
}
this.cluster = await Cluster.launch({ this.cluster = await Cluster.launch({
monitor: this.config.puppeteer_cluster_config.monitor, monitor: this.config.puppeteer_cluster_config.monitor,
@ -293,8 +299,8 @@ class ScrapeManager {
}); });
this.cluster.on('taskerror', (err, data) => { this.cluster.on('taskerror', (err, data) => {
console.log(`Error while scraping ${data}: ${err.message}`); this.logger.error(`Error while scraping ${data}: ${err.message}`);
console.log(err); debug('Error during cluster task', err);
}); });
} }
} }
@ -305,8 +311,7 @@ class ScrapeManager {
async scrape(scrape_config = {}) { async scrape(scrape_config = {}) {
if (!scrape_config.keywords && !scrape_config.keyword_file) { if (!scrape_config.keywords && !scrape_config.keyword_file) {
console.error('Either keywords or keyword_file must be supplied to scrape()'); throw new Error('Either keywords or keyword_file must be supplied to scrape()');
return false;
} }
Object.assign(this.config, scrape_config); Object.assign(this.config, scrape_config);
@ -318,10 +323,7 @@ class ScrapeManager {
this.config.search_engine_name = typeof this.config.search_engine === 'function' ? this.config.search_engine.name : this.config.search_engine; this.config.search_engine_name = typeof this.config.search_engine === 'function' ? this.config.search_engine.name : this.config.search_engine;
if (this.config.keywords && this.config.search_engine) { this.logger.info(`scrapes ${this.config.search_engine_name} with ${this.config.keywords.length} keywords on ${this.config.num_pages} pages each.`);
log(this.config, 1,
`[se-scraper] started at [${(new Date()).toUTCString()}] and scrapes ${this.config.search_engine_name} with ${this.config.keywords.length} keywords on ${this.config.num_pages} pages each.`)
}
if (this.pluggable && this.pluggable.start_browser) { if (this.pluggable && this.pluggable.start_browser) {
@ -385,8 +387,8 @@ class ScrapeManager {
let timeDelta = Date.now() - startTime; let timeDelta = Date.now() - startTime;
let ms_per_request = timeDelta/num_requests; let ms_per_request = timeDelta/num_requests;
log(this.config, 1, `Scraper took ${timeDelta}ms to perform ${num_requests} requests.`); this.logger.info(`Scraper took ${timeDelta}ms to perform ${num_requests} requests.`);
log(this.config, 1, `On average ms/request: ${ms_per_request}ms/request`); this.logger.info(`On average ms/request: ${ms_per_request}ms/request`);
if (this.pluggable && this.pluggable.handle_results) { if (this.pluggable && this.pluggable.handle_results) {
await this.pluggable.handle_results(results); await this.pluggable.handle_results(results);
@ -396,14 +398,14 @@ class ScrapeManager {
metadata.ms_per_keyword = ms_per_request.toString(); metadata.ms_per_keyword = ms_per_request.toString();
metadata.num_requests = num_requests; metadata.num_requests = num_requests;
log(this.config, 2, metadata); debug('metadata=%O', metadata);
if (this.pluggable && this.pluggable.handle_metadata) { if (this.pluggable && this.pluggable.handle_metadata) {
await this.pluggable.handle_metadata(metadata); await this.pluggable.handle_metadata(metadata);
} }
if (this.config.output_file) { if (this.config.output_file) {
log(this.config, 1, `Writing results to ${this.config.output_file}`); this.logger.info(`Writing results to ${this.config.output_file}`);
write_results(this.config.output_file, JSON.stringify(results, null, 4)); write_results(this.config.output_file, JSON.stringify(results, null, 4));
} }