diff --git a/examples/bing_multiple_browser_multiple_pages.js b/examples/bing_multiple_browser_multiple_pages.js index a3dd07b..98499ca 100644 --- a/examples/bing_multiple_browser_multiple_pages.js +++ b/examples/bing_multiple_browser_multiple_pages.js @@ -32,12 +32,6 @@ let browser_config = { verbose: true, // whether to start the browser in headless mode headless: true, - // whether debug information should be printed - // level 0: print nothing - // level 1: print most important info - // ... - // level 4: print all shit nobody wants to know - debug_level: 1, is_local: false, throw_on_detection: false, puppeteer_cluster_config: { diff --git a/examples/for_the_lulz.js b/examples/for_the_lulz.js index 8f9376f..a3ae6af 100644 --- a/examples/for_the_lulz.js +++ b/examples/for_the_lulz.js @@ -30,12 +30,6 @@ let browser_config = { // if random_user_agent is set to True, a random user agent is chosen random_user_agent: true, headless: true, - // whether debug information should be printed - // level 0: print nothing - // level 1: print most important info - // ... - // level 4: print all shit nobody wants to know - debug_level: 1, is_local: false, throw_on_detection: false, puppeteer_cluster_config: { diff --git a/examples/gimage.js b/examples/gimage.js index baf354a..9777a76 100644 --- a/examples/gimage.js +++ b/examples/gimage.js @@ -2,7 +2,6 @@ const se_scraper = require('./../src/node_scraper.js'); (async () => { let browser_config = { - debug_level: 1, output_file: '', }; diff --git a/examples/gnold.js b/examples/gnold.js index a759ec0..5a18d3e 100644 --- a/examples/gnold.js +++ b/examples/gnold.js @@ -2,7 +2,6 @@ const se_scraper = require('./../src/node_scraper.js'); (async () => { let browser_config = { - debug_level: 2, output_file: 'examples/results/gnold.json', google_news_old_settings: { gl: 'us', // The gl parameter determines the Google country to use for the query. diff --git a/examples/google_maps.js b/examples/google_maps.js index cc7c1cc..31c28b0 100644 --- a/examples/google_maps.js +++ b/examples/google_maps.js @@ -2,7 +2,6 @@ const se_scraper = require('./../src/node_scraper.js'); (async () => { let browser_config = { - debug_level: 1, output_file: 'examples/results/maps.json', test_evasion: false, block_assets: false, diff --git a/examples/multiple_browsers.js b/examples/multiple_browsers.js index 2d8f21a..234013b 100644 --- a/examples/multiple_browsers.js +++ b/examples/multiple_browsers.js @@ -3,7 +3,6 @@ const se_scraper = require('./../src/node_scraper.js'); (async () => { let browser_config = { search_engine: 'google', - debug_level: 1, random_user_agent: true, is_local: false, html_output: false, diff --git a/examples/multiple_search_engines.js b/examples/multiple_search_engines.js index c8251ad..e265ea7 100644 --- a/examples/multiple_search_engines.js +++ b/examples/multiple_search_engines.js @@ -5,7 +5,6 @@ const se_scraper = require('./../src/node_scraper.js'); random_user_agent: true, write_meta_data: true, sleep_range: '[1,1]', - debug_level: 1, headless: true, output_file: `examples/results/multiple_search_engines.json` }; diff --git a/examples/pluggable_example.js b/examples/pluggable_example.js index 901549b..c6ac720 100644 --- a/examples/pluggable_example.js +++ b/examples/pluggable_example.js @@ -3,7 +3,6 @@ const resolve = require('path').resolve; (async () => { let browser_config = { - debug_level: 1, test_evasion: false, log_http_headers: true, log_ip_address: true, diff --git a/examples/proxies.js b/examples/proxies.js index 99ac262..ac9e99b 100644 --- a/examples/proxies.js +++ b/examples/proxies.js @@ -2,7 +2,6 @@ const se_scraper = require('./../src/node_scraper.js'); (async () => { let browser_config = { - debug_level: 1, output_file: 'examples/results/proxyresults.json', log_ip_address: true, // a file with one proxy per line. Example: diff --git a/examples/quickstart.js b/examples/quickstart.js index 2545d80..6578027 100644 --- a/examples/quickstart.js +++ b/examples/quickstart.js @@ -2,7 +2,6 @@ const se_scraper = require('./../src/node_scraper.js'); (async () => { let browser_config = { - debug_level: 1, test_evasion: false, log_http_headers: false, log_ip_address: false, diff --git a/examples/reusing.js b/examples/reusing.js index ded4a0c..507f07a 100644 --- a/examples/reusing.js +++ b/examples/reusing.js @@ -2,7 +2,6 @@ const se_scraper = require('./../src/node_scraper.js'); (async () => { let browser_config = { - debug_level: 1, output_file: 'examples/results/data.json', }; diff --git a/package-lock.json b/package-lock.json index d997296..671d279 100644 --- a/package-lock.json +++ b/package-lock.json @@ -71,6 +71,14 @@ "integrity": "sha512-jgsaNduz+ndvGyFt3uSuWqvy4lCnIJiovtouQN5JZHOKCS2QuhEdbcQHFhVksz2N2U9hXJo8odG7ETyWlEeuDw==", "dev": true }, + "async": { + "version": "2.6.3", + "resolved": "https://registry.npmjs.org/async/-/async-2.6.3.tgz", + "integrity": "sha512-zflvls11DCy+dQWzTW2dzuilv8Z5X/pjfmZOWba6TNIVDm+2UDaJmXSOXlasHKfNBs8oo3M0aT50fDEWfKZjXg==", + "requires": { + "lodash": "^4.17.14" + } + }, "async-limiter": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/async-limiter/-/async-limiter-1.0.1.tgz", @@ -224,11 +232,19 @@ "integrity": "sha1-DQcLTQQ6W+ozovGkDi7bPZpMz3c=", "dev": true }, + "color": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/color/-/color-3.0.0.tgz", + "integrity": "sha512-jCpd5+s0s0t7p3pHQKpnJ0TpQKKdleP71LWcA0aqiljpiuAkOSUFN/dyH8ZwF0hRmFlrIuRhufds1QyEP9EB+w==", + "requires": { + "color-convert": "^1.9.1", + "color-string": "^1.5.2" + } + }, "color-convert": { "version": "1.9.3", "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-1.9.3.tgz", "integrity": "sha512-QfAUtd+vFdAtFQcC8CCyYt1fYWxSqAiK2cSD6zDB8N3cpsEBAvRxp9zOGg6G/SHHJYAT88/az/IuDGALsNVbGg==", - "dev": true, "requires": { "color-name": "1.1.3" } @@ -236,8 +252,35 @@ "color-name": { "version": "1.1.3", "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.3.tgz", - "integrity": "sha1-p9BVi9icQveV3UIyj3QIMcpTvCU=", - "dev": true + "integrity": "sha1-p9BVi9icQveV3UIyj3QIMcpTvCU=" + }, + "color-string": { + "version": "1.5.3", + "resolved": "https://registry.npmjs.org/color-string/-/color-string-1.5.3.tgz", + "integrity": "sha512-dC2C5qeWoYkxki5UAXapdjqO672AM4vZuPGRQfO8b5HKuKGBbKWpITyDYN7TOFKvRW7kOgAn3746clDBMDJyQw==", + "requires": { + "color-name": "^1.0.0", + "simple-swizzle": "^0.2.2" + } + }, + "colornames": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/colornames/-/colornames-1.1.1.tgz", + "integrity": "sha1-+IiQMGhcfE/54qVZ9Qd+t2qBb5Y=" + }, + "colors": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/colors/-/colors-1.4.0.tgz", + "integrity": "sha512-a+UqTh4kgZg/SlGvfbzDHpgRu7AAQOmmqRHJnxhRZICKFUT91brVhNNt58CMWU9PsBbv3PDCZUHbVxuDiH2mtA==" + }, + "colorspace": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/colorspace/-/colorspace-1.1.2.tgz", + "integrity": "sha512-vt+OoIP2d76xLhjwbBaucYlNSpPsrJWPlBTtwCpQKIu6/CSMutyzX93O/Do0qzpH3YoHEes8YEFXyZ797rEhzQ==", + "requires": { + "color": "3.0.x", + "text-hex": "1.0.x" + } }, "concat-map": { "version": "0.0.1", @@ -363,6 +406,16 @@ "object-keys": "^1.0.12" } }, + "diagnostics": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/diagnostics/-/diagnostics-1.1.1.tgz", + "integrity": "sha512-8wn1PmdunLJ9Tqbx+Fx/ZEuHfJf4NKSN2ZBj7SJC/OWRWha843+WsTjqMe1B5E3p28jqBlp+mJ2fPVxPyNgYKQ==", + "requires": { + "colorspace": "1.1.x", + "enabled": "1.0.x", + "kuler": "1.0.x" + } + }, "diff": { "version": "3.5.0", "resolved": "https://registry.npmjs.org/diff/-/diff-3.5.0.tgz", @@ -425,6 +478,14 @@ "integrity": "sha512-CwBLREIQ7LvYFB0WyRvwhq5N5qPhc6PMjD6bYggFlI5YyDgl+0vxq5VHbMOFqLg7hfWzmu8T5Z1QofhmTIhItA==", "dev": true }, + "enabled": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/enabled/-/enabled-1.0.2.tgz", + "integrity": "sha1-ll9lE9LC0cX0ZStkouM5ZGf8L5M=", + "requires": { + "env-variable": "0.0.x" + } + }, "end-of-stream": { "version": "1.4.1", "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.1.tgz", @@ -438,6 +499,11 @@ "resolved": "https://registry.npmjs.org/entities/-/entities-1.1.2.tgz", "integrity": "sha512-f2LZMYl1Fzu7YSBKg+RoROelpOaNrcGmE9AZubeDfrCEia483oW4MI4VyFd5VNHIgQ/7qm1I0wUHK1eJnn2y2w==" }, + "env-variable": { + "version": "0.0.5", + "resolved": "https://registry.npmjs.org/env-variable/-/env-variable-0.0.5.tgz", + "integrity": "sha512-zoB603vQReOFvTg5xMl9I1P2PnHsHQQKTEowsKKD7nseUfJq6UWzK+4YtlWUO1nhiQUxe6XMkk+JleSZD1NZFA==" + }, "es-abstract": { "version": "1.13.0", "resolved": "https://registry.npmjs.org/es-abstract/-/es-abstract-1.13.0.tgz", @@ -529,6 +595,11 @@ } } }, + "fast-safe-stringify": { + "version": "2.0.7", + "resolved": "https://registry.npmjs.org/fast-safe-stringify/-/fast-safe-stringify-2.0.7.tgz", + "integrity": "sha512-Utm6CdzT+6xsDk2m8S6uL8VHxNwI6Jub+e9NYTcAms28T84pTa25GJQV9j0CY0N1rM8hK4x6grpF2BQf+2qwVA==" + }, "fd-slicer": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/fd-slicer/-/fd-slicer-1.0.1.tgz", @@ -537,6 +608,11 @@ "pend": "~1.2.0" } }, + "fecha": { + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/fecha/-/fecha-2.3.3.tgz", + "integrity": "sha512-lUGBnIamTAwk4znq5BcqsDaxSmZ9nDVJaij6NvRt/Tg4R69gERA+otPKbS86ROw9nxVMw2/mp1fnaiWqbs6Sdg==" + }, "find-up": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/find-up/-/find-up-3.0.0.tgz", @@ -720,6 +796,11 @@ "integrity": "sha512-wPVv/y/QQ/Uiirj/vh3oP+1Ww+AWehmi1g5fFWGPF6IpCBCDVrhgHRMvrLfdYcwDh3QJbGXDW4JAuzxElLSqKA==", "dev": true }, + "is-arrayish": { + "version": "0.3.2", + "resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.2.tgz", + "integrity": "sha512-eVRqCvVlZbuw3GrM63ovNSNAeA1K16kaR/LRY/92w0zxQ5/1YzwblUX652i4Xs9RwAGjW9d9y6X88t8OaAJfWQ==" + }, "is-buffer": { "version": "2.0.3", "resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-2.0.3.tgz", @@ -769,8 +850,7 @@ "is-stream": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/is-stream/-/is-stream-1.1.0.tgz", - "integrity": "sha1-EtSj3U5o4Lec6428hBc66A2RykQ=", - "dev": true + "integrity": "sha1-EtSj3U5o4Lec6428hBc66A2RykQ=" }, "is-symbol": { "version": "1.0.2", @@ -835,6 +915,14 @@ } } }, + "kuler": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/kuler/-/kuler-1.0.1.tgz", + "integrity": "sha512-J9nVUucG1p/skKul6DU3PUZrhs0LPulNaeUOox0IyXDi8S4CztTHs1gQphhuZmzXG7VOQSf6NJfKuzteQLv9gQ==", + "requires": { + "colornames": "^1.1.1" + } + }, "lazy-cache": { "version": "1.0.4", "resolved": "https://registry.npmjs.org/lazy-cache/-/lazy-cache-1.0.4.tgz", @@ -878,6 +966,18 @@ "chalk": "^2.0.1" } }, + "logform": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/logform/-/logform-2.1.2.tgz", + "integrity": "sha512-+lZh4OpERDBLqjiwDLpAWNQu6KMjnlXH2ByZwCuSqVPJletw0kTWJf5CgSNAUKn1KUkv3m2cUz/LK8zyEy7wzQ==", + "requires": { + "colors": "^1.2.1", + "fast-safe-stringify": "^2.0.4", + "fecha": "^2.3.3", + "ms": "^2.1.1", + "triple-beam": "^1.3.0" + } + }, "lowercase-keys": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/lowercase-keys/-/lowercase-keys-1.0.1.tgz", @@ -1107,6 +1207,11 @@ "wrappy": "1" } }, + "one-time": { + "version": "0.0.4", + "resolved": "https://registry.npmjs.org/one-time/-/one-time-0.0.4.tgz", + "integrity": "sha1-+M33eISCb+Tf+T46nMN7HkSAdC4=" + }, "os-locale": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/os-locale/-/os-locale-3.1.0.tgz", @@ -1429,12 +1534,25 @@ "integrity": "sha1-tf3AjxKH6hF4Yo5BXiUTK3NkbG0=", "dev": true }, + "simple-swizzle": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.2.tgz", + "integrity": "sha1-pNprY1/8zMoz9w0Xy5JZLeleVXo=", + "requires": { + "is-arrayish": "^0.3.1" + } + }, "sprintf-js": { "version": "1.0.3", "resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz", "integrity": "sha1-BOaSb2YolTVPPdAVIDYzuFcpfiw=", "dev": true }, + "stack-trace": { + "version": "0.0.10", + "resolved": "https://registry.npmjs.org/stack-trace/-/stack-trace-0.0.10.tgz", + "integrity": "sha1-VHxws0fo0ytOEI6hoqFZ5f3eGcA=" + }, "string-width": { "version": "2.1.1", "resolved": "https://registry.npmjs.org/string-width/-/string-width-2.1.1.tgz", @@ -1490,11 +1608,21 @@ "has-flag": "^3.0.0" } }, + "text-hex": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/text-hex/-/text-hex-1.0.0.tgz", + "integrity": "sha512-uuVGNWzgJ4yhRaNSiubPY7OjISw4sw4E5Uv0wbjp+OzcbmVU/rsT8ujgcXJhn9ypzsgr5vlzpPqP+MBBKcGvbg==" + }, "to-readable-stream": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/to-readable-stream/-/to-readable-stream-1.0.0.tgz", "integrity": "sha512-Iq25XBt6zD5npPhlLVXGFN3/gyR2/qODcKNNyTMd4vbm39HUaOiAM4PMq0eMVC/Tkxz+Zjdsc55g9yyz+Yq00Q==" }, + "triple-beam": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/triple-beam/-/triple-beam-1.3.0.tgz", + "integrity": "sha512-XrHUvV5HpdLmIj4uVMxHggLbFSZYIn7HEWsqePZcI50pco+MPqJ50wMGY794X7AOOhxOBAjbkqfAbEe/QMp2Lw==" + }, "type-detect": { "version": "4.0.8", "resolved": "https://registry.npmjs.org/type-detect/-/type-detect-4.0.8.tgz", @@ -1565,6 +1693,55 @@ "string-width": "^1.0.2 || 2" } }, + "winston": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/winston/-/winston-3.2.1.tgz", + "integrity": "sha512-zU6vgnS9dAWCEKg/QYigd6cgMVVNwyTzKs81XZtTFuRwJOcDdBg7AU0mXVyNbs7O5RH2zdv+BdNZUlx7mXPuOw==", + "requires": { + "async": "^2.6.1", + "diagnostics": "^1.1.1", + "is-stream": "^1.1.0", + "logform": "^2.1.1", + "one-time": "0.0.4", + "readable-stream": "^3.1.1", + "stack-trace": "0.0.x", + "triple-beam": "^1.3.0", + "winston-transport": "^4.3.0" + } + }, + "winston-transport": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/winston-transport/-/winston-transport-4.3.0.tgz", + "integrity": "sha512-B2wPuwUi3vhzn/51Uukcao4dIduEiPOcOt9HJ3QeaXgkJ5Z7UwpBzxS4ZGNHtrxrUvTwemsQiSys0ihOf8Mp1A==", + "requires": { + "readable-stream": "^2.3.6", + "triple-beam": "^1.2.0" + }, + "dependencies": { + "readable-stream": { + "version": "2.3.6", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz", + "integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==", + "requires": { + "core-util-is": "~1.0.0", + "inherits": "~2.0.3", + "isarray": "~1.0.0", + "process-nextick-args": "~2.0.0", + "safe-buffer": "~5.1.1", + "string_decoder": "~1.1.1", + "util-deprecate": "~1.0.1" + } + }, + "string_decoder": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", + "integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==", + "requires": { + "safe-buffer": "~5.1.0" + } + } + } + }, "wrap-ansi": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-2.1.0.tgz", diff --git a/package.json b/package.json index 78fec21..e4c4bad 100644 --- a/package.json +++ b/package.json @@ -29,7 +29,8 @@ "puppeteer": "^2.0.0", "puppeteer-extra": "^2.1.3", "puppeteer-extra-plugin-stealth": "^2.2.2", - "user-agents": "^1.0.378" + "user-agents": "^1.0.378", + "winston": "^3.2.1" }, "devDependencies": { "chai": "^4.2.0", diff --git a/src/modules/common.js b/src/modules/common.js deleted file mode 100644 index dbcfe85..0000000 --- a/src/modules/common.js +++ /dev/null @@ -1,21 +0,0 @@ -function log(config, loglevel, msg = null, cb = null) { - if (typeof loglevel != "number") { - throw Error('loglevel must be numeric.'); - } - - if (loglevel <= config.debug_level) { - if (msg) { - if (typeof msg == 'object') { - console.dir(msg, {depth: null, colors: false}); - } else { - console.log('[i] ' + msg); - } - } else if (cb) { - cb(); - } - } -} - -module.exports = { - log: log, -}; \ No newline at end of file diff --git a/src/modules/google.js b/src/modules/google.js index 2f61b1c..38ccad2 100644 --- a/src/modules/google.js +++ b/src/modules/google.js @@ -2,8 +2,6 @@ const cheerio = require('cheerio'); const Scraper = require('./se_scraper'); -const common = require('./common.js'); -var log = common.log; class GoogleScraper extends Scraper { @@ -243,7 +241,7 @@ class GoogleScraper extends Scraper { } } - log(this.config, 1, 'Using startUrl: ' + startUrl); + this.logger.info('Using startUrl: ' + startUrl); this.last_response = await this.page.goto(startUrl); @@ -642,7 +640,7 @@ class GoogleMapsScraper extends Scraper { this.scrape_in_detail = this.config.google_maps_settings.scrape_in_detail || false; } - log(this.config, 1, 'Using startUrl: ' + startUrl); + this.logger.info('Using startUrl: ' + startUrl); this.last_response = await this.page.goto(startUrl); @@ -681,7 +679,7 @@ class GoogleMapsScraper extends Scraper { let last_title_last_result = this.results[this.keyword][this.page_num-1].results.slice(-1)[0].title; - log(this.config, 1, `Waiting until new last serp title differs from: "${last_title_last_result}"`); + this.logger.info(`Waiting until new last serp title differs from: "${last_title_last_result}"`); await this.page.waitForFunction((last_title) => { const res = document.querySelectorAll('.section-result .section-result-title span'); @@ -775,7 +773,7 @@ class GoogleShoppingScraper extends Scraper { } } - log(this.config, 1, 'Using startUrl: ' + startUrl); + this.logger.info('Using startUrl: ' + startUrl); this.last_response = await this.page.goto(startUrl); diff --git a/src/modules/se_scraper.js b/src/modules/se_scraper.js index 68e7732..d4f64c9 100644 --- a/src/modules/se_scraper.js +++ b/src/modules/se_scraper.js @@ -1,8 +1,6 @@ 'use strict'; const meta = require('./metadata.js'); -const common = require('./common.js'); -var log = common.log; - +const debug = require('debug')('se-scraper:Scraper'); /* Get useful JS knowledge and get awesome... @@ -26,6 +24,7 @@ module.exports = class Scraper { }; this.pluggable = pluggable; this.config = config; + this.logger = this.config.logger; this.context = context; this.proxy = config.proxy; @@ -113,25 +112,25 @@ module.exports = class Scraper { if (this.config.log_http_headers === true) { this.metadata.http_headers = await meta.get_http_headers(this.page); - log(this.config, 2, this.metadata.http_headers); + debug('this.metadata.http_headers=%O', this.metadata.http_headers); } if (this.config.log_ip_address === true) { let ipinfo = await meta.get_ip_data(this.page); this.metadata.ipinfo = ipinfo; - log(this.config, 2, this.metadata.ipinfo); + debug('this.metadata.ipinfo', this.metadata.ipinfo); } // check that our proxy is working by confirming // that ipinfo.io sees the proxy IP address if (this.proxy && this.config.log_ip_address === true) { - log(this.config, 3, `${this.metadata.ipinfo.ip} vs ${this.proxy}`); + debug(`${this.metadata.ipinfo.ip} vs ${this.proxy}`); // if the ip returned by ipinfo is not a substring of our proxystring, get the heck outta here if (!this.proxy.includes(this.metadata.ipinfo.ip)) { throw new Error(`Proxy output ip ${this.proxy} does not match with provided one`); } else { - log(this.config, 1, `Using valid Proxy: ${this.proxy}`); + this.logger.info(`Using valid Proxy: ${this.proxy}`); } } @@ -179,7 +178,7 @@ module.exports = class Scraper { do { - log(this.config, 1, `${this.config.search_engine_name} scrapes keyword "${keyword}" on page ${this.page_num}`); + this.logger.info(`${this.config.search_engine_name} scrapes keyword "${keyword}" on page ${this.page_num}`); await this.wait_for_results(); @@ -263,28 +262,21 @@ module.exports = class Scraper { } catch (e) { - console.error(`Problem with scraping ${keyword} in search engine ${this.config.search_engine_name}: ${e}`); + this.logger.warn(`Problem with scraping ${keyword} in search engine ${this.config.search_engine_name}: ${e.message}`); + debug('this.last_response=%O', this.last_response); - if (this.last_response) { - log(this.config, 2, this.last_response); - } - - if (this.config.debug_level > 2) { - try { - // Try to save a screenshot of the error - await this.page.screenshot({path: `debug_se_scraper_${this.config.search_engine_name}_${keyword}.png`}); - } catch (e) { - } + if (this.config.take_screenshot_on_error) { + await this.page.screenshot({ path: `debug_se_scraper_${this.config.search_engine_name}_${keyword}.png` }); } this.metadata.scraping_detected = await this.detected(); if (this.metadata.scraping_detected === true) { - console.error(`${this.config.search_engine_name} detected the scraping!`); + this.logger.warn(`${this.config.search_engine_name} detected the scraping!`); if (this.config.is_local === true) { await this.sleep(this.SOLVE_CAPTCHA_TIME); - console.error(`You have ${this.SOLVE_CAPTCHA_TIME}ms to enter the captcha.`); + this.logger.info(`You have ${this.SOLVE_CAPTCHA_TIME}ms to enter the captcha.`); // expect that user filled out necessary captcha } else { if (this.config.throw_on_detection === true) { @@ -318,7 +310,7 @@ module.exports = class Scraper { baseUrl += `${key}=${settings[key]}&` } - log(this.config, 1, 'Using startUrl: ' + baseUrl); + this.logger.info('Using startUrl: ' + baseUrl); return baseUrl; } @@ -335,7 +327,7 @@ module.exports = class Scraper { async random_sleep() { const [min, max] = this.config.sleep_range; let rand = Math.floor(Math.random() * (max - min + 1) + min); //Generate Random number - log(this.config, 1, `Sleeping for ${rand}s`); + this.logger.info(`Sleeping for ${rand}s`); await this.sleep(rand * 1000); } @@ -349,7 +341,7 @@ module.exports = class Scraper { no_results(needles, html) { for (let needle of needles) { if (html.includes(needle)) { - console.log(this.config, 2, `HTML contains needle ${needle}. no_results=true`); + this.logger.warn(`HTML contains needle ${needle}. no_results=true`); return true; } } diff --git a/src/modules/yandex.js b/src/modules/yandex.js index 890bbfa..3666cc1 100644 --- a/src/modules/yandex.js +++ b/src/modules/yandex.js @@ -1,8 +1,6 @@ 'use strict'; const Scraper = require('./se_scraper'); -const common = require('./common.js'); -var log = common.log; class YandexScraper extends Scraper { @@ -75,7 +73,7 @@ class YandexScraper extends Scraper { async load_start_page() { let startUrl = 'https://yandex.com'; - log(this.config, 1, 'Using startUrl: ' + startUrl); + this.logger.info('Using startUrl: ' + startUrl); this.last_response = await this.page.goto(startUrl); diff --git a/src/node_scraper.js b/src/node_scraper.js index 5ebd6dc..8ae673a 100644 --- a/src/node_scraper.js +++ b/src/node_scraper.js @@ -3,6 +3,9 @@ const fs = require('fs'); const os = require('os'); const _ = require('lodash'); +const { createLogger, format, transports } = require('winston'); +const { combine, timestamp, printf } = format; +const debug = require('debug')('se-scraper:ScrapeManager'); const UserAgent = require('user-agents'); const google = require('./modules/google.js'); @@ -11,8 +14,6 @@ const yandex = require('./modules/yandex.js'); const infospace = require('./modules/infospace.js'); const duckduckgo = require('./modules/duckduckgo.js'); const { Cluster } = require('./puppeteer-cluster/dist/index.js'); -const common = require('./modules/common.js'); -var log = common.log; const MAX_ALLOWED_BROWSERS = 6; @@ -81,12 +82,18 @@ class ScrapeManager { // which search engine to scrape search_engine: 'google', search_engine_name: 'google', - // whether debug information should be printed - // level 0: print nothing - // level 1: print most important info - // ... - // level 4: print all shit nobody wants to know - debug_level: 1, + logger: createLogger({ + level: 'info', + format: combine( + timestamp(), + printf(({ level, message, timestamp }) => { + return `${timestamp} [${level}] ${message}`; + }) + ), + transports: [ + new transports.Console() + ] + }), keywords: ['nodejs rocks',], // whether to start the browser in headless mode headless: true, @@ -154,6 +161,8 @@ class ScrapeManager { } }); + this.logger = this.config.logger; + if (config.sleep_range) { // parse an array config.sleep_range = eval(config.sleep_range); @@ -168,16 +177,15 @@ class ScrapeManager { } if (this.config.proxies && this.config.proxy_file) { - console.error('Either use a proxy_file or specify a proxy for all connections. Do not use both options.'); - return false; + throw new Error('Either use a proxy_file or specify a proxy for all connections. Do not use both options.'); } if (fs.existsSync(this.config.proxy_file)) { this.config.proxies = read_keywords_from_file(this.config.proxy_file); - log(this.config, 1, `${this.config.proxies.length} proxies read from file.`); + this.logger.info(`${this.config.proxies.length} proxies read from file.`); } - log(this.config, 2, this.config); + debug('this.config=%O', this.config); } /* @@ -224,7 +232,7 @@ class ScrapeManager { ignoreHTTPSErrors: true, }; - log(this.config, 2, `Using the following puppeteer configuration: ${launch_args}`); + debug('Using the following puppeteer configuration launch_args=%O', launch_args); if (this.pluggable && this.pluggable.start_browser) { launch_args.config = this.config; @@ -256,7 +264,7 @@ class ScrapeManager { MAX_ALLOWED_BROWSERS ); - log(this.config, 1, `Using ${this.numClusters} clusters.`); + this.logger.info(`Using ${this.numClusters} clusters.`); this.config.puppeteer_cluster_config.maxConcurrency = this.numClusters; @@ -279,9 +287,7 @@ class ScrapeManager { }) } - if (this.config.debug_level >= 2) { - console.dir(perBrowserOptions) - } + debug('perBrowserOptions=%O', perBrowserOptions) this.cluster = await Cluster.launch({ monitor: this.config.puppeteer_cluster_config.monitor, @@ -293,8 +299,8 @@ class ScrapeManager { }); this.cluster.on('taskerror', (err, data) => { - console.log(`Error while scraping ${data}: ${err.message}`); - console.log(err); + this.logger.error(`Error while scraping ${data}: ${err.message}`); + debug('Error during cluster task', err); }); } } @@ -305,8 +311,7 @@ class ScrapeManager { async scrape(scrape_config = {}) { if (!scrape_config.keywords && !scrape_config.keyword_file) { - console.error('Either keywords or keyword_file must be supplied to scrape()'); - return false; + throw new Error('Either keywords or keyword_file must be supplied to scrape()'); } Object.assign(this.config, scrape_config); @@ -318,10 +323,7 @@ class ScrapeManager { this.config.search_engine_name = typeof this.config.search_engine === 'function' ? this.config.search_engine.name : this.config.search_engine; - if (this.config.keywords && this.config.search_engine) { - log(this.config, 1, - `[se-scraper] started at [${(new Date()).toUTCString()}] and scrapes ${this.config.search_engine_name} with ${this.config.keywords.length} keywords on ${this.config.num_pages} pages each.`) - } + this.logger.info(`scrapes ${this.config.search_engine_name} with ${this.config.keywords.length} keywords on ${this.config.num_pages} pages each.`); if (this.pluggable && this.pluggable.start_browser) { @@ -385,8 +387,8 @@ class ScrapeManager { let timeDelta = Date.now() - startTime; let ms_per_request = timeDelta/num_requests; - log(this.config, 1, `Scraper took ${timeDelta}ms to perform ${num_requests} requests.`); - log(this.config, 1, `On average ms/request: ${ms_per_request}ms/request`); + this.logger.info(`Scraper took ${timeDelta}ms to perform ${num_requests} requests.`); + this.logger.info(`On average ms/request: ${ms_per_request}ms/request`); if (this.pluggable && this.pluggable.handle_results) { await this.pluggable.handle_results(results); @@ -396,14 +398,14 @@ class ScrapeManager { metadata.ms_per_keyword = ms_per_request.toString(); metadata.num_requests = num_requests; - log(this.config, 2, metadata); + debug('metadata=%O', metadata); if (this.pluggable && this.pluggable.handle_metadata) { await this.pluggable.handle_metadata(metadata); } if (this.config.output_file) { - log(this.config, 1, `Writing results to ${this.config.output_file}`); + this.logger.info(`Writing results to ${this.config.output_file}`); write_results(this.config.output_file, JSON.stringify(results, null, 4)); }