From 089e410ec64ea26827adfdfd7b4f4e0a28f417b0 Mon Sep 17 00:00:00 2001 From: Nikolai Tschacher Date: Wed, 27 Feb 2019 20:58:13 +0100 Subject: [PATCH] support for multible browsers and proxies --- README.md | 102 +++++++-- TODO.txt | 8 +- data.json | 1 + examples/per_page_proxy.js | 76 +++++++ examples/proxies.js | 19 ++ examples/quickstart.js | 6 +- examples/test_cluster.js | 86 ++++++++ examples/test_promise.js | 40 ++++ index.js | 142 ++++++------ package-lock.json | 91 ++++++-- package.json | 7 +- run.js | 11 +- src/modules/google.js | 7 +- src/modules/metadata.js | 10 +- src/modules/se_scraper.js | 45 +++- src/node_scraper.js | 441 ++++++++++++++++++------------------- src/puppeteer-cluster | 1 + 17 files changed, 743 insertions(+), 350 deletions(-) create mode 100644 data.json create mode 100644 examples/per_page_proxy.js create mode 100644 examples/proxies.js create mode 100644 examples/test_cluster.js create mode 100644 examples/test_promise.js create mode 160000 src/puppeteer-cluster diff --git a/README.md b/README.md index b3f30ea..7d083b8 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ This node module supports scraping several search engines. -Right now scraping the search engines +Right now it's possible to scrape the following search engines * Google * Google News @@ -14,20 +14,15 @@ Right now scraping the search engines * Infospace * Duckduckgo * Webcrawler - -is supported. - -Additionally **se-scraper** supports investment ticker search from the following sites: - * Reuters * cnbc * Marketwatch -This module uses puppeteer. It was created by the Developer of https://github.com/NikolaiT/GoogleScraper, a module with 1800 Stars on Github. +This module uses puppeteer and puppeteer-cluster (modified version). It was created by the Developer of https://github.com/NikolaiT/GoogleScraper, a module with 1800 Stars on Github. ### Quickstart -**Note**: If you don't want puppeteer to download a complete chromium browser, add this variable to your environments: +**Note**: If you **don't** want puppeteer to download a complete chromium browser, add this variable to your environments. Then this library is not guaranteed to run out of the box. ```bash export PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=1 @@ -39,7 +34,7 @@ Then install with npm install se-scraper ``` -then create a file with the following contents and start scraping. +then create a file `run.js` with the following contents ```js const se_scraper = require('se-scraper'); @@ -61,6 +56,79 @@ function callback(err, response) { se_scraper.scrape(config, callback); ``` +Start scraping by firing up the command `node run.js` + +#### Scrape with proxies + +**se-scraper** will create one browser instance per proxy. So the maximal ammount of concurency is equivalent to the number of proxies plus one (your own IP). + +```js +const se_scraper = require('se-scraper'); + +let config = { + search_engine: 'google', + debug: false, + verbose: false, + keywords: ['news', 'scrapeulous.com', 'incolumitas.com', 'i work too much'], + num_pages: 1, + output_file: 'data.json', + proxy_file: '/home/nikolai/.proxies', // one proxy per line + log_ip_address: true, +}; + +function callback(err, response) { + if (err) { console.error(err) } + console.dir(response, {depth: null, colors: true}); +} + +se_scraper.scrape(config, callback); +``` + +With a proxy file such as (invalid proxies of course) + +```text +socks5://53.34.23.55:55523 +socks4://51.11.23.22:22222 +``` + +This will scrape with **three** browser instance each having their own IP address. Unfortunately, it is currently not possible to scrape with different proxies per tab (chromium issue). + +### Scraping Model + +**se-scraper** scrapes search engines only. In order to introduce concurrency into this library, it is necessary to define the scraping model. Then we can decide how we divide and conquer. + +#### Scraping Resources + +What are common scraping resources? + +1. **Memory and CPU**. Necessary to launch multiple browser instances. +2. **Network Bandwith**. Is not often the bottleneck. +3. **IP Addresses**. Websites often block IP addresses after a certain amount of requests from the same IP address. Can be circumvented by using proxies. +4. Spoofable identifiers such as browser fingerprint or user agents. Those will be handled by **se-scraper** + +#### Concurrency Model + +**se-scraper** should be able to run without any concurrency at all. This is the default case. No concurrency means only one browser/tab is searching at the time. + +For concurrent use, we will make use of a modified [puppeteer-cluster library](https://github.com/thomasdondorf/puppeteer-cluster). + +One scrape job is properly defined by + +* 1 search engine such as `google` +* `M` pages +* `N` keywords/queries +* `K` proxies and `K+1` browser instances (because when we have no proxies available, we will scrape with our dedicated IP) + +Then **se-scraper** will create `K+1` dedicated browser instances with a unique ip address. Each browser will get `N/(K+1)` keywords and will issue `N/(K+1) * M` total requests to the search engine. + +The problem is that [puppeteer-cluster library](https://github.com/thomasdondorf/puppeteer-cluster) does only allow identical options for subsequent new browser instances. Therefore, it is not trivial to launch a cluster of browsers with distinct proxy settings. Right now, every browser has the same options. It's not possible to set options on a per browser basis. + +Solution: + +1. Create a [upstream proxy router](https://github.com/GoogleChrome/puppeteer/issues/678). +2. Modify [puppeteer-cluster library](https://github.com/thomasdondorf/puppeteer-cluster) to accept a list of proxy strings and then pop() from this list at every new call to `workerInstance()` in https://github.com/thomasdondorf/puppeteer-cluster/blob/master/src/Cluster.ts I wrote an [issue here](https://github.com/thomasdondorf/puppeteer-cluster/issues/107). **I ended up doing this**. + + ### Technical Notes Scraping is done with a headless chromium browser using the automation library puppeteer. Puppeteer is a Node library which provides a high-level API to control headless Chrome or Chromium over the DevTools Protocol. @@ -144,7 +212,8 @@ Use se-scraper by calling it with a script such as the one below. const se_scraper = require('se-scraper'); const resolve = require('path').resolve; -let config = { +// options for scraping +event = { // the user agent to scrape with user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', // if random_user_agent is set to True, a random user agent is chosen @@ -162,7 +231,7 @@ let config = { search_engine: 'google', compress: false, // compress debug: false, - verbose: false, + verbose: true, keywords: ['scrapeulous.com'], // whether to start the browser in headless mode headless: true, @@ -178,13 +247,16 @@ let config = { // get_browser, handle_metadata, close_browser //custom_func: resolve('examples/pluggable.js'), custom_func: '', - // use a proxy for all connections - // example: 'socks5://78.94.172.42:1080' - // example: 'http://118.174.233.10:48400' - proxy: '', + // path to a proxy file, one proxy per line. Example: + // socks5://78.94.172.42:1080 + // http://118.174.233.10:48400 + proxy_file: '', + proxies: [], // check if headless chrome escapes common detection techniques // this is a quick test and should be used for debugging test_evasion: false, + // settings for puppeteer-cluster + monitor: false, }; function callback(err, response) { diff --git a/TODO.txt b/TODO.txt index 44a05cc..1da5957 100644 --- a/TODO.txt +++ b/TODO.txt @@ -27,19 +27,21 @@ 30.1.2019 - - modify all scrapers to use the generic class where it makes sense - Bing, Baidu, Google, Duckduckgo 7.2.2019 - add num_requests to test cases [done] - +25.2.2019 + - https://antoinevastel.com/crawler/2018/09/20/parallel-crawler-puppeteer.html + - add support for browsing with multiple browsers, use this neat library: + - https://github.com/thomasdondorf/puppeteer-cluster [done] TODO: + - write test case for proxy support and cluster support - add captcha service solving support - check if news instances run the same browser and if we can have one proxy per tab wokers - write test case for: - pluggable - - full metadata (log http headers, log ip address) diff --git a/data.json b/data.json new file mode 100644 index 0000000..a7219e5 --- /dev/null +++ b/data.json @@ -0,0 +1 @@ +{"news":{"1":{"time":"Wed, 27 Feb 2019 19:51:57 GMT","num_results":"Ungefähr 13.440.000.000 Ergebnisse (0,31 Sekunden) ","no_results":false,"effective_query":"","results":[{"link":"https://news.google.de/","title":"Google Newshttps://news.google.de/Ähnliche Seiten","snippet":"Ausführliche und aktuelle Beiträge - von Google News aus verschiedenen Nachrichtenquellen aus aller Welt zusammengetragen.","visible_link":"https://news.google.de/","date":"","rank":1},{"link":"https://www.bild.de/news/startseite/news/news-16804530.bild.html","title":"News aktuell aus Deutschland und der Welt - Bild.dehttps://www.bild.de/news/startseite/news/news-16804530.bild.html","snippet":"vor 2 Tagen - Aktuelle News aus Deutschland, Europa und der Welt. Alle Informationen, Bilder und Videos zu Skandalen, Krisen und Sensationen bei ...","visible_link":"https://www.bild.de/news/startseite/news/news-16804530.bild.html","date":"vor 2 Tagen - ","rank":2},{"link":"https://www.zeit.de/news/index","title":"Schlagzeilen, News und Newsticker | ZEIT ONLINE - Die Zeithttps://www.zeit.de/news/index","snippet":"Aktuelle News und Schlagzeilen im Newsticker von ZEIT ONLINE. Lesen Sie hier die neuesten Nachrichten.","visible_link":"https://www.zeit.de/news/index","date":"","rank":3},{"link":"http://www.news.de/","title":"news.de - mehr als Nachrichten und News, die Sie bewegenwww.news.de/Ähnliche Seiten","snippet":"Promi News und Aktuelles aus Sport, TV & Web. Jetzt Sportnachrichten von Fußball bis Boxen und das Neueste aus Klatsch und Tratsch per Newsticker, Fotos ...","visible_link":"www.news.de/","date":"","rank":4},{"link":"https://www.rtl.de/cms/news.html","title":"News: Aktuelle Nachrichten, Schlagzeilen und Videos | RTL.dehttps://www.rtl.de/cms/news.html","snippet":"Aktuelle Nachrichten aus Deutschland und der Welt auf einen Blick: Bei RTL.de finden Sie die News von heute, spannende Hintergründe und Videos.","visible_link":"https://www.rtl.de/cms/news.html","date":"","rank":5},{"link":"https://www.t-online.de/nachrichten/","title":"Politik aktuell: Nachrichten aus Deutschland, Europa und der Welthttps://www.t-online.de/nachrichten/","snippet":"Trump trifft Kim: Der Nordkorea-Gipfel in Vietnam im News-Blog · Krise in Venezuela: Aktuelle Entwicklungen, ... E-Mails und News unterwegs immer dabei.","visible_link":"https://www.t-online.de/nachrichten/","date":"","rank":6},{"link":"https://www.mopo.de/news","title":"News - Aktuelle Nachrichten aus Deutschland und der Welt. | MOPO.dehttps://www.mopo.de/news","snippet":"News - Aktuelle Nachrichten aus Hamburg, der Welt, zum HSV und der Welt der Promis.","visible_link":"https://www.mopo.de/news","date":"","rank":7},{"link":"https://news.google.com/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRFZxYUdjU0FtUmxHZ0pFUlNnQVAB?hl=de&gl=DE&ceid=DE%3Ade","title":"Google News - Schlagzeilen - Neuestehttps://news.google.com/.../CAAqJggKIiBDQkFTRWdvSUwyMHZNRFZxYUdjU0FtUm...","snippet":"Mit Google News kannst du zum Thema Schlagzeilen vollständige Artikel lesen, Videos ansehen und in Tausenden von Titeln stöbern.","visible_link":"https://news.google.com/.../CAAqJggKIiBDQkFTRWdvSUwyMHZNRFZxYUdjU0FtUm...","date":"","rank":8},{"link":"https://www.stern.de/news/","title":"News - Sternhttps://www.stern.de/news/Im Cache","snippet":"News und aktuelle Schlagzeilen im Nachrichten-Ticker von STERN.de. Alle Informationen, Reportagen und Hintergründe im Überblick.","visible_link":"https://www.stern.de/news/","date":"","rank":9}]}},"se-scraper":{"1":{"time":"Wed, 27 Feb 2019 19:52:00 GMT","num_results":"Ungefähr 16.100.000 Ergebnisse (0,19 Sekunden) ","no_results":false,"effective_query":"","results":[{"link":"https://www.npmjs.com/package/se-scraper","title":"se-scraper - npmhttps://www.npmjs.com/package/se-scraperIm CacheDiese Seite übersetzen","snippet":"07.02.2019 - A simple library using puppeteer to scrape several search engines such as Google, Duckduckgo and Bing.","visible_link":"https://www.npmjs.com/package/se-scraper","date":"07.02.2019 - ","rank":1},{"link":"https://github.com/NikolaiT/se-scraper","title":"GitHub - NikolaiT/se-scraper: Javascript scraping module based on ...https://github.com/NikolaiT/se-scraperIm CacheDiese Seite übersetzen","snippet":"Javascript scraping module based on puppeteer for many different search engines... - NikolaiT/se-scraper.","visible_link":"https://github.com/NikolaiT/se-scraper","date":"","rank":2},{"link":"https://swedishicescraper.se/","title":"Swedish Ice Scraper: Onlinehttps://swedishicescraper.se/Im CacheDiese Seite übersetzen","snippet":"The original Swedish Ice Scraper - best in test. ... solid Acrylic Glass and use diamond polishing to sharpen the scraping edges. ... info@swedishicescraper.se.","visible_link":"https://swedishicescraper.se/","date":"","rank":3},{"link":"http://konjugator.reverso.net/konjugation-franzosisch-verb-se%20scraper.html","title":"Konjugation se scraper | Konjugieren verb se scraper Französisch ...konjugator.reverso.net/konjugation-franzosisch-verb-se%20scraper.html","snippet":"Reverso-Konjugation: Konjugation des französischen Verbs se scraper, Konjugator für französische Verben, unregelmäßige Verben, Übersetzung,Grammatik.","visible_link":"konjugator.reverso.net/konjugation-franzosisch-verb-se%20scraper.html","date":"","rank":4},{"link":"https://www.blackhatworld.com/seo/any-yandex-scrapers-available-or-universal-se-scraper.243421/","title":"Any yandex scrapers available? Or universal SE scraper ...https://www.blackhatworld.com › ... › Black Hat SEO ToolsIm CacheDiese Seite übersetzen","snippet":"10.10.2010 - Mostly blogs & stuff like that. Is Hrefer for yandex only or there are other SEs? How much is it? Advertise on BHW ...","visible_link":"https://www.blackhatworld.com › ... › Black Hat SEO Tools","date":"10.10.2010 - ","rank":5},{"link":"https://www.friatec.de/content/friatec/en/Technical-Plastics/FRIATOOLS-Technical-Equipment/Mechanical-tools/index.html","title":"FRIATOOLS Scraper tools and mechanical tooling - Friatec AGhttps://www.friatec.de/content/friatec/en/...tools/index.htmlIm CacheDiese Seite übersetzen","snippet":"FRIATOOLS Scraper tools and mechanical tooling. ... FWSG SE 63 - 315, 613562 - 613574, saddle area, pipe ends, d 63 - d 315, SDR 11 - SDR 33. FWSG 710 ...","visible_link":"https://www.friatec.de/content/friatec/en/...tools/index.html","date":"","rank":6},{"link":"https://www.friatec.de/content/friatec/en/Technical-Plastics/FRIATOOLS-Technical-Equipment/Downloads/index.html","title":"Downloads - Friatechttps://www.friatec.de/content/friatec/en/Technical.../index.htmlIm CacheDiese Seite übersetzen","snippet":"Compact Scraper Tool for pipe ends and outlets FWSG RA · Scraper Tool for pipe ends and saddle surfaces (FWSG SE). Brochures Mechanical Tools.","visible_link":"https://www.friatec.de/content/friatec/en/Technical.../index.html","date":"","rank":7},{"link":"https://www.amazon.de/Calli-Edelstahl-Schokolade-Flugzeug-Werkzeug/dp/B01JJ96DJE","title":"Calli Edelstahl Käse Scraper Schokolade reiben Messer Flugzeug ...https://www.amazon.de/Calli-Edelstahl-Schokolade-Flugzeug.../dp/B01JJ96DJEIm Cache","snippet":"Amazon.de: Küchen- und Haushaltsartikel online - Calli Edelstahl Käse Scraper Schokolade reiben Messer Flugzeug Cutter. Beschreibung: Edelstahl Käse ...","visible_link":"https://www.amazon.de/Calli-Edelstahl-Schokolade-Flugzeug.../dp/B01JJ96DJE","date":"","rank":8}]}}} \ No newline at end of file diff --git a/examples/per_page_proxy.js b/examples/per_page_proxy.js new file mode 100644 index 0000000..8cfe938 --- /dev/null +++ b/examples/per_page_proxy.js @@ -0,0 +1,76 @@ +const puppeteer = require('puppeteer'); +const ProxyChain = require('proxy-chain'); + +const ROUTER_PROXY = 'http://127.0.0.1:8000'; + +// SEE: https://github.com/GoogleChrome/puppeteer/issues/678 +// Idea is: Setup a local router proxy that assigns requests identified by unique user-agent strings +// distinct upstream proxies. With this way it is possible to use one proxy per chromium tab. +// downside: not fast and efficient + +const uas = [ + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', +]; + +const proxies = ['http://142.93.57.147:3128', 'http://85.132.31.115:8181']; + +(async () => { + const browser = await puppeteer.launch({ + headless: false, + args: [`--proxy-server=${ROUTER_PROXY}`], + }); + const page1 = await browser.newPage(); + const page2 = await browser.newPage(); + + try { + await page1.setUserAgent(uas[0]); + await page1.goto('https://www.whatsmyip.org/'); + } catch (e) { + console.log(e); + } + + try { + await page2.setUserAgent(uas[1]); + await page2.goto('https://www.whatsmyip.org/'); + } catch (e) { + console.log(e); + } + + //await browser.close(); +})(); + +const server = new ProxyChain.Server({ + // Port where the server the server will listen. By default 8000. + port: 8000, + + // Enables verbose logging + verbose: true, + + prepareRequestFunction: ({ + request, + username, + password, + hostname, + port, + isHttp, + }) => { + var upstreamProxyUrl; + + if (request.headers['user-agent'] === uas[0]) { + upstreamProxyUrl = proxies[0]; + } + + if (request.headers['user-agent'] === uas[1]) { + upstreamProxyUrl = proxies[1]; + } + + console.log('Using proxy: ' + upstreamProxyUrl); + + return { upstreamProxyUrl }; + }, +}); + +server.listen(() => { + console.log(`Router Proxy server is listening on port ${8000}`); +}); \ No newline at end of file diff --git a/examples/proxies.js b/examples/proxies.js new file mode 100644 index 0000000..46a082a --- /dev/null +++ b/examples/proxies.js @@ -0,0 +1,19 @@ +const se_scraper = require('./../index.js'); + +let config = { + search_engine: 'google', + debug: false, + verbose: false, + keywords: ['news', 'scrapeulous.com', 'incolumitas.com', 'i work too much'], + num_pages: 1, + output_file: 'data.json', + proxy_file: '/home/nikolai/.proxies', // one proxy per line + log_ip_address: true, +}; + +function callback(err, response) { + if (err) { console.error(err) } + console.dir(response, {depth: null, colors: true}); +} + +se_scraper.scrape(config, callback); \ No newline at end of file diff --git a/examples/quickstart.js b/examples/quickstart.js index de4cdbf..223fec6 100644 --- a/examples/quickstart.js +++ b/examples/quickstart.js @@ -1,11 +1,11 @@ const se_scraper = require('./../index.js'); let config = { - search_engine: 'duckduckgo', + search_engine: 'google', debug: false, verbose: false, - keywords: ['news'], - num_pages: 2, + keywords: ['news', 'se-scraper'], + num_pages: 1, output_file: 'data.json', }; diff --git a/examples/test_cluster.js b/examples/test_cluster.js new file mode 100644 index 0000000..5967094 --- /dev/null +++ b/examples/test_cluster.js @@ -0,0 +1,86 @@ +const { Cluster } = require('../../puppeteer-cluster/dist/index.js'); +var fs = require('fs'); +var os = require("os"); + +const PROXY_FILE = '/home/nikolai/.proxies'; + +function read_items_from_file(fname) { + let kws = fs.readFileSync(fname).toString().split(os.EOL); + // clean keywords + kws = kws.filter((kw) => { + return kw.trim().length > 0; + }); + return kws; +} + +(async () => { + + let browserArgs = [ + '--disable-infobars', + '--window-position=0,0', + '--ignore-certifcate-errors', + '--ignore-certifcate-errors-spki-list', + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-accelerated-2d-canvas', + '--disable-gpu', + '--window-size=1920x1080', + '--hide-scrollbars', + ]; + + let proxies = read_items_from_file(PROXY_FILE); + + console.dir(proxies); + + // each new call to workerInstance() will + // left pop() one element from this list + // maxConcurrency should be equal to perBrowserOptions.length + + // the first browser config with home IP + let perBrowserOptions = [{ + headless: false, + ignoreHTTPSErrors: true, + args: browserArgs + }]; + + for (var proxy of proxies) { + perBrowserOptions.push({ + headless: false, + ignoreHTTPSErrors: true, + args: browserArgs.concat(`--proxy-server=${proxy}`) + }) + } + + const cluster = await Cluster.launch({ + monitor: true, + concurrency: Cluster.CONCURRENCY_BROWSER, + maxConcurrency: perBrowserOptions.length, + puppeteerOptions: { + headless: false, + args: browserArgs, + ignoreHTTPSErrors: true, + }, + perBrowserOptions: perBrowserOptions + }); + + // Event handler to be called in case of problems + cluster.on('taskerror', (err, data) => { + console.log(`Error crawling ${data}: ${err.message}`); + }); + + + await cluster.task(async ({ page, data: url }) => { + await page.goto(url, {waitUntil: 'domcontentloaded', timeout: 20000}); + const pageTitle = await page.evaluate(() => document.title); + console.log(`Page title of ${url} is ${pageTitle}`); + console.log(await page.content()); + }); + + for(var i = 0; i < perBrowserOptions.length; i++) { + await cluster.queue('http://ipinfo.io/json'); + } + + await cluster.idle(); + await cluster.close(); +})(); diff --git a/examples/test_promise.js b/examples/test_promise.js new file mode 100644 index 0000000..353e538 --- /dev/null +++ b/examples/test_promise.js @@ -0,0 +1,40 @@ +class Test { + constructor(options = {}) { + const { + config = {}, + } = options; + + this.config = config; + } + + run(vars) { + + console.log(this.config) + } +} + +let o1 = new Test({config: {a: Math.random()}}); +let o2 = new Test({config: {a: Math.random()}}); + +o1.run() +o2.run() + +// (async () => { +// +// let prom = []; +// +// for (var i = 0; i < 3; i++) { +// var obj = new Test({ +// config: {a: Math.random()}, +// }); +// prom.push(new Promise(resolve => { +// setTimeout(() => { new Test({ +// config: {a: Math.random()}, +// }).run(); resolve() }, 1000); +// })); +// } +// +// let res = await Promise.all(prom); +// console.log(res); +// +// })(); \ No newline at end of file diff --git a/index.js b/index.js index 76961a4..8190b19 100644 --- a/index.js +++ b/index.js @@ -4,78 +4,88 @@ var os = require("os"); exports.scrape = async function(config, callback) { - // options for scraping - event = { - // the user agent to scrape with - user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', - // if random_user_agent is set to True, a random user agent is chosen - random_user_agent: true, - // whether to select manual settings in visible mode - set_manual_settings: false, - // log ip address data - log_ip_address: false, - // log http headers - log_http_headers: false, - // how long to sleep between requests. a random sleep interval within the range [a,b] - // is drawn before every request. empty string for no sleeping. - sleep_range: '[1,1]', - // which search engine to scrape - search_engine: 'google', - compress: false, // compress - debug: false, - verbose: false, - keywords: ['scrapeulous.com'], - // whether to start the browser in headless mode - headless: true, - // the number of pages to scrape for each keyword - num_pages: 1, - // path to output file, data will be stored in JSON - output_file: '', - // whether to prevent images, css, fonts and media from being loaded - // will speed up scraping a great deal - block_assets: true, - // path to js module that extends functionality - // this module should export the functions: - // get_browser, handle_metadata, close_browser - //custom_func: resolve('examples/pluggable.js'), - custom_func: '', - // use a proxy for all connections - // example: 'socks5://78.94.172.42:1080' - // example: 'http://118.174.233.10:48400' - proxy: '', - // check if headless chrome escapes common detection techniques - // this is a quick test and should be used for debugging - test_evasion: false, - }; + // options for scraping + event = { + // the user agent to scrape with + user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', + // if random_user_agent is set to True, a random user agent is chosen + random_user_agent: true, + // whether to select manual settings in visible mode + set_manual_settings: false, + // log ip address data + log_ip_address: false, + // log http headers + log_http_headers: false, + // how long to sleep between requests. a random sleep interval within the range [a,b] + // is drawn before every request. empty string for no sleeping. + sleep_range: '[1,1]', + // which search engine to scrape + search_engine: 'google', + compress: false, // compress + debug: false, + verbose: true, + keywords: ['scrapeulous.com'], + // whether to start the browser in headless mode + headless: true, + // the number of pages to scrape for each keyword + num_pages: 1, + // path to output file, data will be stored in JSON + output_file: '', + // whether to prevent images, css, fonts and media from being loaded + // will speed up scraping a great deal + block_assets: true, + // path to js module that extends functionality + // this module should export the functions: + // get_browser, handle_metadata, close_browser + //custom_func: resolve('examples/pluggable.js'), + custom_func: '', + // path to a proxy file, one proxy per line. Example: + // socks5://78.94.172.42:1080 + // http://118.174.233.10:48400 + proxy_file: '', + proxies: [], + // check if headless chrome escapes common detection techniques + // this is a quick test and should be used for debugging + test_evasion: false, + // settings for puppeteer-cluster + monitor: false, + }; - // overwrite default config - for (var key in config) { - event[key] = config[key]; - } + // overwrite default config + for (var key in config) { + event[key] = config[key]; + } - if (fs.existsSync(event.keyword_file)) { - event.keywords = read_keywords_from_file(event.keyword_file); - } + if (fs.existsSync(event.keyword_file)) { + event.keywords = read_keywords_from_file(event.keyword_file); + } - if (!callback) { - // called when results are ready - callback = function (err, response) { - if (err) { - console.error(err) - } + if (fs.existsSync(event.proxy_file)) { + event.proxies = read_keywords_from_file(event.proxy_file); + if (event.verbose) { + console.log(`${event.proxies.length} proxies loaded.`); + } + } - console.dir(response.results, {depth: null, colors: true}); - } - } + if (!callback) { + // called when results are ready + callback = function (err, response) { + if (err) { + console.error(err) + } - await handler.handler(event, undefined, callback ); + console.dir(response.results, {depth: null, colors: true}); + } + } + + await handler.handler(event, undefined, callback ); }; function read_keywords_from_file(fname) { - let kws = fs.readFileSync(fname).toString().split(os.EOL); - // clean keywords - kws = kws.filter((kw) => { - return kw.trim().length > 0; - }); - return kws; + let kws = fs.readFileSync(fname).toString().split(os.EOL); + // clean keywords + kws = kws.filter((kw) => { + return kw.trim().length > 0; + }); + return kws; } diff --git a/package-lock.json b/package-lock.json index f6e8213..25fb7f7 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,6 +1,6 @@ { "name": "se-scraper", - "version": "1.1.12", + "version": "1.1.14", "lockfileVersion": 1, "requires": true, "dependencies": { @@ -45,6 +45,11 @@ "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.0.tgz", "integrity": "sha1-ibTRmasr7kneFk6gK4nORi1xt2c=" }, + "bluebird": { + "version": "3.5.3", + "resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.5.3.tgz", + "integrity": "sha512-/qKPUQlaW1OyR51WeCPBvRnAlnZFUJkCSG5HzGnuIqhgyJtF+T94lFnn33eiazjRm2LAHVy2guNnaq48X9SJuw==" + }, "boolbase": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz", @@ -117,6 +122,11 @@ "mimic-response": "^1.0.0" } }, + "commander": { + "version": "2.19.0", + "resolved": "https://registry.npmjs.org/commander/-/commander-2.19.0.tgz", + "integrity": "sha512-6tvAOO+D6OENvRAh524Dh9jcfKTYDQAqvqezbCW82xj5X0pSrcpxtvRKHLG0yBY6SD7PSDrJaj+0AiOcKVd1Xg==" + }, "concat-map": { "version": "0.0.1", "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", @@ -124,7 +134,7 @@ }, "concat-stream": { "version": "1.6.2", - "resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz", + "resolved": "http://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz", "integrity": "sha512-27HBghJxjiZtIk3Ycvn/4kbJk/1uZuJFfuPEns6LaEvpvG1f0hTea8lilrouyo9mVc2GWdcEZ8OLoGmSADlrCw==", "requires": { "buffer-from": "^1.0.0", @@ -135,7 +145,7 @@ "dependencies": { "readable-stream": { "version": "2.3.6", - "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz", + "resolved": "http://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz", "integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==", "requires": { "core-util-is": "~1.0.0", @@ -149,7 +159,7 @@ }, "string_decoder": { "version": "1.1.1", - "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", + "resolved": "http://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", "integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==", "requires": { "safe-buffer": "~5.1.0" @@ -264,13 +274,13 @@ "integrity": "sha512-f2LZMYl1Fzu7YSBKg+RoROelpOaNrcGmE9AZubeDfrCEia483oW4MI4VyFd5VNHIgQ/7qm1I0wUHK1eJnn2y2w==" }, "es6-promise": { - "version": "4.2.5", - "resolved": "https://registry.npmjs.org/es6-promise/-/es6-promise-4.2.5.tgz", - "integrity": "sha512-n6wvpdE43VFtJq+lUDYDBFUwV8TZbuGXLV4D6wKafg13ldznKsyEvatubnmUe31zcvelSzOHF+XbaT+Bl9ObDg==" + "version": "4.2.6", + "resolved": "https://registry.npmjs.org/es6-promise/-/es6-promise-4.2.6.tgz", + "integrity": "sha512-aRVgGdnmW2OiySVPUC9e6m+plolMAJKjZnQlCwNSuK5yQ0JN61DZSO1X1Ufd1foqWRAlig0rhduTCHe7sVtK5Q==" }, "es6-promisify": { "version": "5.0.0", - "resolved": "https://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz", + "resolved": "http://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz", "integrity": "sha1-UQnWLz5W6pZ8S2NQWu8IKRyKUgM=", "requires": { "es6-promise": "^4.0.3" @@ -458,12 +468,12 @@ }, "minimist": { "version": "0.0.8", - "resolved": "https://registry.npmjs.org/minimist/-/minimist-0.0.8.tgz", + "resolved": "http://registry.npmjs.org/minimist/-/minimist-0.0.8.tgz", "integrity": "sha1-hX/Kv8M5fSYluCKCYuhqp6ARsF0=" }, "mkdirp": { "version": "0.5.1", - "resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-0.5.1.tgz", + "resolved": "http://registry.npmjs.org/mkdirp/-/mkdirp-0.5.1.tgz", "integrity": "sha1-MAV0OOrGz3+MR2fzhkjWaX11yQM=", "requires": { "minimist": "0.0.8" @@ -510,7 +520,7 @@ }, "path-is-absolute": { "version": "1.0.1", - "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz", + "resolved": "http://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz", "integrity": "sha1-F0uSaHNVNP+8es5r9TpanhtcX18=" }, "pathval": { @@ -523,6 +533,36 @@ "resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz", "integrity": "sha1-elfrVQpng/kRUzH89GY9XI4AelA=" }, + "portastic": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/portastic/-/portastic-1.0.1.tgz", + "integrity": "sha1-HJgF1D+uj2pAzw28d5QJGi6dDSo=", + "requires": { + "bluebird": "^2.9.34", + "commander": "^2.8.1", + "debug": "^2.2.0" + }, + "dependencies": { + "bluebird": { + "version": "2.11.0", + "resolved": "https://registry.npmjs.org/bluebird/-/bluebird-2.11.0.tgz", + "integrity": "sha1-U0uQM8AiyVecVro7Plpcqvu2UOE=" + }, + "debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "requires": { + "ms": "2.0.0" + } + }, + "ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=" + } + } + }, "prepend-http": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/prepend-http/-/prepend-http-2.0.0.tgz", @@ -538,6 +578,16 @@ "resolved": "https://registry.npmjs.org/progress/-/progress-2.0.3.tgz", "integrity": "sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA==" }, + "proxy-chain": { + "version": "0.2.7", + "resolved": "https://registry.npmjs.org/proxy-chain/-/proxy-chain-0.2.7.tgz", + "integrity": "sha512-e0s94WDfooeC3zQkvIJ/Eudiy/AywTQK4K6PMYbZdBE2m/eug54ThgCPdBE4txHvzi0A0gAVbX04Kt4RygTlRQ==", + "requires": { + "bluebird": "^3.5.1", + "portastic": "^1.0.1", + "underscore": "^1.9.1" + } + }, "proxy-from-env": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.0.0.tgz", @@ -567,6 +617,14 @@ "ws": "^6.1.0" } }, + "puppeteer-cluster": { + "version": "0.13.0", + "resolved": "https://registry.npmjs.org/puppeteer-cluster/-/puppeteer-cluster-0.13.0.tgz", + "integrity": "sha512-en9F6cHkj1tLucFz9q3BtrvVKxGxIR1cWZgcpKyjXJUElBbNahaUErrz7jGa6edVQJfqTrdF40mkDqIOZNJUhg==", + "requires": { + "debug": "^4.1.1" + } + }, "readable-stream": { "version": "3.1.1", "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.1.1.tgz", @@ -621,6 +679,11 @@ "resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz", "integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c=" }, + "underscore": { + "version": "1.9.1", + "resolved": "https://registry.npmjs.org/underscore/-/underscore-1.9.1.tgz", + "integrity": "sha512-5/4etnCkd9c8gwgowi5/om/mYO5ajCaOgdzj/oW+0eQV9WxKBDZw5+ycmKmeaTXjInS/W0BzpGLo2xR2aBwZdg==" + }, "url-parse-lax": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/url-parse-lax/-/url-parse-lax-3.0.0.tgz", @@ -640,9 +703,9 @@ "integrity": "sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8=" }, "ws": { - "version": "6.1.3", - "resolved": "https://registry.npmjs.org/ws/-/ws-6.1.3.tgz", - "integrity": "sha512-tbSxiT+qJI223AP4iLfQbkbxkwdFcneYinM2+x46Gx2wgvbaOMO36czfdfVUBRTHvzAMRhDd98sA5d/BuWbQdg==", + "version": "6.1.4", + "resolved": "https://registry.npmjs.org/ws/-/ws-6.1.4.tgz", + "integrity": "sha512-eqZfL+NE/YQc1/ZynhojeV8q+H050oR8AZ2uIev7RU10svA9ZnJUddHcOUZTJLinZ9yEfdA2kSATS2qZK5fhJA==", "requires": { "async-limiter": "~1.0.0" } diff --git a/package.json b/package.json index 380e2f2..1980429 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "se-scraper", - "version": "1.1.13", + "version": "1.2.0", "description": "A simple library using puppeteer to scrape several search engines such as Google, Duckduckgo and Bing.", "homepage": "https://scrapeulous.com/", "main": "index.js", @@ -11,6 +11,7 @@ "scraping", "search-engines", "google", + "bing", "web-scraping" ], "author": "Nikolai Tschacher (https://incolumitas.com/)", @@ -23,6 +24,8 @@ "chai": "^4.2.0", "cheerio": "^1.0.0-rc.2", "got": "^9.6.0", - "puppeteer": "^1.12.2" + "proxy-chain": "^0.2.7", + "puppeteer": "^1.12.2", + "puppeteer-cluster": "^0.13.0" } } diff --git a/run.js b/run.js index 9983711..5e16a2c 100644 --- a/run.js +++ b/run.js @@ -1,5 +1,4 @@ const se_scraper = require('./index.js'); -const resolve = require('path').resolve; let config = { // the user agent to scrape with @@ -18,13 +17,13 @@ let config = { // this output is informational verbose: true, // an array of keywords to scrape - keywords: ['news'], + keywords: ['news', 'abc', 'good', 'bad', 'better', 'one more', 'time', 'we are going'], // alternatively you can specify a keyword_file. this overwrites the keywords array keyword_file: '', // the number of pages to scrape for each keyword num_pages: 1, // whether to start the browser in headless mode - headless: true, + headless: false, // path to output file, data will be stored in JSON output_file: 'data.json', // whether to prevent images, css, fonts from being loaded @@ -40,13 +39,17 @@ let config = { // example: 'socks5://78.94.172.42:1080' // example: 'http://118.174.233.10:48400' proxy: '', + // a file with one proxy per line. Example: + // socks5://78.94.172.42:1080 + // http://118.174.233.10:48400 + proxy_file: '/home/nikolai/.proxies', // check if headless chrome escapes common detection techniques // this is a quick test and should be used for debugging test_evasion: false, // log ip address data log_ip_address: true, // log http headers - log_http_headers: true, + log_http_headers: false, }; function callback(err, response) { diff --git a/src/modules/google.js b/src/modules/google.js index a9f6c8d..02aa78e 100644 --- a/src/modules/google.js +++ b/src/modules/google.js @@ -3,6 +3,10 @@ const Scraper = require('./se_scraper'); class GoogleScraper extends Scraper { + constructor(...args) { + super(...args); + } + parse(html) { // load the page source into cheerio const $ = cheerio.load(html); @@ -75,7 +79,6 @@ class GoogleScraper extends Scraper { return false; } await next_page_link.click(); - await this.page.waitForNavigation(); return true; } @@ -153,13 +156,11 @@ class GoogleNewsOldScraper extends Scraper { return false; } await next_page_link.click(); - await this.page.waitForNavigation(); return true; } async wait_for_results() { - //await this.page.waitForNavigation({ timeout: this.STANDARD_TIMEOUT }); await this.page.waitForSelector('#main', { timeout: this.STANDARD_TIMEOUT }); await this.sleep(500); } diff --git a/src/modules/metadata.js b/src/modules/metadata.js index 1c7c5ee..d4016db 100644 --- a/src/modules/metadata.js +++ b/src/modules/metadata.js @@ -5,11 +5,10 @@ module.exports = { get_http_headers: get_http_headers, }; -async function get_ip_data(browser) { - const page = await browser.newPage(); +async function get_ip_data(page) { await page.goto('https://ipinfo.io/json', { waitLoad: true, - waitNetworkIdle: true // defaults to false + waitNetworkIdle: true }); let json = await page.content({ timeout: 20000 @@ -19,11 +18,10 @@ async function get_ip_data(browser) { return JSON.parse(ipinfo_text); } -async function get_http_headers(browser) { - const page = await browser.newPage(); +async function get_http_headers(page) { await page.goto('https://httpbin.org/get', { waitLoad: true, - waitNetworkIdle: true // defaults to false + waitNetworkIdle: true }); let headers = await page.content(); diff --git a/src/modules/se_scraper.js b/src/modules/se_scraper.js index 0b40c61..a0b9daa 100644 --- a/src/modules/se_scraper.js +++ b/src/modules/se_scraper.js @@ -1,6 +1,4 @@ -const start_url = { - 'google': '' -}; +const meta = require('./metadata.js'); /* Get useful JS knowledge and get awesome... @@ -12,17 +10,19 @@ const start_url = { module.exports = class Scraper { constructor(options = {}) { const { - browser = null, config = {}, context = {}, pluggable = null, } = options; + this.page = null; + this.metadata = {}; this.pluggable = pluggable; - this.browser = browser; this.config = config; this.context = context; + this.keywords = config.keywords; + this.STANDARD_TIMEOUT = 8000; // longer timeout when using proxies this.PROXY_TIMEOUT = 15000; @@ -36,7 +36,9 @@ module.exports = class Scraper { this.num_keywords = 0; } - async run() { + async run({page, data}) { + + this.page = page; let do_continue = await this.load_search_engine(); @@ -58,8 +60,6 @@ module.exports = class Scraper { */ async load_search_engine() { - this.page = await this.browser.newPage(); - // prevent detection by evading common detection techniques await evadeChromeHeadlessDetection(this.page); @@ -87,6 +87,32 @@ module.exports = class Scraper { await this.page.screenshot({path: 'headless-test-result.png'}); } + if (this.config.log_http_headers === true) { + this.metadata.http_headers = await meta.get_http_headers(this.page); + console.log(this.metadata.http_headers); + } + + if (this.config.log_ip_address === true) { + this.metadata.ipinfo = await meta.get_ip_data(this.page); + console.log(this.metadata.ipinfo); + } + + // check that our proxy is working by confirming + // that ipinfo.io sees the proxy IP address + if (this.config.proxy && this.config.log_ip_address === true) { + console.log(`${this.metadata.ipinfo} vs ${this.config.proxy}`); + + try { + // if the ip returned by ipinfo is not a substring of our proxystring, get the heck outta here + if (!this.config.proxy.includes(this.metadata.ipinfo.ip)) { + console.error('Proxy not working properly.'); + return false; + } + } catch (exception) { + + } + } + return await this.load_start_page(); } @@ -98,7 +124,7 @@ module.exports = class Scraper { * @returns {Promise} */ async scraping_loop() { - for (let keyword of this.config.keywords) { + for (var keyword of this.keywords) { this.num_keywords++; this.keyword = keyword; this.results[keyword] = {}; @@ -106,6 +132,7 @@ module.exports = class Scraper { if (this.pluggable.before_keyword_scraped) { await this.pluggable.before_keyword_scraped({ + results: this.results, num_keywords: this.num_keywords, num_requests: this.num_requests, keyword: keyword, diff --git a/src/node_scraper.js b/src/node_scraper.js index f737f18..0cb5494 100644 --- a/src/node_scraper.js +++ b/src/node_scraper.js @@ -1,4 +1,4 @@ -const puppeteer = require('puppeteer'); +const { Cluster } = require('./puppeteer-cluster/dist/index.js'); const zlib = require('zlib'); var fs = require('fs'); @@ -9,283 +9,274 @@ const baidu = require('./modules/baidu.js'); const infospace = require('./modules/infospace.js'); const youtube = require('./modules/youtube.js'); const ua = require('./modules/user_agents.js'); -const meta = require('./modules/metadata.js'); const duckduckgo = require('./modules/duckduckgo.js'); const tickersearch = require('./modules/ticker_search.js'); function write_results(fname, data) { - fs.writeFileSync(fname, data, (err) => { - if (err) throw err; - console.log(`Results written to file ${fname}`); - }); + fs.writeFileSync(fname, data, (err) => { + if (err) throw err; + console.log(`Results written to file ${fname}`); + }); +} + +function getScraper(searchEngine, args) { + return new { + google: google.GoogleScraper, + google_news_old: google.GoogleNewsOldScraper, + google_news: google.GoogleNewsScraper, + google_image: google.GoogleImageScraper, + bing: bing.BingScraper, + bing_news: bing.BingNewsScraper, + duckduckgo: duckduckgo.DuckduckgoScraper, + duckduckgo_news: duckduckgo.DuckduckgoNewsScraper, + infospace: infospace.InfospaceScraper, + webcrawler: infospace.WebcrawlerNewsScraper, + baidu: baidu.BaiduScraper, + youtube: youtube.YoutubeScraper, + yahoo_news: tickersearch.YahooFinanceScraper, + reuters: tickersearch.ReutersFinanceScraper, + cnbc: tickersearch.CnbcFinanceScraper, + marketwatch: tickersearch.MarketwatchFinanceScraper, + }[searchEngine](args); } module.exports.handler = async function handler (event, context, callback) { - config = event; - pluggable = {}; - if (config.custom_func) { - if (fs.existsSync(config.custom_func)) { - try { - Pluggable = require(config.custom_func); - pluggable = new Pluggable({config: config}); - } catch (exception) { - console.error(exception); - } - } else { - console.error(`File "${config.custom_func}" does not exist...`); - } - } + config = event; + pluggable = {}; + if (config.custom_func) { + if (fs.existsSync(config.custom_func)) { + try { + Pluggable = require(config.custom_func); + pluggable = new Pluggable({config: config}); + } catch (exception) { + console.error(exception); + } + } else { + console.error(`File "${config.custom_func}" does not exist...`); + } + } - try { - const startTime = Date.now(); - config = parseEventData(config); - if (config.debug === true) { - console.log(config); - } + try { + const startTime = Date.now(); + config = parseEventData(config); + if (config.debug === true) { + console.log(config); + } var ADDITIONAL_CHROME_FLAGS = [ - '--disable-infobars', - '--window-position=0,0', - '--ignore-certifcate-errors', - '--ignore-certifcate-errors-spki-list', - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disable-accelerated-2d-canvas', - '--disable-gpu', - '--window-size=1920x1080', + '--disable-infobars', + '--window-position=0,0', + '--ignore-certifcate-errors', + '--ignore-certifcate-errors-spki-list', + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-accelerated-2d-canvas', + '--disable-gpu', + '--window-size=1920x1080', '--hide-scrollbars', ]; - let USER_AGENT = ''; + var user_agent = undefined; if (config.user_agent) { - USER_AGENT = config.user_agent; - } + user_agent = config.user_agent; + } - if (config.random_user_agent === true) { - USER_AGENT = ua.random_user_agent(); - } + if (config.random_user_agent === true) { + user_agent = ua.random_user_agent(); + } - if (USER_AGENT) { - ADDITIONAL_CHROME_FLAGS.push( - `--user-agent="${USER_AGENT}"` - ) - } + if (user_agent) { + ADDITIONAL_CHROME_FLAGS.push( + `--user-agent="${user_agent}"` + ) + } if (config.proxy) { - // check this out bubbles - // https://www.systutorials.com/241062/how-to-set-google-chromes-proxy-settings-in-command-line-on-linux/ - // [://][:] - // "http", "socks", "socks4", "socks5". - ADDITIONAL_CHROME_FLAGS.push( - '--proxy-server=' + config.proxy, - ) - } + // https://www.systutorials.com/241062/how-to-set-google-chromes-proxy-settings-in-command-line-on-linux/ + // [://][:] + // "http", "socks", "socks4", "socks5". + ADDITIONAL_CHROME_FLAGS.push( + '--proxy-server=' + config.proxy, + ) + } let launch_args = { - args: ADDITIONAL_CHROME_FLAGS, - headless: config.headless, - ignoreHTTPSErrors: true, - }; + args: ADDITIONAL_CHROME_FLAGS, + headless: config.headless, + ignoreHTTPSErrors: true, + }; - if (config.debug === true) { - console.log("Chrome Args: ", launch_args); - } + if (config.debug === true) { + console.log("Chrome Args: ", launch_args); + } if (pluggable.start_browser) { - launch_args.config = config; - browser = await pluggable.start_browser(launch_args); - } else { - browser = await puppeteer.launch(launch_args); - } + launch_args.config = config; + browser = await pluggable.start_browser(launch_args); + } else { + var numClusters = config.proxies.length + 1; - let metadata = {}; + // the first browser config with home IP + let perBrowserOptions = [launch_args, ]; - if (config.log_http_headers === true) { - metadata.http_headers = await meta.get_http_headers(browser); - } + for (var proxy of config.proxies) { + perBrowserOptions.push({ + headless: config.headless, + ignoreHTTPSErrors: true, + args: ADDITIONAL_CHROME_FLAGS.concat(`--proxy-server=${proxy}`) + }) + } - if (config.log_ip_address === true) { - metadata.ipinfo = await meta.get_ip_data(browser); - } + var cluster = await Cluster.launch({ + monitor: config.monitor, + timeout: 30 * 60 * 1000, // max timeout set to 30 minutes + concurrency: Cluster.CONCURRENCY_BROWSER, + maxConcurrency: numClusters, + puppeteerOptions: launch_args, + perBrowserOptions: perBrowserOptions + }); - // check that our proxy is working by confirming - // that ipinfo.io sees the proxy IP address - if (config.proxy && config.log_ip_address === true) { - console.log(`${metadata.ipinfo} vs ${config.proxy}`); + cluster.on('taskerror', (err, data) => { + console.log(`Error while scraping ${data}: ${err.message}`); + console.log(err) + }); + } - try { - // if the ip returned by ipinfo is not a substring of our proxystring, get the heck outta here - if (!config.proxy.includes(metadata.ipinfo.ip)) { - console.error('Proxy not working properly.'); - await browser.close(); - return; - } - } catch (exception) { + let metadata = {}; - } - } + // Each browser will get N/(K+1) keywords and will issue N/(K+1) * M total requests to the search engine. + // https://github.com/GoogleChrome/puppeteer/issues/678 + // The question is: Is it possible to set proxies per Page? Per Browser? + // as far as I can see, puppeteer cluster uses the same puppeteerOptions + // for every browser instance. We will use our custom puppeteer-cluster version. + // https://www.npmjs.com/package/proxy-chain + // this answer looks nice: https://github.com/GoogleChrome/puppeteer/issues/678#issuecomment-389096077 + let chunks = []; + for (var n = 0; n < numClusters; n++) { + chunks.push([]); + } + for (var k = 0; k < config.keywords.length; k++) { + chunks[k%numClusters].push(config.keywords[k]); + } + //console.log(`Generated ${chunks.length} chunks...`); - var results = {}; + let execPromises = []; + let scraperInstances = []; + for (var c = 0; c < chunks.length; c++) { + config.keywords = chunks[c]; + if (c>0) { + config.proxy = config.proxies[c]; + } + obj = getScraper(config.search_engine, { + config: config, + context: context, + pluggable: pluggable, + }); + var boundMethod = obj.run.bind(obj); + execPromises.push(cluster.execute({}, boundMethod)); + scraperInstances.push(obj); + } - Scraper = { - google: google.GoogleScraper, - google_news_old: google.GoogleNewsOldScraper, - google_news: google.GoogleNewsScraper, - google_image: google.GoogleImageScraper, - bing: bing.BingScraper, - bing_news: bing.BingNewsScraper, - duckduckgo: duckduckgo.DuckduckgoScraper, - duckduckgo_news: duckduckgo.DuckduckgoNewsScraper, - infospace: infospace.InfospaceScraper, - webcrawler: infospace.WebcrawlerNewsScraper, - baidu: baidu.BaiduScraper, - youtube: youtube.YoutubeScraper, - yahoo_news: tickersearch.YahooFinanceScraper, - reuters: tickersearch.ReutersFinanceScraper, - cnbc: tickersearch.CnbcFinanceScraper, - marketwatch: tickersearch.MarketwatchFinanceScraper, - }[config.search_engine]; + let results = await Promise.all(execPromises); + results = results[0]; // TODO: this is strange. fix that shit boy - if (Scraper === undefined) { - console.info('Currently not implemented search_engine: ', config.search_engine); - } else { - scraperObj = new Scraper({ - browser: browser, - config: config, - context: context, - pluggable: pluggable, - }); - results = await scraperObj.run(); - } + if (pluggable.close_browser) { + await pluggable.close_browser(); + } else { + await cluster.idle(); + await cluster.close(); + } - if (pluggable.close_browser) { - await pluggable.close_browser(); - } else { - await browser.close(); - } + // count total requests among all scraper instances + let num_requests = 0; + for (var o of scraperInstances) { + num_requests += o.num_requests; + } - let num_requests = scraperObj.num_requests; - let timeDelta = Date.now() - startTime; - let ms_per_request = timeDelta/num_requests; + let timeDelta = Date.now() - startTime; + let ms_per_request = timeDelta/num_requests; - if (config.verbose === true) { - console.log(`Scraper took ${timeDelta}ms to perform ${num_requests} requests.`); - console.log(`On average ms/request: ${ms_per_request}ms/request`); - console.dir(results, {depth: null, colors: true}); - } + if (config.verbose === true) { + console.log(`${numClusters} Scraper Workers took ${timeDelta}ms to perform ${num_requests} requests.`); + console.log(`On average ms/request: ${ms_per_request}ms/request`); + console.dir(results, {depth: null, colors: true}); + } - if (config.compress === true) { - results = JSON.stringify(results); - // https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Encoding - results = zlib.deflateSync(results).toString('base64'); - } + if (config.compress === true) { + results = JSON.stringify(results); + // https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Encoding + results = zlib.deflateSync(results).toString('base64'); + } - if (pluggable.handle_results) { - await pluggable.handle_results({ - config: config, - results: results, - }); - } + if (pluggable.handle_results) { + await pluggable.handle_results({ + config: config, + results: results, + }); + } - metadata.id = `${config.job_name} ${config.chunk_lines}`; - metadata.chunk_lines = config.chunk_lines; - metadata.elapsed_time = timeDelta.toString(); - metadata.ms_per_keyword = ms_per_request.toString(); - metadata.num_requests = num_requests; + metadata.id = `${config.job_name} ${config.chunk_lines}`; + metadata.chunk_lines = config.chunk_lines; + metadata.elapsed_time = timeDelta.toString(); + metadata.ms_per_keyword = ms_per_request.toString(); + metadata.num_requests = num_requests; - if (config.verbose === true) { - console.log(metadata); - } + if (config.verbose === true) { + console.log(metadata); + } - if (pluggable.handle_metadata) { - await pluggable.handle_metadata({metadata: metadata, config: config}); - } + if (pluggable.handle_metadata) { + await pluggable.handle_metadata({metadata: metadata, config: config}); + } - if (config.output_file) { - write_results(config.output_file, JSON.stringify(results)); - } + if (config.output_file) { + write_results(config.output_file, JSON.stringify(results)); + } - let response = { - headers: { - 'Content-Type': 'text/json', - }, - results: results, - metadata: metadata || {}, - statusCode: 200 - }; + let response = { + headers: { + 'Content-Type': 'text/json', + }, + results: results, + metadata: metadata || {}, + statusCode: 200 + }; - callback(null, response); + callback(null, response); - } catch (e) { - callback(e, null); - } + } catch (e) { + callback(e, null); + } }; function parseEventData(config) { - function _bool(e) { - e = String(e); - if (typeof e.trim === "function") { - return e.trim().toLowerCase() == 'true'; - } else { - return e.toLowerCase() == 'true'; - } - } + function _bool(e) { + e = String(e); + if (typeof e.trim === "function") { + return e.trim().toLowerCase() === 'true'; + } else { + return e.toLowerCase() === 'true'; + } + } - if (config.debug) { - config.debug = _bool(config.debug); - } + const booleans = ['debug', 'verbose', 'upload_to_s3', 'log_ip_address', 'log_http_headers', 'random_user_agent', + 'compress', 'is_local', 'max_results', 'set_manual_settings', 'block_assets', 'test_evasion']; - if (config.verbose) { - config.verbose = _bool(config.verbose); - } + for (b of booleans) { + config[b] = _bool(config[b]); + } - if (config.upload_to_s3) { - config.upload_to_s3 = _bool(config.upload_to_s3); - } + if (config.sleep_range) { + // parse an array + config.sleep_range = eval(config.sleep_range); - if (config.log_ip_address) { - config.log_ip_address = _bool(config.log_ip_address); - } - - if (config.log_http_headers) { - config.log_http_headers = _bool(config.log_http_headers); - } - - if (config.random_user_agent) { - config.random_user_agent = _bool(config.random_user_agent); - } - - if (config.compress) { - config.compress = _bool(config.compress); - } - - if (config.is_local) { - config.is_local = _bool(config.is_local); - } - - if (config.max_results) { - config.max_results = parseInt(config.max_results); - } - - if (config.set_manual_settings) { - config.set_manual_settings = _bool(config.set_manual_settings); - } - - if (config.block_assets) { - config.block_assets = _bool(config.block_assets); - } - - if (config.sleep_range) { - // parse an array - config.sleep_range = eval(config.sleep_range); - - if (config.sleep_range.length !== 2 && typeof i[0] !== 'number' && typeof i[1] !== 'number') { + if (config.sleep_range.length !== 2 && typeof i[0] !== 'number' && typeof i[1] !== 'number') { throw "sleep_range is not a valid array of two integers."; - } - } + } + } - return config; + return config; } \ No newline at end of file diff --git a/src/puppeteer-cluster b/src/puppeteer-cluster new file mode 160000 index 0000000..da9b7bc --- /dev/null +++ b/src/puppeteer-cluster @@ -0,0 +1 @@ +Subproject commit da9b7bc889273e966c68c50b4ffcb45115cbb2e8