From 8cbf37eaba4a5b47c20c21d73562309c0465dc76 Mon Sep 17 00:00:00 2001 From: Nikolai Tschacher Date: Sat, 2 Mar 2019 22:32:26 +0100 Subject: [PATCH] minor improvements --- examples/pluggable.js | 5 + examples/results/data.json | 96 ++++++++------- package.json | 2 +- src/modules/se_scraper.js | 246 +++++++++++++++++++------------------ src/node_scraper.js | 34 ++--- 5 files changed, 205 insertions(+), 178 deletions(-) diff --git a/examples/pluggable.js b/examples/pluggable.js index 0f640f1..22cf5dc 100644 --- a/examples/pluggable.js +++ b/examples/pluggable.js @@ -65,4 +65,9 @@ module.exports = class Pluggable { return this.browser; } + + async do_work(page) { + // do some scraping work and return results and num_requests + + } }; \ No newline at end of file diff --git a/examples/results/data.json b/examples/results/data.json index 02393a0..49315a4 100644 --- a/examples/results/data.json +++ b/examples/results/data.json @@ -1,8 +1,8 @@ { "news": { "1": { - "time": "Thu, 28 Feb 2019 14:24:51 GMT", - "num_results": "Ungefähr 25.270.000.000 Ergebnisse (0,49 Sekunden) ", + "time": "Fri, 01 Mar 2019 15:04:34 GMT", + "num_results": "Ungefähr 13.620.000.000 Ergebnisse (0,45 Sekunden) ", "no_results": false, "effective_query": "", "results": [ @@ -14,13 +14,21 @@ "date": "", "rank": 1 }, + { + "link": "https://www.bild.de/news/startseite/news/news-16804530.bild.html", + "title": "News aktuell aus Deutschland und der Welt - Bild.dehttps://www.bild.de/news/startseite/news/news-16804530.bild.html", + "snippet": "Aktuelle News aus Deutschland, Europa und der Welt. Alle Informationen, Bilder und Videos zu Skandalen, Krisen und Sensationen bei BILD.de.", + "visible_link": "https://www.bild.de/news/startseite/news/news-16804530.bild.html", + "date": "", + "rank": 2 + }, { "link": "https://www.rtl.de/cms/news.html", "title": "News: Aktuelle Nachrichten, Schlagzeilen und Videos | RTL.dehttps://www.rtl.de/cms/news.html", "snippet": "Aktuelle Nachrichten aus Deutschland und der Welt auf einen Blick: Bei RTL.de finden Sie die News von heute, spannende Hintergründe und Videos.", "visible_link": "https://www.rtl.de/cms/news.html", "date": "", - "rank": 2 + "rank": 3 }, { "link": "https://www.zeit.de/news/index", @@ -28,14 +36,6 @@ "snippet": "Aktuelle News und Schlagzeilen im Newsticker von ZEIT ONLINE. Lesen Sie hier die neuesten Nachrichten.", "visible_link": "https://www.zeit.de/news/index", "date": "", - "rank": 3 - }, - { - "link": "https://www.bild.de/news/startseite/news/news-16804530.bild.html", - "title": "News aktuell aus Deutschland und der Welt - Bild.dehttps://www.bild.de/news/startseite/news/news-16804530.bild.html", - "snippet": "Aktuelle News aus Deutschland, Europa und der Welt. Alle Informationen, Bilder und Videos zu Skandalen, Krisen und Sensationen bei BILD.de.", - "visible_link": "https://www.bild.de/news/startseite/news/news-16804530.bild.html", - "date": "", "rank": 4 }, { @@ -57,43 +57,51 @@ { "link": "https://www.t-online.de/nachrichten/", "title": "Politik aktuell: Nachrichten aus Deutschland, Europa und der Welthttps://www.t-online.de/nachrichten/", - "snippet": "Trump trifft Kim: Der Nordkorea-Gipfel in Vietnam im News-Blog · Krise in Venezuela: Aktuelle Entwicklungen, ... E-Mails und News unterwegs immer dabei.", + "snippet": "Neuigkeiten aus der Welt des Wintersports · Der Bachelor: Alle Informationen zur aktuellen Staffel · GNTM Staffel 14: News zu Germany's Next Topmodel 2019 ...", "visible_link": "https://www.t-online.de/nachrichten/", "date": "", "rank": 7 }, - { - "link": "https://news.google.com/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRFZxYUdjU0FtUmxHZ0pFUlNnQVAB?hl=de&gl=DE&ceid=DE%3Ade", - "title": "Google News - Schlagzeilen - Neuestehttps://news.google.com/.../CAAqJggKIiBDQkFTRWdvSUwyMHZNRFZxYUdjU0FtUm...", - "snippet": "Mit Google News kannst du zum Thema Schlagzeilen vollständige Artikel lesen, Videos ansehen und in Tausenden von Titeln stöbern.", - "visible_link": "https://news.google.com/.../CAAqJggKIiBDQkFTRWdvSUwyMHZNRFZxYUdjU0FtUm...", - "date": "", - "rank": 8 - }, { "link": "https://www.n-tv.de/", "title": "Nachrichten, aktuelle Schlagzeilen und Videos - n-tv.dehttps://www.n-tv.de/", "snippet": "Nachrichten seriös, schnell und kompetent. Artikel und Videos aus Politik, Wirtschaft, Börse, Sport und News aus aller Welt.", "visible_link": "https://www.n-tv.de/", "date": "", + "rank": 8 + }, + { + "link": "https://www.stern.de/news/", + "title": "News - Sternhttps://www.stern.de/news/Im Cache", + "snippet": "News und aktuelle Schlagzeilen im Nachrichten-Ticker von STERN.de. Alle Informationen, Reportagen und Hintergründe im Überblick.", + "visible_link": "https://www.stern.de/news/", + "date": "", "rank": 9 + }, + { + "link": "https://www.stern.de/panorama/weltgeschehen/news--russland-sagt-venezuela-massive-hilfslieferungen-zu-8601942.html", + "title": "News: Russland sagt Venezuela massive Hilfslieferungen zu | STERN ...https://www.stern.de › Panorama › WeltgeschehenIm Cache", + "snippet": "vor 1 Stunde - News des TagesPutin will Venezuela massiv unter die Arme greifen. Maserninfektionen steigen weltweit an +++ 20 Jahre Haft für Magier Jan ...", + "visible_link": "https://www.stern.de › Panorama › Weltgeschehen", + "date": "vor 1 Stunde - ", + "rank": 10 } ] } }, "se-scraper": { "1": { - "time": "Thu, 28 Feb 2019 14:24:51 GMT", - "num_results": "Ungefähr 16.400.000 Ergebnisse (0,27 Sekunden) ", + "time": "Fri, 01 Mar 2019 15:04:34 GMT", + "num_results": "Ungefähr 17.500.000 Ergebnisse (0,36 Sekunden) ", "no_results": false, "effective_query": "", "results": [ { "link": "https://www.npmjs.com/package/se-scraper", "title": "se-scraper - npmhttps://www.npmjs.com/package/se-scraperIm CacheDiese Seite übersetzen", - "snippet": "07.02.2019 - A simple library using puppeteer to scrape several search engines such as Google, Duckduckgo and Bing.", + "snippet": "vor 1 Tag - se-scraper will create one browser instance per proxy. So the maximal amount of concurrency is equivalent to the number of proxies plus one ...", "visible_link": "https://www.npmjs.com/package/se-scraper", - "date": "07.02.2019 - ", + "date": "vor 1 Tag - ", "rank": 1 }, { @@ -104,21 +112,13 @@ "date": "", "rank": 2 }, - { - "link": "https://github.com/nyancat18/Se-Scraper", - "title": "GitHub - nyancat18/Se-Scraper: se-scraper your siteshttps://github.com/nyancat18/Se-ScraperIm CacheDiese Seite übersetzen", - "snippet": "se-scraper your sites. Contribute to nyancat18/Se-Scraper development by creating an account on GitHub.", - "visible_link": "https://github.com/nyancat18/Se-Scraper", - "date": "", - "rank": 3 - }, { "link": "http://konjugator.reverso.net/konjugation-franzosisch-verb-se%20scraper.html", "title": "Konjugation se scraper | Konjugieren verb se scraper Französisch ...konjugator.reverso.net/konjugation-franzosisch-verb-se%20scraper.html", "snippet": "Reverso-Konjugation: Konjugation des französischen Verbs se scraper, Konjugator für französische Verben, unregelmäßige Verben, Übersetzung,Grammatik.", "visible_link": "konjugator.reverso.net/konjugation-franzosisch-verb-se%20scraper.html", "date": "", - "rank": 4 + "rank": 3 }, { "link": "https://swedishicescraper.se/", @@ -126,7 +126,7 @@ "snippet": "The original Swedish Ice Scraper - best in test. ... solid Acrylic Glass and use diamond polishing to sharpen the scraping edges. ... info@swedishicescraper.se.", "visible_link": "https://swedishicescraper.se/", "date": "", - "rank": 5 + "rank": 4 }, { "link": "https://www.blackhatworld.com/seo/any-yandex-scrapers-available-or-universal-se-scraper.243421/", @@ -134,22 +134,30 @@ "snippet": "10.10.2010 - Mostly blogs & stuff like that. Is Hrefer for yandex only or there are other SEs? How much is it? Advertise on BHW ...", "visible_link": "https://www.blackhatworld.com › ... › Black Hat SEO Tools", "date": "10.10.2010 - ", + "rank": 5 + }, + { + "link": "https://www.amazon.de/Calli-Edelstahl-Schokolade-Flugzeug-Werkzeug/dp/B01JJ96DJE", + "title": "Calli Edelstahl Käse Scraper Schokolade reiben Messer Flugzeug ...https://www.amazon.de/Calli-Edelstahl-Schokolade-Flugzeug.../dp/B01JJ96DJEIm Cache", + "snippet": "Amazon.de: Küchen- und Haushaltsartikel online - Calli Edelstahl Käse Scraper Schokolade reiben Messer Flugzeug Cutter. Beschreibung: Edelstahl Käse ...", + "visible_link": "https://www.amazon.de/Calli-Edelstahl-Schokolade-Flugzeug.../dp/B01JJ96DJE", + "date": "", "rank": 6 }, { - "link": "http://network.ubotstudio.com/forum/index.php/topic/8648-sell-free-sescraper-scrape-search-engines-with-long-lists-of-queries/", - "title": "[SELL] FREE - SEscraper - scrape search engines with long lists of ...network.ubotstudio.com › ... › Sell › Bots and ScriptsIm CacheDiese Seite übersetzen", - "snippet": "03.12.2011 - SEscraper. Scrape results from: Google Yahoo Bing AOL Enter one or more queries as well as an optional list of keywords to append to each ...", - "visible_link": "network.ubotstudio.com › ... › Sell › Bots and Scripts", - "date": "03.12.2011 - ", + "link": "https://www.friatec.de/content/friatec/en/Technical-Plastics/FRIATOOLS-Technical-Equipment/Mechanical-tools/index.html", + "title": "FRIATOOLS Scraper tools and mechanical tooling - Friatec AGhttps://www.friatec.de/content/friatec/en/...tools/index.htmlIm CacheDiese Seite übersetzen", + "snippet": "FRIATOOLS Scraper tools and mechanical tooling. ... FWSG SE 63 - 315, 613562 - 613574, saddle area, pipe ends, d 63 - d 315, SDR 11 - SDR 33. FWSG 710 ...", + "visible_link": "https://www.friatec.de/content/friatec/en/...tools/index.html", + "date": "", "rank": 7 }, { - "link": "https://netpeaksoftware.com/blog/netpeak-checker-3-0-serp-scraping", - "title": "Netpeak Checker 3.0: SERP Scraping – Netpeak Software Bloghttps://netpeaksoftware.com/.../netpeak-checker-3-0-serp-scrapin...Im CacheDiese Seite übersetzen", - "snippet": "19.09.2018 - With a new tool under an 'SE Scraper' nickname you can get Google, Bing, Yahoo, and Yandex search results in a structured table with a lot of ...", - "visible_link": "https://netpeaksoftware.com/.../netpeak-checker-3-0-serp-scrapin...", - "date": "19.09.2018 - ", + "link": "https://books.google.de/books?id=IAjyQdFwh4UC&pg=PA1024&lpg=PA1024&dq=se-scraper&source=bl&ots=_3OxiKnP4G&sig=ACfU3U1kZgZPdNlnGGWHRWjU0gG2OVHY1g&hl=de&sa=X&ved=2ahUKEwjU5evLm-HgAhWw1uAKHTEUB1IQ6AEwDnoECAMQAQ", + "title": "A Dictionary of Slang and Unconventional Englishhttps://books.google.de/books?isbn=1134963653Diese Seite übersetzen", + "snippet": "1861 (OED); 1873, Rhoda Broughton, 'Happiness thinly spread over their whole lives, like bread and scrape!' Ex S.E. scrape, a thin layer.-——4. Hence, bread ...", + "visible_link": "https://books.google.de/books?isbn=1134963653", + "date": "", "rank": 8 } ] diff --git a/package.json b/package.json index 86b80ae..7aa82a9 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "se-scraper", - "version": "1.2.2", + "version": "1.2.6", "description": "A simple module using puppeteer to scrape several search engines such as Google, Duckduckgo and Bing.", "homepage": "https://scrapeulous.com/", "main": "index.js", diff --git a/src/modules/se_scraper.js b/src/modules/se_scraper.js index 3f49b1f..4efb87e 100644 --- a/src/modules/se_scraper.js +++ b/src/modules/se_scraper.js @@ -13,9 +13,10 @@ module.exports = class Scraper { config = {}, context = {}, pluggable = null, + page = null, } = options; - this.page = null; + this.page = page; this.metadata = {}; this.pluggable = pluggable; this.config = config; @@ -89,13 +90,11 @@ module.exports = class Scraper { if (this.config.log_http_headers === true) { this.metadata.http_headers = await meta.get_http_headers(this.page); - console.log(this.metadata.http_headers); } if (this.config.log_ip_address === true) { let ipinfo = await meta.get_ip_data(this.page); this.metadata.ipinfo = ipinfo; - console.log(ipinfo); } // check that our proxy is working by confirming @@ -292,127 +291,136 @@ module.exports = class Scraper { // This is where we'll put the code to get around the tests. async function evadeChromeHeadlessDetection(page) { - // Pass the Webdriver Test. - await page.evaluateOnNewDocument(() => { - const newProto = navigator.__proto__; - delete newProto.webdriver; - navigator.__proto__ = newProto; - }); - // Pass the Chrome Test. - await page.evaluateOnNewDocument(() => { - // We can mock this in as much depth as we need for the test. - const mockObj = { - app: { - isInstalled: false, - }, - webstore: { - onInstallStageChanged: {}, - onDownloadProgress: {}, - }, - runtime: { - PlatformOs: { - MAC: 'mac', - WIN: 'win', - ANDROID: 'android', - CROS: 'cros', - LINUX: 'linux', - OPENBSD: 'openbsd', - }, - PlatformArch: { - ARM: 'arm', - X86_32: 'x86-32', - X86_64: 'x86-64', - }, - PlatformNaclArch: { - ARM: 'arm', - X86_32: 'x86-32', - X86_64: 'x86-64', - }, - RequestUpdateCheckStatus: { - THROTTLED: 'throttled', - NO_UPDATE: 'no_update', - UPDATE_AVAILABLE: 'update_available', - }, - OnInstalledReason: { - INSTALL: 'install', - UPDATE: 'update', - CHROME_UPDATE: 'chrome_update', - SHARED_MODULE_UPDATE: 'shared_module_update', - }, - OnRestartRequiredReason: { - APP_UPDATE: 'app_update', - OS_UPDATE: 'os_update', - PERIODIC: 'periodic', - }, - }, - }; - - window.navigator.chrome = mockObj; - window.chrome = mockObj; - }); - - // Pass the Permissions Test. - await page.evaluateOnNewDocument(() => { - const originalQuery = window.navigator.permissions.query; - window.navigator.permissions.__proto__.query = parameters => - parameters.name === 'notifications' - ? Promise.resolve({state: Notification.permission}) - : originalQuery(parameters); - - // Inspired by: https://github.com/ikarienator/phantomjs_hide_and_seek/blob/master/5.spoofFunctionBind.js - const oldCall = Function.prototype.call; - function call() { - return oldCall.apply(this, arguments); - } - Function.prototype.call = call; - - const nativeToStringFunctionString = Error.toString().replace(/Error/g, "toString"); - const oldToString = Function.prototype.toString; - - function functionToString() { - if (this === window.navigator.permissions.query) { - return "function query() { [native code] }"; - } - if (this === functionToString) { - return nativeToStringFunctionString; - } - return oldCall.call(oldToString, this); - } - Function.prototype.toString = functionToString; - }); - - // Pass the Plugins Length Test. - await page.evaluateOnNewDocument(() => { - // Overwrite the `plugins` property to use a custom getter. - Object.defineProperty(navigator, 'plugins', { - // This just needs to have `length > 0` for the current test, - // but we could mock the plugins too if necessary. - get: () => [1, 2, 3, 4, 5] + try { + // Pass the Webdriver Test. + await page.evaluateOnNewDocument(() => { + const newProto = navigator.__proto__; + delete newProto.webdriver; + navigator.__proto__ = newProto; }); - }); - // Pass the Languages Test. - await page.evaluateOnNewDocument(() => { - // Overwrite the `plugins` property to use a custom getter. - Object.defineProperty(navigator, 'languages', { - get: () => ['en-US', 'en'] + // Pass the Chrome Test. + await page.evaluateOnNewDocument(() => { + // We can mock this in as much depth as we need for the test. + const mockObj = { + app: { + isInstalled: false, + }, + webstore: { + onInstallStageChanged: {}, + onDownloadProgress: {}, + }, + runtime: { + PlatformOs: { + MAC: 'mac', + WIN: 'win', + ANDROID: 'android', + CROS: 'cros', + LINUX: 'linux', + OPENBSD: 'openbsd', + }, + PlatformArch: { + ARM: 'arm', + X86_32: 'x86-32', + X86_64: 'x86-64', + }, + PlatformNaclArch: { + ARM: 'arm', + X86_32: 'x86-32', + X86_64: 'x86-64', + }, + RequestUpdateCheckStatus: { + THROTTLED: 'throttled', + NO_UPDATE: 'no_update', + UPDATE_AVAILABLE: 'update_available', + }, + OnInstalledReason: { + INSTALL: 'install', + UPDATE: 'update', + CHROME_UPDATE: 'chrome_update', + SHARED_MODULE_UPDATE: 'shared_module_update', + }, + OnRestartRequiredReason: { + APP_UPDATE: 'app_update', + OS_UPDATE: 'os_update', + PERIODIC: 'periodic', + }, + }, + }; + + window.navigator.chrome = mockObj; + window.chrome = mockObj; }); - }); - // Pass the iframe Test - await page.evaluateOnNewDocument(() => { - Object.defineProperty(HTMLIFrameElement.prototype, 'contentWindow', { - get: function() { - return window; + // Pass the Permissions Test. + await page.evaluateOnNewDocument(() => { + const originalQuery = window.navigator.permissions.query; + window.navigator.permissions.__proto__.query = parameters => + parameters.name === 'notifications' + ? Promise.resolve({state: Notification.permission}) + : originalQuery(parameters); + + // Inspired by: https://github.com/ikarienator/phantomjs_hide_and_seek/blob/master/5.spoofFunctionBind.js + const oldCall = Function.prototype.call; + + function call() { + return oldCall.apply(this, arguments); } - }); - }); - // Pass toString test, though it breaks console.debug() from working - await page.evaluateOnNewDocument(() => { - window.console.debug = () => { - return null; - }; - }); + Function.prototype.call = call; + + const nativeToStringFunctionString = Error.toString().replace(/Error/g, "toString"); + const oldToString = Function.prototype.toString; + + function functionToString() { + if (this === window.navigator.permissions.query) { + return "function query() { [native code] }"; + } + if (this === functionToString) { + return nativeToStringFunctionString; + } + return oldCall.call(oldToString, this); + } + + Function.prototype.toString = functionToString; + }); + + // Pass the Plugins Length Test. + await page.evaluateOnNewDocument(() => { + // Overwrite the `plugins` property to use a custom getter. + Object.defineProperty(navigator, 'plugins', { + // This just needs to have `length > 0` for the current test, + // but we could mock the plugins too if necessary. + get: () => [1, 2, 3, 4, 5] + }); + }); + + // Pass the Languages Test. + await page.evaluateOnNewDocument(() => { + // Overwrite the `plugins` property to use a custom getter. + Object.defineProperty(navigator, 'languages', { + get: () => ['en-US', 'en'] + }); + }); + + // Pass the iframe Test + await page.evaluateOnNewDocument(() => { + Object.defineProperty(HTMLIFrameElement.prototype, 'contentWindow', { + get: function () { + return window; + } + }); + }); + + // Pass toString test, though it breaks console.debug() from working + await page.evaluateOnNewDocument(() => { + window.console.debug = () => { + return null; + }; + }); + + } catch (e) { + console.error(e); + } } \ No newline at end of file diff --git a/src/node_scraper.js b/src/node_scraper.js index fd5f126..1f12780 100644 --- a/src/node_scraper.js +++ b/src/node_scraper.js @@ -1,8 +1,5 @@ -const { Cluster } = require('./puppeteer-cluster/dist/index.js'); const zlib = require('zlib'); var fs = require('fs'); - -// local module imports const google = require('./modules/google.js'); const bing = require('./modules/bing.js'); const baidu = require('./modules/baidu.js'); @@ -63,7 +60,9 @@ module.exports.handler = async function handler (event, context, callback) { console.log(config); } - console.log(`[se-scraper] started at [${(new Date()).toUTCString()}] and scrapes ${config.search_engine} with ${config.keywords.length} keywords on ${config.num_pages} pages each.`); + if (config.keywords && config.search_engine) { + console.log(`[se-scraper] started at [${(new Date()).toUTCString()}] and scrapes ${config.search_engine} with ${config.keywords.length} keywords on ${config.num_pages} pages each.`); + } // See here: https://peter.sh/experiments/chromium-command-line-switches/ var ADDITIONAL_CHROME_FLAGS = [ @@ -81,7 +80,7 @@ module.exports.handler = async function handler (event, context, callback) { '--disable-notifications', ]; - var user_agent = undefined; + var user_agent = null; if (config.user_agent) { user_agent = config.user_agent; @@ -120,18 +119,23 @@ module.exports.handler = async function handler (event, context, callback) { if (pluggable.start_browser) { launch_args.config = config; let browser = await pluggable.start_browser(launch_args); - const realUA = await browser.userAgent(); - if (realUA === user_agent) { - const page = await await browser.newPage(); + + const page = await browser.newPage(); + + if (config.do_work && pluggable.do_work) { + let res = await pluggable.do_work(page); + results = res.results; + num_requests = res.num_requests; + } else { let obj = getScraper(config.search_engine, { config: config, context: context, pluggable: pluggable, + page: page, }); - results = obj.run(page); + results = obj.run({page: page}); num_requests = obj.num_requests; - } else { - console.error('provided user agent does not match real user agent'); + metadata = obj.metadata; } if (pluggable.close_browser) { @@ -139,9 +143,12 @@ module.exports.handler = async function handler (event, context, callback) { } else { await browser.close(); } + } else { + // if no custom start_browser functionality was given // use puppeteer-cluster for scraping + const { Cluster } = require('./puppeteer-cluster/dist/index.js'); var numClusters = config.puppeteer_cluster_config.maxConcurrency; var perBrowserOptions = []; @@ -235,9 +242,8 @@ module.exports.handler = async function handler (event, context, callback) { let ms_per_request = timeDelta/num_requests; if (config.verbose === true) { - console.log(`se-scraper took ${timeDelta}ms to perform ${num_requests} requests.`); + console.log(`Scraper took ${timeDelta}ms to perform ${num_requests} requests.`); console.log(`On average ms/request: ${ms_per_request}ms/request`); - //console.dir(results, {depth: null, colors: true}); } if (config.compress === true) { @@ -299,7 +305,7 @@ function parseEventData(config) { } const booleans = ['debug', 'verbose', 'upload_to_s3', 'log_ip_address', 'log_http_headers', 'random_user_agent', - 'compress', 'is_local', 'max_results', 'set_manual_settings', 'block_assets', 'test_evasion']; + 'compress', 'is_local', 'max_results', 'set_manual_settings', 'block_assets', 'test_evasion', 'do_work']; for (b of booleans) { config[b] = _bool(config[b]);