minor improvements

This commit is contained in:
Nikolai Tschacher 2019-03-02 22:32:26 +01:00
parent abf4458e46
commit 8cbf37eaba
5 changed files with 205 additions and 178 deletions

View File

@ -65,4 +65,9 @@ module.exports = class Pluggable {
return this.browser; return this.browser;
} }
async do_work(page) {
// do some scraping work and return results and num_requests
}
}; };

View File

@ -1,8 +1,8 @@
{ {
"news": { "news": {
"1": { "1": {
"time": "Thu, 28 Feb 2019 14:24:51 GMT", "time": "Fri, 01 Mar 2019 15:04:34 GMT",
"num_results": "Ungefähr 25.270.000.000 Ergebnisse (0,49 Sekunden) ", "num_results": "Ungefähr 13.620.000.000 Ergebnisse (0,45 Sekunden) ",
"no_results": false, "no_results": false,
"effective_query": "", "effective_query": "",
"results": [ "results": [
@ -14,13 +14,21 @@
"date": "", "date": "",
"rank": 1 "rank": 1
}, },
{
"link": "https://www.bild.de/news/startseite/news/news-16804530.bild.html",
"title": "News aktuell aus Deutschland und der Welt - Bild.dehttps://www.bild.de/news/startseite/news/news-16804530.bild.html",
"snippet": "Aktuelle News aus Deutschland, Europa und der Welt. Alle Informationen, Bilder und Videos zu Skandalen, Krisen und Sensationen bei BILD.de.",
"visible_link": "https://www.bild.de/news/startseite/news/news-16804530.bild.html",
"date": "",
"rank": 2
},
{ {
"link": "https://www.rtl.de/cms/news.html", "link": "https://www.rtl.de/cms/news.html",
"title": "News: Aktuelle Nachrichten, Schlagzeilen und Videos | RTL.dehttps://www.rtl.de/cms/news.html", "title": "News: Aktuelle Nachrichten, Schlagzeilen und Videos | RTL.dehttps://www.rtl.de/cms/news.html",
"snippet": "Aktuelle Nachrichten aus Deutschland und der Welt auf einen Blick: Bei RTL.de finden Sie die News von heute, spannende Hintergründe und Videos.", "snippet": "Aktuelle Nachrichten aus Deutschland und der Welt auf einen Blick: Bei RTL.de finden Sie die News von heute, spannende Hintergründe und Videos.",
"visible_link": "https://www.rtl.de/cms/news.html", "visible_link": "https://www.rtl.de/cms/news.html",
"date": "", "date": "",
"rank": 2 "rank": 3
}, },
{ {
"link": "https://www.zeit.de/news/index", "link": "https://www.zeit.de/news/index",
@ -28,14 +36,6 @@
"snippet": "Aktuelle News und Schlagzeilen im Newsticker von ZEIT ONLINE. Lesen Sie hier die neuesten Nachrichten.", "snippet": "Aktuelle News und Schlagzeilen im Newsticker von ZEIT ONLINE. Lesen Sie hier die neuesten Nachrichten.",
"visible_link": "https://www.zeit.de/news/index", "visible_link": "https://www.zeit.de/news/index",
"date": "", "date": "",
"rank": 3
},
{
"link": "https://www.bild.de/news/startseite/news/news-16804530.bild.html",
"title": "News aktuell aus Deutschland und der Welt - Bild.dehttps://www.bild.de/news/startseite/news/news-16804530.bild.html",
"snippet": "Aktuelle News aus Deutschland, Europa und der Welt. Alle Informationen, Bilder und Videos zu Skandalen, Krisen und Sensationen bei BILD.de.",
"visible_link": "https://www.bild.de/news/startseite/news/news-16804530.bild.html",
"date": "",
"rank": 4 "rank": 4
}, },
{ {
@ -57,43 +57,51 @@
{ {
"link": "https://www.t-online.de/nachrichten/", "link": "https://www.t-online.de/nachrichten/",
"title": "Politik aktuell: Nachrichten aus Deutschland, Europa und der Welthttps://www.t-online.de/nachrichten/", "title": "Politik aktuell: Nachrichten aus Deutschland, Europa und der Welthttps://www.t-online.de/nachrichten/",
"snippet": "Trump trifft Kim: Der Nordkorea-Gipfel in Vietnam im News-Blog · Krise in Venezuela: Aktuelle Entwicklungen, ... E-Mails und News unterwegs immer dabei.", "snippet": "Neuigkeiten aus der Welt des Wintersports · Der Bachelor: Alle Informationen zur aktuellen Staffel · GNTM Staffel 14: News zu Germany's Next Topmodel 2019 ...",
"visible_link": "https://www.t-online.de/nachrichten/", "visible_link": "https://www.t-online.de/nachrichten/",
"date": "", "date": "",
"rank": 7 "rank": 7
}, },
{
"link": "https://news.google.com/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRFZxYUdjU0FtUmxHZ0pFUlNnQVAB?hl=de&gl=DE&ceid=DE%3Ade",
"title": "Google News - Schlagzeilen - Neuestehttps://news.google.com/.../CAAqJggKIiBDQkFTRWdvSUwyMHZNRFZxYUdjU0FtUm...",
"snippet": "Mit Google News kannst du zum Thema Schlagzeilen vollständige Artikel lesen, Videos ansehen und in Tausenden von Titeln stöbern.",
"visible_link": "https://news.google.com/.../CAAqJggKIiBDQkFTRWdvSUwyMHZNRFZxYUdjU0FtUm...",
"date": "",
"rank": 8
},
{ {
"link": "https://www.n-tv.de/", "link": "https://www.n-tv.de/",
"title": "Nachrichten, aktuelle Schlagzeilen und Videos - n-tv.dehttps://www.n-tv.de/", "title": "Nachrichten, aktuelle Schlagzeilen und Videos - n-tv.dehttps://www.n-tv.de/",
"snippet": "Nachrichten seriös, schnell und kompetent. Artikel und Videos aus Politik, Wirtschaft, Börse, Sport und News aus aller Welt.", "snippet": "Nachrichten seriös, schnell und kompetent. Artikel und Videos aus Politik, Wirtschaft, Börse, Sport und News aus aller Welt.",
"visible_link": "https://www.n-tv.de/", "visible_link": "https://www.n-tv.de/",
"date": "", "date": "",
"rank": 8
},
{
"link": "https://www.stern.de/news/",
"title": "News - Sternhttps://www.stern.de/news/Im Cache",
"snippet": "News und aktuelle Schlagzeilen im Nachrichten-Ticker von STERN.de. Alle Informationen, Reportagen und Hintergründe im Überblick.",
"visible_link": "https://www.stern.de/news/",
"date": "",
"rank": 9 "rank": 9
},
{
"link": "https://www.stern.de/panorama/weltgeschehen/news--russland-sagt-venezuela-massive-hilfslieferungen-zu-8601942.html",
"title": "News: Russland sagt Venezuela massive Hilfslieferungen zu | STERN ...https://www.stern.de Panorama WeltgeschehenIm Cache",
"snippet": "vor 1 Stunde - News des TagesPutin will Venezuela massiv unter die Arme greifen. Maserninfektionen steigen weltweit an +++ 20 Jahre Haft für Magier Jan ...",
"visible_link": "https://www.stern.de Panorama Weltgeschehen",
"date": "vor 1 Stunde - ",
"rank": 10
} }
] ]
} }
}, },
"se-scraper": { "se-scraper": {
"1": { "1": {
"time": "Thu, 28 Feb 2019 14:24:51 GMT", "time": "Fri, 01 Mar 2019 15:04:34 GMT",
"num_results": "Ungefähr 16.400.000 Ergebnisse (0,27 Sekunden) ", "num_results": "Ungefähr 17.500.000 Ergebnisse (0,36 Sekunden) ",
"no_results": false, "no_results": false,
"effective_query": "", "effective_query": "",
"results": [ "results": [
{ {
"link": "https://www.npmjs.com/package/se-scraper", "link": "https://www.npmjs.com/package/se-scraper",
"title": "se-scraper - npmhttps://www.npmjs.com/package/se-scraperIm CacheDiese Seite übersetzen", "title": "se-scraper - npmhttps://www.npmjs.com/package/se-scraperIm CacheDiese Seite übersetzen",
"snippet": "07.02.2019 - A simple library using puppeteer to scrape several search engines such as Google, Duckduckgo and Bing.", "snippet": "vor 1 Tag - se-scraper will create one browser instance per proxy. So the maximal amount of concurrency is equivalent to the number of proxies plus one ...",
"visible_link": "https://www.npmjs.com/package/se-scraper", "visible_link": "https://www.npmjs.com/package/se-scraper",
"date": "07.02.2019 - ", "date": "vor 1 Tag - ",
"rank": 1 "rank": 1
}, },
{ {
@ -104,21 +112,13 @@
"date": "", "date": "",
"rank": 2 "rank": 2
}, },
{
"link": "https://github.com/nyancat18/Se-Scraper",
"title": "GitHub - nyancat18/Se-Scraper: se-scraper your siteshttps://github.com/nyancat18/Se-ScraperIm CacheDiese Seite übersetzen",
"snippet": "se-scraper your sites. Contribute to nyancat18/Se-Scraper development by creating an account on GitHub.",
"visible_link": "https://github.com/nyancat18/Se-Scraper",
"date": "",
"rank": 3
},
{ {
"link": "http://konjugator.reverso.net/konjugation-franzosisch-verb-se%20scraper.html", "link": "http://konjugator.reverso.net/konjugation-franzosisch-verb-se%20scraper.html",
"title": "Konjugation se scraper | Konjugieren verb se scraper Französisch ...konjugator.reverso.net/konjugation-franzosisch-verb-se%20scraper.html", "title": "Konjugation se scraper | Konjugieren verb se scraper Französisch ...konjugator.reverso.net/konjugation-franzosisch-verb-se%20scraper.html",
"snippet": "Reverso-Konjugation: Konjugation des französischen Verbs se scraper, Konjugator für französische Verben, unregelmäßige Verben, Übersetzung,Grammatik.", "snippet": "Reverso-Konjugation: Konjugation des französischen Verbs se scraper, Konjugator für französische Verben, unregelmäßige Verben, Übersetzung,Grammatik.",
"visible_link": "konjugator.reverso.net/konjugation-franzosisch-verb-se%20scraper.html", "visible_link": "konjugator.reverso.net/konjugation-franzosisch-verb-se%20scraper.html",
"date": "", "date": "",
"rank": 4 "rank": 3
}, },
{ {
"link": "https://swedishicescraper.se/", "link": "https://swedishicescraper.se/",
@ -126,7 +126,7 @@
"snippet": "The original Swedish Ice Scraper - best in test. ... solid Acrylic Glass and use diamond polishing to sharpen the scraping edges. ... info@swedishicescraper.se.", "snippet": "The original Swedish Ice Scraper - best in test. ... solid Acrylic Glass and use diamond polishing to sharpen the scraping edges. ... info@swedishicescraper.se.",
"visible_link": "https://swedishicescraper.se/", "visible_link": "https://swedishicescraper.se/",
"date": "", "date": "",
"rank": 5 "rank": 4
}, },
{ {
"link": "https://www.blackhatworld.com/seo/any-yandex-scrapers-available-or-universal-se-scraper.243421/", "link": "https://www.blackhatworld.com/seo/any-yandex-scrapers-available-or-universal-se-scraper.243421/",
@ -134,22 +134,30 @@
"snippet": "10.10.2010 - Mostly blogs & stuff like that. Is Hrefer for yandex only or there are other SEs? How much is it? Advertise on BHW ...", "snippet": "10.10.2010 - Mostly blogs & stuff like that. Is Hrefer for yandex only or there are other SEs? How much is it? Advertise on BHW ...",
"visible_link": "https://www.blackhatworld.com ... Black Hat SEO Tools", "visible_link": "https://www.blackhatworld.com ... Black Hat SEO Tools",
"date": "10.10.2010 - ", "date": "10.10.2010 - ",
"rank": 5
},
{
"link": "https://www.amazon.de/Calli-Edelstahl-Schokolade-Flugzeug-Werkzeug/dp/B01JJ96DJE",
"title": "Calli Edelstahl Käse Scraper Schokolade reiben Messer Flugzeug ...https://www.amazon.de/Calli-Edelstahl-Schokolade-Flugzeug.../dp/B01JJ96DJEIm Cache",
"snippet": "Amazon.de: Küchen- und Haushaltsartikel online - Calli Edelstahl Käse Scraper Schokolade reiben Messer Flugzeug Cutter. Beschreibung: Edelstahl Käse ...",
"visible_link": "https://www.amazon.de/Calli-Edelstahl-Schokolade-Flugzeug.../dp/B01JJ96DJE",
"date": "",
"rank": 6 "rank": 6
}, },
{ {
"link": "http://network.ubotstudio.com/forum/index.php/topic/8648-sell-free-sescraper-scrape-search-engines-with-long-lists-of-queries/", "link": "https://www.friatec.de/content/friatec/en/Technical-Plastics/FRIATOOLS-Technical-Equipment/Mechanical-tools/index.html",
"title": "[SELL] FREE - SEscraper - scrape search engines with long lists of ...network.ubotstudio.com ... Sell Bots and ScriptsIm CacheDiese Seite übersetzen", "title": "FRIATOOLS Scraper tools and mechanical tooling - Friatec AGhttps://www.friatec.de/content/friatec/en/...tools/index.htmlIm CacheDiese Seite übersetzen",
"snippet": "03.12.2011 - SEscraper. Scrape results from: Google Yahoo Bing AOL Enter one or more queries as well as an optional list of keywords to append to each ...", "snippet": "FRIATOOLS Scraper tools and mechanical tooling. ... FWSG SE 63 - 315, 613562 - 613574, saddle area, pipe ends, d 63 - d 315, SDR 11 - SDR 33. FWSG 710 ...",
"visible_link": "network.ubotstudio.com ... Sell Bots and Scripts", "visible_link": "https://www.friatec.de/content/friatec/en/...tools/index.html",
"date": "03.12.2011 - ", "date": "",
"rank": 7 "rank": 7
}, },
{ {
"link": "https://netpeaksoftware.com/blog/netpeak-checker-3-0-serp-scraping", "link": "https://books.google.de/books?id=IAjyQdFwh4UC&pg=PA1024&lpg=PA1024&dq=se-scraper&source=bl&ots=_3OxiKnP4G&sig=ACfU3U1kZgZPdNlnGGWHRWjU0gG2OVHY1g&hl=de&sa=X&ved=2ahUKEwjU5evLm-HgAhWw1uAKHTEUB1IQ6AEwDnoECAMQAQ",
"title": "Netpeak Checker 3.0: SERP Scraping Netpeak Software Bloghttps://netpeaksoftware.com/.../netpeak-checker-3-0-serp-scrapin...Im CacheDiese Seite übersetzen", "title": "A Dictionary of Slang and Unconventional Englishhttps://books.google.de/books?isbn=1134963653Diese Seite übersetzen",
"snippet": "19.09.2018 - With a new tool under an 'SE Scraper' nickname you can get Google, Bing, Yahoo, and Yandex search results in a structured table with a lot of ...", "snippet": "1861 (OED); 1873, Rhoda Broughton, 'Happiness thinly spread over their whole lives, like bread and scrape!' Ex S.E. scrape, a thin layer.-——4. Hence, bread ...",
"visible_link": "https://netpeaksoftware.com/.../netpeak-checker-3-0-serp-scrapin...", "visible_link": "https://books.google.de/books?isbn=1134963653",
"date": "19.09.2018 - ", "date": "",
"rank": 8 "rank": 8
} }
] ]

View File

@ -1,6 +1,6 @@
{ {
"name": "se-scraper", "name": "se-scraper",
"version": "1.2.2", "version": "1.2.6",
"description": "A simple module using puppeteer to scrape several search engines such as Google, Duckduckgo and Bing.", "description": "A simple module using puppeteer to scrape several search engines such as Google, Duckduckgo and Bing.",
"homepage": "https://scrapeulous.com/", "homepage": "https://scrapeulous.com/",
"main": "index.js", "main": "index.js",

View File

@ -13,9 +13,10 @@ module.exports = class Scraper {
config = {}, config = {},
context = {}, context = {},
pluggable = null, pluggable = null,
page = null,
} = options; } = options;
this.page = null; this.page = page;
this.metadata = {}; this.metadata = {};
this.pluggable = pluggable; this.pluggable = pluggable;
this.config = config; this.config = config;
@ -89,13 +90,11 @@ module.exports = class Scraper {
if (this.config.log_http_headers === true) { if (this.config.log_http_headers === true) {
this.metadata.http_headers = await meta.get_http_headers(this.page); this.metadata.http_headers = await meta.get_http_headers(this.page);
console.log(this.metadata.http_headers);
} }
if (this.config.log_ip_address === true) { if (this.config.log_ip_address === true) {
let ipinfo = await meta.get_ip_data(this.page); let ipinfo = await meta.get_ip_data(this.page);
this.metadata.ipinfo = ipinfo; this.metadata.ipinfo = ipinfo;
console.log(ipinfo);
} }
// check that our proxy is working by confirming // check that our proxy is working by confirming
@ -292,127 +291,136 @@ module.exports = class Scraper {
// This is where we'll put the code to get around the tests. // This is where we'll put the code to get around the tests.
async function evadeChromeHeadlessDetection(page) { async function evadeChromeHeadlessDetection(page) {
// Pass the Webdriver Test.
await page.evaluateOnNewDocument(() => {
const newProto = navigator.__proto__;
delete newProto.webdriver;
navigator.__proto__ = newProto;
});
// Pass the Chrome Test. try {
await page.evaluateOnNewDocument(() => { // Pass the Webdriver Test.
// We can mock this in as much depth as we need for the test. await page.evaluateOnNewDocument(() => {
const mockObj = { const newProto = navigator.__proto__;
app: { delete newProto.webdriver;
isInstalled: false, navigator.__proto__ = newProto;
},
webstore: {
onInstallStageChanged: {},
onDownloadProgress: {},
},
runtime: {
PlatformOs: {
MAC: 'mac',
WIN: 'win',
ANDROID: 'android',
CROS: 'cros',
LINUX: 'linux',
OPENBSD: 'openbsd',
},
PlatformArch: {
ARM: 'arm',
X86_32: 'x86-32',
X86_64: 'x86-64',
},
PlatformNaclArch: {
ARM: 'arm',
X86_32: 'x86-32',
X86_64: 'x86-64',
},
RequestUpdateCheckStatus: {
THROTTLED: 'throttled',
NO_UPDATE: 'no_update',
UPDATE_AVAILABLE: 'update_available',
},
OnInstalledReason: {
INSTALL: 'install',
UPDATE: 'update',
CHROME_UPDATE: 'chrome_update',
SHARED_MODULE_UPDATE: 'shared_module_update',
},
OnRestartRequiredReason: {
APP_UPDATE: 'app_update',
OS_UPDATE: 'os_update',
PERIODIC: 'periodic',
},
},
};
window.navigator.chrome = mockObj;
window.chrome = mockObj;
});
// Pass the Permissions Test.
await page.evaluateOnNewDocument(() => {
const originalQuery = window.navigator.permissions.query;
window.navigator.permissions.__proto__.query = parameters =>
parameters.name === 'notifications'
? Promise.resolve({state: Notification.permission})
: originalQuery(parameters);
// Inspired by: https://github.com/ikarienator/phantomjs_hide_and_seek/blob/master/5.spoofFunctionBind.js
const oldCall = Function.prototype.call;
function call() {
return oldCall.apply(this, arguments);
}
Function.prototype.call = call;
const nativeToStringFunctionString = Error.toString().replace(/Error/g, "toString");
const oldToString = Function.prototype.toString;
function functionToString() {
if (this === window.navigator.permissions.query) {
return "function query() { [native code] }";
}
if (this === functionToString) {
return nativeToStringFunctionString;
}
return oldCall.call(oldToString, this);
}
Function.prototype.toString = functionToString;
});
// Pass the Plugins Length Test.
await page.evaluateOnNewDocument(() => {
// Overwrite the `plugins` property to use a custom getter.
Object.defineProperty(navigator, 'plugins', {
// This just needs to have `length > 0` for the current test,
// but we could mock the plugins too if necessary.
get: () => [1, 2, 3, 4, 5]
}); });
});
// Pass the Languages Test. // Pass the Chrome Test.
await page.evaluateOnNewDocument(() => { await page.evaluateOnNewDocument(() => {
// Overwrite the `plugins` property to use a custom getter. // We can mock this in as much depth as we need for the test.
Object.defineProperty(navigator, 'languages', { const mockObj = {
get: () => ['en-US', 'en'] app: {
isInstalled: false,
},
webstore: {
onInstallStageChanged: {},
onDownloadProgress: {},
},
runtime: {
PlatformOs: {
MAC: 'mac',
WIN: 'win',
ANDROID: 'android',
CROS: 'cros',
LINUX: 'linux',
OPENBSD: 'openbsd',
},
PlatformArch: {
ARM: 'arm',
X86_32: 'x86-32',
X86_64: 'x86-64',
},
PlatformNaclArch: {
ARM: 'arm',
X86_32: 'x86-32',
X86_64: 'x86-64',
},
RequestUpdateCheckStatus: {
THROTTLED: 'throttled',
NO_UPDATE: 'no_update',
UPDATE_AVAILABLE: 'update_available',
},
OnInstalledReason: {
INSTALL: 'install',
UPDATE: 'update',
CHROME_UPDATE: 'chrome_update',
SHARED_MODULE_UPDATE: 'shared_module_update',
},
OnRestartRequiredReason: {
APP_UPDATE: 'app_update',
OS_UPDATE: 'os_update',
PERIODIC: 'periodic',
},
},
};
window.navigator.chrome = mockObj;
window.chrome = mockObj;
}); });
});
// Pass the iframe Test // Pass the Permissions Test.
await page.evaluateOnNewDocument(() => { await page.evaluateOnNewDocument(() => {
Object.defineProperty(HTMLIFrameElement.prototype, 'contentWindow', { const originalQuery = window.navigator.permissions.query;
get: function() { window.navigator.permissions.__proto__.query = parameters =>
return window; parameters.name === 'notifications'
? Promise.resolve({state: Notification.permission})
: originalQuery(parameters);
// Inspired by: https://github.com/ikarienator/phantomjs_hide_and_seek/blob/master/5.spoofFunctionBind.js
const oldCall = Function.prototype.call;
function call() {
return oldCall.apply(this, arguments);
} }
});
});
// Pass toString test, though it breaks console.debug() from working Function.prototype.call = call;
await page.evaluateOnNewDocument(() => {
window.console.debug = () => { const nativeToStringFunctionString = Error.toString().replace(/Error/g, "toString");
return null; const oldToString = Function.prototype.toString;
};
}); function functionToString() {
if (this === window.navigator.permissions.query) {
return "function query() { [native code] }";
}
if (this === functionToString) {
return nativeToStringFunctionString;
}
return oldCall.call(oldToString, this);
}
Function.prototype.toString = functionToString;
});
// Pass the Plugins Length Test.
await page.evaluateOnNewDocument(() => {
// Overwrite the `plugins` property to use a custom getter.
Object.defineProperty(navigator, 'plugins', {
// This just needs to have `length > 0` for the current test,
// but we could mock the plugins too if necessary.
get: () => [1, 2, 3, 4, 5]
});
});
// Pass the Languages Test.
await page.evaluateOnNewDocument(() => {
// Overwrite the `plugins` property to use a custom getter.
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en']
});
});
// Pass the iframe Test
await page.evaluateOnNewDocument(() => {
Object.defineProperty(HTMLIFrameElement.prototype, 'contentWindow', {
get: function () {
return window;
}
});
});
// Pass toString test, though it breaks console.debug() from working
await page.evaluateOnNewDocument(() => {
window.console.debug = () => {
return null;
};
});
} catch (e) {
console.error(e);
}
} }

View File

@ -1,8 +1,5 @@
const { Cluster } = require('./puppeteer-cluster/dist/index.js');
const zlib = require('zlib'); const zlib = require('zlib');
var fs = require('fs'); var fs = require('fs');
// local module imports
const google = require('./modules/google.js'); const google = require('./modules/google.js');
const bing = require('./modules/bing.js'); const bing = require('./modules/bing.js');
const baidu = require('./modules/baidu.js'); const baidu = require('./modules/baidu.js');
@ -63,7 +60,9 @@ module.exports.handler = async function handler (event, context, callback) {
console.log(config); console.log(config);
} }
console.log(`[se-scraper] started at [${(new Date()).toUTCString()}] and scrapes ${config.search_engine} with ${config.keywords.length} keywords on ${config.num_pages} pages each.`); if (config.keywords && config.search_engine) {
console.log(`[se-scraper] started at [${(new Date()).toUTCString()}] and scrapes ${config.search_engine} with ${config.keywords.length} keywords on ${config.num_pages} pages each.`);
}
// See here: https://peter.sh/experiments/chromium-command-line-switches/ // See here: https://peter.sh/experiments/chromium-command-line-switches/
var ADDITIONAL_CHROME_FLAGS = [ var ADDITIONAL_CHROME_FLAGS = [
@ -81,7 +80,7 @@ module.exports.handler = async function handler (event, context, callback) {
'--disable-notifications', '--disable-notifications',
]; ];
var user_agent = undefined; var user_agent = null;
if (config.user_agent) { if (config.user_agent) {
user_agent = config.user_agent; user_agent = config.user_agent;
@ -120,18 +119,23 @@ module.exports.handler = async function handler (event, context, callback) {
if (pluggable.start_browser) { if (pluggable.start_browser) {
launch_args.config = config; launch_args.config = config;
let browser = await pluggable.start_browser(launch_args); let browser = await pluggable.start_browser(launch_args);
const realUA = await browser.userAgent();
if (realUA === user_agent) { const page = await browser.newPage();
const page = await await browser.newPage();
if (config.do_work && pluggable.do_work) {
let res = await pluggable.do_work(page);
results = res.results;
num_requests = res.num_requests;
} else {
let obj = getScraper(config.search_engine, { let obj = getScraper(config.search_engine, {
config: config, config: config,
context: context, context: context,
pluggable: pluggable, pluggable: pluggable,
page: page,
}); });
results = obj.run(page); results = obj.run({page: page});
num_requests = obj.num_requests; num_requests = obj.num_requests;
} else { metadata = obj.metadata;
console.error('provided user agent does not match real user agent');
} }
if (pluggable.close_browser) { if (pluggable.close_browser) {
@ -139,9 +143,12 @@ module.exports.handler = async function handler (event, context, callback) {
} else { } else {
await browser.close(); await browser.close();
} }
} else { } else {
// if no custom start_browser functionality was given // if no custom start_browser functionality was given
// use puppeteer-cluster for scraping // use puppeteer-cluster for scraping
const { Cluster } = require('./puppeteer-cluster/dist/index.js');
var numClusters = config.puppeteer_cluster_config.maxConcurrency; var numClusters = config.puppeteer_cluster_config.maxConcurrency;
var perBrowserOptions = []; var perBrowserOptions = [];
@ -235,9 +242,8 @@ module.exports.handler = async function handler (event, context, callback) {
let ms_per_request = timeDelta/num_requests; let ms_per_request = timeDelta/num_requests;
if (config.verbose === true) { if (config.verbose === true) {
console.log(`se-scraper took ${timeDelta}ms to perform ${num_requests} requests.`); console.log(`Scraper took ${timeDelta}ms to perform ${num_requests} requests.`);
console.log(`On average ms/request: ${ms_per_request}ms/request`); console.log(`On average ms/request: ${ms_per_request}ms/request`);
//console.dir(results, {depth: null, colors: true});
} }
if (config.compress === true) { if (config.compress === true) {
@ -299,7 +305,7 @@ function parseEventData(config) {
} }
const booleans = ['debug', 'verbose', 'upload_to_s3', 'log_ip_address', 'log_http_headers', 'random_user_agent', const booleans = ['debug', 'verbose', 'upload_to_s3', 'log_ip_address', 'log_http_headers', 'random_user_agent',
'compress', 'is_local', 'max_results', 'set_manual_settings', 'block_assets', 'test_evasion']; 'compress', 'is_local', 'max_results', 'set_manual_settings', 'block_assets', 'test_evasion', 'do_work'];
for (b of booleans) { for (b of booleans) {
config[b] = _bool(config[b]); config[b] = _bool(config[b]);