forked from extern/se-scraper
minor improvements
This commit is contained in:
parent
abf4458e46
commit
8cbf37eaba
@ -65,4 +65,9 @@ module.exports = class Pluggable {
|
||||
|
||||
return this.browser;
|
||||
}
|
||||
|
||||
async do_work(page) {
|
||||
// do some scraping work and return results and num_requests
|
||||
|
||||
}
|
||||
};
|
@ -1,8 +1,8 @@
|
||||
{
|
||||
"news": {
|
||||
"1": {
|
||||
"time": "Thu, 28 Feb 2019 14:24:51 GMT",
|
||||
"num_results": "Ungefähr 25.270.000.000 Ergebnisse (0,49 Sekunden) ",
|
||||
"time": "Fri, 01 Mar 2019 15:04:34 GMT",
|
||||
"num_results": "Ungefähr 13.620.000.000 Ergebnisse (0,45 Sekunden) ",
|
||||
"no_results": false,
|
||||
"effective_query": "",
|
||||
"results": [
|
||||
@ -14,13 +14,21 @@
|
||||
"date": "",
|
||||
"rank": 1
|
||||
},
|
||||
{
|
||||
"link": "https://www.bild.de/news/startseite/news/news-16804530.bild.html",
|
||||
"title": "News aktuell aus Deutschland und der Welt - Bild.dehttps://www.bild.de/news/startseite/news/news-16804530.bild.html",
|
||||
"snippet": "Aktuelle News aus Deutschland, Europa und der Welt. Alle Informationen, Bilder und Videos zu Skandalen, Krisen und Sensationen bei BILD.de.",
|
||||
"visible_link": "https://www.bild.de/news/startseite/news/news-16804530.bild.html",
|
||||
"date": "",
|
||||
"rank": 2
|
||||
},
|
||||
{
|
||||
"link": "https://www.rtl.de/cms/news.html",
|
||||
"title": "News: Aktuelle Nachrichten, Schlagzeilen und Videos | RTL.dehttps://www.rtl.de/cms/news.html",
|
||||
"snippet": "Aktuelle Nachrichten aus Deutschland und der Welt auf einen Blick: Bei RTL.de finden Sie die News von heute, spannende Hintergründe und Videos.",
|
||||
"visible_link": "https://www.rtl.de/cms/news.html",
|
||||
"date": "",
|
||||
"rank": 2
|
||||
"rank": 3
|
||||
},
|
||||
{
|
||||
"link": "https://www.zeit.de/news/index",
|
||||
@ -28,14 +36,6 @@
|
||||
"snippet": "Aktuelle News und Schlagzeilen im Newsticker von ZEIT ONLINE. Lesen Sie hier die neuesten Nachrichten.",
|
||||
"visible_link": "https://www.zeit.de/news/index",
|
||||
"date": "",
|
||||
"rank": 3
|
||||
},
|
||||
{
|
||||
"link": "https://www.bild.de/news/startseite/news/news-16804530.bild.html",
|
||||
"title": "News aktuell aus Deutschland und der Welt - Bild.dehttps://www.bild.de/news/startseite/news/news-16804530.bild.html",
|
||||
"snippet": "Aktuelle News aus Deutschland, Europa und der Welt. Alle Informationen, Bilder und Videos zu Skandalen, Krisen und Sensationen bei BILD.de.",
|
||||
"visible_link": "https://www.bild.de/news/startseite/news/news-16804530.bild.html",
|
||||
"date": "",
|
||||
"rank": 4
|
||||
},
|
||||
{
|
||||
@ -57,43 +57,51 @@
|
||||
{
|
||||
"link": "https://www.t-online.de/nachrichten/",
|
||||
"title": "Politik aktuell: Nachrichten aus Deutschland, Europa und der Welthttps://www.t-online.de/nachrichten/",
|
||||
"snippet": "Trump trifft Kim: Der Nordkorea-Gipfel in Vietnam im News-Blog · Krise in Venezuela: Aktuelle Entwicklungen, ... E-Mails und News unterwegs immer dabei.",
|
||||
"snippet": "Neuigkeiten aus der Welt des Wintersports · Der Bachelor: Alle Informationen zur aktuellen Staffel · GNTM Staffel 14: News zu Germany's Next Topmodel 2019 ...",
|
||||
"visible_link": "https://www.t-online.de/nachrichten/",
|
||||
"date": "",
|
||||
"rank": 7
|
||||
},
|
||||
{
|
||||
"link": "https://news.google.com/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRFZxYUdjU0FtUmxHZ0pFUlNnQVAB?hl=de&gl=DE&ceid=DE%3Ade",
|
||||
"title": "Google News - Schlagzeilen - Neuestehttps://news.google.com/.../CAAqJggKIiBDQkFTRWdvSUwyMHZNRFZxYUdjU0FtUm...",
|
||||
"snippet": "Mit Google News kannst du zum Thema Schlagzeilen vollständige Artikel lesen, Videos ansehen und in Tausenden von Titeln stöbern.",
|
||||
"visible_link": "https://news.google.com/.../CAAqJggKIiBDQkFTRWdvSUwyMHZNRFZxYUdjU0FtUm...",
|
||||
"date": "",
|
||||
"rank": 8
|
||||
},
|
||||
{
|
||||
"link": "https://www.n-tv.de/",
|
||||
"title": "Nachrichten, aktuelle Schlagzeilen und Videos - n-tv.dehttps://www.n-tv.de/",
|
||||
"snippet": "Nachrichten seriös, schnell und kompetent. Artikel und Videos aus Politik, Wirtschaft, Börse, Sport und News aus aller Welt.",
|
||||
"visible_link": "https://www.n-tv.de/",
|
||||
"date": "",
|
||||
"rank": 8
|
||||
},
|
||||
{
|
||||
"link": "https://www.stern.de/news/",
|
||||
"title": "News - Sternhttps://www.stern.de/news/Im Cache",
|
||||
"snippet": "News und aktuelle Schlagzeilen im Nachrichten-Ticker von STERN.de. Alle Informationen, Reportagen und Hintergründe im Überblick.",
|
||||
"visible_link": "https://www.stern.de/news/",
|
||||
"date": "",
|
||||
"rank": 9
|
||||
},
|
||||
{
|
||||
"link": "https://www.stern.de/panorama/weltgeschehen/news--russland-sagt-venezuela-massive-hilfslieferungen-zu-8601942.html",
|
||||
"title": "News: Russland sagt Venezuela massive Hilfslieferungen zu | STERN ...https://www.stern.de › Panorama › WeltgeschehenIm Cache",
|
||||
"snippet": "vor 1 Stunde - News des TagesPutin will Venezuela massiv unter die Arme greifen. Maserninfektionen steigen weltweit an +++ 20 Jahre Haft für Magier Jan ...",
|
||||
"visible_link": "https://www.stern.de › Panorama › Weltgeschehen",
|
||||
"date": "vor 1 Stunde - ",
|
||||
"rank": 10
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"se-scraper": {
|
||||
"1": {
|
||||
"time": "Thu, 28 Feb 2019 14:24:51 GMT",
|
||||
"num_results": "Ungefähr 16.400.000 Ergebnisse (0,27 Sekunden) ",
|
||||
"time": "Fri, 01 Mar 2019 15:04:34 GMT",
|
||||
"num_results": "Ungefähr 17.500.000 Ergebnisse (0,36 Sekunden) ",
|
||||
"no_results": false,
|
||||
"effective_query": "",
|
||||
"results": [
|
||||
{
|
||||
"link": "https://www.npmjs.com/package/se-scraper",
|
||||
"title": "se-scraper - npmhttps://www.npmjs.com/package/se-scraperIm CacheDiese Seite übersetzen",
|
||||
"snippet": "07.02.2019 - A simple library using puppeteer to scrape several search engines such as Google, Duckduckgo and Bing.",
|
||||
"snippet": "vor 1 Tag - se-scraper will create one browser instance per proxy. So the maximal amount of concurrency is equivalent to the number of proxies plus one ...",
|
||||
"visible_link": "https://www.npmjs.com/package/se-scraper",
|
||||
"date": "07.02.2019 - ",
|
||||
"date": "vor 1 Tag - ",
|
||||
"rank": 1
|
||||
},
|
||||
{
|
||||
@ -104,21 +112,13 @@
|
||||
"date": "",
|
||||
"rank": 2
|
||||
},
|
||||
{
|
||||
"link": "https://github.com/nyancat18/Se-Scraper",
|
||||
"title": "GitHub - nyancat18/Se-Scraper: se-scraper your siteshttps://github.com/nyancat18/Se-ScraperIm CacheDiese Seite übersetzen",
|
||||
"snippet": "se-scraper your sites. Contribute to nyancat18/Se-Scraper development by creating an account on GitHub.",
|
||||
"visible_link": "https://github.com/nyancat18/Se-Scraper",
|
||||
"date": "",
|
||||
"rank": 3
|
||||
},
|
||||
{
|
||||
"link": "http://konjugator.reverso.net/konjugation-franzosisch-verb-se%20scraper.html",
|
||||
"title": "Konjugation se scraper | Konjugieren verb se scraper Französisch ...konjugator.reverso.net/konjugation-franzosisch-verb-se%20scraper.html",
|
||||
"snippet": "Reverso-Konjugation: Konjugation des französischen Verbs se scraper, Konjugator für französische Verben, unregelmäßige Verben, Übersetzung,Grammatik.",
|
||||
"visible_link": "konjugator.reverso.net/konjugation-franzosisch-verb-se%20scraper.html",
|
||||
"date": "",
|
||||
"rank": 4
|
||||
"rank": 3
|
||||
},
|
||||
{
|
||||
"link": "https://swedishicescraper.se/",
|
||||
@ -126,7 +126,7 @@
|
||||
"snippet": "The original Swedish Ice Scraper - best in test. ... solid Acrylic Glass and use diamond polishing to sharpen the scraping edges. ... info@swedishicescraper.se.",
|
||||
"visible_link": "https://swedishicescraper.se/",
|
||||
"date": "",
|
||||
"rank": 5
|
||||
"rank": 4
|
||||
},
|
||||
{
|
||||
"link": "https://www.blackhatworld.com/seo/any-yandex-scrapers-available-or-universal-se-scraper.243421/",
|
||||
@ -134,22 +134,30 @@
|
||||
"snippet": "10.10.2010 - Mostly blogs & stuff like that. Is Hrefer for yandex only or there are other SEs? How much is it? Advertise on BHW ...",
|
||||
"visible_link": "https://www.blackhatworld.com › ... › Black Hat SEO Tools",
|
||||
"date": "10.10.2010 - ",
|
||||
"rank": 5
|
||||
},
|
||||
{
|
||||
"link": "https://www.amazon.de/Calli-Edelstahl-Schokolade-Flugzeug-Werkzeug/dp/B01JJ96DJE",
|
||||
"title": "Calli Edelstahl Käse Scraper Schokolade reiben Messer Flugzeug ...https://www.amazon.de/Calli-Edelstahl-Schokolade-Flugzeug.../dp/B01JJ96DJEIm Cache",
|
||||
"snippet": "Amazon.de: Küchen- und Haushaltsartikel online - Calli Edelstahl Käse Scraper Schokolade reiben Messer Flugzeug Cutter. Beschreibung: Edelstahl Käse ...",
|
||||
"visible_link": "https://www.amazon.de/Calli-Edelstahl-Schokolade-Flugzeug.../dp/B01JJ96DJE",
|
||||
"date": "",
|
||||
"rank": 6
|
||||
},
|
||||
{
|
||||
"link": "http://network.ubotstudio.com/forum/index.php/topic/8648-sell-free-sescraper-scrape-search-engines-with-long-lists-of-queries/",
|
||||
"title": "[SELL] FREE - SEscraper - scrape search engines with long lists of ...network.ubotstudio.com › ... › Sell › Bots and ScriptsIm CacheDiese Seite übersetzen",
|
||||
"snippet": "03.12.2011 - SEscraper. Scrape results from: Google Yahoo Bing AOL Enter one or more queries as well as an optional list of keywords to append to each ...",
|
||||
"visible_link": "network.ubotstudio.com › ... › Sell › Bots and Scripts",
|
||||
"date": "03.12.2011 - ",
|
||||
"link": "https://www.friatec.de/content/friatec/en/Technical-Plastics/FRIATOOLS-Technical-Equipment/Mechanical-tools/index.html",
|
||||
"title": "FRIATOOLS Scraper tools and mechanical tooling - Friatec AGhttps://www.friatec.de/content/friatec/en/...tools/index.htmlIm CacheDiese Seite übersetzen",
|
||||
"snippet": "FRIATOOLS Scraper tools and mechanical tooling. ... FWSG SE 63 - 315, 613562 - 613574, saddle area, pipe ends, d 63 - d 315, SDR 11 - SDR 33. FWSG 710 ...",
|
||||
"visible_link": "https://www.friatec.de/content/friatec/en/...tools/index.html",
|
||||
"date": "",
|
||||
"rank": 7
|
||||
},
|
||||
{
|
||||
"link": "https://netpeaksoftware.com/blog/netpeak-checker-3-0-serp-scraping",
|
||||
"title": "Netpeak Checker 3.0: SERP Scraping – Netpeak Software Bloghttps://netpeaksoftware.com/.../netpeak-checker-3-0-serp-scrapin...Im CacheDiese Seite übersetzen",
|
||||
"snippet": "19.09.2018 - With a new tool under an 'SE Scraper' nickname you can get Google, Bing, Yahoo, and Yandex search results in a structured table with a lot of ...",
|
||||
"visible_link": "https://netpeaksoftware.com/.../netpeak-checker-3-0-serp-scrapin...",
|
||||
"date": "19.09.2018 - ",
|
||||
"link": "https://books.google.de/books?id=IAjyQdFwh4UC&pg=PA1024&lpg=PA1024&dq=se-scraper&source=bl&ots=_3OxiKnP4G&sig=ACfU3U1kZgZPdNlnGGWHRWjU0gG2OVHY1g&hl=de&sa=X&ved=2ahUKEwjU5evLm-HgAhWw1uAKHTEUB1IQ6AEwDnoECAMQAQ",
|
||||
"title": "A Dictionary of Slang and Unconventional Englishhttps://books.google.de/books?isbn=1134963653Diese Seite übersetzen",
|
||||
"snippet": "1861 (OED); 1873, Rhoda Broughton, 'Happiness thinly spread over their whole lives, like bread and scrape!' Ex S.E. scrape, a thin layer.-——4. Hence, bread ...",
|
||||
"visible_link": "https://books.google.de/books?isbn=1134963653",
|
||||
"date": "",
|
||||
"rank": 8
|
||||
}
|
||||
]
|
||||
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "se-scraper",
|
||||
"version": "1.2.2",
|
||||
"version": "1.2.6",
|
||||
"description": "A simple module using puppeteer to scrape several search engines such as Google, Duckduckgo and Bing.",
|
||||
"homepage": "https://scrapeulous.com/",
|
||||
"main": "index.js",
|
||||
|
@ -13,9 +13,10 @@ module.exports = class Scraper {
|
||||
config = {},
|
||||
context = {},
|
||||
pluggable = null,
|
||||
page = null,
|
||||
} = options;
|
||||
|
||||
this.page = null;
|
||||
this.page = page;
|
||||
this.metadata = {};
|
||||
this.pluggable = pluggable;
|
||||
this.config = config;
|
||||
@ -89,13 +90,11 @@ module.exports = class Scraper {
|
||||
|
||||
if (this.config.log_http_headers === true) {
|
||||
this.metadata.http_headers = await meta.get_http_headers(this.page);
|
||||
console.log(this.metadata.http_headers);
|
||||
}
|
||||
|
||||
if (this.config.log_ip_address === true) {
|
||||
let ipinfo = await meta.get_ip_data(this.page);
|
||||
this.metadata.ipinfo = ipinfo;
|
||||
console.log(ipinfo);
|
||||
}
|
||||
|
||||
// check that our proxy is working by confirming
|
||||
@ -292,127 +291,136 @@ module.exports = class Scraper {
|
||||
|
||||
// This is where we'll put the code to get around the tests.
|
||||
async function evadeChromeHeadlessDetection(page) {
|
||||
// Pass the Webdriver Test.
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
const newProto = navigator.__proto__;
|
||||
delete newProto.webdriver;
|
||||
navigator.__proto__ = newProto;
|
||||
});
|
||||
|
||||
// Pass the Chrome Test.
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
// We can mock this in as much depth as we need for the test.
|
||||
const mockObj = {
|
||||
app: {
|
||||
isInstalled: false,
|
||||
},
|
||||
webstore: {
|
||||
onInstallStageChanged: {},
|
||||
onDownloadProgress: {},
|
||||
},
|
||||
runtime: {
|
||||
PlatformOs: {
|
||||
MAC: 'mac',
|
||||
WIN: 'win',
|
||||
ANDROID: 'android',
|
||||
CROS: 'cros',
|
||||
LINUX: 'linux',
|
||||
OPENBSD: 'openbsd',
|
||||
},
|
||||
PlatformArch: {
|
||||
ARM: 'arm',
|
||||
X86_32: 'x86-32',
|
||||
X86_64: 'x86-64',
|
||||
},
|
||||
PlatformNaclArch: {
|
||||
ARM: 'arm',
|
||||
X86_32: 'x86-32',
|
||||
X86_64: 'x86-64',
|
||||
},
|
||||
RequestUpdateCheckStatus: {
|
||||
THROTTLED: 'throttled',
|
||||
NO_UPDATE: 'no_update',
|
||||
UPDATE_AVAILABLE: 'update_available',
|
||||
},
|
||||
OnInstalledReason: {
|
||||
INSTALL: 'install',
|
||||
UPDATE: 'update',
|
||||
CHROME_UPDATE: 'chrome_update',
|
||||
SHARED_MODULE_UPDATE: 'shared_module_update',
|
||||
},
|
||||
OnRestartRequiredReason: {
|
||||
APP_UPDATE: 'app_update',
|
||||
OS_UPDATE: 'os_update',
|
||||
PERIODIC: 'periodic',
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
window.navigator.chrome = mockObj;
|
||||
window.chrome = mockObj;
|
||||
});
|
||||
|
||||
// Pass the Permissions Test.
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
const originalQuery = window.navigator.permissions.query;
|
||||
window.navigator.permissions.__proto__.query = parameters =>
|
||||
parameters.name === 'notifications'
|
||||
? Promise.resolve({state: Notification.permission})
|
||||
: originalQuery(parameters);
|
||||
|
||||
// Inspired by: https://github.com/ikarienator/phantomjs_hide_and_seek/blob/master/5.spoofFunctionBind.js
|
||||
const oldCall = Function.prototype.call;
|
||||
function call() {
|
||||
return oldCall.apply(this, arguments);
|
||||
}
|
||||
Function.prototype.call = call;
|
||||
|
||||
const nativeToStringFunctionString = Error.toString().replace(/Error/g, "toString");
|
||||
const oldToString = Function.prototype.toString;
|
||||
|
||||
function functionToString() {
|
||||
if (this === window.navigator.permissions.query) {
|
||||
return "function query() { [native code] }";
|
||||
}
|
||||
if (this === functionToString) {
|
||||
return nativeToStringFunctionString;
|
||||
}
|
||||
return oldCall.call(oldToString, this);
|
||||
}
|
||||
Function.prototype.toString = functionToString;
|
||||
});
|
||||
|
||||
// Pass the Plugins Length Test.
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
// Overwrite the `plugins` property to use a custom getter.
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
// This just needs to have `length > 0` for the current test,
|
||||
// but we could mock the plugins too if necessary.
|
||||
get: () => [1, 2, 3, 4, 5]
|
||||
try {
|
||||
// Pass the Webdriver Test.
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
const newProto = navigator.__proto__;
|
||||
delete newProto.webdriver;
|
||||
navigator.__proto__ = newProto;
|
||||
});
|
||||
});
|
||||
|
||||
// Pass the Languages Test.
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
// Overwrite the `plugins` property to use a custom getter.
|
||||
Object.defineProperty(navigator, 'languages', {
|
||||
get: () => ['en-US', 'en']
|
||||
// Pass the Chrome Test.
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
// We can mock this in as much depth as we need for the test.
|
||||
const mockObj = {
|
||||
app: {
|
||||
isInstalled: false,
|
||||
},
|
||||
webstore: {
|
||||
onInstallStageChanged: {},
|
||||
onDownloadProgress: {},
|
||||
},
|
||||
runtime: {
|
||||
PlatformOs: {
|
||||
MAC: 'mac',
|
||||
WIN: 'win',
|
||||
ANDROID: 'android',
|
||||
CROS: 'cros',
|
||||
LINUX: 'linux',
|
||||
OPENBSD: 'openbsd',
|
||||
},
|
||||
PlatformArch: {
|
||||
ARM: 'arm',
|
||||
X86_32: 'x86-32',
|
||||
X86_64: 'x86-64',
|
||||
},
|
||||
PlatformNaclArch: {
|
||||
ARM: 'arm',
|
||||
X86_32: 'x86-32',
|
||||
X86_64: 'x86-64',
|
||||
},
|
||||
RequestUpdateCheckStatus: {
|
||||
THROTTLED: 'throttled',
|
||||
NO_UPDATE: 'no_update',
|
||||
UPDATE_AVAILABLE: 'update_available',
|
||||
},
|
||||
OnInstalledReason: {
|
||||
INSTALL: 'install',
|
||||
UPDATE: 'update',
|
||||
CHROME_UPDATE: 'chrome_update',
|
||||
SHARED_MODULE_UPDATE: 'shared_module_update',
|
||||
},
|
||||
OnRestartRequiredReason: {
|
||||
APP_UPDATE: 'app_update',
|
||||
OS_UPDATE: 'os_update',
|
||||
PERIODIC: 'periodic',
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
window.navigator.chrome = mockObj;
|
||||
window.chrome = mockObj;
|
||||
});
|
||||
});
|
||||
|
||||
// Pass the iframe Test
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(HTMLIFrameElement.prototype, 'contentWindow', {
|
||||
get: function() {
|
||||
return window;
|
||||
// Pass the Permissions Test.
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
const originalQuery = window.navigator.permissions.query;
|
||||
window.navigator.permissions.__proto__.query = parameters =>
|
||||
parameters.name === 'notifications'
|
||||
? Promise.resolve({state: Notification.permission})
|
||||
: originalQuery(parameters);
|
||||
|
||||
// Inspired by: https://github.com/ikarienator/phantomjs_hide_and_seek/blob/master/5.spoofFunctionBind.js
|
||||
const oldCall = Function.prototype.call;
|
||||
|
||||
function call() {
|
||||
return oldCall.apply(this, arguments);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
// Pass toString test, though it breaks console.debug() from working
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
window.console.debug = () => {
|
||||
return null;
|
||||
};
|
||||
});
|
||||
Function.prototype.call = call;
|
||||
|
||||
const nativeToStringFunctionString = Error.toString().replace(/Error/g, "toString");
|
||||
const oldToString = Function.prototype.toString;
|
||||
|
||||
function functionToString() {
|
||||
if (this === window.navigator.permissions.query) {
|
||||
return "function query() { [native code] }";
|
||||
}
|
||||
if (this === functionToString) {
|
||||
return nativeToStringFunctionString;
|
||||
}
|
||||
return oldCall.call(oldToString, this);
|
||||
}
|
||||
|
||||
Function.prototype.toString = functionToString;
|
||||
});
|
||||
|
||||
// Pass the Plugins Length Test.
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
// Overwrite the `plugins` property to use a custom getter.
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
// This just needs to have `length > 0` for the current test,
|
||||
// but we could mock the plugins too if necessary.
|
||||
get: () => [1, 2, 3, 4, 5]
|
||||
});
|
||||
});
|
||||
|
||||
// Pass the Languages Test.
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
// Overwrite the `plugins` property to use a custom getter.
|
||||
Object.defineProperty(navigator, 'languages', {
|
||||
get: () => ['en-US', 'en']
|
||||
});
|
||||
});
|
||||
|
||||
// Pass the iframe Test
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(HTMLIFrameElement.prototype, 'contentWindow', {
|
||||
get: function () {
|
||||
return window;
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
// Pass toString test, though it breaks console.debug() from working
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
window.console.debug = () => {
|
||||
return null;
|
||||
};
|
||||
});
|
||||
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
}
|
||||
}
|
@ -1,8 +1,5 @@
|
||||
const { Cluster } = require('./puppeteer-cluster/dist/index.js');
|
||||
const zlib = require('zlib');
|
||||
var fs = require('fs');
|
||||
|
||||
// local module imports
|
||||
const google = require('./modules/google.js');
|
||||
const bing = require('./modules/bing.js');
|
||||
const baidu = require('./modules/baidu.js');
|
||||
@ -63,7 +60,9 @@ module.exports.handler = async function handler (event, context, callback) {
|
||||
console.log(config);
|
||||
}
|
||||
|
||||
console.log(`[se-scraper] started at [${(new Date()).toUTCString()}] and scrapes ${config.search_engine} with ${config.keywords.length} keywords on ${config.num_pages} pages each.`);
|
||||
if (config.keywords && config.search_engine) {
|
||||
console.log(`[se-scraper] started at [${(new Date()).toUTCString()}] and scrapes ${config.search_engine} with ${config.keywords.length} keywords on ${config.num_pages} pages each.`);
|
||||
}
|
||||
|
||||
// See here: https://peter.sh/experiments/chromium-command-line-switches/
|
||||
var ADDITIONAL_CHROME_FLAGS = [
|
||||
@ -81,7 +80,7 @@ module.exports.handler = async function handler (event, context, callback) {
|
||||
'--disable-notifications',
|
||||
];
|
||||
|
||||
var user_agent = undefined;
|
||||
var user_agent = null;
|
||||
|
||||
if (config.user_agent) {
|
||||
user_agent = config.user_agent;
|
||||
@ -120,18 +119,23 @@ module.exports.handler = async function handler (event, context, callback) {
|
||||
if (pluggable.start_browser) {
|
||||
launch_args.config = config;
|
||||
let browser = await pluggable.start_browser(launch_args);
|
||||
const realUA = await browser.userAgent();
|
||||
if (realUA === user_agent) {
|
||||
const page = await await browser.newPage();
|
||||
|
||||
const page = await browser.newPage();
|
||||
|
||||
if (config.do_work && pluggable.do_work) {
|
||||
let res = await pluggable.do_work(page);
|
||||
results = res.results;
|
||||
num_requests = res.num_requests;
|
||||
} else {
|
||||
let obj = getScraper(config.search_engine, {
|
||||
config: config,
|
||||
context: context,
|
||||
pluggable: pluggable,
|
||||
page: page,
|
||||
});
|
||||
results = obj.run(page);
|
||||
results = obj.run({page: page});
|
||||
num_requests = obj.num_requests;
|
||||
} else {
|
||||
console.error('provided user agent does not match real user agent');
|
||||
metadata = obj.metadata;
|
||||
}
|
||||
|
||||
if (pluggable.close_browser) {
|
||||
@ -139,9 +143,12 @@ module.exports.handler = async function handler (event, context, callback) {
|
||||
} else {
|
||||
await browser.close();
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
// if no custom start_browser functionality was given
|
||||
// use puppeteer-cluster for scraping
|
||||
const { Cluster } = require('./puppeteer-cluster/dist/index.js');
|
||||
|
||||
var numClusters = config.puppeteer_cluster_config.maxConcurrency;
|
||||
var perBrowserOptions = [];
|
||||
@ -235,9 +242,8 @@ module.exports.handler = async function handler (event, context, callback) {
|
||||
let ms_per_request = timeDelta/num_requests;
|
||||
|
||||
if (config.verbose === true) {
|
||||
console.log(`se-scraper took ${timeDelta}ms to perform ${num_requests} requests.`);
|
||||
console.log(`Scraper took ${timeDelta}ms to perform ${num_requests} requests.`);
|
||||
console.log(`On average ms/request: ${ms_per_request}ms/request`);
|
||||
//console.dir(results, {depth: null, colors: true});
|
||||
}
|
||||
|
||||
if (config.compress === true) {
|
||||
@ -299,7 +305,7 @@ function parseEventData(config) {
|
||||
}
|
||||
|
||||
const booleans = ['debug', 'verbose', 'upload_to_s3', 'log_ip_address', 'log_http_headers', 'random_user_agent',
|
||||
'compress', 'is_local', 'max_results', 'set_manual_settings', 'block_assets', 'test_evasion'];
|
||||
'compress', 'is_local', 'max_results', 'set_manual_settings', 'block_assets', 'test_evasion', 'do_work'];
|
||||
|
||||
for (b of booleans) {
|
||||
config[b] = _bool(config[b]);
|
||||
|
Loading…
Reference in New Issue
Block a user