forked from extern/se-scraper
minor improvements
This commit is contained in:
parent
abf4458e46
commit
8cbf37eaba
@ -65,4 +65,9 @@ module.exports = class Pluggable {
|
|||||||
|
|
||||||
return this.browser;
|
return this.browser;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async do_work(page) {
|
||||||
|
// do some scraping work and return results and num_requests
|
||||||
|
|
||||||
|
}
|
||||||
};
|
};
|
@ -1,8 +1,8 @@
|
|||||||
{
|
{
|
||||||
"news": {
|
"news": {
|
||||||
"1": {
|
"1": {
|
||||||
"time": "Thu, 28 Feb 2019 14:24:51 GMT",
|
"time": "Fri, 01 Mar 2019 15:04:34 GMT",
|
||||||
"num_results": "Ungefähr 25.270.000.000 Ergebnisse (0,49 Sekunden) ",
|
"num_results": "Ungefähr 13.620.000.000 Ergebnisse (0,45 Sekunden) ",
|
||||||
"no_results": false,
|
"no_results": false,
|
||||||
"effective_query": "",
|
"effective_query": "",
|
||||||
"results": [
|
"results": [
|
||||||
@ -14,13 +14,21 @@
|
|||||||
"date": "",
|
"date": "",
|
||||||
"rank": 1
|
"rank": 1
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"link": "https://www.bild.de/news/startseite/news/news-16804530.bild.html",
|
||||||
|
"title": "News aktuell aus Deutschland und der Welt - Bild.dehttps://www.bild.de/news/startseite/news/news-16804530.bild.html",
|
||||||
|
"snippet": "Aktuelle News aus Deutschland, Europa und der Welt. Alle Informationen, Bilder und Videos zu Skandalen, Krisen und Sensationen bei BILD.de.",
|
||||||
|
"visible_link": "https://www.bild.de/news/startseite/news/news-16804530.bild.html",
|
||||||
|
"date": "",
|
||||||
|
"rank": 2
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"link": "https://www.rtl.de/cms/news.html",
|
"link": "https://www.rtl.de/cms/news.html",
|
||||||
"title": "News: Aktuelle Nachrichten, Schlagzeilen und Videos | RTL.dehttps://www.rtl.de/cms/news.html",
|
"title": "News: Aktuelle Nachrichten, Schlagzeilen und Videos | RTL.dehttps://www.rtl.de/cms/news.html",
|
||||||
"snippet": "Aktuelle Nachrichten aus Deutschland und der Welt auf einen Blick: Bei RTL.de finden Sie die News von heute, spannende Hintergründe und Videos.",
|
"snippet": "Aktuelle Nachrichten aus Deutschland und der Welt auf einen Blick: Bei RTL.de finden Sie die News von heute, spannende Hintergründe und Videos.",
|
||||||
"visible_link": "https://www.rtl.de/cms/news.html",
|
"visible_link": "https://www.rtl.de/cms/news.html",
|
||||||
"date": "",
|
"date": "",
|
||||||
"rank": 2
|
"rank": 3
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"link": "https://www.zeit.de/news/index",
|
"link": "https://www.zeit.de/news/index",
|
||||||
@ -28,14 +36,6 @@
|
|||||||
"snippet": "Aktuelle News und Schlagzeilen im Newsticker von ZEIT ONLINE. Lesen Sie hier die neuesten Nachrichten.",
|
"snippet": "Aktuelle News und Schlagzeilen im Newsticker von ZEIT ONLINE. Lesen Sie hier die neuesten Nachrichten.",
|
||||||
"visible_link": "https://www.zeit.de/news/index",
|
"visible_link": "https://www.zeit.de/news/index",
|
||||||
"date": "",
|
"date": "",
|
||||||
"rank": 3
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"link": "https://www.bild.de/news/startseite/news/news-16804530.bild.html",
|
|
||||||
"title": "News aktuell aus Deutschland und der Welt - Bild.dehttps://www.bild.de/news/startseite/news/news-16804530.bild.html",
|
|
||||||
"snippet": "Aktuelle News aus Deutschland, Europa und der Welt. Alle Informationen, Bilder und Videos zu Skandalen, Krisen und Sensationen bei BILD.de.",
|
|
||||||
"visible_link": "https://www.bild.de/news/startseite/news/news-16804530.bild.html",
|
|
||||||
"date": "",
|
|
||||||
"rank": 4
|
"rank": 4
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -57,43 +57,51 @@
|
|||||||
{
|
{
|
||||||
"link": "https://www.t-online.de/nachrichten/",
|
"link": "https://www.t-online.de/nachrichten/",
|
||||||
"title": "Politik aktuell: Nachrichten aus Deutschland, Europa und der Welthttps://www.t-online.de/nachrichten/",
|
"title": "Politik aktuell: Nachrichten aus Deutschland, Europa und der Welthttps://www.t-online.de/nachrichten/",
|
||||||
"snippet": "Trump trifft Kim: Der Nordkorea-Gipfel in Vietnam im News-Blog · Krise in Venezuela: Aktuelle Entwicklungen, ... E-Mails und News unterwegs immer dabei.",
|
"snippet": "Neuigkeiten aus der Welt des Wintersports · Der Bachelor: Alle Informationen zur aktuellen Staffel · GNTM Staffel 14: News zu Germany's Next Topmodel 2019 ...",
|
||||||
"visible_link": "https://www.t-online.de/nachrichten/",
|
"visible_link": "https://www.t-online.de/nachrichten/",
|
||||||
"date": "",
|
"date": "",
|
||||||
"rank": 7
|
"rank": 7
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"link": "https://news.google.com/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRFZxYUdjU0FtUmxHZ0pFUlNnQVAB?hl=de&gl=DE&ceid=DE%3Ade",
|
|
||||||
"title": "Google News - Schlagzeilen - Neuestehttps://news.google.com/.../CAAqJggKIiBDQkFTRWdvSUwyMHZNRFZxYUdjU0FtUm...",
|
|
||||||
"snippet": "Mit Google News kannst du zum Thema Schlagzeilen vollständige Artikel lesen, Videos ansehen und in Tausenden von Titeln stöbern.",
|
|
||||||
"visible_link": "https://news.google.com/.../CAAqJggKIiBDQkFTRWdvSUwyMHZNRFZxYUdjU0FtUm...",
|
|
||||||
"date": "",
|
|
||||||
"rank": 8
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"link": "https://www.n-tv.de/",
|
"link": "https://www.n-tv.de/",
|
||||||
"title": "Nachrichten, aktuelle Schlagzeilen und Videos - n-tv.dehttps://www.n-tv.de/",
|
"title": "Nachrichten, aktuelle Schlagzeilen und Videos - n-tv.dehttps://www.n-tv.de/",
|
||||||
"snippet": "Nachrichten seriös, schnell und kompetent. Artikel und Videos aus Politik, Wirtschaft, Börse, Sport und News aus aller Welt.",
|
"snippet": "Nachrichten seriös, schnell und kompetent. Artikel und Videos aus Politik, Wirtschaft, Börse, Sport und News aus aller Welt.",
|
||||||
"visible_link": "https://www.n-tv.de/",
|
"visible_link": "https://www.n-tv.de/",
|
||||||
"date": "",
|
"date": "",
|
||||||
|
"rank": 8
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"link": "https://www.stern.de/news/",
|
||||||
|
"title": "News - Sternhttps://www.stern.de/news/Im Cache",
|
||||||
|
"snippet": "News und aktuelle Schlagzeilen im Nachrichten-Ticker von STERN.de. Alle Informationen, Reportagen und Hintergründe im Überblick.",
|
||||||
|
"visible_link": "https://www.stern.de/news/",
|
||||||
|
"date": "",
|
||||||
"rank": 9
|
"rank": 9
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"link": "https://www.stern.de/panorama/weltgeschehen/news--russland-sagt-venezuela-massive-hilfslieferungen-zu-8601942.html",
|
||||||
|
"title": "News: Russland sagt Venezuela massive Hilfslieferungen zu | STERN ...https://www.stern.de › Panorama › WeltgeschehenIm Cache",
|
||||||
|
"snippet": "vor 1 Stunde - News des TagesPutin will Venezuela massiv unter die Arme greifen. Maserninfektionen steigen weltweit an +++ 20 Jahre Haft für Magier Jan ...",
|
||||||
|
"visible_link": "https://www.stern.de › Panorama › Weltgeschehen",
|
||||||
|
"date": "vor 1 Stunde - ",
|
||||||
|
"rank": 10
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"se-scraper": {
|
"se-scraper": {
|
||||||
"1": {
|
"1": {
|
||||||
"time": "Thu, 28 Feb 2019 14:24:51 GMT",
|
"time": "Fri, 01 Mar 2019 15:04:34 GMT",
|
||||||
"num_results": "Ungefähr 16.400.000 Ergebnisse (0,27 Sekunden) ",
|
"num_results": "Ungefähr 17.500.000 Ergebnisse (0,36 Sekunden) ",
|
||||||
"no_results": false,
|
"no_results": false,
|
||||||
"effective_query": "",
|
"effective_query": "",
|
||||||
"results": [
|
"results": [
|
||||||
{
|
{
|
||||||
"link": "https://www.npmjs.com/package/se-scraper",
|
"link": "https://www.npmjs.com/package/se-scraper",
|
||||||
"title": "se-scraper - npmhttps://www.npmjs.com/package/se-scraperIm CacheDiese Seite übersetzen",
|
"title": "se-scraper - npmhttps://www.npmjs.com/package/se-scraperIm CacheDiese Seite übersetzen",
|
||||||
"snippet": "07.02.2019 - A simple library using puppeteer to scrape several search engines such as Google, Duckduckgo and Bing.",
|
"snippet": "vor 1 Tag - se-scraper will create one browser instance per proxy. So the maximal amount of concurrency is equivalent to the number of proxies plus one ...",
|
||||||
"visible_link": "https://www.npmjs.com/package/se-scraper",
|
"visible_link": "https://www.npmjs.com/package/se-scraper",
|
||||||
"date": "07.02.2019 - ",
|
"date": "vor 1 Tag - ",
|
||||||
"rank": 1
|
"rank": 1
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -104,21 +112,13 @@
|
|||||||
"date": "",
|
"date": "",
|
||||||
"rank": 2
|
"rank": 2
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"link": "https://github.com/nyancat18/Se-Scraper",
|
|
||||||
"title": "GitHub - nyancat18/Se-Scraper: se-scraper your siteshttps://github.com/nyancat18/Se-ScraperIm CacheDiese Seite übersetzen",
|
|
||||||
"snippet": "se-scraper your sites. Contribute to nyancat18/Se-Scraper development by creating an account on GitHub.",
|
|
||||||
"visible_link": "https://github.com/nyancat18/Se-Scraper",
|
|
||||||
"date": "",
|
|
||||||
"rank": 3
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"link": "http://konjugator.reverso.net/konjugation-franzosisch-verb-se%20scraper.html",
|
"link": "http://konjugator.reverso.net/konjugation-franzosisch-verb-se%20scraper.html",
|
||||||
"title": "Konjugation se scraper | Konjugieren verb se scraper Französisch ...konjugator.reverso.net/konjugation-franzosisch-verb-se%20scraper.html",
|
"title": "Konjugation se scraper | Konjugieren verb se scraper Französisch ...konjugator.reverso.net/konjugation-franzosisch-verb-se%20scraper.html",
|
||||||
"snippet": "Reverso-Konjugation: Konjugation des französischen Verbs se scraper, Konjugator für französische Verben, unregelmäßige Verben, Übersetzung,Grammatik.",
|
"snippet": "Reverso-Konjugation: Konjugation des französischen Verbs se scraper, Konjugator für französische Verben, unregelmäßige Verben, Übersetzung,Grammatik.",
|
||||||
"visible_link": "konjugator.reverso.net/konjugation-franzosisch-verb-se%20scraper.html",
|
"visible_link": "konjugator.reverso.net/konjugation-franzosisch-verb-se%20scraper.html",
|
||||||
"date": "",
|
"date": "",
|
||||||
"rank": 4
|
"rank": 3
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"link": "https://swedishicescraper.se/",
|
"link": "https://swedishicescraper.se/",
|
||||||
@ -126,7 +126,7 @@
|
|||||||
"snippet": "The original Swedish Ice Scraper - best in test. ... solid Acrylic Glass and use diamond polishing to sharpen the scraping edges. ... info@swedishicescraper.se.",
|
"snippet": "The original Swedish Ice Scraper - best in test. ... solid Acrylic Glass and use diamond polishing to sharpen the scraping edges. ... info@swedishicescraper.se.",
|
||||||
"visible_link": "https://swedishicescraper.se/",
|
"visible_link": "https://swedishicescraper.se/",
|
||||||
"date": "",
|
"date": "",
|
||||||
"rank": 5
|
"rank": 4
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"link": "https://www.blackhatworld.com/seo/any-yandex-scrapers-available-or-universal-se-scraper.243421/",
|
"link": "https://www.blackhatworld.com/seo/any-yandex-scrapers-available-or-universal-se-scraper.243421/",
|
||||||
@ -134,22 +134,30 @@
|
|||||||
"snippet": "10.10.2010 - Mostly blogs & stuff like that. Is Hrefer for yandex only or there are other SEs? How much is it? Advertise on BHW ...",
|
"snippet": "10.10.2010 - Mostly blogs & stuff like that. Is Hrefer for yandex only or there are other SEs? How much is it? Advertise on BHW ...",
|
||||||
"visible_link": "https://www.blackhatworld.com › ... › Black Hat SEO Tools",
|
"visible_link": "https://www.blackhatworld.com › ... › Black Hat SEO Tools",
|
||||||
"date": "10.10.2010 - ",
|
"date": "10.10.2010 - ",
|
||||||
|
"rank": 5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"link": "https://www.amazon.de/Calli-Edelstahl-Schokolade-Flugzeug-Werkzeug/dp/B01JJ96DJE",
|
||||||
|
"title": "Calli Edelstahl Käse Scraper Schokolade reiben Messer Flugzeug ...https://www.amazon.de/Calli-Edelstahl-Schokolade-Flugzeug.../dp/B01JJ96DJEIm Cache",
|
||||||
|
"snippet": "Amazon.de: Küchen- und Haushaltsartikel online - Calli Edelstahl Käse Scraper Schokolade reiben Messer Flugzeug Cutter. Beschreibung: Edelstahl Käse ...",
|
||||||
|
"visible_link": "https://www.amazon.de/Calli-Edelstahl-Schokolade-Flugzeug.../dp/B01JJ96DJE",
|
||||||
|
"date": "",
|
||||||
"rank": 6
|
"rank": 6
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"link": "http://network.ubotstudio.com/forum/index.php/topic/8648-sell-free-sescraper-scrape-search-engines-with-long-lists-of-queries/",
|
"link": "https://www.friatec.de/content/friatec/en/Technical-Plastics/FRIATOOLS-Technical-Equipment/Mechanical-tools/index.html",
|
||||||
"title": "[SELL] FREE - SEscraper - scrape search engines with long lists of ...network.ubotstudio.com › ... › Sell › Bots and ScriptsIm CacheDiese Seite übersetzen",
|
"title": "FRIATOOLS Scraper tools and mechanical tooling - Friatec AGhttps://www.friatec.de/content/friatec/en/...tools/index.htmlIm CacheDiese Seite übersetzen",
|
||||||
"snippet": "03.12.2011 - SEscraper. Scrape results from: Google Yahoo Bing AOL Enter one or more queries as well as an optional list of keywords to append to each ...",
|
"snippet": "FRIATOOLS Scraper tools and mechanical tooling. ... FWSG SE 63 - 315, 613562 - 613574, saddle area, pipe ends, d 63 - d 315, SDR 11 - SDR 33. FWSG 710 ...",
|
||||||
"visible_link": "network.ubotstudio.com › ... › Sell › Bots and Scripts",
|
"visible_link": "https://www.friatec.de/content/friatec/en/...tools/index.html",
|
||||||
"date": "03.12.2011 - ",
|
"date": "",
|
||||||
"rank": 7
|
"rank": 7
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"link": "https://netpeaksoftware.com/blog/netpeak-checker-3-0-serp-scraping",
|
"link": "https://books.google.de/books?id=IAjyQdFwh4UC&pg=PA1024&lpg=PA1024&dq=se-scraper&source=bl&ots=_3OxiKnP4G&sig=ACfU3U1kZgZPdNlnGGWHRWjU0gG2OVHY1g&hl=de&sa=X&ved=2ahUKEwjU5evLm-HgAhWw1uAKHTEUB1IQ6AEwDnoECAMQAQ",
|
||||||
"title": "Netpeak Checker 3.0: SERP Scraping – Netpeak Software Bloghttps://netpeaksoftware.com/.../netpeak-checker-3-0-serp-scrapin...Im CacheDiese Seite übersetzen",
|
"title": "A Dictionary of Slang and Unconventional Englishhttps://books.google.de/books?isbn=1134963653Diese Seite übersetzen",
|
||||||
"snippet": "19.09.2018 - With a new tool under an 'SE Scraper' nickname you can get Google, Bing, Yahoo, and Yandex search results in a structured table with a lot of ...",
|
"snippet": "1861 (OED); 1873, Rhoda Broughton, 'Happiness thinly spread over their whole lives, like bread and scrape!' Ex S.E. scrape, a thin layer.-——4. Hence, bread ...",
|
||||||
"visible_link": "https://netpeaksoftware.com/.../netpeak-checker-3-0-serp-scrapin...",
|
"visible_link": "https://books.google.de/books?isbn=1134963653",
|
||||||
"date": "19.09.2018 - ",
|
"date": "",
|
||||||
"rank": 8
|
"rank": 8
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "se-scraper",
|
"name": "se-scraper",
|
||||||
"version": "1.2.2",
|
"version": "1.2.6",
|
||||||
"description": "A simple module using puppeteer to scrape several search engines such as Google, Duckduckgo and Bing.",
|
"description": "A simple module using puppeteer to scrape several search engines such as Google, Duckduckgo and Bing.",
|
||||||
"homepage": "https://scrapeulous.com/",
|
"homepage": "https://scrapeulous.com/",
|
||||||
"main": "index.js",
|
"main": "index.js",
|
||||||
|
@ -13,9 +13,10 @@ module.exports = class Scraper {
|
|||||||
config = {},
|
config = {},
|
||||||
context = {},
|
context = {},
|
||||||
pluggable = null,
|
pluggable = null,
|
||||||
|
page = null,
|
||||||
} = options;
|
} = options;
|
||||||
|
|
||||||
this.page = null;
|
this.page = page;
|
||||||
this.metadata = {};
|
this.metadata = {};
|
||||||
this.pluggable = pluggable;
|
this.pluggable = pluggable;
|
||||||
this.config = config;
|
this.config = config;
|
||||||
@ -89,13 +90,11 @@ module.exports = class Scraper {
|
|||||||
|
|
||||||
if (this.config.log_http_headers === true) {
|
if (this.config.log_http_headers === true) {
|
||||||
this.metadata.http_headers = await meta.get_http_headers(this.page);
|
this.metadata.http_headers = await meta.get_http_headers(this.page);
|
||||||
console.log(this.metadata.http_headers);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.config.log_ip_address === true) {
|
if (this.config.log_ip_address === true) {
|
||||||
let ipinfo = await meta.get_ip_data(this.page);
|
let ipinfo = await meta.get_ip_data(this.page);
|
||||||
this.metadata.ipinfo = ipinfo;
|
this.metadata.ipinfo = ipinfo;
|
||||||
console.log(ipinfo);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// check that our proxy is working by confirming
|
// check that our proxy is working by confirming
|
||||||
@ -292,127 +291,136 @@ module.exports = class Scraper {
|
|||||||
|
|
||||||
// This is where we'll put the code to get around the tests.
|
// This is where we'll put the code to get around the tests.
|
||||||
async function evadeChromeHeadlessDetection(page) {
|
async function evadeChromeHeadlessDetection(page) {
|
||||||
// Pass the Webdriver Test.
|
|
||||||
await page.evaluateOnNewDocument(() => {
|
|
||||||
const newProto = navigator.__proto__;
|
|
||||||
delete newProto.webdriver;
|
|
||||||
navigator.__proto__ = newProto;
|
|
||||||
});
|
|
||||||
|
|
||||||
// Pass the Chrome Test.
|
try {
|
||||||
await page.evaluateOnNewDocument(() => {
|
// Pass the Webdriver Test.
|
||||||
// We can mock this in as much depth as we need for the test.
|
await page.evaluateOnNewDocument(() => {
|
||||||
const mockObj = {
|
const newProto = navigator.__proto__;
|
||||||
app: {
|
delete newProto.webdriver;
|
||||||
isInstalled: false,
|
navigator.__proto__ = newProto;
|
||||||
},
|
|
||||||
webstore: {
|
|
||||||
onInstallStageChanged: {},
|
|
||||||
onDownloadProgress: {},
|
|
||||||
},
|
|
||||||
runtime: {
|
|
||||||
PlatformOs: {
|
|
||||||
MAC: 'mac',
|
|
||||||
WIN: 'win',
|
|
||||||
ANDROID: 'android',
|
|
||||||
CROS: 'cros',
|
|
||||||
LINUX: 'linux',
|
|
||||||
OPENBSD: 'openbsd',
|
|
||||||
},
|
|
||||||
PlatformArch: {
|
|
||||||
ARM: 'arm',
|
|
||||||
X86_32: 'x86-32',
|
|
||||||
X86_64: 'x86-64',
|
|
||||||
},
|
|
||||||
PlatformNaclArch: {
|
|
||||||
ARM: 'arm',
|
|
||||||
X86_32: 'x86-32',
|
|
||||||
X86_64: 'x86-64',
|
|
||||||
},
|
|
||||||
RequestUpdateCheckStatus: {
|
|
||||||
THROTTLED: 'throttled',
|
|
||||||
NO_UPDATE: 'no_update',
|
|
||||||
UPDATE_AVAILABLE: 'update_available',
|
|
||||||
},
|
|
||||||
OnInstalledReason: {
|
|
||||||
INSTALL: 'install',
|
|
||||||
UPDATE: 'update',
|
|
||||||
CHROME_UPDATE: 'chrome_update',
|
|
||||||
SHARED_MODULE_UPDATE: 'shared_module_update',
|
|
||||||
},
|
|
||||||
OnRestartRequiredReason: {
|
|
||||||
APP_UPDATE: 'app_update',
|
|
||||||
OS_UPDATE: 'os_update',
|
|
||||||
PERIODIC: 'periodic',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
window.navigator.chrome = mockObj;
|
|
||||||
window.chrome = mockObj;
|
|
||||||
});
|
|
||||||
|
|
||||||
// Pass the Permissions Test.
|
|
||||||
await page.evaluateOnNewDocument(() => {
|
|
||||||
const originalQuery = window.navigator.permissions.query;
|
|
||||||
window.navigator.permissions.__proto__.query = parameters =>
|
|
||||||
parameters.name === 'notifications'
|
|
||||||
? Promise.resolve({state: Notification.permission})
|
|
||||||
: originalQuery(parameters);
|
|
||||||
|
|
||||||
// Inspired by: https://github.com/ikarienator/phantomjs_hide_and_seek/blob/master/5.spoofFunctionBind.js
|
|
||||||
const oldCall = Function.prototype.call;
|
|
||||||
function call() {
|
|
||||||
return oldCall.apply(this, arguments);
|
|
||||||
}
|
|
||||||
Function.prototype.call = call;
|
|
||||||
|
|
||||||
const nativeToStringFunctionString = Error.toString().replace(/Error/g, "toString");
|
|
||||||
const oldToString = Function.prototype.toString;
|
|
||||||
|
|
||||||
function functionToString() {
|
|
||||||
if (this === window.navigator.permissions.query) {
|
|
||||||
return "function query() { [native code] }";
|
|
||||||
}
|
|
||||||
if (this === functionToString) {
|
|
||||||
return nativeToStringFunctionString;
|
|
||||||
}
|
|
||||||
return oldCall.call(oldToString, this);
|
|
||||||
}
|
|
||||||
Function.prototype.toString = functionToString;
|
|
||||||
});
|
|
||||||
|
|
||||||
// Pass the Plugins Length Test.
|
|
||||||
await page.evaluateOnNewDocument(() => {
|
|
||||||
// Overwrite the `plugins` property to use a custom getter.
|
|
||||||
Object.defineProperty(navigator, 'plugins', {
|
|
||||||
// This just needs to have `length > 0` for the current test,
|
|
||||||
// but we could mock the plugins too if necessary.
|
|
||||||
get: () => [1, 2, 3, 4, 5]
|
|
||||||
});
|
});
|
||||||
});
|
|
||||||
|
|
||||||
// Pass the Languages Test.
|
// Pass the Chrome Test.
|
||||||
await page.evaluateOnNewDocument(() => {
|
await page.evaluateOnNewDocument(() => {
|
||||||
// Overwrite the `plugins` property to use a custom getter.
|
// We can mock this in as much depth as we need for the test.
|
||||||
Object.defineProperty(navigator, 'languages', {
|
const mockObj = {
|
||||||
get: () => ['en-US', 'en']
|
app: {
|
||||||
|
isInstalled: false,
|
||||||
|
},
|
||||||
|
webstore: {
|
||||||
|
onInstallStageChanged: {},
|
||||||
|
onDownloadProgress: {},
|
||||||
|
},
|
||||||
|
runtime: {
|
||||||
|
PlatformOs: {
|
||||||
|
MAC: 'mac',
|
||||||
|
WIN: 'win',
|
||||||
|
ANDROID: 'android',
|
||||||
|
CROS: 'cros',
|
||||||
|
LINUX: 'linux',
|
||||||
|
OPENBSD: 'openbsd',
|
||||||
|
},
|
||||||
|
PlatformArch: {
|
||||||
|
ARM: 'arm',
|
||||||
|
X86_32: 'x86-32',
|
||||||
|
X86_64: 'x86-64',
|
||||||
|
},
|
||||||
|
PlatformNaclArch: {
|
||||||
|
ARM: 'arm',
|
||||||
|
X86_32: 'x86-32',
|
||||||
|
X86_64: 'x86-64',
|
||||||
|
},
|
||||||
|
RequestUpdateCheckStatus: {
|
||||||
|
THROTTLED: 'throttled',
|
||||||
|
NO_UPDATE: 'no_update',
|
||||||
|
UPDATE_AVAILABLE: 'update_available',
|
||||||
|
},
|
||||||
|
OnInstalledReason: {
|
||||||
|
INSTALL: 'install',
|
||||||
|
UPDATE: 'update',
|
||||||
|
CHROME_UPDATE: 'chrome_update',
|
||||||
|
SHARED_MODULE_UPDATE: 'shared_module_update',
|
||||||
|
},
|
||||||
|
OnRestartRequiredReason: {
|
||||||
|
APP_UPDATE: 'app_update',
|
||||||
|
OS_UPDATE: 'os_update',
|
||||||
|
PERIODIC: 'periodic',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
window.navigator.chrome = mockObj;
|
||||||
|
window.chrome = mockObj;
|
||||||
});
|
});
|
||||||
});
|
|
||||||
|
|
||||||
// Pass the iframe Test
|
// Pass the Permissions Test.
|
||||||
await page.evaluateOnNewDocument(() => {
|
await page.evaluateOnNewDocument(() => {
|
||||||
Object.defineProperty(HTMLIFrameElement.prototype, 'contentWindow', {
|
const originalQuery = window.navigator.permissions.query;
|
||||||
get: function() {
|
window.navigator.permissions.__proto__.query = parameters =>
|
||||||
return window;
|
parameters.name === 'notifications'
|
||||||
|
? Promise.resolve({state: Notification.permission})
|
||||||
|
: originalQuery(parameters);
|
||||||
|
|
||||||
|
// Inspired by: https://github.com/ikarienator/phantomjs_hide_and_seek/blob/master/5.spoofFunctionBind.js
|
||||||
|
const oldCall = Function.prototype.call;
|
||||||
|
|
||||||
|
function call() {
|
||||||
|
return oldCall.apply(this, arguments);
|
||||||
}
|
}
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
// Pass toString test, though it breaks console.debug() from working
|
Function.prototype.call = call;
|
||||||
await page.evaluateOnNewDocument(() => {
|
|
||||||
window.console.debug = () => {
|
const nativeToStringFunctionString = Error.toString().replace(/Error/g, "toString");
|
||||||
return null;
|
const oldToString = Function.prototype.toString;
|
||||||
};
|
|
||||||
});
|
function functionToString() {
|
||||||
|
if (this === window.navigator.permissions.query) {
|
||||||
|
return "function query() { [native code] }";
|
||||||
|
}
|
||||||
|
if (this === functionToString) {
|
||||||
|
return nativeToStringFunctionString;
|
||||||
|
}
|
||||||
|
return oldCall.call(oldToString, this);
|
||||||
|
}
|
||||||
|
|
||||||
|
Function.prototype.toString = functionToString;
|
||||||
|
});
|
||||||
|
|
||||||
|
// Pass the Plugins Length Test.
|
||||||
|
await page.evaluateOnNewDocument(() => {
|
||||||
|
// Overwrite the `plugins` property to use a custom getter.
|
||||||
|
Object.defineProperty(navigator, 'plugins', {
|
||||||
|
// This just needs to have `length > 0` for the current test,
|
||||||
|
// but we could mock the plugins too if necessary.
|
||||||
|
get: () => [1, 2, 3, 4, 5]
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// Pass the Languages Test.
|
||||||
|
await page.evaluateOnNewDocument(() => {
|
||||||
|
// Overwrite the `plugins` property to use a custom getter.
|
||||||
|
Object.defineProperty(navigator, 'languages', {
|
||||||
|
get: () => ['en-US', 'en']
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// Pass the iframe Test
|
||||||
|
await page.evaluateOnNewDocument(() => {
|
||||||
|
Object.defineProperty(HTMLIFrameElement.prototype, 'contentWindow', {
|
||||||
|
get: function () {
|
||||||
|
return window;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// Pass toString test, though it breaks console.debug() from working
|
||||||
|
await page.evaluateOnNewDocument(() => {
|
||||||
|
window.console.debug = () => {
|
||||||
|
return null;
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
} catch (e) {
|
||||||
|
console.error(e);
|
||||||
|
}
|
||||||
}
|
}
|
@ -1,8 +1,5 @@
|
|||||||
const { Cluster } = require('./puppeteer-cluster/dist/index.js');
|
|
||||||
const zlib = require('zlib');
|
const zlib = require('zlib');
|
||||||
var fs = require('fs');
|
var fs = require('fs');
|
||||||
|
|
||||||
// local module imports
|
|
||||||
const google = require('./modules/google.js');
|
const google = require('./modules/google.js');
|
||||||
const bing = require('./modules/bing.js');
|
const bing = require('./modules/bing.js');
|
||||||
const baidu = require('./modules/baidu.js');
|
const baidu = require('./modules/baidu.js');
|
||||||
@ -63,7 +60,9 @@ module.exports.handler = async function handler (event, context, callback) {
|
|||||||
console.log(config);
|
console.log(config);
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log(`[se-scraper] started at [${(new Date()).toUTCString()}] and scrapes ${config.search_engine} with ${config.keywords.length} keywords on ${config.num_pages} pages each.`);
|
if (config.keywords && config.search_engine) {
|
||||||
|
console.log(`[se-scraper] started at [${(new Date()).toUTCString()}] and scrapes ${config.search_engine} with ${config.keywords.length} keywords on ${config.num_pages} pages each.`);
|
||||||
|
}
|
||||||
|
|
||||||
// See here: https://peter.sh/experiments/chromium-command-line-switches/
|
// See here: https://peter.sh/experiments/chromium-command-line-switches/
|
||||||
var ADDITIONAL_CHROME_FLAGS = [
|
var ADDITIONAL_CHROME_FLAGS = [
|
||||||
@ -81,7 +80,7 @@ module.exports.handler = async function handler (event, context, callback) {
|
|||||||
'--disable-notifications',
|
'--disable-notifications',
|
||||||
];
|
];
|
||||||
|
|
||||||
var user_agent = undefined;
|
var user_agent = null;
|
||||||
|
|
||||||
if (config.user_agent) {
|
if (config.user_agent) {
|
||||||
user_agent = config.user_agent;
|
user_agent = config.user_agent;
|
||||||
@ -120,18 +119,23 @@ module.exports.handler = async function handler (event, context, callback) {
|
|||||||
if (pluggable.start_browser) {
|
if (pluggable.start_browser) {
|
||||||
launch_args.config = config;
|
launch_args.config = config;
|
||||||
let browser = await pluggable.start_browser(launch_args);
|
let browser = await pluggable.start_browser(launch_args);
|
||||||
const realUA = await browser.userAgent();
|
|
||||||
if (realUA === user_agent) {
|
const page = await browser.newPage();
|
||||||
const page = await await browser.newPage();
|
|
||||||
|
if (config.do_work && pluggable.do_work) {
|
||||||
|
let res = await pluggable.do_work(page);
|
||||||
|
results = res.results;
|
||||||
|
num_requests = res.num_requests;
|
||||||
|
} else {
|
||||||
let obj = getScraper(config.search_engine, {
|
let obj = getScraper(config.search_engine, {
|
||||||
config: config,
|
config: config,
|
||||||
context: context,
|
context: context,
|
||||||
pluggable: pluggable,
|
pluggable: pluggable,
|
||||||
|
page: page,
|
||||||
});
|
});
|
||||||
results = obj.run(page);
|
results = obj.run({page: page});
|
||||||
num_requests = obj.num_requests;
|
num_requests = obj.num_requests;
|
||||||
} else {
|
metadata = obj.metadata;
|
||||||
console.error('provided user agent does not match real user agent');
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pluggable.close_browser) {
|
if (pluggable.close_browser) {
|
||||||
@ -139,9 +143,12 @@ module.exports.handler = async function handler (event, context, callback) {
|
|||||||
} else {
|
} else {
|
||||||
await browser.close();
|
await browser.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
// if no custom start_browser functionality was given
|
// if no custom start_browser functionality was given
|
||||||
// use puppeteer-cluster for scraping
|
// use puppeteer-cluster for scraping
|
||||||
|
const { Cluster } = require('./puppeteer-cluster/dist/index.js');
|
||||||
|
|
||||||
var numClusters = config.puppeteer_cluster_config.maxConcurrency;
|
var numClusters = config.puppeteer_cluster_config.maxConcurrency;
|
||||||
var perBrowserOptions = [];
|
var perBrowserOptions = [];
|
||||||
@ -235,9 +242,8 @@ module.exports.handler = async function handler (event, context, callback) {
|
|||||||
let ms_per_request = timeDelta/num_requests;
|
let ms_per_request = timeDelta/num_requests;
|
||||||
|
|
||||||
if (config.verbose === true) {
|
if (config.verbose === true) {
|
||||||
console.log(`se-scraper took ${timeDelta}ms to perform ${num_requests} requests.`);
|
console.log(`Scraper took ${timeDelta}ms to perform ${num_requests} requests.`);
|
||||||
console.log(`On average ms/request: ${ms_per_request}ms/request`);
|
console.log(`On average ms/request: ${ms_per_request}ms/request`);
|
||||||
//console.dir(results, {depth: null, colors: true});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (config.compress === true) {
|
if (config.compress === true) {
|
||||||
@ -299,7 +305,7 @@ function parseEventData(config) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const booleans = ['debug', 'verbose', 'upload_to_s3', 'log_ip_address', 'log_http_headers', 'random_user_agent',
|
const booleans = ['debug', 'verbose', 'upload_to_s3', 'log_ip_address', 'log_http_headers', 'random_user_agent',
|
||||||
'compress', 'is_local', 'max_results', 'set_manual_settings', 'block_assets', 'test_evasion'];
|
'compress', 'is_local', 'max_results', 'set_manual_settings', 'block_assets', 'test_evasion', 'do_work'];
|
||||||
|
|
||||||
for (b of booleans) {
|
for (b of booleans) {
|
||||||
config[b] = _bool(config[b]);
|
config[b] = _bool(config[b]);
|
||||||
|
Loading…
Reference in New Issue
Block a user