minor improvements

This commit is contained in:
Nikolai Tschacher 2019-03-02 22:32:26 +01:00
parent abf4458e46
commit 8cbf37eaba
5 changed files with 205 additions and 178 deletions

View File

@ -65,4 +65,9 @@ module.exports = class Pluggable {
return this.browser;
}
async do_work(page) {
// do some scraping work and return results and num_requests
}
};

View File

@ -1,8 +1,8 @@
{
"news": {
"1": {
"time": "Thu, 28 Feb 2019 14:24:51 GMT",
"num_results": "Ungefähr 25.270.000.000 Ergebnisse (0,49 Sekunden) ",
"time": "Fri, 01 Mar 2019 15:04:34 GMT",
"num_results": "Ungefähr 13.620.000.000 Ergebnisse (0,45 Sekunden) ",
"no_results": false,
"effective_query": "",
"results": [
@ -14,13 +14,21 @@
"date": "",
"rank": 1
},
{
"link": "https://www.bild.de/news/startseite/news/news-16804530.bild.html",
"title": "News aktuell aus Deutschland und der Welt - Bild.dehttps://www.bild.de/news/startseite/news/news-16804530.bild.html",
"snippet": "Aktuelle News aus Deutschland, Europa und der Welt. Alle Informationen, Bilder und Videos zu Skandalen, Krisen und Sensationen bei BILD.de.",
"visible_link": "https://www.bild.de/news/startseite/news/news-16804530.bild.html",
"date": "",
"rank": 2
},
{
"link": "https://www.rtl.de/cms/news.html",
"title": "News: Aktuelle Nachrichten, Schlagzeilen und Videos | RTL.dehttps://www.rtl.de/cms/news.html",
"snippet": "Aktuelle Nachrichten aus Deutschland und der Welt auf einen Blick: Bei RTL.de finden Sie die News von heute, spannende Hintergründe und Videos.",
"visible_link": "https://www.rtl.de/cms/news.html",
"date": "",
"rank": 2
"rank": 3
},
{
"link": "https://www.zeit.de/news/index",
@ -28,14 +36,6 @@
"snippet": "Aktuelle News und Schlagzeilen im Newsticker von ZEIT ONLINE. Lesen Sie hier die neuesten Nachrichten.",
"visible_link": "https://www.zeit.de/news/index",
"date": "",
"rank": 3
},
{
"link": "https://www.bild.de/news/startseite/news/news-16804530.bild.html",
"title": "News aktuell aus Deutschland und der Welt - Bild.dehttps://www.bild.de/news/startseite/news/news-16804530.bild.html",
"snippet": "Aktuelle News aus Deutschland, Europa und der Welt. Alle Informationen, Bilder und Videos zu Skandalen, Krisen und Sensationen bei BILD.de.",
"visible_link": "https://www.bild.de/news/startseite/news/news-16804530.bild.html",
"date": "",
"rank": 4
},
{
@ -57,43 +57,51 @@
{
"link": "https://www.t-online.de/nachrichten/",
"title": "Politik aktuell: Nachrichten aus Deutschland, Europa und der Welthttps://www.t-online.de/nachrichten/",
"snippet": "Trump trifft Kim: Der Nordkorea-Gipfel in Vietnam im News-Blog · Krise in Venezuela: Aktuelle Entwicklungen, ... E-Mails und News unterwegs immer dabei.",
"snippet": "Neuigkeiten aus der Welt des Wintersports · Der Bachelor: Alle Informationen zur aktuellen Staffel · GNTM Staffel 14: News zu Germany's Next Topmodel 2019 ...",
"visible_link": "https://www.t-online.de/nachrichten/",
"date": "",
"rank": 7
},
{
"link": "https://news.google.com/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRFZxYUdjU0FtUmxHZ0pFUlNnQVAB?hl=de&gl=DE&ceid=DE%3Ade",
"title": "Google News - Schlagzeilen - Neuestehttps://news.google.com/.../CAAqJggKIiBDQkFTRWdvSUwyMHZNRFZxYUdjU0FtUm...",
"snippet": "Mit Google News kannst du zum Thema Schlagzeilen vollständige Artikel lesen, Videos ansehen und in Tausenden von Titeln stöbern.",
"visible_link": "https://news.google.com/.../CAAqJggKIiBDQkFTRWdvSUwyMHZNRFZxYUdjU0FtUm...",
"date": "",
"rank": 8
},
{
"link": "https://www.n-tv.de/",
"title": "Nachrichten, aktuelle Schlagzeilen und Videos - n-tv.dehttps://www.n-tv.de/",
"snippet": "Nachrichten seriös, schnell und kompetent. Artikel und Videos aus Politik, Wirtschaft, Börse, Sport und News aus aller Welt.",
"visible_link": "https://www.n-tv.de/",
"date": "",
"rank": 8
},
{
"link": "https://www.stern.de/news/",
"title": "News - Sternhttps://www.stern.de/news/Im Cache",
"snippet": "News und aktuelle Schlagzeilen im Nachrichten-Ticker von STERN.de. Alle Informationen, Reportagen und Hintergründe im Überblick.",
"visible_link": "https://www.stern.de/news/",
"date": "",
"rank": 9
},
{
"link": "https://www.stern.de/panorama/weltgeschehen/news--russland-sagt-venezuela-massive-hilfslieferungen-zu-8601942.html",
"title": "News: Russland sagt Venezuela massive Hilfslieferungen zu | STERN ...https://www.stern.de Panorama WeltgeschehenIm Cache",
"snippet": "vor 1 Stunde - News des TagesPutin will Venezuela massiv unter die Arme greifen. Maserninfektionen steigen weltweit an +++ 20 Jahre Haft für Magier Jan ...",
"visible_link": "https://www.stern.de Panorama Weltgeschehen",
"date": "vor 1 Stunde - ",
"rank": 10
}
]
}
},
"se-scraper": {
"1": {
"time": "Thu, 28 Feb 2019 14:24:51 GMT",
"num_results": "Ungefähr 16.400.000 Ergebnisse (0,27 Sekunden) ",
"time": "Fri, 01 Mar 2019 15:04:34 GMT",
"num_results": "Ungefähr 17.500.000 Ergebnisse (0,36 Sekunden) ",
"no_results": false,
"effective_query": "",
"results": [
{
"link": "https://www.npmjs.com/package/se-scraper",
"title": "se-scraper - npmhttps://www.npmjs.com/package/se-scraperIm CacheDiese Seite übersetzen",
"snippet": "07.02.2019 - A simple library using puppeteer to scrape several search engines such as Google, Duckduckgo and Bing.",
"snippet": "vor 1 Tag - se-scraper will create one browser instance per proxy. So the maximal amount of concurrency is equivalent to the number of proxies plus one ...",
"visible_link": "https://www.npmjs.com/package/se-scraper",
"date": "07.02.2019 - ",
"date": "vor 1 Tag - ",
"rank": 1
},
{
@ -104,21 +112,13 @@
"date": "",
"rank": 2
},
{
"link": "https://github.com/nyancat18/Se-Scraper",
"title": "GitHub - nyancat18/Se-Scraper: se-scraper your siteshttps://github.com/nyancat18/Se-ScraperIm CacheDiese Seite übersetzen",
"snippet": "se-scraper your sites. Contribute to nyancat18/Se-Scraper development by creating an account on GitHub.",
"visible_link": "https://github.com/nyancat18/Se-Scraper",
"date": "",
"rank": 3
},
{
"link": "http://konjugator.reverso.net/konjugation-franzosisch-verb-se%20scraper.html",
"title": "Konjugation se scraper | Konjugieren verb se scraper Französisch ...konjugator.reverso.net/konjugation-franzosisch-verb-se%20scraper.html",
"snippet": "Reverso-Konjugation: Konjugation des französischen Verbs se scraper, Konjugator für französische Verben, unregelmäßige Verben, Übersetzung,Grammatik.",
"visible_link": "konjugator.reverso.net/konjugation-franzosisch-verb-se%20scraper.html",
"date": "",
"rank": 4
"rank": 3
},
{
"link": "https://swedishicescraper.se/",
@ -126,7 +126,7 @@
"snippet": "The original Swedish Ice Scraper - best in test. ... solid Acrylic Glass and use diamond polishing to sharpen the scraping edges. ... info@swedishicescraper.se.",
"visible_link": "https://swedishicescraper.se/",
"date": "",
"rank": 5
"rank": 4
},
{
"link": "https://www.blackhatworld.com/seo/any-yandex-scrapers-available-or-universal-se-scraper.243421/",
@ -134,22 +134,30 @@
"snippet": "10.10.2010 - Mostly blogs & stuff like that. Is Hrefer for yandex only or there are other SEs? How much is it? Advertise on BHW ...",
"visible_link": "https://www.blackhatworld.com ... Black Hat SEO Tools",
"date": "10.10.2010 - ",
"rank": 5
},
{
"link": "https://www.amazon.de/Calli-Edelstahl-Schokolade-Flugzeug-Werkzeug/dp/B01JJ96DJE",
"title": "Calli Edelstahl Käse Scraper Schokolade reiben Messer Flugzeug ...https://www.amazon.de/Calli-Edelstahl-Schokolade-Flugzeug.../dp/B01JJ96DJEIm Cache",
"snippet": "Amazon.de: Küchen- und Haushaltsartikel online - Calli Edelstahl Käse Scraper Schokolade reiben Messer Flugzeug Cutter. Beschreibung: Edelstahl Käse ...",
"visible_link": "https://www.amazon.de/Calli-Edelstahl-Schokolade-Flugzeug.../dp/B01JJ96DJE",
"date": "",
"rank": 6
},
{
"link": "http://network.ubotstudio.com/forum/index.php/topic/8648-sell-free-sescraper-scrape-search-engines-with-long-lists-of-queries/",
"title": "[SELL] FREE - SEscraper - scrape search engines with long lists of ...network.ubotstudio.com ... Sell Bots and ScriptsIm CacheDiese Seite übersetzen",
"snippet": "03.12.2011 - SEscraper. Scrape results from: Google Yahoo Bing AOL Enter one or more queries as well as an optional list of keywords to append to each ...",
"visible_link": "network.ubotstudio.com ... Sell Bots and Scripts",
"date": "03.12.2011 - ",
"link": "https://www.friatec.de/content/friatec/en/Technical-Plastics/FRIATOOLS-Technical-Equipment/Mechanical-tools/index.html",
"title": "FRIATOOLS Scraper tools and mechanical tooling - Friatec AGhttps://www.friatec.de/content/friatec/en/...tools/index.htmlIm CacheDiese Seite übersetzen",
"snippet": "FRIATOOLS Scraper tools and mechanical tooling. ... FWSG SE 63 - 315, 613562 - 613574, saddle area, pipe ends, d 63 - d 315, SDR 11 - SDR 33. FWSG 710 ...",
"visible_link": "https://www.friatec.de/content/friatec/en/...tools/index.html",
"date": "",
"rank": 7
},
{
"link": "https://netpeaksoftware.com/blog/netpeak-checker-3-0-serp-scraping",
"title": "Netpeak Checker 3.0: SERP Scraping Netpeak Software Bloghttps://netpeaksoftware.com/.../netpeak-checker-3-0-serp-scrapin...Im CacheDiese Seite übersetzen",
"snippet": "19.09.2018 - With a new tool under an 'SE Scraper' nickname you can get Google, Bing, Yahoo, and Yandex search results in a structured table with a lot of ...",
"visible_link": "https://netpeaksoftware.com/.../netpeak-checker-3-0-serp-scrapin...",
"date": "19.09.2018 - ",
"link": "https://books.google.de/books?id=IAjyQdFwh4UC&pg=PA1024&lpg=PA1024&dq=se-scraper&source=bl&ots=_3OxiKnP4G&sig=ACfU3U1kZgZPdNlnGGWHRWjU0gG2OVHY1g&hl=de&sa=X&ved=2ahUKEwjU5evLm-HgAhWw1uAKHTEUB1IQ6AEwDnoECAMQAQ",
"title": "A Dictionary of Slang and Unconventional Englishhttps://books.google.de/books?isbn=1134963653Diese Seite übersetzen",
"snippet": "1861 (OED); 1873, Rhoda Broughton, 'Happiness thinly spread over their whole lives, like bread and scrape!' Ex S.E. scrape, a thin layer.-——4. Hence, bread ...",
"visible_link": "https://books.google.de/books?isbn=1134963653",
"date": "",
"rank": 8
}
]

View File

@ -1,6 +1,6 @@
{
"name": "se-scraper",
"version": "1.2.2",
"version": "1.2.6",
"description": "A simple module using puppeteer to scrape several search engines such as Google, Duckduckgo and Bing.",
"homepage": "https://scrapeulous.com/",
"main": "index.js",

View File

@ -13,9 +13,10 @@ module.exports = class Scraper {
config = {},
context = {},
pluggable = null,
page = null,
} = options;
this.page = null;
this.page = page;
this.metadata = {};
this.pluggable = pluggable;
this.config = config;
@ -89,13 +90,11 @@ module.exports = class Scraper {
if (this.config.log_http_headers === true) {
this.metadata.http_headers = await meta.get_http_headers(this.page);
console.log(this.metadata.http_headers);
}
if (this.config.log_ip_address === true) {
let ipinfo = await meta.get_ip_data(this.page);
this.metadata.ipinfo = ipinfo;
console.log(ipinfo);
}
// check that our proxy is working by confirming
@ -292,127 +291,136 @@ module.exports = class Scraper {
// This is where we'll put the code to get around the tests.
async function evadeChromeHeadlessDetection(page) {
// Pass the Webdriver Test.
await page.evaluateOnNewDocument(() => {
const newProto = navigator.__proto__;
delete newProto.webdriver;
navigator.__proto__ = newProto;
});
// Pass the Chrome Test.
await page.evaluateOnNewDocument(() => {
// We can mock this in as much depth as we need for the test.
const mockObj = {
app: {
isInstalled: false,
},
webstore: {
onInstallStageChanged: {},
onDownloadProgress: {},
},
runtime: {
PlatformOs: {
MAC: 'mac',
WIN: 'win',
ANDROID: 'android',
CROS: 'cros',
LINUX: 'linux',
OPENBSD: 'openbsd',
},
PlatformArch: {
ARM: 'arm',
X86_32: 'x86-32',
X86_64: 'x86-64',
},
PlatformNaclArch: {
ARM: 'arm',
X86_32: 'x86-32',
X86_64: 'x86-64',
},
RequestUpdateCheckStatus: {
THROTTLED: 'throttled',
NO_UPDATE: 'no_update',
UPDATE_AVAILABLE: 'update_available',
},
OnInstalledReason: {
INSTALL: 'install',
UPDATE: 'update',
CHROME_UPDATE: 'chrome_update',
SHARED_MODULE_UPDATE: 'shared_module_update',
},
OnRestartRequiredReason: {
APP_UPDATE: 'app_update',
OS_UPDATE: 'os_update',
PERIODIC: 'periodic',
},
},
};
window.navigator.chrome = mockObj;
window.chrome = mockObj;
});
// Pass the Permissions Test.
await page.evaluateOnNewDocument(() => {
const originalQuery = window.navigator.permissions.query;
window.navigator.permissions.__proto__.query = parameters =>
parameters.name === 'notifications'
? Promise.resolve({state: Notification.permission})
: originalQuery(parameters);
// Inspired by: https://github.com/ikarienator/phantomjs_hide_and_seek/blob/master/5.spoofFunctionBind.js
const oldCall = Function.prototype.call;
function call() {
return oldCall.apply(this, arguments);
}
Function.prototype.call = call;
const nativeToStringFunctionString = Error.toString().replace(/Error/g, "toString");
const oldToString = Function.prototype.toString;
function functionToString() {
if (this === window.navigator.permissions.query) {
return "function query() { [native code] }";
}
if (this === functionToString) {
return nativeToStringFunctionString;
}
return oldCall.call(oldToString, this);
}
Function.prototype.toString = functionToString;
});
// Pass the Plugins Length Test.
await page.evaluateOnNewDocument(() => {
// Overwrite the `plugins` property to use a custom getter.
Object.defineProperty(navigator, 'plugins', {
// This just needs to have `length > 0` for the current test,
// but we could mock the plugins too if necessary.
get: () => [1, 2, 3, 4, 5]
try {
// Pass the Webdriver Test.
await page.evaluateOnNewDocument(() => {
const newProto = navigator.__proto__;
delete newProto.webdriver;
navigator.__proto__ = newProto;
});
});
// Pass the Languages Test.
await page.evaluateOnNewDocument(() => {
// Overwrite the `plugins` property to use a custom getter.
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en']
// Pass the Chrome Test.
await page.evaluateOnNewDocument(() => {
// We can mock this in as much depth as we need for the test.
const mockObj = {
app: {
isInstalled: false,
},
webstore: {
onInstallStageChanged: {},
onDownloadProgress: {},
},
runtime: {
PlatformOs: {
MAC: 'mac',
WIN: 'win',
ANDROID: 'android',
CROS: 'cros',
LINUX: 'linux',
OPENBSD: 'openbsd',
},
PlatformArch: {
ARM: 'arm',
X86_32: 'x86-32',
X86_64: 'x86-64',
},
PlatformNaclArch: {
ARM: 'arm',
X86_32: 'x86-32',
X86_64: 'x86-64',
},
RequestUpdateCheckStatus: {
THROTTLED: 'throttled',
NO_UPDATE: 'no_update',
UPDATE_AVAILABLE: 'update_available',
},
OnInstalledReason: {
INSTALL: 'install',
UPDATE: 'update',
CHROME_UPDATE: 'chrome_update',
SHARED_MODULE_UPDATE: 'shared_module_update',
},
OnRestartRequiredReason: {
APP_UPDATE: 'app_update',
OS_UPDATE: 'os_update',
PERIODIC: 'periodic',
},
},
};
window.navigator.chrome = mockObj;
window.chrome = mockObj;
});
});
// Pass the iframe Test
await page.evaluateOnNewDocument(() => {
Object.defineProperty(HTMLIFrameElement.prototype, 'contentWindow', {
get: function() {
return window;
// Pass the Permissions Test.
await page.evaluateOnNewDocument(() => {
const originalQuery = window.navigator.permissions.query;
window.navigator.permissions.__proto__.query = parameters =>
parameters.name === 'notifications'
? Promise.resolve({state: Notification.permission})
: originalQuery(parameters);
// Inspired by: https://github.com/ikarienator/phantomjs_hide_and_seek/blob/master/5.spoofFunctionBind.js
const oldCall = Function.prototype.call;
function call() {
return oldCall.apply(this, arguments);
}
});
});
// Pass toString test, though it breaks console.debug() from working
await page.evaluateOnNewDocument(() => {
window.console.debug = () => {
return null;
};
});
Function.prototype.call = call;
const nativeToStringFunctionString = Error.toString().replace(/Error/g, "toString");
const oldToString = Function.prototype.toString;
function functionToString() {
if (this === window.navigator.permissions.query) {
return "function query() { [native code] }";
}
if (this === functionToString) {
return nativeToStringFunctionString;
}
return oldCall.call(oldToString, this);
}
Function.prototype.toString = functionToString;
});
// Pass the Plugins Length Test.
await page.evaluateOnNewDocument(() => {
// Overwrite the `plugins` property to use a custom getter.
Object.defineProperty(navigator, 'plugins', {
// This just needs to have `length > 0` for the current test,
// but we could mock the plugins too if necessary.
get: () => [1, 2, 3, 4, 5]
});
});
// Pass the Languages Test.
await page.evaluateOnNewDocument(() => {
// Overwrite the `plugins` property to use a custom getter.
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en']
});
});
// Pass the iframe Test
await page.evaluateOnNewDocument(() => {
Object.defineProperty(HTMLIFrameElement.prototype, 'contentWindow', {
get: function () {
return window;
}
});
});
// Pass toString test, though it breaks console.debug() from working
await page.evaluateOnNewDocument(() => {
window.console.debug = () => {
return null;
};
});
} catch (e) {
console.error(e);
}
}

View File

@ -1,8 +1,5 @@
const { Cluster } = require('./puppeteer-cluster/dist/index.js');
const zlib = require('zlib');
var fs = require('fs');
// local module imports
const google = require('./modules/google.js');
const bing = require('./modules/bing.js');
const baidu = require('./modules/baidu.js');
@ -63,7 +60,9 @@ module.exports.handler = async function handler (event, context, callback) {
console.log(config);
}
console.log(`[se-scraper] started at [${(new Date()).toUTCString()}] and scrapes ${config.search_engine} with ${config.keywords.length} keywords on ${config.num_pages} pages each.`);
if (config.keywords && config.search_engine) {
console.log(`[se-scraper] started at [${(new Date()).toUTCString()}] and scrapes ${config.search_engine} with ${config.keywords.length} keywords on ${config.num_pages} pages each.`);
}
// See here: https://peter.sh/experiments/chromium-command-line-switches/
var ADDITIONAL_CHROME_FLAGS = [
@ -81,7 +80,7 @@ module.exports.handler = async function handler (event, context, callback) {
'--disable-notifications',
];
var user_agent = undefined;
var user_agent = null;
if (config.user_agent) {
user_agent = config.user_agent;
@ -120,18 +119,23 @@ module.exports.handler = async function handler (event, context, callback) {
if (pluggable.start_browser) {
launch_args.config = config;
let browser = await pluggable.start_browser(launch_args);
const realUA = await browser.userAgent();
if (realUA === user_agent) {
const page = await await browser.newPage();
const page = await browser.newPage();
if (config.do_work && pluggable.do_work) {
let res = await pluggable.do_work(page);
results = res.results;
num_requests = res.num_requests;
} else {
let obj = getScraper(config.search_engine, {
config: config,
context: context,
pluggable: pluggable,
page: page,
});
results = obj.run(page);
results = obj.run({page: page});
num_requests = obj.num_requests;
} else {
console.error('provided user agent does not match real user agent');
metadata = obj.metadata;
}
if (pluggable.close_browser) {
@ -139,9 +143,12 @@ module.exports.handler = async function handler (event, context, callback) {
} else {
await browser.close();
}
} else {
// if no custom start_browser functionality was given
// use puppeteer-cluster for scraping
const { Cluster } = require('./puppeteer-cluster/dist/index.js');
var numClusters = config.puppeteer_cluster_config.maxConcurrency;
var perBrowserOptions = [];
@ -235,9 +242,8 @@ module.exports.handler = async function handler (event, context, callback) {
let ms_per_request = timeDelta/num_requests;
if (config.verbose === true) {
console.log(`se-scraper took ${timeDelta}ms to perform ${num_requests} requests.`);
console.log(`Scraper took ${timeDelta}ms to perform ${num_requests} requests.`);
console.log(`On average ms/request: ${ms_per_request}ms/request`);
//console.dir(results, {depth: null, colors: true});
}
if (config.compress === true) {
@ -299,7 +305,7 @@ function parseEventData(config) {
}
const booleans = ['debug', 'verbose', 'upload_to_s3', 'log_ip_address', 'log_http_headers', 'random_user_agent',
'compress', 'is_local', 'max_results', 'set_manual_settings', 'block_assets', 'test_evasion'];
'compress', 'is_local', 'max_results', 'set_manual_settings', 'block_assets', 'test_evasion', 'do_work'];
for (b of booleans) {
config[b] = _bool(config[b]);