cleaned up google scrapers. All scrapers are classes now. from 600 LOC to 400 LOC. HIGH IQ MOVE

2025-06-20 09:38:06 +02:00 · 2019-01-30 20:24:03 +01:00 · 2019-01-30 20:24:03 +01:00 · 581568ff18
commit 581568ff18
parent 4306848657
5 changed files with 252 additions and 509 deletions
--- a/data.json
+++ b/data.json
--- a/run.js
+++ b/run.js
@ -12,7 +12,7 @@ let config = {
    // is drawn before every request. empty string for no sleeping.
    sleep_range: '[1,2]',
    // which search engine to scrape
-    search_engine: 'google',
+    search_engine: 'google_news',
    // whether debug information should be printed
    // debug info is useful for developers when debugging
    debug: true,
@ -20,7 +20,7 @@ let config = {
    // this output is informational
    verbose: true,
    // an array of keywords to scrape
-    keywords: ['trump', ],
+    keywords: ['hacking', 'trump'],
    // alternatively you can specify a keyword_file. this overwrites the keywords array
    keyword_file: '',
    // the number of pages to scrape for each keyword
@ -54,7 +54,7 @@ function callback(err, response) {
        response.statusCode - status code of the scraping process
     */

-    console.dir(response.results, {depth: null, colors: true});
+    // console.dir(response.results, {depth: null, colors: true});
 }

 se_scraper.scrape(config, callback);
--- a/src/modules/google.js
+++ b/src/modules/google.js
@ -92,398 +92,285 @@ class GoogleScraper extends Scraper {
 	}
 }

+class GoogleNewsOldScraper extends Scraper {

-async function scrape_google_pup_dr(page, event, context, pluggable) {
-    let keywords = event.keywords;
-    first = keywords[0];
-    var year = first.slice(-5);
-    var remaining = first.slice(0,-5);
-    year = parseInt(year.trim());
-    let dr_from = `1/1/${year-1}`;
-    let dr_to = `1/1/${year+1}`;
-    var url = `https://www.google.com/search?lr=&hl=en&tbs=cdr:1,cd_min:${dr_from},cd_max:${dr_to}&q=${remaining}&oq=${remaining}`;
+	parse(html) {
+		const $ = cheerio.load(html);
+		// perform queries
+		const results = [];

-    await page.goto(url);
+		$('.g').each((i, result) => {
+			results.push({
+				link: $(result).find('h3 a').attr('href'),
+				title: $(result).find('h3 a').text(),
+				snippet: $(result).find('.st').text(),
+				date: $(result).find('.nsa').text(),
+			})
+		});

-    try {
-        await page.waitForSelector('input[name="q"]', { timeout: STANDARD_TIMEOUT });
-    } catch (e) {
-        return results;
-    }
+		let no_results = sfunctions.no_results(
+			['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
+				'No results found for', 'Ergebnisse für', 'Showing results for', 'did not match any news results'],
+			$('#main').text()
+		);

-    var results = {};
-
-    for (var i = 1; i < keywords.length; i++) {
-
-        // strip the year at the end plus whitespace
-        keyword = keywords[i].slice(0,-5);
-
-		if (pluggable.before_keyword_scraped) {
-			await pluggable.before_keyword_scraped({
-				keyword: keyword,
-				page: page,
-				event: event,
-				context: context,
-			});
+		let effective_query = $('#fprsl').text() || '';
+		if (!effective_query) {
+			effective_query = $('#fprs a').text()
 		}

-        if (event.verbose === true) {
-            console.log(`${event.search_engine} is scraping keyword: ${keyword}`);
-        }
+		const cleaned = [];
+		for (var i=0; i < results.length; i++) {
+			let res = results[i];
+			if (res.link && res.link.trim()) {
+				res.rank = i+1;
+				cleaned.push(res);
+			}
+		}

-        try {
-            const input = await page.$('input[name="q"]');
-            // overwrites last text in input
-            // await input.click({ clickCount: 3 });
-            // await sfunctions.sleep(50);
-            // await input.type(keyword);
-            await sfunctions.set_input_value(page, `input[name="q"]`, keyword);
-            await sfunctions.sleep(50);
-
-            await input.focus();
-            await page.keyboard.press("Enter");
-
-            if (event.debug === true && event.is_local === true) {
-                console.log(`[${i}] Scraping ${keyword}`);
-            }
-
-            if (event.sleep_range) {
-                await sfunctions.random_sleep(event);
-            }
-
-            await page.waitForSelector('#center_col', { timeout: STANDARD_TIMEOUT });
-            await sfunctions.sleep(100);
-
-        } catch (e) {
-            console.error(`Problem with scraping ${keyword}: ${e}`);
-
-            if (await scraping_detected(page) === true) {
-                console.error('Google detected the scraping. Aborting.');
-
-                if (event.is_local === true) {
-                    await sfunctions.sleep(SOLVE_CAPTCHA_TIME);
-                    console.error('You have 45 seconds to enter the captcha.');
-                    // expect that user filled out necessary captcha
-                } else {
-                    return results;
-				}
-            } else {
-                // some other error, quit scraping process if stuff is broken
-                if (event.is_local === true) {
-                    console.error('You have 30 seconds to fix this.');
-                    await sfunctions.sleep(30000);
-                } else {
-                    return results;
-                }
-            }
-        }
-
-        let html = await page.content();
-        results[keyword] = parse_google_results(html);
-        results[keyword].daterange = dr_from + '-' + dr_to;
-        results[keyword].year = year;
-    }
-
-    return results;
-}
-
-function parse_google_results(html) {
-	// load the page source into cheerio
-	const $ = cheerio.load(html);
-
-	// perform queries
-	const results = [];
-	$('#center_col .g').each((i, link) => {
-		results.push({
-		  link: $(link).find('.r a').attr('href'),
-		  title: $(link).find('.r a').text(),
-		  snippet: $(link).find('span.st').text(),
-		  visible_link: $(link).find('.r cite').text(),
-		  date: $(link).find('span.f').text() || '',
-		})
-	});
-
-	let no_results = sfunctions.no_results(
-		['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
-			'No results found for', 'Ergebnisse für', 'Showing results for'],
-		$('#main').text()
-	);
-
-	let effective_query = $('#fprsl').text() || '';
-	if (!effective_query) {
-		effective_query = $('#fprs a').text()
-	}
-
-	const cleaned = [];
-	for (var i=0; i < results.length; i++) {
-		let res = results[i];
-		if (res.link && res.link.trim() && res.title && res.title.trim()) {
-			res.rank = i+1;
-			cleaned.push(res);
+		return {
+			time: (new Date()).toUTCString(),
+			results: cleaned,
+			no_results: no_results,
+			effective_query: effective_query,
 		}
 	}

-	return {
-		time: (new Date()).toUTCString(),
-		num_results: $('#resultStats').text(),
-		no_results: no_results,
-		effective_query: effective_query,
-		results: cleaned
-	}
-}
-
-async function scraping_detected(page) {
-    const title = await page.title();
-    let html = await page.content();
-	return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
-}
-
-async function scrape_google_news_old_pup(page, event, context, pluggable) {
-	let keywords = event.keywords;
-	var results = {};
-
-	for (var i = 0; i < keywords.length; i++) {
-
-		keyword = keywords[i];
-
-		if (pluggable.before_keyword_scraped) {
-			await pluggable.before_keyword_scraped({
-				keyword: keyword,
-				page: page,
-				event: event,
-				context: context,
-			});
-		}
-
-        if (event.verbose === true) {
-            console.log(`${event.search_engine} is scraping keyword: ${keyword}`);
-        }
-
-		try {
-			await page.goto(`https://www.google.com/search?q=${keyword}&hl=en&source=lnms&tbm=nws`, {
-	  			referer: 'https://www.google.com/'
-			});
-			await page.waitForSelector('input[name="q"]', { timeout: STANDARD_TIMEOUT });
-			const input = await page.$('input[name="q"]');
-			// overwrites last text in input
-			// await input.click({ clickCount: 3 });
-			// await input.type(keyword);
-            await sfunctions.set_input_value(page, `input[name="q"]`, keyword);
-            await sfunctions.sleep(50);
-			await input.focus();
-			await page.keyboard.press("Enter");
-
-            if (event.sleep_range) {
-                await sfunctions.random_sleep(event);
-            }
-
-			await page.waitForNavigation({ timeout: STANDARD_TIMEOUT });
-			await page.waitForSelector('#main', { timeout: STANDARD_TIMEOUT });
-
-			await sfunctions.sleep(200);
-
-		} catch(e) {
-			console.error(`Problem with scraping ${keyword}: ${e}`);
-
-            if (await scraping_detected(page) === true) {
-                console.error('Google detected the scraping. Aborting.');
-
-                if (event.is_local === true) {
-                    await sfunctions.sleep(SOLVE_CAPTCHA_TIME);
-                    console.error('You have 45 seconds to enter the captcha.');
-                    // expect that user filled out necessary captcha
-                } else {
-                    return results;
-                }
-            } else {
-                // some other error, quit scraping process if stuff is broken
-                if (event.is_local === true) {
-                    console.error('You have 30 seconds to fix this.');
-                    await sfunctions.sleep(30000);
-                } else {
-                    return results;
-                }
-            }
-		}
-
-        let html = await page.content();
-        results[keyword] = parse_google_news_results_se_format(html);
-
+	async load_start_page() {
+		return true;
 	}

-	return results;
-}
-
-function parse_google_news_results_se_format(html) {
-	const $ = cheerio.load(html);
-	// perform queries
-	const results = [];
-
-	$('.g').each((i, result) => {
-		results.push({
-		  link: $(result).find('h3 a').attr('href'),
-		  title: $(result).find('h3 a').text(),
-		  snippet: $(result).find('.st').text(),
-		  date: $(result).find('.nsa').text(),
-		})
-	});
-
-	let no_results = sfunctions.no_results(
-		['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
-			'No results found for', 'Ergebnisse für', 'Showing results for', 'did not match any news results'],
-		$('#main').text()
-	);
-
-	let effective_query = $('#fprsl').text() || '';
-	if (!effective_query) {
-		effective_query = $('#fprs a').text()
-	}
-
-	const cleaned = [];
-	for (var i=0; i < results.length; i++) {
-		let res = results[i];
-		if (res.link && res.link.trim()) {
-			res.rank = i+1;
-			cleaned.push(res);
-		}
-	}
-
-  return {
-      time: (new Date()).toUTCString(),
-      results: cleaned,
-      no_results: no_results,
-      effective_query: effective_query,
-  }
-}
-
-async function scrape_google_image_pup(page, event, context, pluggable) {
-	let keywords = event.keywords;
-	var results = {};
-
-	await page.goto(`https://www.google.com/imghp?tbm=isch`, {
+	async search_keyword(keyword) {
+		await this.page.goto(`https://www.google.com/search?q=${keyword}&hl=en&source=lnms&tbm=nws`, {
 			referer: 'https://www.google.com/'
-	});
-
-	try {
-		await page.waitForSelector('input[name="q"]', { timeout: STANDARD_TIMEOUT });
-	} catch (e) {
-		return results;
+		});
+		await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
 	}

-	for (var i = 0; i < keywords.length; i++) {
+	async next_page() {
+		let next_page_link = await this.page.$('#pnnext', {timeout: 1000});
+		if (!next_page_link) {
+			return false;
+		}
+		await next_page_link.click();
+		await this.page.waitForNavigation();

-		keyword = keywords[i];
+		return true;
+	}

-		if (pluggable.before_keyword_scraped) {
-			await pluggable.before_keyword_scraped({
-				keyword: keyword,
-				page: page,
-				event: event,
-				context: context,
-			});
+	async wait_for_results() {
+		//await this.page.waitForNavigation({ timeout: this.STANDARD_TIMEOUT });
+		await this.page.waitForSelector('#main', { timeout: this.STANDARD_TIMEOUT });
+		await this.sleep(200);
+	}
+
+	async detected() {
+		const title = await this.page.title();
+		let html = await this.page.content();
+		return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
+	}
+}
+
+
+class GoogleImageScraper extends Scraper {
+
+	parse(html) {
+		// load the page source into cheerio
+		const $ = cheerio.load(html);
+
+		// perform queries
+		const results = [];
+		$('.rg_bx').each((i, link) => {
+			let link_element = $(link).find('a.rg_l').attr('href');
+			let clean_link = clean_image_url(link_element);
+			results.push({
+				link: link_element,
+				clean_link: clean_link,
+				snippet: $(link).find('.a-no-hover-decoration').text(),
+			})
+		});
+
+		let no_results = sfunctions.no_results(
+			['stimmt mit keinem Bildergebnis', 'Keine Ergebnisse für', 'not match any image results', 'No results found for',
+				'Showing results for', 'Ergebnisse für'],
+			$('#main').text()
+		);
+
+		let effective_query = $('#fprsl').text() || '';
+		if (!effective_query) {
+			effective_query = $('#fprs a').text();
 		}

-        if (event.verbose === true) {
-            console.log(`${event.search_engine} is scraping keyword: ${keyword}`);
-        }
+		const cleaned = [];
+		for (var i=0; i < results.length; i++) {
+			let res = results[i];
+			if (res.link && res.link.trim() && res.link.trim().length > 10) {
+				res.link = res.link.trim();
+				res.rank = i+1;
+				cleaned.push(res);
+			}
+		}

+		return {
+			time: (new Date()).toUTCString(),
+			no_results: no_results,
+			results: cleaned,
+			effective_query: effective_query
+		}
+	}
+
+	async load_start_page() {
 		try {
-			const input = await page.$('input[name="q"]');
-			// overwrites last text in input
-			// await input.click({ clickCount: 3 });
-			// await input.type(keyword);
-            await sfunctions.set_input_value(page, `input[name="q"]`, keyword);
-            await sfunctions.sleep(50);
-
-			await input.focus();
-			await page.keyboard.press("Enter");
-
-            if (event.sleep_range) {
-                await sfunctions.random_sleep(event);
-            }
-
-			await page.waitForNavigation({ timeout: STANDARD_TIMEOUT});
-			await page.waitForSelector('#main', { timeout: STANDARD_TIMEOUT });
-
-			let html = await page.content();
-			results[keyword] = parse_google_image_results(html);
+			await this.page.goto(`https://www.google.com/imghp?tbm=isch`, {
+				referer: 'https://www.google.com/'
+			});
+			await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
 		} catch (e) {
-			console.error(`Problem with scraping ${keyword}: ${e}`);
-
-            if (await scraping_detected(page) === true) {
-                console.error('Google detected the scraping. Aborting.');
-
-                if (event.is_local === true) {
-                    await sfunctions.sleep(SOLVE_CAPTCHA_TIME);
-                    console.error('You have 45 seconds to enter the captcha.');
-                    // expect that user filled out necessary captcha
-                } else {
-                    return results;
-                }
-            } else {
-                // some other error, quit scraping process if stuff is broken
-                if (event.is_local === true) {
-                    console.error('You have 30 seconds to fix this.');
-                    await sfunctions.sleep(30000);
-                } else {
-                    return results;
-                }
-            }
+			return false;
 		}
-
+		return true;
 	}

-	return results;
+	async search_keyword(keyword) {
+		const input = await this.page.$('input[name="q"]');
+		await this.set_input_value(`input[name="q"]`, keyword);
+		await this.sleep(50);
+		await input.focus();
+		await this.page.keyboard.press("Enter");
+	}
+
+	async next_page() {
+		let next_page_link = await this.page.$('#pnnext', {timeout: 1000});
+		if (!next_page_link) {
+			return false;
+		}
+		await next_page_link.click();
+		await this.page.waitForNavigation();
+
+		return true;
+	}
+
+	async wait_for_results() {
+		await this.page.waitForSelector('#main', { timeout: this.STANDARD_TIMEOUT });
+		await this.sleep(100);
+	}
+
+	async detected() {
+		const title = await this.page.title();
+		let html = await this.page.content();
+		return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
+	}
 }

-function parse_google_image_results(html) {
-	// load the page source into cheerio
-	const $ = cheerio.load(html);

-	// perform queries
-	const results = [];
-	$('.rg_bx').each((i, link) => {
-		let link_element = $(link).find('a.rg_l').attr('href');
-		let clean_link = clean_image_url(link_element);
-		results.push({
-		  link: link_element,
-		  clean_link: clean_link,
-		  snippet: $(link).find('.a-no-hover-decoration').text(),
-		})
-	});
+class GoogleNewsScraper extends Scraper {

-	let no_results = sfunctions.no_results(
-		['stimmt mit keinem Bildergebnis', 'Keine Ergebnisse für', 'not match any image results', 'No results found for',
-		'Showing results for', 'Ergebnisse für'],
-		$('#main').text()
-	);
+	parse(html) {
+		const $ = cheerio.load(html);
+		// perform queries
+		const results = [];

-	let effective_query = $('#fprsl').text() || '';
-	if (!effective_query) {
-		effective_query = $('#fprs a').text();
-	}
+		$('article h3').each((i, headline) => {

-	const cleaned = [];
-	for (var i=0; i < results.length; i++) {
-		let res = results[i];
-		if (res.link && res.link.trim() && res.link.trim().length > 10) {
-            res.link = res.link.trim();
-			res.rank = i+1;
-			cleaned.push(res);
+			let title = $(headline).find('a span').text();
+
+			try {
+				var snippet = $(headline).parent().find('p').text();
+				var link = $(headline).find('a').attr('href');
+				var date = $(headline).parent().parent().parent().find('time').text();
+				var ts = $(headline).parent().parent().parent().find('time').attr('datetime');
+			} catch(e) {
+
+			}
+
+			if (!this.all_results.has(title)) {
+				results.push({
+					rank: i+1,
+					title: title,
+					snippet: snippet,
+					link: link,
+					date: date,
+					ts: ts,
+				});
+			}
+			this.all_results.add(title);
+		});
+
+		let no_results = sfunctions.no_results(
+			['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
+				'No results found for', 'Ergebnisse für', 'Showing results for', 'did not match any news results'],
+			$('body').text()
+		);
+
+		let effective_query = $('#fprsl').text() || '';
+
+		const cleaned = [];
+		for (var i=0; i < results.length; i++) {
+			let res = results[i];
+			if (res.title && res.title.trim()) {
+				res.rank = i+1;
+				cleaned.push(res);
+			}
+		}
+
+		return {
+			time: (new Date()).toUTCString(),
+			results: cleaned,
+			no_results: no_results,
+			effective_query: effective_query,
 		}
 	}

-	return {
-		time: (new Date()).toUTCString(),
-		no_results: no_results,
-		results: cleaned,
-		effective_query: effective_query
+	async load_start_page() {
+		try {
+			this.all_results = new Set();
+			await this.page.goto(`https://news.google.com/?hl=en-US&gl=US&ceid=US:en`, {
+				referer: 'https://news.google.com'
+			});
+			await this.page.waitForSelector('div input:nth-child(2)', {timeout: this.STANDARD_TIMEOUT});
+			await this.sleep(1000);
+
+			// parse here front page results
+			let html = await this.page.content();
+			this.results['frontpage'] = this.parse(html);
+		} catch(e) {
+			return false;
+		}
+		return true;
+	}
+
+	async search_keyword(keyword) {
+		await this.page.waitForSelector('div input:nth-child(2)', { timeout: this.STANDARD_TIMEOUT });
+		const input = await this.page.$('div input:nth-child(2)');
+		// overwrites last text in input
+		await input.click({ clickCount: 3 });
+		await input.type(keyword);
+		await this.sleep(50);
+		await input.focus();
+		await this.page.keyboard.press("Enter");
+	}
+
+	async next_page() {
+		// google news app does not have next pages
+		return false;
+	}
+
+	async wait_for_results() {
+		await this.page.waitForSelector(`[data-n-q="${this.keyword}"]`, { timeout: this.STANDARD_TIMEOUT });
+		await this.sleep(2000);
+	}
+
+	async detected() {
+		const title = await this.page.title();
+		let html = await this.page.content();
+		return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
 	}
 }

+
 function clean_image_url(url) {
 	// Example:
- 	// https://www.google.com/imgres?imgurl=https%3A%2F%2Fupload.wikimedia.org%2Fwikipedia%2Fen%2Fthumb%2Ff%2Ffd%2F1928_Edward_Campbell.jpg%2F220px-1928_Edward_Campbell.jpg&imgrefurl=https%3A%2F%2Fwww.revolvy.com%2Fpage%2FSir-Edward-Campbell%252C-1st-Baronet&docid=BMkW_GerTIY4GM&tbnid=TmQapIxDCQbQhM%3A&vet=10ahUKEwje_LLE_YXeAhXisaQKHVAEBSAQMwiNAShEMEQ..i&w=220&h=290&bih=1696&biw=1280&q=John%20MacLeod%20Breadalbane%20Councillor%20Prince%20Edward%20Island&ved=0ahUKEwje_LLE_YXeAhXisaQKHVAEBSAQMwiNAShEMEQ&iact=mrc&uact=8
+	// https://www.google.com/imgres?imgurl=https%3A%2F%2Fupload.wikimedia.org%2Fwikipedia%2Fen%2Fthumb%2Ff%2Ffd%2F1928_Edward_Campbell.jpg%2F220px-1928_Edward_Campbell.jpg&imgrefurl=https%3A%2F%2Fwww.revolvy.com%2Fpage%2FSir-Edward-Campbell%252C-1st-Baronet&docid=BMkW_GerTIY4GM&tbnid=TmQapIxDCQbQhM%3A&vet=10ahUKEwje_LLE_YXeAhXisaQKHVAEBSAQMwiNAShEMEQ..i&w=220&h=290&bih=1696&biw=1280&q=John%20MacLeod%20Breadalbane%20Councillor%20Prince%20Edward%20Island&ved=0ahUKEwje_LLE_YXeAhXisaQKHVAEBSAQMwiNAShEMEQ&iact=mrc&uact=8
 	const regex = /imgurl=(.*?)&/gm;
 	let match = regex.exec(url);
 	if (match !== null) {
@ -504,153 +391,9 @@ function clean_google_url(url) {
 	}
 }

-const all_results = new Set();
-
-async function scrape_google_news_pup(page, event, context, pluggable) {
-	let keywords = event.keywords;
-	var results = {};
-
-	await page.goto(`https://news.google.com/?hl=en-US&gl=US&ceid=US:en`, {
-			referer: 'https://news.google.com'
-	});
-	await page.waitForSelector('div input:nth-child(2)', { timeout: STANDARD_TIMEOUT });
-	await sfunctions.sleep(1000);
-
-	// parse here front page results
-	let html = await page.content();
-	results['frontpage'] = parse_google_news_results(html);
-
-	for (var i = 0; i < keywords.length; i++) {
-
-		keyword = keywords[i];
-
-		if (pluggable.before_keyword_scraped) {
-			await pluggable.before_keyword_scraped({
-				keyword: keyword,
-				page: page,
-				event: event,
-				context: context,
-			});
-		}
-
-        if (event.verbose === true) {
-            console.log(`${event.search_engine} is scraping keyword: ${keyword}`);
-        }
-
-		try {
-			await page.waitForSelector('div input:nth-child(2)', { timeout: STANDARD_TIMEOUT });
-
-			const input = await page.$('div input:nth-child(2)');
-			// overwrites last text in input
-			await input.click({ clickCount: 3 });
-			await input.type(keyword);
-			// TODO: setting the input in https://news.google.com/
-			// TODO: doesn't work. Fall back to use clicking and typing
-            // await setTextInputValue(page, `input[aria-label="Search"]`, keyword);
-            await sfunctions.sleep(50);
-			await input.focus();
-			await page.keyboard.press("Enter");
-
-            if (event.sleep_range) {
-                await sfunctions.random_sleep(event);
-            }
-
-			//await page.waitForSelector('#main', { timeout: 5000 });
-
-			await sfunctions.sleep(2500);
-
-			html = await page.content();
-			results[keyword] = parse_google_news_results(html);
-
-		} catch(e) {
-			console.error(`Problem with scraping ${keyword}: ${e}`);
-
-            if (await scraping_detected(page) === true) {
-                console.error('Google detected the scraping. Aborting.');
-
-                if (event.is_local === true) {
-                    await sfunctions.sleep(SOLVE_CAPTCHA_TIME);
-                    console.error('You have 45 seconds to enter the captcha.');
-                    // expect that user filled out necessary captcha
-                } else {
-                    return results;
-                }
-            } else {
-                // some other error, quit scraping process if stuff is broken
-                if (event.is_local === true) {
-                    console.error('You have 30 seconds to fix this.');
-                    await sfunctions.sleep(30000);
-                } else {
-                    return results;
-                }
-            }
-		}
-	}
-
-	return results;
-}
-
-function parse_google_news_results(html) {
-	const $ = cheerio.load(html);
-	// perform queries
-	const results = [];
-
-	$('article h3').each((i, headline) => {
-
-		title = $(headline).find('a span').text();
-
-		try {
-			snippet = $(headline).parent().find('p').text();
-            link = $(headline).find('a').attr('href');
-			date = $(headline).parent().parent().parent().find('time').text();
-			ts = $(headline).parent().parent().parent().find('time').attr('datetime');
-		} catch(e) {
-
-		}
-
-		if (!all_results.has(title)) {
-		    results.push({
-		      rank: i+1,
-		      title: title,
-		      snippet: snippet,
-              link: link,
-		      date: date,
-		      ts: ts,
-		    })
-		}
-		all_results.add(title);
-	});
-
-	let no_results = sfunctions.no_results(
-		['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
-			'No results found for', 'Ergebnisse für', 'Showing results for', 'did not match any news results'],
-		$('body').text()
-	);
-
-	let effective_query = $('#fprsl').text() || '';
-
-	const cleaned = [];
-	for (var i=0; i < results.length; i++) {
-		let res = results[i];
-		if (res.title && res.title.trim()) {
-			res.rank = i+1;
-			cleaned.push(res);
-		}
-	}
-
-  return {
-      time: (new Date()).toUTCString(),
-      results: cleaned,
-      no_results: no_results,
-      effective_query: effective_query,
-  }
-}
-
-
 module.exports = {
-	scrape_google_news_old_pup: scrape_google_news_old_pup,
+	GoogleNewsOldScraper: GoogleNewsOldScraper,
 	GoogleScraper: GoogleScraper,
-	scrape_google_image_pup: scrape_google_image_pup,
-	scrape_google_news_pup: scrape_google_news_pup,
-	scrape_google_pup_dr: scrape_google_pup_dr,
+	GoogleImageScraper: GoogleImageScraper,
+	GoogleNewsScraper: GoogleNewsScraper,
 };
--- a/src/modules/se_scraper.js
+++ b/src/modules/se_scraper.js
@ -82,7 +82,7 @@ module.exports = class Scraper {
    async scraping_loop() {

        for (let keyword of this.config.keywords) {
-
+            this.keyword = keyword;
            this.results[keyword] = {};

            if (this.pluggable.before_keyword_scraped) {
--- a/src/node_scraper.js
+++ b/src/node_scraper.js
@ -131,11 +131,12 @@ module.exports.handler = async function handler (event, context, callback) {

 		Scraper = {
 			google: google.GoogleScraper,
-			google_news_old: google.scrape_google_news_old_pup,
-			google_news: google.scrape_google_news_pup,
-			google_image: google.scrape_google_image_pup,
+			google_news_old: google.GoogleNewsOldScraper,
+			google_news: google.GoogleNewsScraper,
+			google_image: google.GoogleImageScraper,
 			bing: bing.BingScraper,
 			bing_news: bing.BingNewsScraper,
+
 			infospace: infospace.scrape_infospace_pup,
 			webcrawler: infospace.scrape_webcrawler_news_pup,
 			baidu: baidu.scrape_baidu_pup,
@ -158,7 +159,6 @@ module.exports.handler = async function handler (event, context, callback) {

 		let results = await scraper.run();

-
 		if (pluggable.close_browser) {
 			await pluggable.close_browser();
 		} else {