implemented generic scraping class

2025-08-17 00:51:04 +02:00 · 2019-01-30 16:05:08 +01:00
parent 9e62f23451
commit 4306848657
7 changed files with 463 additions and 320 deletions
--- a/TODO.txt
+++ b/TODO.txt
@ -25,6 +25,12 @@

    - implement duckduckgo scraping

+
+30.1.2019
+
+    - modify all scrapers to use the generic class where it makes sense
+        - Bing, Baidu, Google, Duckduckgo
+
 TODO:
    - think about implementing ticker search for: https://quotes.wsj.com/MSFT?mod=searchresults_companyquotes
    - add proxy support
--- a/data.json
+++ b/data.json
@ -1 +1 @@
-{"scraping scrapeulous.com":{"1":{"time":"Tue, 29 Jan 2019 21:46:30 GMT","num_results":"Ungefähr 139 Ergebnisse (0,29 Sekunden) ","no_results":false,"effective_query":"","results":[{"link":"https://scrapeulous.com/about/","title":"About - Scrapeuloushttps://scrapeulous.com/about/Im CacheDiese Seite übersetzen","snippet":"Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. The business requirement to scrape information from ...","visible_link":"https://scrapeulous.com/about/","date":"","rank":1},{"link":"https://scrapeulous.com/howto/","title":"Howto - Scrapeuloushttps://scrapeulous.com/howto/Im CacheDiese Seite übersetzen","snippet":"We offer scraping large amounts of keywords for the Google Search Engine. Large means any number of keywords between 40 and 50000. Additionally, we ...","visible_link":"https://scrapeulous.com/howto/","date":"","rank":2},{"link":"https://github.com/NikolaiT/se-scraper","title":"GitHub - NikolaiT/se-scraper: Javascript scraping module based on ...https://github.com/NikolaiT/se-scraperIm CacheDiese Seite übersetzen","snippet":"24.12.2018 - Javascript scraping module based on puppeteer for many different search ... for many different search engines... https://scrapeulous.com/.","visible_link":"https://github.com/NikolaiT/se-scraper","date":"24.12.2018 - ","rank":3},{"link":"https://github.com/NikolaiT/GoogleScraper/blob/master/README.md","title":"GoogleScraper/README.md at master · NikolaiT/GoogleScraper ...https://github.com/NikolaiT/GoogleScraper/blob/.../README.mdIm CacheÄhnliche SeitenDiese Seite übersetzen","snippet":"GoogleScraper - Scraping search engines professionally. Scrapeulous.com - Scraping Service. GoogleScraper is a open source tool and will remain a open ...","visible_link":"https://github.com/NikolaiT/GoogleScraper/blob/.../README.md","date":"","rank":4},{"link":"https://googlescraper.readthedocs.io/","title":"Welcome to GoogleScraper's documentation! — GoogleScraper ...https://googlescraper.readthedocs.io/Im CacheDiese Seite übersetzen","snippet":"Welcome to GoogleScraper's documentation!¶. Contents: GoogleScraper - Scraping search engines professionally · Scrapeulous.com - Scraping Service ...","visible_link":"https://googlescraper.readthedocs.io/","date":"","rank":5},{"link":"https://incolumitas.com/pages/scrapeulous/","title":"Coding, Learning and Business Ideas – Scrapeulous.com - Incolumitashttps://incolumitas.com/pages/scrapeulous/Im CacheDiese Seite übersetzen","snippet":"A scraping service for scientists, marketing professionals, analysts or SEO folk. In autumn 2018, I created a scraping service called scrapeulous.com. There you ...","visible_link":"https://incolumitas.com/pages/scrapeulous/","date":"","rank":6},{"link":"https://incolumitas.com/","title":"Coding, Learning and Business Ideashttps://incolumitas.com/Im CacheDiese Seite übersetzen","snippet":"Scraping Amazon Reviews using Headless Chrome Browser and Python3. Posted on Mi ... GoogleScraper Tutorial - How to scrape 1000 keywords with Google.","visible_link":"https://incolumitas.com/","date":"","rank":7},{"link":"https://en.wikipedia.org/wiki/Search_engine_scraping","title":"Search engine scraping - Wikipediahttps://en.wikipedia.org/wiki/Search_engine_scrapingIm CacheDiese Seite übersetzen","snippet":"Search engine scraping is the process of harvesting URLs, descriptions, or other information from search engines such as Google, Bing or Yahoo. This is a ...","visible_link":"https://en.wikipedia.org/wiki/Search_engine_scraping","date":"","rank":8},{"link":"https://readthedocs.org/projects/googlescraper/downloads/pdf/latest/","title":"GoogleScraper Documentation - Read the Docshttps://readthedocs.org/projects/googlescraper/downloads/.../latest...Im CacheDiese Seite übersetzen","snippet":"23.12.2018 - Contents: 1 GoogleScraper - Scraping search engines professionally. 1. 1.1 ... For this reason, I created the web service scrapeulous.com.","visible_link":"https://readthedocs.org/projects/googlescraper/downloads/.../latest...","date":"23.12.2018 - ","rank":9},{"link":"https://pypi.org/project/CountryGoogleScraper/","title":"CountryGoogleScraper · PyPIhttps://pypi.org/project/CountryGoogleScraper/Im CacheDiese Seite übersetzen","snippet":"A module to scrape and extract links, titles and descriptions from various search ... Look [here to get an idea how to use asynchronous mode](http://scrapeulous.","visible_link":"https://pypi.org/project/CountryGoogleScraper/","date":"","rank":10}]}}}
+{"trump":{"1":{"time":"Wed, 30 Jan 2019 15:03:46 GMT","num_results":"Ungefähr 1.450.000.000 Ergebnisse (0,49 Sekunden) ","no_results":false,"effective_query":"","results":[{"link":"https://de.wikipedia.org/wiki/Donald_Trump","title":"Donald Trump – Wikipediahttps://de.wikipedia.org/wiki/Donald_TrumpIm CacheÄhnliche Seiten","snippet":"Donald John Trump /dɒnəld d͡ʒɒn trʌmp/ (* 14. Juni 1946 in Queens, New York City, New York) ist ein amerikanischer Unternehmer, Entertainer und seit ...","visible_link":"https://de.wikipedia.org/wiki/Donald_Trump","date":"","rank":1},{"link":"https://www.merkur.de/politik/milliardaer-will-trump-mit-unfassbarer-summe-aus-amt-jagen-news-zr-11469011.html","title":"Milliardär will Trump mit unfassbarer Summe aus dem Amt jagen ...https://www.merkur.de/.../milliardaer-will-trump-mit-unfassbarer-summe-aus-amt-jage...Im Cache","snippet":"vor 1 Stunde - Donald Trump: Der längste Shutdown in der Geschichte der USA ist beendet. Die Rede zur Lage der Nation steht bevor und ein Milliardär fährt ...","visible_link":"https://www.merkur.de/.../milliardaer-will-trump-mit-unfassbarer-summe-aus-amt-jage...","date":"vor 1 Stunde - ","rank":2},{"link":"http://www.spiegel.de/wirtschaft/impeachment-tom-steyer-wirbt-fuer-amtsenthebung-von-donald-trump-a-1250683.html","title":"Impeachment: Tom Steyer wirbt für Amtsenthebung von Donald Trump ...www.spiegel.de › Wirtschaft › Donald Trump","snippet":"vor 5 Stunden - US-Milliardär Tom Steyer 100 Millionen Dollar, um Trump aus dem Amt zu jagen. Der Milliardär und Ex-Fondsmanager Tom Steyer ist ...","visible_link":"www.spiegel.de › Wirtschaft › Donald Trump","date":"vor 5 Stunden - ","rank":3},{"link":"http://www.spiegel.de/thema/donald_trump/","title":"Donald Trump - SPIEGEL ONLINEwww.spiegel.de › Politik › AuslandÄhnliche Seiten","snippet":"Der Unternehmer Donald Trump war schon vor seiner Bewerbung als republikanischer Präsidentschaftskandidat weltweit bekannt. Überraschend gewann der ...","visible_link":"www.spiegel.de › Politik › Ausland","date":"","rank":4},{"link":"https://www.faz.net/aktuell/politik/ausland/gefahren-fuer-amerika-geheimdienste-widersprechen-trump-16015734.html","title":"Gefahren für Amerika: Geheimdienste widersprechen Trump - Fazhttps://www.faz.net › Politik › Ausland","snippet":"vor 1 Stunde - Nordkorea rüstet ab, Iran auf und der „IS“ ist besiegt – so sieht es Donald Trump. Ein Bericht der amerikanischen Geheimdienste über ...","visible_link":"https://www.faz.net › Politik › Ausland","date":"vor 1 Stunde - ","rank":5},{"link":"https://www.faz.net/aktuell/politik/thema/donald-trump","title":"Donald Trump: Aktuelle News der FAZ zum US-Präsidentenhttps://www.faz.net/aktuell/politik/thema/donald-trump","snippet":"Donald Trump ist der 45. US-Präsident. ▷ Lesen Sie hier alle Nachrichten der FAZ rund um die Politik und Entscheidungen des Republikaners.","visible_link":"https://www.faz.net/aktuell/politik/thema/donald-trump","date":"","rank":6},{"link":"https://www.donaldjtrump.com/","title":"Donald J. Trump for President: Homehttps://www.donaldjtrump.com/Im CacheÄhnliche SeitenDiese Seite übersetzen","snippet":"Help continue our promise to Make America Great Again!","visible_link":"https://www.donaldjtrump.com/","date":"","rank":7},{"link":"https://www.zeit.de/thema/donald-trump","title":"Donald Trump: Präsident der USA | ZEIT ONLINE - Die Zeithttps://www.zeit.de › Politik","snippet":"Importzölle, Atomabkommen, Einreiseverbot: Donald Trump sorgt innen- und außenpolitisch für Schlagzeilen. Hier lesen Sie Nachrichten und Analysen zum ...","visible_link":"https://www.zeit.de › Politik","date":"","rank":8}]}}}
--- a/run.js
+++ b/run.js
@ -15,18 +15,18 @@ let config = {
    search_engine: 'google',
    // whether debug information should be printed
    // debug info is useful for developers when debugging
-    debug: false,
+    debug: true,
    // whether verbose program output should be printed
    // this output is informational
-    verbose: false,
+    verbose: true,
    // an array of keywords to scrape
-    keywords: ['scraping scrapeulous.com'],
+    keywords: ['trump', ],
    // alternatively you can specify a keyword_file. this overwrites the keywords array
    keyword_file: '',
    // the number of pages to scrape for each keyword
    num_pages: 1,
    // whether to start the browser in headless mode
-    headless: true,
+    headless: false,
    // path to output file, data will be stored in JSON
    output_file: 'data.json',
    // whether to prevent images, css, fonts from being loaded
--- a/src/modules/bing.js
+++ b/src/modules/bing.js
@ -1,78 +1,9 @@
 const cheerio = require('cheerio');
-const sfunctions = require('./functions.js');
+const Scraper = require('./se_scraper');

-module.exports = {
-	scrape_bing_pup: scrape_bing_pup,
-	scrape_bing_news_pup: scrape_bing_news_pup,
-};
+class BingScraper extends Scraper {

-async function scrape_bing_pup(page, event, context, pluggable) {
-	await page.goto('https://www.bing.com/');
-
-	try {
-		await page.waitForSelector('input[name="q"]', { timeout: 5000 });
-	} catch (e) {
-		return results;
-	}
-
-	let keywords = event.keywords;
-	var results = {};
-
-	for (var i = 0; i < keywords.length; i++) {
-
-		keyword = keywords[i];
-		results[keyword] = {};
-
-		if (pluggable.before_keyword_scraped) {
-			await pluggable.before_keyword_scraped({
-				keyword: keyword,
-				page: page,
-				event: event,
-				context: context,
-			});
-		}
-
-		try {
-			const input = await page.$('input[name="q"]');
-			await sfunctions.set_input_value(page, `input[name="q"]`, keyword);
-			await sfunctions.sleep(50);
-			await input.focus();
-			await page.keyboard.press("Enter");
-
-			let page_num = 1;
-
-			do {
-				if (event.verbose === true) {
-					console.log(`${event.search_engine} is scraping keyword: ${keyword} on page ${page_num}`);
-				}
-				if (event.sleep_range) {
-					await sfunctions.random_sleep(event);
-				}
-				await page.waitForSelector('#b_content', { timeout: 5000 });
-				await sfunctions.sleep(500);
-				let html = await page.content();
-				results[keyword][page_num] = parse(html);
-
-				page_num += 1;
-
-				let next_page_link = await page.$('.sb_pagN', {timeout: 1000});
-				if (!next_page_link) {
-					break;
-				}
-				await next_page_link.click();
-				await page.waitForNavigation();
-
-			} while (page_num <= event.num_pages)
-
-		} catch (e) {
-			console.error(`Problem with scraping ${keyword}: ${e}`);
-		}
-	}
-
-	return results;
-}
-
-function parse(html) {
+	parse(html) {
 		// load the page source into cheerio
 		const $ = cheerio.load(html);

@ -87,7 +18,7 @@ function parse(html) {
 			})
 		});

-	let no_results = sfunctions.no_results(
+		let no_results = this.no_results(
 			['There are no results', 'Es gibt keine Ergebnisse', 'Including results for', 'Einschließlich Ergebnisse'],
 			$('#b_results').text()
 		);
@ -110,69 +41,51 @@ function parse(html) {
 			num_results: $('#b_content .sb_count').text(),
 			results: cleaned,
 		}
-}
-
-async function scrape_bing_news_pup(page, event, context, pluggable) {
-	await page.goto('https://www.bing.com/news/search?');
-
-	if (event.set_manual_settings === true) {
-		console.log('Sleeping 30 seconds. Set your settings now.');
-		await sfunctions.sleep(30000);
 	}

+	async load_start_page() {
 		try {
-		await page.waitForSelector('input[name="q"]', { timeout: 5000 });
+			await this.page.goto('https://www.bing.com/');
+		    await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
 		} catch (e) {
-		return results;
+		    return false;
+		}
+		return true;
 	}

-	let keywords = event.keywords;
-	var results = {};
-
-	for (var i = 0; i < keywords.length; i++) {
-
-		keyword = keywords[i];
-
-		if (pluggable.before_keyword_scraped) {
-			await pluggable.before_keyword_scraped({
-				keyword: keyword,
-				page: page,
-				event: event,
-				context: context,
-			});
-		}
-
-		try {
-			const input = await page.$('input[name="q"]');
-			// overwrites last text in input
-			await input.click({ clickCount: 3 });
-			await input.type(keyword);
+	async search_keyword(keyword) {
+		const input = await this.page.$('input[name="q"]');
+		await this.set_input_value(`input[name="q"]`, keyword);
+		await this.sleep(50);
 		await input.focus();
-			await page.keyboard.press("Enter");
-
-            if (event.sleep_range) {
-                await sfunctions.random_sleep(event);
+		await this.page.keyboard.press("Enter");
 	}

-			await page.waitForSelector('#news', { timeout: 5000 });
-			await sfunctions.sleep(2000);
+	async next_page() {
+		let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000});
+		if (!next_page_link) {
+		    return false;
+		}
+		await next_page_link.click();
+		await this.page.waitForNavigation();

-			if (event.debug === true && event.is_local === true) {
-				await page.screenshot({path: `debug/${keyword}.png`});
+		return true;
 	}

-			let html = await page.content();
-			results[keyword] = parse_bing_news(html);
-
-		} catch (e) {
-			console.error(`Problem with scraping ${keyword}: ${e}`);
-		}
+	async wait_for_results() {
+		await this.page.waitForSelector('#b_content', { timeout: 5000 });
+		await this.sleep(500);
 	}

-	return results;
+	async detected() {
+		// TODO: I was actually never detected by bing. those are good guys.
+	}
 }

-function parse_bing_news(html) {
+
+class BingNewsScraper extends Scraper {
+
+	parse(html) {
 		// load the page source into cheerio
 		const $ = cheerio.load(html);

@ -200,4 +113,52 @@ function parse_bing_news(html) {
 			time: (new Date()).toUTCString(),
 			results: cleaned,
 		}
+	}
+
+	async load_start_page() {
+		try {
+			await this.page.goto('https://www.bing.com/news/search?');
+			if (this.config.set_manual_settings === true) {
+				console.log('Sleeping 30 seconds. Set your settings now.');
+				await this.sleep(30000);
+			}
+			await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
+		} catch (e) {
+			return false;
+		}
+		return true;
+	}
+
+	async search_keyword(keyword) {
+		const input = await this.page.$('input[name="q"]');
+		await this.set_input_value(`input[name="q"]`, keyword);
+		await this.sleep(50);
+		await input.focus();
+		await this.page.keyboard.press("Enter");
+	}
+
+	async next_page() {
+		let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000});
+		if (!next_page_link) {
+			return false;
+		}
+		await next_page_link.click();
+		await this.page.waitForNavigation();
+
+		return true;
+	}
+
+	async wait_for_results() {
+		await this.page.waitForSelector('#news', { timeout: 5000 });
+		await this.sleep(2000);
+	}
+
+	async detected() {
+		// TODO: I was actually never detected by bing news.
+	}
 }
+
+module.exports = {
+	BingNewsScraper: BingNewsScraper,
+	BingScraper: BingScraper,
+};
--- a/src/modules/google.js
+++ b/src/modules/google.js
@ -1,104 +1,98 @@
 const cheerio = require('cheerio');
 const sfunctions = require('./functions.js');
+const Scraper = require('./se_scraper');

-module.exports = {
-	scrape_google_news_old_pup: scrape_google_news_old_pup,
-	scrape_google_pup: scrape_google_pup,
-	scrape_google_image_pup: scrape_google_image_pup,
-	scrape_google_news_pup: scrape_google_news_pup,
-    scrape_google_pup_dr: scrape_google_pup_dr,
-};
+class GoogleScraper extends Scraper {

-const STANDARD_TIMEOUT = 8000;
-const SOLVE_CAPTCHA_TIME = 45000;
+	parse(html) {
+		// load the page source into cheerio
+		const $ = cheerio.load(html);

-async function scrape_google_pup(page, event, context, pluggable) {
-	await page.goto('https://www.google.com/');
-
-	try {
-		await page.waitForSelector('input[name="q"]', { timeout: STANDARD_TIMEOUT });
-	} catch (e) {
-		return results;
-	}
-
-	let keywords = event.keywords;
-	var results = {};
-
-	for (var i = 0; i < keywords.length; i++) {
-		keyword = keywords[i];
-		results[keyword] = {};
-
-		if (pluggable.before_keyword_scraped) {
-			await pluggable.before_keyword_scraped({
-				keyword: keyword,
-				page: page,
-				event: event,
-				context: context,
+		// perform queries
+		const results = [];
+		$('#center_col .g').each((i, link) => {
+			results.push({
+				link: $(link).find('.r a').attr('href'),
+				title: $(link).find('.r a').text(),
+				snippet: $(link).find('span.st').text(),
+				visible_link: $(link).find('.r cite').text(),
+				date: $(link).find('span.f').text() || '',
+			})
 		});
+
+		let no_results = sfunctions.no_results(
+			['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
+				'No results found for', 'Ergebnisse für', 'Showing results for'],
+			$('#main').text()
+		);
+
+		let effective_query = $('#fprsl').text() || '';
+		if (!effective_query) {
+			effective_query = $('#fprs a').text()
 		}

+		const cleaned = [];
+		for (var i=0; i < results.length; i++) {
+			let res = results[i];
+			if (res.link && res.link.trim() && res.title && res.title.trim()) {
+				res.rank = i+1;
+				cleaned.push(res);
+			}
+		}
+
+		return {
+			time: (new Date()).toUTCString(),
+			num_results: $('#resultStats').text(),
+			no_results: no_results,
+			effective_query: effective_query,
+			results: cleaned
+		}
+	}
+
+	async load_start_page() {
+		await this.page.goto('https://www.google.com/');
+
 		try {
+			await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
+		} catch (e) {
+			return false;
+		}

-			const input = await page.$('input[name="q"]');
-			await sfunctions.set_input_value(page, `input[name="q"]`, keyword);
-			await sfunctions.sleep(50);
+		return true;
+	}
+
+	async search_keyword(keyword) {
+		const input = await this.page.$('input[name="q"]');
+		await this.set_input_value(`input[name="q"]`, keyword);
+		await this.sleep(50);
 		await input.focus();
-			await page.keyboard.press("Enter");
-
-			let page_num = 1;
-
-			do {
-				if (event.verbose === true) {
-					console.log(`${event.search_engine} is scraping keyword: ${keyword} on page ${page_num}`);
+		await this.page.keyboard.press("Enter");
 	}
-				if (event.sleep_range) {
-					await sfunctions.random_sleep(event);
-				}
-				await page.waitForSelector('#center_col', {timeout: STANDARD_TIMEOUT});
-				await sfunctions.sleep(500);
-				let html = await page.content();
-				results[keyword][page_num] = parse_google_results(html);

-				page_num += 1;
-
-				let next_page_link = await page.$('#pnnext', {timeout: 1000});
+	async next_page() {
+		let next_page_link = await this.page.$('#pnnext', {timeout: 1000});
 		if (!next_page_link) {
-					break;
+			return false;
 		}
 		await next_page_link.click();
-				await page.waitForNavigation();
+		await this.page.waitForNavigation();

-			} while (page_num <= event.num_pages)
-
-		} catch (e) {
-			console.error(`Problem with scraping ${keyword}.`);
-			console.error(e);
-
-			if (await scraping_detected(page) === true) {
-				console.error('Google detected the scraping. Aborting.');
-
-				if (event.is_local === true) {
-					await sfunctions.sleep(SOLVE_CAPTCHA_TIME);
-					console.error('You have 45 seconds to enter the captcha.');
-					// expect that user filled out necessary captcha
-				} else {
-					return results;
-				}
-			} else {
-				// some other error, quit scraping process if stuff is broken
-				if (event.is_local === true) {
-					console.error('You have 30 seconds to fix this.');
-					await sfunctions.sleep(30000);
-				} else {
-					return results;
-				}
-			}
-		}
+		return true;
 	}

-	return results;
+	async wait_for_results() {
+		await this.page.waitForSelector('#center_col', { timeout: this.STANDARD_TIMEOUT });
+		await this.sleep(500);
+	}
+
+	async detected() {
+		const title = await this.page.title();
+		let html = await this.page.content();
+		return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
+	}
 }

+
 async function scrape_google_pup_dr(page, event, context, pluggable) {
    let keywords = event.keywords;
    first = keywords[0];
@ -651,3 +645,12 @@ function parse_google_news_results(html) {
      effective_query: effective_query,
  }
 }
+
+
+module.exports = {
+	scrape_google_news_old_pup: scrape_google_news_old_pup,
+	GoogleScraper: GoogleScraper,
+	scrape_google_image_pup: scrape_google_image_pup,
+	scrape_google_news_pup: scrape_google_news_pup,
+	scrape_google_pup_dr: scrape_google_pup_dr,
+};
--- a/src/modules/se_scraper.js
+++ b/src/modules/se_scraper.js
@ -3,34 +3,214 @@ const start_url = {
 };

 /*
+    Get useful JS knowledge and get awesome...
+
    Read this shit: https://javascript.info/class-inheritance
+    And this: https://medium.freecodecamp.org/here-are-examples-of-everything-new-in-ecmascript-2016-2017-and-2018-d52fa3b5a70e
 */

 module.exports = class Scraper {
    constructor(options = {}) {
        const {
-            searchEngine = 'google',
-            numPages = 1,
+            browser = null,
+            config = {},
+            context = {},
            pluggable = null,
        } = options;

        this.pluggable = pluggable;
-        this.searchEngine = searchEngine;
-        this.numPages = numPages;
-        this.results = {}
+        this.browser = browser;
+        this.config = config;
+        this.context = context;
+
+        this.STANDARD_TIMEOUT = 8000;
+        // longer timeout when using proxies
+        this.PROXY_TIMEOUT = 15000;
+        this.SOLVE_CAPTCHA_TIME = 45000;
+
+        this.results = {};
    }

+    async run() {
+
+        let do_continue = await this.load_search_engine();
+
+        if (!do_continue) {
+            console.error('Failed to load the search engine: load_search_engine()');
+            return this.results;
+        }
+
+        await this.scraping_loop();
+
+        return this.results;
+    }
+
+    /**
+     * Action that runs only once in the beginning of the
+     * scraping procedure.
+     *
+     * @returns {Promise<void>} true if everything is correct.
+     */
    async load_search_engine() {
+
+        this.page = await this.browser.newPage();
+
+        // block some assets to speed up scraping
+        if (this.config.block_assets === true) {
+            await this.page.setRequestInterception(true);
+            this.page.on('request', (req) => {
+                let type = req.resourceType();
+                const block = ['stylesheet', 'font', 'image', 'media'];
+                if (block.includes(type)) {
+                    req.abort();
+                } else {
+                    req.continue();
+                }
+            });
        }

-    async search_keyword() {
+        return await this.load_start_page();
    }

-    parse() {
+    /**
+     * Each scraper basically iterates over a list of
+     * keywords and a list of pages. This is the generic
+     * method for that.
+     *
+     * @returns {Promise<void>}
+     */
+    async scraping_loop() {
+
+        for (let keyword of this.config.keywords) {
+
+            this.results[keyword] = {};
+
+            if (this.pluggable.before_keyword_scraped) {
+                await this.pluggable.before_keyword_scraped({
+                    keyword: keyword,
+                    page: this.page,
+                    event: this.config,
+                    context: this.context,
+                });
+            }
+
+            let page_num = 1;
+
+            try {
+
+                await this.search_keyword(keyword);
+
+                do {
+
+                    if (this.config.verbose === true) {
+                        console.log(`${this.config.search_engine} scrapes keyword "${keyword}" on page ${page_num}`);
+                    }
+
+                    await this.wait_for_results();
+
+                    if (event.sleep_range) {
+                        await this.random_sleep();
+                    }
+
+                    let html = await this.page.content();
+                    this.results[keyword][page_num] = this.parse(html);
+
+                    page_num += 1;
+
+                    if (await this.next_page() === false) {
+                        break;
+                    }
+
+                } while (page_num < event.num_pages);
+
+            } catch (e) {
+
+                console.error(`Problem with scraping ${keyword} in search engine ${this.config.search_engine}: ${e}`);
+
+                if (await this.detected() === true) {
+                    console.error(`${this.config.search_engine} DETECTED the scraping!`);
+
+                    if (this.config.is_local === true) {
+                        await this.sleep(this.SOLVE_CAPTCHA_TIME);
+                        console.error(`You have ${this.SOLVE_CAPTCHA_TIME}ms to enter the captcha.`);
+                        // expect that user filled out necessary captcha
+                    } else {
+                        break;
+                    }
+                } else {
+                    // some other error, quit scraping process if stuff is broken
+                    if (this.config.is_local === true) {
+                        console.error('You have 30 seconds to fix this.');
+                        await this.sleep(30000);
+                    } else {
+                        break;
+                    }
+                }
+
+            }
+        }
+    }
+
+    sleep(ms) {
+        return new Promise(resolve => {
+            setTimeout(resolve, ms)
+        })
+    }
+
+    async random_sleep() {
+        const [min, max] = this.config.sleep_range;
+        let rand = Math.floor(Math.random() * (max - min + 1) + min); //Generate Random number
+        if (this.config.debug === true) {
+            console.log(`Sleeping for ${rand}s`);
+        }
+        await this.sleep(rand * 1000);
+    }
+
+    async set_input_value(selector, value) {
+        await this.page.waitFor(selector);
+        await this.page.evaluate((value, selector) => {
+            return document.querySelector(selector).value = value;
+        }, value, selector);
+    }
+
+    no_results(needles, html) {
+        return !needles.map((needle) => { return html.indexOf(needle)})
+            .every((res) => { return res == -1});
+    }
+
+    parse(html) {

    }

+    /**
+     *
+     * @returns true if startpage was loaded correctly.
+     */
+    async load_start_page() {
+
+    }
+
+    /**
+     * Searches the keyword by inputting it into the form and hitting enter
+     * or something similar.
+     *
+     * @param keyword
+     * @returns {Promise<void>}
+     */
+    async search_keyword(keyword) {
+
+    }
+
+    /**
+     *
+     * @returns true if the next page was loaded correctely
+     */
    async next_page() {
+
+    }
+
+    async wait_for_results() {
+
    }

    async detected() {
--- a/src/node_scraper.js
+++ b/src/node_scraper.js
@ -129,29 +129,13 @@ module.exports.handler = async function handler (event, context, callback) {
 			}
 		}

-		const page = await browser.newPage();
-
-		// block some assets to speed up scraping
-		if (config.block_assets === true) {
-			await page.setRequestInterception(true);
-			page.on('request', (req) => {
-				let type = req.resourceType();
-				const block = ['stylesheet', 'font', 'image', 'media'];
-				if (block.includes(type)) {
-					req.abort();
-				} else {
-					req.continue();
-				}
-			});
-		}
-
-		results = await {
-			google: google.scrape_google_pup,
+		Scraper = {
+			google: google.GoogleScraper,
 			google_news_old: google.scrape_google_news_old_pup,
 			google_news: google.scrape_google_news_pup,
 			google_image: google.scrape_google_image_pup,
-			bing: bing.scrape_bing_pup,
-			bing_news: bing.scrape_bing_news_pup,
+			bing: bing.BingScraper,
+			bing_news: bing.BingNewsScraper,
 			infospace: infospace.scrape_infospace_pup,
 			webcrawler: infospace.scrape_webcrawler_news_pup,
 			baidu: baidu.scrape_baidu_pup,
@ -163,7 +147,16 @@ module.exports.handler = async function handler (event, context, callback) {
 			reuters: tickersearch.scrape_reuters_finance_pup,
 			cnbc: tickersearch.scrape_cnbc_finance_pup,
 			marketwatch: tickersearch.scrape_marketwatch_finance_pup,
-		}[config.search_engine](page, config, context, pluggable);
+		}[config.search_engine];
+
+		let scraper = new Scraper({
+			browser: browser,
+			config: config,
+			context: context,
+			pluggable: pluggable,
+		});
+
+		let results = await scraper.run();


 		if (pluggable.close_browser) {