implemented generic scraping class

2019-01-30 16:05:08 +01:00
parent 9e62f23451
commit 4306848657
7 changed files with 463 additions and 320 deletions
--- a/src/modules/bing.js
+++ b/src/modules/bing.js
@ -1,203 +1,164 @@
 const cheerio = require('cheerio');
-const sfunctions = require('./functions.js');
+const Scraper = require('./se_scraper');
+
+class BingScraper extends Scraper {
+
+	parse(html) {
+		// load the page source into cheerio
+		const $ = cheerio.load(html);
+
+		// perform queries
+		const results = [];
+		$('#b_content #b_results .b_algo').each((i, link) => {
+			results.push({
+				link: $(link).find('h2 a').attr('href'),
+				title: $(link).find('h2').text(),
+				snippet: $(link).find('.b_caption p').text(),
+				visible_link: $(link).find('cite').text(),
+			})
+		});
+
+		let no_results = this.no_results(
+			['There are no results', 'Es gibt keine Ergebnisse', 'Including results for', 'Einschließlich Ergebnisse'],
+			$('#b_results').text()
+		);
+
+		let effective_query = $('#sp_requery a').first().text() || '';
+
+		const cleaned = [];
+		for (var i=0; i < results.length; i++) {
+			let res = results[i];
+			if (res.link && res.link.trim() && res.title && res.title.trim()) {
+				res.rank = i+1;
+				cleaned.push(res);
+			}
+		}
+
+		return {
+			time: (new Date()).toUTCString(),
+			no_results: no_results,
+			effective_query: effective_query,
+			num_results: $('#b_content .sb_count').text(),
+			results: cleaned,
+		}
+	}
+
+	async load_start_page() {
+		try {
+			await this.page.goto('https://www.bing.com/');
+		    await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
+		} catch (e) {
+		    return false;
+		}
+		return true;
+	}
+
+	async search_keyword(keyword) {
+		const input = await this.page.$('input[name="q"]');
+		await this.set_input_value(`input[name="q"]`, keyword);
+		await this.sleep(50);
+		await input.focus();
+		await this.page.keyboard.press("Enter");
+	}
+
+	async next_page() {
+		let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000});
+		if (!next_page_link) {
+		    return false;
+		}
+		await next_page_link.click();
+		await this.page.waitForNavigation();
+
+		return true;
+	}
+
+	async wait_for_results() {
+		await this.page.waitForSelector('#b_content', { timeout: 5000 });
+		await this.sleep(500);
+	}
+
+	async detected() {
+		// TODO: I was actually never detected by bing. those are good guys.
+	}
+}
+
+
+class BingNewsScraper extends Scraper {
+
+	parse(html) {
+		// load the page source into cheerio
+		const $ = cheerio.load(html);
+
+		// perform queries
+		const results = [];
+		$('#algocore .newsitem').each((i, link) => {
+			results.push({
+				link: $(link).attr('url'),
+				title: $(link).find('a.title').text(),
+				snippet: $(link).find('.snippet').text(),
+				date: $(link).find('.source span').last().text(),
+			})
+		});
+
+		const cleaned = [];
+		for (var i=0; i < results.length; i++) {
+			let res = results[i];
+			if (res.link && res.link.trim() && res.title && res.title.trim()) {
+				res.rank = i+1;
+				cleaned.push(res);
+			}
+		}
+
+		return {
+			time: (new Date()).toUTCString(),
+			results: cleaned,
+		}
+	}
+
+	async load_start_page() {
+		try {
+			await this.page.goto('https://www.bing.com/news/search?');
+			if (this.config.set_manual_settings === true) {
+				console.log('Sleeping 30 seconds. Set your settings now.');
+				await this.sleep(30000);
+			}
+			await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
+		} catch (e) {
+			return false;
+		}
+		return true;
+	}
+
+	async search_keyword(keyword) {
+		const input = await this.page.$('input[name="q"]');
+		await this.set_input_value(`input[name="q"]`, keyword);
+		await this.sleep(50);
+		await input.focus();
+		await this.page.keyboard.press("Enter");
+	}
+
+	async next_page() {
+		let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000});
+		if (!next_page_link) {
+			return false;
+		}
+		await next_page_link.click();
+		await this.page.waitForNavigation();
+
+		return true;
+	}
+
+	async wait_for_results() {
+		await this.page.waitForSelector('#news', { timeout: 5000 });
+		await this.sleep(2000);
+	}
+
+	async detected() {
+		// TODO: I was actually never detected by bing news.
+	}
+}

 module.exports = {
-	scrape_bing_pup: scrape_bing_pup,
-	scrape_bing_news_pup: scrape_bing_news_pup,
-};
-
-async function scrape_bing_pup(page, event, context, pluggable) {
-	await page.goto('https://www.bing.com/');
-
-	try {
-		await page.waitForSelector('input[name="q"]', { timeout: 5000 });
-	} catch (e) {
-		return results;
-	}
-
-	let keywords = event.keywords;
-	var results = {};
-
-	for (var i = 0; i < keywords.length; i++) {
-
-		keyword = keywords[i];
-		results[keyword] = {};
-
-		if (pluggable.before_keyword_scraped) {
-			await pluggable.before_keyword_scraped({
-				keyword: keyword,
-				page: page,
-				event: event,
-				context: context,
-			});
-		}
-
-		try {
-			const input = await page.$('input[name="q"]');
-			await sfunctions.set_input_value(page, `input[name="q"]`, keyword);
-			await sfunctions.sleep(50);
-			await input.focus();
-			await page.keyboard.press("Enter");
-
-			let page_num = 1;
-
-			do {
-				if (event.verbose === true) {
-					console.log(`${event.search_engine} is scraping keyword: ${keyword} on page ${page_num}`);
-				}
-				if (event.sleep_range) {
-					await sfunctions.random_sleep(event);
-				}
-				await page.waitForSelector('#b_content', { timeout: 5000 });
-				await sfunctions.sleep(500);
-				let html = await page.content();
-				results[keyword][page_num] = parse(html);
-
-				page_num += 1;
-
-				let next_page_link = await page.$('.sb_pagN', {timeout: 1000});
-				if (!next_page_link) {
-					break;
-				}
-				await next_page_link.click();
-				await page.waitForNavigation();
-
-			} while (page_num <= event.num_pages)
-
-		} catch (e) {
-			console.error(`Problem with scraping ${keyword}: ${e}`);
-		}
-	}
-
-	return results;
-}
-
-function parse(html) {
-	// load the page source into cheerio
-	const $ = cheerio.load(html);
-
-	// perform queries
-	const results = [];
-	$('#b_content #b_results .b_algo').each((i, link) => {
-		results.push({
-		  link: $(link).find('h2 a').attr('href'),
-		  title: $(link).find('h2').text(),
-		  snippet: $(link).find('.b_caption p').text(),
-		  visible_link: $(link).find('cite').text(),
-		})
-	});
-
-	let no_results = sfunctions.no_results(
-		['There are no results', 'Es gibt keine Ergebnisse', 'Including results for', 'Einschließlich Ergebnisse'],
-		$('#b_results').text()
-	);
-
-	let effective_query = $('#sp_requery a').first().text() || '';
-
-	const cleaned = [];
-	for (var i=0; i < results.length; i++) {
-		let res = results[i];
-		if (res.link && res.link.trim() && res.title && res.title.trim()) {
-			res.rank = i+1;
-			cleaned.push(res);
-		}
-	}
-
-	return {
-		time: (new Date()).toUTCString(),
-		no_results: no_results,
-        effective_query: effective_query,
-		num_results: $('#b_content .sb_count').text(),
-		results: cleaned,
-	}
-}
-
-async function scrape_bing_news_pup(page, event, context, pluggable) {
-	await page.goto('https://www.bing.com/news/search?');
-
-	if (event.set_manual_settings === true) {
-		console.log('Sleeping 30 seconds. Set your settings now.');
-		await sfunctions.sleep(30000);
-	}
-
-	try {
-		await page.waitForSelector('input[name="q"]', { timeout: 5000 });
-	} catch (e) {
-		return results;
-	}
-
-	let keywords = event.keywords;
-	var results = {};
-
-	for (var i = 0; i < keywords.length; i++) {
-
-		keyword = keywords[i];
-
-		if (pluggable.before_keyword_scraped) {
-			await pluggable.before_keyword_scraped({
-				keyword: keyword,
-				page: page,
-				event: event,
-				context: context,
-			});
-		}
-
-		try {
-			const input = await page.$('input[name="q"]');
-			// overwrites last text in input
-			await input.click({ clickCount: 3 });
-			await input.type(keyword);
-			await input.focus();
-			await page.keyboard.press("Enter");
-
-            if (event.sleep_range) {
-                await sfunctions.random_sleep(event);
-            }
-
-			await page.waitForSelector('#news', { timeout: 5000 });
-			await sfunctions.sleep(2000);
-
-			if (event.debug === true && event.is_local === true) {
-				await page.screenshot({path: `debug/${keyword}.png`});
-			}
-
-			let html = await page.content();
-			results[keyword] = parse_bing_news(html);
-
-		} catch (e) {
-			console.error(`Problem with scraping ${keyword}: ${e}`);
-		}
-	}
-
-	return results;
-}
-
-function parse_bing_news(html) {
-	// load the page source into cheerio
-	const $ = cheerio.load(html);
-
-	// perform queries
-	const results = [];
-	$('#algocore .newsitem').each((i, link) => {
-		results.push({
-		  link: $(link).attr('url'),
-		  title: $(link).find('a.title').text(),
-		  snippet: $(link).find('.snippet').text(),
-		  date: $(link).find('.source span').last().text(),
-		})
-	});
-
-	const cleaned = [];
-	for (var i=0; i < results.length; i++) {
-		let res = results[i];
-		if (res.link && res.link.trim() && res.title && res.title.trim()) {
-			res.rank = i+1;
-			cleaned.push(res);
-		}
-	}
-
-	return {
-		time: (new Date()).toUTCString(),
-		results: cleaned,
-	}
-}
+	BingNewsScraper: BingNewsScraper,
+	BingScraper: BingScraper,
+};