.

2019-01-26 20:15:19 +01:00 · 2019-01-26 20:15:19 +01:00 · b354e6918d
commit b354e6918d
parent bab902e80a
10 changed files with 83 additions and 28 deletions
--- a/README.md
+++ b/README.md
@ -47,22 +47,24 @@ let config = {
    // if random_user_agent is set to True, a random user agent is chosen
    random_user_agent: false,
    // get meta data of scraping in return object
-    write_meta_data: 'true',
+    write_meta_data: true,
    // how long to sleep between requests. a random sleep interval within the range [a,b]
    // is drawn before every request. empty string for no sleeping.
    sleep_range: '[1,1]',
    // which search engine to scrape
    search_engine: 'yahoo_news',
    // whether debug information should be printed
-    debug: 'true',
+    debug: true,
    // whether verbose program output should be printed
-    verbose: 'false',
+    verbose: false,
    // an array of keywords to scrape
    keywords: ['GOOGL', ],
    // alternatively you can specify a keyword_file. this overwrites the keywords array
    keyword_file: './keywords.txt',
    // whether to start the browser in headless mode
    headless: false,
+    // path to output file, data will be stored in JSON
+    output_file: 'results.json',
 };

 se_scraper.scrape(config, (err, response) => {
--- a/data.json
+++ b/data.json
@ -0,0 +1 @@
+{"scrapeulous.com":{"time":"Sat, 26 Jan 2019 19:05:15 GMT","num_results":"Ungefähr 171 Ergebnisse (0,25 Sekunden) ","no_results":false,"effective_query":"","results":[{"link":"https://scrapeulous.com/","title":"Scrapeulous","snippet":"Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. Whether you need to analyze your competitors market ...","visible_link":"https://scrapeulous.com/","date":"","rank":1},{"link":"https://scrapeulous.com/about/","title":"About - Scrapeulous","snippet":"Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. The business requirement to scrape information from ...","visible_link":"https://scrapeulous.com/about/","date":"","rank":2},{"link":"https://scrapeulous.com/contact/","title":"Contact - Scrapeulous","snippet":"Contact scrapeulous.com. Your email address. Valid email address where we are going to contact you. We will not send spam mail. Your inquiry.","visible_link":"https://scrapeulous.com/contact/","date":"","rank":3},{"link":"https://scrapeulous.com/howto/","title":"Howto - Scrapeulous","snippet":"We offer scraping large amounts of keywords for the Google Search Engine. Large means any number of keywords between 40 and 50000. Additionally, we ...","visible_link":"https://scrapeulous.com/howto/","date":"","rank":4},{"link":"https://www.scrapeulous.com/faq/","title":"Frequently Asked Questions - Scrapeulous","snippet":"31.10.2018 - Frequently Asked Questions. What is our hourly rate for customized scraping services? The minimal rate for creating custom scrapers is set to ...","visible_link":"https://www.scrapeulous.com/faq/","date":"31.10.2018 - ","rank":5},{"link":"https://www.scrapeulous.com/news/","title":"News Api for MSCI World ETF - Scrapeulous","snippet":"News Api for MSCI World ETF. Scrapeulous.com News Api allows you to query the most recent world news for an index composed of developed market equities.","visible_link":"https://www.scrapeulous.com/news/","date":"","rank":6},{"link":"https://scrapeulous.com/advanced/","title":"Advanced Scraping Services - Scrapeulous","snippet":"Advanced Scraping Services. If you have special requirements for your scraping/crawling projects, you can write us an email to this contact mail and we will ...","visible_link":"https://scrapeulous.com/advanced/","date":"","rank":7},{"link":"https://incolumitas.com/","title":"Coding, Learning and Business Ideas","snippet":"About · Contact · GoogleScraper · Lichess Autoplay-Bot · Projects · Scrapeulous.com · Site Notice · SVGCaptcha · Home Archives Categories Tags Atom ...","visible_link":"https://incolumitas.com/","date":"","rank":8},{"link":"https://incolumitas.com/pages/scrapeulous/","title":"Coding, Learning and Business Ideas – Scrapeulous.com - Incolumitas","snippet":"In autumn 2018, I created a scraping service called scrapeulous.com. There you can purchase scrape jobs that allow you to upload a keyword file which in turn ...","visible_link":"https://incolumitas.com/pages/scrapeulous/","date":"","rank":9},{"link":"https://www.youtube.com/watch?v=a6xn6rc9GbI","title":"scrapeulous intro - YouTube","snippet":"Introduction for https://scrapeulous.com.","visible_link":"https://www.youtube.com/watch?v=a6xn6rc9GbI","date":"","rank":10}]}}
--- a/examples/multiple_search_engines.js
+++ b/examples/multiple_search_engines.js
@ -0,0 +1,35 @@
+const se_scraper = require('../index.js');
+
+async function multiple_search_engines() {
+
+    var searchEnginesList = ['google', 'bing'];
+
+    for (let index = 0; index < searchEnginesList.length; index++) {
+        const searchEngine = searchEnginesList[index];
+        let config = {
+            random_user_agent: true,
+            write_meta_data: true,
+            sleep_range: '[1,1]',
+            search_engine: searchEngine,
+            debug: false,
+            verbose: false,
+            // the list of keywords to scrape
+            keywords: ['scrapeulous.com',],
+            // whether to start the browser in headless mode
+            headless: true,
+            output_file: `${searchEngine}.json`
+        };
+
+        await se_scraper.scrape(config, (err, response) => {
+            if (err) {
+                console.error(err)
+            }
+            console.dir(response.results, {
+                depth: null,
+                colors: true
+            });
+        });
+    }
+}
+
+multiple_search_engines();
--- a/index.js
+++ b/index.js
@ -2,7 +2,7 @@ const handler = require('./src/node_scraper.js');
 var fs = require('fs');
 var os = require("os");

-exports.scrape = function(config, callback) {
+exports.scrape = async function(config, callback) {
 	// options for scraping
 	event = {
 		// the user agent to scrape with
@ -10,19 +10,23 @@ exports.scrape = function(config, callback) {
 		// if random_user_agent is set to True, a random user agent is chosen
 		random_user_agent: false,
 		// whether to select manual settings in visible mode
-		set_manual_settings: 'false',
+		set_manual_settings: false,
 		// get meta data of scraping in return object
-		write_meta_data: 'true',
-		log_http_headers: 'false',
+		write_meta_data: true,
+		log_http_headers: false,
 		// how long to sleep between requests. a random sleep interval within the range [a,b]
 		// is drawn before every request. empty string for no sleeping.
 		sleep_range: '[1,1]',
 		// which search engine to scrape
 		search_engine: 'google',
-		compress: 'false', // compress
-		debug: 'false',
-		verbose: 'false',
+		compress: false, // compress
+		debug: false,
+		verbose: false,
 		keywords: ['test'],
+		// whether to start the browser in headless mode
+		headless: true,
+		// path to output file, data will be stored in JSON
+		output_file: '',
 	};

 	for (var key in config) {
@ -44,7 +48,7 @@ exports.scrape = function(config, callback) {
 		}
 	}

-	handler.handler(event, undefined, callback );
+	await handler.handler(event, undefined, callback );
 };

 function read_keywords_from_file(fname) {
@ -55,10 +59,3 @@ function read_keywords_from_file(fname) {
 	});
 	return kws;
 }
-
-function write_results(fname, data) {
-	fs.writeFile(fname || 'results.json', data, (err) => {
-		if (err) throw err;
-		console.log('Results written to file');
-	});
-}
--- a/jformat.py
+++ b/jformat.py
@ -0,0 +1,6 @@
+import pprint
+import sys
+import json
+
+if len(sys.argv) == 2:
+	print(pprint.pformat(json.load(open(sys.argv[1]))))
--- a/keywords.txt
+++ b/keywords.txt
@ -1,2 +1,2 @@
-GOOGL
-AAPL
+test
+water is blue
--- a/package.json
+++ b/package.json
@ -1,6 +1,6 @@
 {
  "name": "se-scraper",
-  "version": "1.0.5",
+  "version": "1.0.6",
  "description": "A simple module which uses puppeteer to scrape several search engines.",
  "main": "index.js",
  "scripts": {
--- a/results.json
+++ b/results.json
--- a/run.js
+++ b/run.js
@ -6,22 +6,24 @@ let config = {
    // if random_user_agent is set to True, a random user agent is chosen
    random_user_agent: false,
    // get meta data of scraping in return object
-    write_meta_data: 'true',
+    write_meta_data: false,
    // how long to sleep between requests. a random sleep interval within the range [a,b]
    // is drawn before every request. empty string for no sleeping.
    sleep_range: '[1,1]',
    // which search engine to scrape
-    search_engine: 'yahoo_news',
+    search_engine: 'google',
    // whether debug information should be printed
-    debug: 'true',
+    debug: true,
    // whether verbose program output should be printed
-    verbose: 'false',
+    verbose: false,
    // an array of keywords to scrape
-    keywords: ['GOOGL', ],
+    keywords: ['scrapeulous.com', ],
    // alternatively you can specify a keyword_file. this overwrites the keywords array
-    keyword_file: './keywords.txt',
+    keyword_file: '',
    // whether to start the browser in headless mode
    headless: false,
+    // path to output file, data will be stored in JSON
+    output_file: 'data.json',
 };

 se_scraper.scrape(config, (err, response) => {
--- a/src/node_scraper.js
+++ b/src/node_scraper.js
@ -1,5 +1,6 @@
 const puppeteer = require('puppeteer');
 const zlib = require('zlib');
+var fs = require('fs');

 // local module imports
 const google = require('./modules/google.js');
@ -12,6 +13,14 @@ const meta = require('./modules/metadata.js');
 const duckduckgo = require('./modules/duckduckgo.js');
 const tickersearch = require('./modules/ticker_search.js');

+
+function write_results(fname, data) {
+	fs.writeFileSync(fname, data, (err) => {
+		if (err) throw err;
+		console.log(`Results written to file ${fname}`);
+	});
+}
+
 module.exports.handler = async function handler (event, context, callback) {

 	try {
@ -127,6 +136,10 @@ module.exports.handler = async function handler (event, context, callback) {
 			}
 		}

+		if (event.output_file) {
+			write_results(event.output_file, JSON.stringify(results));
+		}
+
 		let response = {
 		  headers: {
 		  	'Content-Type': 'text/json',
				`@ -0,0 +1 @@`
				{"scrapeulous.com":{"time":"Sat, 26 Jan 2019 19:05:15 GMT","num_results":"Ungefähr 171 Ergebnisse (0,25 Sekunden) ","no_results":false,"effective_query":"","results":[{"link":"https://scrapeulous.com/","title":"Scrapeulous","snippet":"Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. Whether you need to analyze your competitors market ...","visible_link":"https://scrapeulous.com/","date":"","rank":1},{"link":"https://scrapeulous.com/about/","title":"About - Scrapeulous","snippet":"Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. The business requirement to scrape information from ...","visible_link":"https://scrapeulous.com/about/","date":"","rank":2},{"link":"https://scrapeulous.com/contact/","title":"Contact - Scrapeulous","snippet":"Contact scrapeulous.com. Your email address. Valid email address where we are going to contact you. We will not send spam mail. Your inquiry.","visible_link":"https://scrapeulous.com/contact/","date":"","rank":3},{"link":"https://scrapeulous.com/howto/","title":"Howto - Scrapeulous","snippet":"We offer scraping large amounts of keywords for the Google Search Engine. Large means any number of keywords between 40 and 50000. Additionally, we ...","visible_link":"https://scrapeulous.com/howto/","date":"","rank":4},{"link":"https://www.scrapeulous.com/faq/","title":"Frequently Asked Questions - Scrapeulous","snippet":"31.10.2018 - Frequently Asked Questions. What is our hourly rate for customized scraping services? The minimal rate for creating custom scrapers is set to ...","visible_link":"https://www.scrapeulous.com/faq/","date":"31.10.2018 - ","rank":5},{"link":"https://www.scrapeulous.com/news/","title":"News Api for MSCI World ETF - Scrapeulous","snippet":"News Api for MSCI World ETF. Scrapeulous.com News Api allows you to query the most recent world news for an index composed of developed market equities.","visible_link":"https://www.scrapeulous.com/news/","date":"","rank":6},{"link":"https://scrapeulous.com/advanced/","title":"Advanced Scraping Services - Scrapeulous","snippet":"Advanced Scraping Services. If you have special requirements for your scraping/crawling projects, you can write us an email to this contact mail and we will ...","visible_link":"https://scrapeulous.com/advanced/","date":"","rank":7},{"link":"https://incolumitas.com/","title":"Coding, Learning and Business Ideas","snippet":"About · Contact · GoogleScraper · Lichess Autoplay-Bot · Projects · Scrapeulous.com · Site Notice · SVGCaptcha · Home Archives Categories Tags Atom ...","visible_link":"https://incolumitas.com/","date":"","rank":8},{"link":"https://incolumitas.com/pages/scrapeulous/","title":"Coding, Learning and Business Ideas – Scrapeulous.com - Incolumitas","snippet":"In autumn 2018, I created a scraping service called scrapeulous.com. There you can purchase scrape jobs that allow you to upload a keyword file which in turn ...","visible_link":"https://incolumitas.com/pages/scrapeulous/","date":"","rank":9},{"link":"https://www.youtube.com/watch?v=a6xn6rc9GbI","title":"scrapeulous intro - YouTube","snippet":"Introduction for https://scrapeulous.com.","visible_link":"https://www.youtube.com/watch?v=a6xn6rc9GbI","date":"","rank":10}]}}