.

2025-08-09 21:57:49 +02:00 · 2019-01-26 20:15:19 +01:00
parent bab902e80a
commit b354e6918d
10 changed files with 83 additions and 28 deletions
--- a/README.md
+++ b/README.md
@ -47,22 +47,24 @@ let config = {
    // if random_user_agent is set to True, a random user agent is chosen
    random_user_agent: false,
    // get meta data of scraping in return object
-    write_meta_data: 'true',
+    write_meta_data: true,
    // how long to sleep between requests. a random sleep interval within the range [a,b]
    // is drawn before every request. empty string for no sleeping.
    sleep_range: '[1,1]',
    // which search engine to scrape
    search_engine: 'yahoo_news',
    // whether debug information should be printed
-    debug: 'true',
+    debug: true,
    // whether verbose program output should be printed
-    verbose: 'false',
+    verbose: false,
    // an array of keywords to scrape
    keywords: ['GOOGL', ],
    // alternatively you can specify a keyword_file. this overwrites the keywords array
    keyword_file: './keywords.txt',
    // whether to start the browser in headless mode
    headless: false,
    // path to output file, data will be stored in JSON
    output_file: 'results.json',
 };
 se_scraper.scrape(config, (err, response) => {
--- a/data.json
+++ b/data.json
@ -0,0 +1 @@
 {"scrapeulous.com":{"time":"Sat, 26 Jan 2019 19:05:15 GMT","num_results":"Ungefähr 171 Ergebnisse (0,25 Sekunden) ","no_results":false,"effective_query":"","results":[{"link":"https://scrapeulous.com/","title":"Scrapeulous","snippet":"Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. Whether you need to analyze your competitors market ...","visible_link":"https://scrapeulous.com/","date":"","rank":1},{"link":"https://scrapeulous.com/about/","title":"About - Scrapeulous","snippet":"Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. The business requirement to scrape information from ...","visible_link":"https://scrapeulous.com/about/","date":"","rank":2},{"link":"https://scrapeulous.com/contact/","title":"Contact - Scrapeulous","snippet":"Contact scrapeulous.com. Your email address. Valid email address where we are going to contact you. We will not send spam mail. Your inquiry.","visible_link":"https://scrapeulous.com/contact/","date":"","rank":3},{"link":"https://scrapeulous.com/howto/","title":"Howto - Scrapeulous","snippet":"We offer scraping large amounts of keywords for the Google Search Engine. Large means any number of keywords between 40 and 50000. Additionally, we ...","visible_link":"https://scrapeulous.com/howto/","date":"","rank":4},{"link":"https://www.scrapeulous.com/faq/","title":"Frequently Asked Questions - Scrapeulous","snippet":"31.10.2018 - Frequently Asked Questions. What is our hourly rate for customized scraping services? The minimal rate for creating custom scrapers is set to ...","visible_link":"https://www.scrapeulous.com/faq/","date":"31.10.2018 - ","rank":5},{"link":"https://www.scrapeulous.com/news/","title":"News Api for MSCI World ETF - Scrapeulous","snippet":"News Api for MSCI World ETF. Scrapeulous.com News Api allows you to query the most recent world news for an index composed of developed market equities.","visible_link":"https://www.scrapeulous.com/news/","date":"","rank":6},{"link":"https://scrapeulous.com/advanced/","title":"Advanced Scraping Services - Scrapeulous","snippet":"Advanced Scraping Services. If you have special requirements for your scraping/crawling projects, you can write us an email to this contact mail and we will ...","visible_link":"https://scrapeulous.com/advanced/","date":"","rank":7},{"link":"https://incolumitas.com/","title":"Coding, Learning and Business Ideas","snippet":"About · Contact · GoogleScraper · Lichess Autoplay-Bot · Projects · Scrapeulous.com · Site Notice · SVGCaptcha · Home Archives Categories Tags Atom ...","visible_link":"https://incolumitas.com/","date":"","rank":8},{"link":"https://incolumitas.com/pages/scrapeulous/","title":"Coding, Learning and Business Ideas – Scrapeulous.com - Incolumitas","snippet":"In autumn 2018, I created a scraping service called scrapeulous.com. There you can purchase scrape jobs that allow you to upload a keyword file which in turn ...","visible_link":"https://incolumitas.com/pages/scrapeulous/","date":"","rank":9},{"link":"https://www.youtube.com/watch?v=a6xn6rc9GbI","title":"scrapeulous intro - YouTube","snippet":"Introduction for https://scrapeulous.com.","visible_link":"https://www.youtube.com/watch?v=a6xn6rc9GbI","date":"","rank":10}]}}
--- a/examples/multiple_search_engines.js
+++ b/examples/multiple_search_engines.js
@ -0,0 +1,35 @@
 const se_scraper = require('../index.js');
 async function multiple_search_engines() {
    var searchEnginesList = ['google', 'bing'];
    for (let index = 0; index < searchEnginesList.length; index++) {
        const searchEngine = searchEnginesList[index];
        let config = {
            random_user_agent: true,
            write_meta_data: true,
            sleep_range: '[1,1]',
            search_engine: searchEngine,
            debug: false,
            verbose: false,
            // the list of keywords to scrape
            keywords: ['scrapeulous.com',],
            // whether to start the browser in headless mode
            headless: true,
            output_file: `${searchEngine}.json`
        };
        await se_scraper.scrape(config, (err, response) => {
            if (err) {
                console.error(err)
            }
            console.dir(response.results, {
                depth: null,
                colors: true
            });
        });
    }
 }
 multiple_search_engines();
--- a/index.js
+++ b/index.js
@ -2,7 +2,7 @@ const handler = require('./src/node_scraper.js');
 var fs = require('fs');
 var os = require("os");
-exports.scrape = function(config, callback) {
+exports.scrape = async function(config, callback) {
 	// options for scraping
 	event = {
 		// the user agent to scrape with
@ -10,19 +10,23 @@ exports.scrape = function(config, callback) {
 		// if random_user_agent is set to True, a random user agent is chosen
 		random_user_agent: false,
 		// whether to select manual settings in visible mode
-		set_manual_settings: 'false',
+		set_manual_settings: false,
 		// get meta data of scraping in return object
-		write_meta_data: 'true',
+		write_meta_data: true,
-		log_http_headers: 'false',
+		log_http_headers: false,
 		// how long to sleep between requests. a random sleep interval within the range [a,b]
 		// is drawn before every request. empty string for no sleeping.
 		sleep_range: '[1,1]',
 		// which search engine to scrape
 		search_engine: 'google',
-		compress: 'false', // compress
+		compress: false, // compress
-		debug: 'false',
+		debug: false,
-		verbose: 'false',
+		verbose: false,
 		keywords: ['test'],
 		// whether to start the browser in headless mode
 		headless: true,
 		// path to output file, data will be stored in JSON
 		output_file: '',
 	};
 	for (var key in config) {
@ -44,7 +48,7 @@ exports.scrape = function(config, callback) {
 		}
 	}
-	handler.handler(event, undefined, callback );
+	await handler.handler(event, undefined, callback );
 };
 function read_keywords_from_file(fname) {
@ -55,10 +59,3 @@ function read_keywords_from_file(fname) {
 	});
 	return kws;
 }
 function write_results(fname, data) {
 	fs.writeFile(fname || 'results.json', data, (err) => {
 		if (err) throw err;
 		console.log('Results written to file');
 	});
 }
--- a/jformat.py
+++ b/jformat.py
@ -0,0 +1,6 @@
 import pprint
 import sys
 import json
 if len(sys.argv) == 2:
 	print(pprint.pformat(json.load(open(sys.argv[1]))))
--- a/keywords.txt
+++ b/keywords.txt
@ -1,2 +1,2 @@
-GOOGL
+test
-AAPL
+water is blue
--- a/package.json
+++ b/package.json
@ -1,6 +1,6 @@
 {
  "name": "se-scraper",
-  "version": "1.0.5",
+  "version": "1.0.6",
  "description": "A simple module which uses puppeteer to scrape several search engines.",
  "main": "index.js",
  "scripts": {
--- a/results.json
+++ b/results.json
--- a/run.js
+++ b/run.js
@ -6,22 +6,24 @@ let config = {
    // if random_user_agent is set to True, a random user agent is chosen
    random_user_agent: false,
    // get meta data of scraping in return object
-    write_meta_data: 'true',
+    write_meta_data: false,
    // how long to sleep between requests. a random sleep interval within the range [a,b]
    // is drawn before every request. empty string for no sleeping.
    sleep_range: '[1,1]',
    // which search engine to scrape
-    search_engine: 'yahoo_news',
+    search_engine: 'google',
    // whether debug information should be printed
-    debug: 'true',
+    debug: true,
    // whether verbose program output should be printed
-    verbose: 'false',
+    verbose: false,
    // an array of keywords to scrape
-    keywords: ['GOOGL', ],
+    keywords: ['scrapeulous.com', ],
    // alternatively you can specify a keyword_file. this overwrites the keywords array
-    keyword_file: './keywords.txt',
+    keyword_file: '',
    // whether to start the browser in headless mode
    headless: false,
    // path to output file, data will be stored in JSON
    output_file: 'data.json',
 };
 se_scraper.scrape(config, (err, response) => {
--- a/src/node_scraper.js
+++ b/src/node_scraper.js
@ -1,5 +1,6 @@
 const puppeteer = require('puppeteer');
 const zlib = require('zlib');
 var fs = require('fs');
 // local module imports
 const google = require('./modules/google.js');
@ -12,6 +13,14 @@ const meta = require('./modules/metadata.js');
 const duckduckgo = require('./modules/duckduckgo.js');
 const tickersearch = require('./modules/ticker_search.js');
 function write_results(fname, data) {
 	fs.writeFileSync(fname, data, (err) => {
 		if (err) throw err;
 		console.log(`Results written to file ${fname}`);
 	});
 }
 module.exports.handler = async function handler (event, context, callback) {
 	try {
@ -127,6 +136,10 @@ module.exports.handler = async function handler (event, context, callback) {
 			}
 		}
 		if (event.output_file) {
 			write_results(event.output_file, JSON.stringify(results));
 		}
 		let response = {
 		  headers: {
 		  	'Content-Type': 'text/json',
		`@ -0,0 +1 @@`
							{"scrapeulous.com":{"time":"Sat, 26 Jan 2019 19:05:15 GMT","num_results":"Ungefähr 171 Ergebnisse (0,25 Sekunden) ","no_results":false,"effective_query":"","results":[{"link":"https://scrapeulous.com/","title":"Scrapeulous","snippet":"Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. Whether you need to analyze your competitors market ...","visible_link":"https://scrapeulous.com/","date":"","rank":1},{"link":"https://scrapeulous.com/about/","title":"About - Scrapeulous","snippet":"Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. The business requirement to scrape information from ...","visible_link":"https://scrapeulous.com/about/","date":"","rank":2},{"link":"https://scrapeulous.com/contact/","title":"Contact - Scrapeulous","snippet":"Contact scrapeulous.com. Your email address. Valid email address where we are going to contact you. We will not send spam mail. Your inquiry.","visible_link":"https://scrapeulous.com/contact/","date":"","rank":3},{"link":"https://scrapeulous.com/howto/","title":"Howto - Scrapeulous","snippet":"We offer scraping large amounts of keywords for the Google Search Engine. Large means any number of keywords between 40 and 50000. Additionally, we ...","visible_link":"https://scrapeulous.com/howto/","date":"","rank":4},{"link":"https://www.scrapeulous.com/faq/","title":"Frequently Asked Questions - Scrapeulous","snippet":"31.10.2018 - Frequently Asked Questions. What is our hourly rate for customized scraping services? The minimal rate for creating custom scrapers is set to ...","visible_link":"https://www.scrapeulous.com/faq/","date":"31.10.2018 - ","rank":5},{"link":"https://www.scrapeulous.com/news/","title":"News Api for MSCI World ETF - Scrapeulous","snippet":"News Api for MSCI World ETF. Scrapeulous.com News Api allows you to query the most recent world news for an index composed of developed market equities.","visible_link":"https://www.scrapeulous.com/news/","date":"","rank":6},{"link":"https://scrapeulous.com/advanced/","title":"Advanced Scraping Services - Scrapeulous","snippet":"Advanced Scraping Services. If you have special requirements for your scraping/crawling projects, you can write us an email to this contact mail and we will ...","visible_link":"https://scrapeulous.com/advanced/","date":"","rank":7},{"link":"https://incolumitas.com/","title":"Coding, Learning and Business Ideas","snippet":"About · Contact · GoogleScraper · Lichess Autoplay-Bot · Projects · Scrapeulous.com · Site Notice · SVGCaptcha · Home Archives Categories Tags Atom ...","visible_link":"https://incolumitas.com/","date":"","rank":8},{"link":"https://incolumitas.com/pages/scrapeulous/","title":"Coding, Learning and Business Ideas – Scrapeulous.com - Incolumitas","snippet":"In autumn 2018, I created a scraping service called scrapeulous.com. There you can purchase scrape jobs that allow you to upload a keyword file which in turn ...","visible_link":"https://incolumitas.com/pages/scrapeulous/","date":"","rank":9},{"link":"https://www.youtube.com/watch?v=a6xn6rc9GbI","title":"scrapeulous intro - YouTube","snippet":"Introduction for https://scrapeulous.com.","visible_link":"https://www.youtube.com/watch?v=a6xn6rc9GbI","date":"","rank":10}]}}