diff --git a/README.md b/README.md index 062877f..6fd2047 100644 --- a/README.md +++ b/README.md @@ -47,22 +47,24 @@ let config = { // if random_user_agent is set to True, a random user agent is chosen random_user_agent: false, // get meta data of scraping in return object - write_meta_data: 'true', + write_meta_data: true, // how long to sleep between requests. a random sleep interval within the range [a,b] // is drawn before every request. empty string for no sleeping. sleep_range: '[1,1]', // which search engine to scrape search_engine: 'yahoo_news', // whether debug information should be printed - debug: 'true', + debug: true, // whether verbose program output should be printed - verbose: 'false', + verbose: false, // an array of keywords to scrape keywords: ['GOOGL', ], // alternatively you can specify a keyword_file. this overwrites the keywords array keyword_file: './keywords.txt', // whether to start the browser in headless mode headless: false, + // path to output file, data will be stored in JSON + output_file: 'results.json', }; se_scraper.scrape(config, (err, response) => { diff --git a/data.json b/data.json new file mode 100644 index 0000000..6a6cce5 --- /dev/null +++ b/data.json @@ -0,0 +1 @@ +{"scrapeulous.com":{"time":"Sat, 26 Jan 2019 19:05:15 GMT","num_results":"Ungefähr 171 Ergebnisse (0,25 Sekunden) ","no_results":false,"effective_query":"","results":[{"link":"https://scrapeulous.com/","title":"Scrapeulous","snippet":"Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. Whether you need to analyze your competitors market ...","visible_link":"https://scrapeulous.com/","date":"","rank":1},{"link":"https://scrapeulous.com/about/","title":"About - Scrapeulous","snippet":"Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. The business requirement to scrape information from ...","visible_link":"https://scrapeulous.com/about/","date":"","rank":2},{"link":"https://scrapeulous.com/contact/","title":"Contact - Scrapeulous","snippet":"Contact scrapeulous.com. Your email address. Valid email address where we are going to contact you. We will not send spam mail. Your inquiry.","visible_link":"https://scrapeulous.com/contact/","date":"","rank":3},{"link":"https://scrapeulous.com/howto/","title":"Howto - Scrapeulous","snippet":"We offer scraping large amounts of keywords for the Google Search Engine. Large means any number of keywords between 40 and 50000. Additionally, we ...","visible_link":"https://scrapeulous.com/howto/","date":"","rank":4},{"link":"https://www.scrapeulous.com/faq/","title":"Frequently Asked Questions - Scrapeulous","snippet":"31.10.2018 - Frequently Asked Questions. What is our hourly rate for customized scraping services? The minimal rate for creating custom scrapers is set to ...","visible_link":"https://www.scrapeulous.com/faq/","date":"31.10.2018 - ","rank":5},{"link":"https://www.scrapeulous.com/news/","title":"News Api for MSCI World ETF - Scrapeulous","snippet":"News Api for MSCI World ETF. Scrapeulous.com News Api allows you to query the most recent world news for an index composed of developed market equities.","visible_link":"https://www.scrapeulous.com/news/","date":"","rank":6},{"link":"https://scrapeulous.com/advanced/","title":"Advanced Scraping Services - Scrapeulous","snippet":"Advanced Scraping Services. If you have special requirements for your scraping/crawling projects, you can write us an email to this contact mail and we will ...","visible_link":"https://scrapeulous.com/advanced/","date":"","rank":7},{"link":"https://incolumitas.com/","title":"Coding, Learning and Business Ideas","snippet":"About · Contact · GoogleScraper · Lichess Autoplay-Bot · Projects · Scrapeulous.com · Site Notice · SVGCaptcha · Home Archives Categories Tags Atom ...","visible_link":"https://incolumitas.com/","date":"","rank":8},{"link":"https://incolumitas.com/pages/scrapeulous/","title":"Coding, Learning and Business Ideas – Scrapeulous.com - Incolumitas","snippet":"In autumn 2018, I created a scraping service called scrapeulous.com. There you can purchase scrape jobs that allow you to upload a keyword file which in turn ...","visible_link":"https://incolumitas.com/pages/scrapeulous/","date":"","rank":9},{"link":"https://www.youtube.com/watch?v=a6xn6rc9GbI","title":"scrapeulous intro - YouTube","snippet":"Introduction for https://scrapeulous.com.","visible_link":"https://www.youtube.com/watch?v=a6xn6rc9GbI","date":"","rank":10}]}} \ No newline at end of file diff --git a/examples/multiple_search_engines.js b/examples/multiple_search_engines.js new file mode 100644 index 0000000..1e3ff78 --- /dev/null +++ b/examples/multiple_search_engines.js @@ -0,0 +1,35 @@ +const se_scraper = require('../index.js'); + +async function multiple_search_engines() { + + var searchEnginesList = ['google', 'bing']; + + for (let index = 0; index < searchEnginesList.length; index++) { + const searchEngine = searchEnginesList[index]; + let config = { + random_user_agent: true, + write_meta_data: true, + sleep_range: '[1,1]', + search_engine: searchEngine, + debug: false, + verbose: false, + // the list of keywords to scrape + keywords: ['scrapeulous.com',], + // whether to start the browser in headless mode + headless: true, + output_file: `${searchEngine}.json` + }; + + await se_scraper.scrape(config, (err, response) => { + if (err) { + console.error(err) + } + console.dir(response.results, { + depth: null, + colors: true + }); + }); + } +} + +multiple_search_engines(); \ No newline at end of file diff --git a/index.js b/index.js index bb3b587..920c5e9 100644 --- a/index.js +++ b/index.js @@ -2,7 +2,7 @@ const handler = require('./src/node_scraper.js'); var fs = require('fs'); var os = require("os"); -exports.scrape = function(config, callback) { +exports.scrape = async function(config, callback) { // options for scraping event = { // the user agent to scrape with @@ -10,19 +10,23 @@ exports.scrape = function(config, callback) { // if random_user_agent is set to True, a random user agent is chosen random_user_agent: false, // whether to select manual settings in visible mode - set_manual_settings: 'false', + set_manual_settings: false, // get meta data of scraping in return object - write_meta_data: 'true', - log_http_headers: 'false', + write_meta_data: true, + log_http_headers: false, // how long to sleep between requests. a random sleep interval within the range [a,b] // is drawn before every request. empty string for no sleeping. sleep_range: '[1,1]', // which search engine to scrape search_engine: 'google', - compress: 'false', // compress - debug: 'false', - verbose: 'false', + compress: false, // compress + debug: false, + verbose: false, keywords: ['test'], + // whether to start the browser in headless mode + headless: true, + // path to output file, data will be stored in JSON + output_file: '', }; for (var key in config) { @@ -44,7 +48,7 @@ exports.scrape = function(config, callback) { } } - handler.handler(event, undefined, callback ); + await handler.handler(event, undefined, callback ); }; function read_keywords_from_file(fname) { @@ -55,10 +59,3 @@ function read_keywords_from_file(fname) { }); return kws; } - -function write_results(fname, data) { - fs.writeFile(fname || 'results.json', data, (err) => { - if (err) throw err; - console.log('Results written to file'); - }); -} \ No newline at end of file diff --git a/jformat.py b/jformat.py new file mode 100755 index 0000000..8414dd0 --- /dev/null +++ b/jformat.py @@ -0,0 +1,6 @@ +import pprint +import sys +import json + +if len(sys.argv) == 2: + print(pprint.pformat(json.load(open(sys.argv[1])))) \ No newline at end of file diff --git a/keywords.txt b/keywords.txt index 6dc8051..f48eb1e 100644 --- a/keywords.txt +++ b/keywords.txt @@ -1,2 +1,2 @@ -GOOGL -AAPL \ No newline at end of file +test +water is blue \ No newline at end of file diff --git a/package.json b/package.json index 221d869..44342c7 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "se-scraper", - "version": "1.0.5", + "version": "1.0.6", "description": "A simple module which uses puppeteer to scrape several search engines.", "main": "index.js", "scripts": { diff --git a/results.json b/results.json deleted file mode 100644 index 851b5e7..0000000 --- a/results.json +++ /dev/null @@ -1 +0,0 @@ -{"google scraper nikolait":{"time":"Sun, 23 Dec 2018 18:10:32 GMT","no_results":false,"effective_query":"","num_results":"7'510'000 Ergebnisse","results":[{"link":"https://github.com/NikolaiT/GoogleScraper","title":"GitHub - NikolaiT/GoogleScraper: A …","snippet":"A Python module to scrape several search engines (like Google, Yandex, Bing, Duckduckgo, Baidu and others) by using proxies (socks4/5, http proxy) and with many ...","visible_link":"https://github.com/NikolaiT/GoogleScraper","rank":1},{"link":"https://github.com/NikolaiT/GoogleScraper/tree/master/GoogleScraper","title":"NikolaiT/GoogleScraper - GitHub","snippet":"A Python module to scrape several search engines (like Google, Yandex, Bing, Duckduckgo, Baidu and others) by using proxies (socks4/5, http proxy) and with many ...","visible_link":"https://github.com/NikolaiT/GoogleScraper/tree/master/GoogleScraper","rank":2},{"link":"https://www.google.ch/","title":"Google","snippet":"Search the world's information, including webpages, images, videos and more. Google has many special features to help you find exactly what you're looking for.","visible_link":"https://www.google.ch","rank":3},{"link":"https://dataforseo.com/solutions/scraping-services","title":"Google scraping services, scrape …","snippet":"Need to scrape google search results? Get your TOP-100 results for any keyword with data for SEO.","visible_link":"https://dataforseo.com/solutions/scraping-services","rank":4},{"link":"https://chrome.google.com/webstore/detail/web-scraper/jnhgnonknehpejjnehehllkliplmbmhn","title":"Web Scraper - Chrome Web Store - …","snippet":"Web site data extraction tool","visible_link":"https://chrome.google.com/webstore/detail/web-scraper/...","rank":5},{"link":"https://www.google.ch/?hl=fr&gws_rd=ssl","title":"Google","snippet":"Paramètres de recherche; Historique Web : Recherche avancée Outils linguistiques","visible_link":"https://www.google.ch/?hl=fr&gws_rd=ssl","rank":6},{"link":"https://chrome.google.com/webstore/detail/scraper/mbigbapnjcgaffohmbkdlecaccepngjd","title":"Scraper - Chrome Web Store - Google","snippet":"Scraper gets data out of web pages and into spreadsheets.","visible_link":"https://chrome.google.com/webstore/detail/scraper/...","rank":7},{"link":"https://maps.google.com/maps/contrib/102527402329858671234/photos","title":"Google Maps","snippet":"Find local businesses, view maps and get driving directions in Google Maps.","visible_link":"https://maps.google.com/maps/contrib/102527402329858671234/photos","rank":8},{"link":"https://news.google.com/?taa=1&hl=de&gl=CH&ceid=CH:de","title":"Google News","snippet":"Ausführliche und aktuelle Beiträge - von Google News aus verschiedenen Nachrichtenquellen aus aller Welt zusammengetragen","visible_link":"https://news.google.com/?taa=1&hl=de&gl=CH&ceid=CH:de","rank":9},{"link":"https://images.google.de/","title":"Google Images","snippet":"Google Images. The most comprehensive image search on the web.","visible_link":"https://images.google.de","rank":10}]},"mount everest":{"time":"Sun, 23 Dec 2018 18:10:33 GMT","no_results":false,"effective_query":"","num_results":"4'280'000 Ergebnisse","results":[{"link":"https://de.wikipedia.org/wiki/Mount_Everest","title":"Mount Everest – Wikipedia","snippet":"Der Mount Everest ist ein Berg im Himalaya und mit einer Höhe von 8848 m der höchste Berg der Erde. Er gehört zu den 14 Achttausendern und zu den Seven Summits.","visible_link":"https://de.wikipedia.org/wiki/Mount_Everest","rank":1},{"link":"http://www.mounteverest.ch/","title":"MountEverest.ch - Take the Challenge","snippet":"Steigen Sie jetzt symbolisch aufs Dach der Welt - mitten in der Schweiz. Leisten Sie diese 8848 Höhenmeter des Mount Everest jetzt bergwärts auf definierten ...","visible_link":"www.mounteverest.ch","rank":2},{"link":"https://en.wikipedia.org/wiki/Mount_Everest","title":"Mount Everest - Wikipedia","snippet":"","visible_link":"https://en.wikipedia.org/wiki/Mount_Everest","rank":3},{"link":"https://www.summitclimb.ch/de/highlights/mount-everest","title":"Mount Everest - Expeditionen, Besteigung! - SummitClimb","snippet":"Mount Everest, der Berg der Berge, ein attraktives Ziel: Trekking und Expeditionen. Begleite unsere Experten nach Nepal und Tibet zum höchsten Gipfel der Welt.","visible_link":"https://www.summitclimb.ch/de/highlights/mount-everest","rank":4},{"link":"https://de.wikipedia.org/wiki/Besteigungsgeschichte_des_Mount_Everest","title":"Besteigungsgeschichte des Mount Everest – Wikipedia","snippet":"Der Mount Everest ist als höchster Berg der Erde stets ein attraktives Ziel. Die ersten Besteigungsversuche wurden in den 1920er Jahren unternommen, jedoch dauerte ...","visible_link":"https://de.wikipedia.org/wiki/Besteigungsgeschichte_des_Mount_Everest","rank":5},{"link":"https://www.blick.ch/dossiers/mount-everest/","title":"Mount Everest - Blick","snippet":"Mount Everest im Überblick - Alle Schlagzeile, letzte Nachrichten, Archiv-Material, die besten Fotos und Videos. Blick.ch bietet Ihnen aktuelle Nachrichten und ...","visible_link":"https://www.blick.ch/dossiers/mount-everest","rank":6},{"link":"https://www.blick.ch/news/ausland/fast-300-bergsteiger-starben-am-mount-everest-die-leichen-sind-fuer-uns-wie-wegweiser-zum-gipfel-id7359570.html","title":"Tote Bergsteiger am Mount Everest als Wegweise - …","snippet":"30.09.2018 · Fast 300 Menschen fanden am Mount Everest, dem höchsten Berg der Welt, den Tod. So manche Leiche wurde nie geborgen. Das Vorhaben, alle Kletterer vom ...","visible_link":"https://www.blick.ch/news/ausland/fast-300-bergsteiger-starben-am...","rank":7},{"link":"http://www.spiegel.de/thema/mount_everest/","title":"Mount Everest - SPIEGEL ONLINE","snippet":"Deutschlands führende Nachrichtenseite. Alles Wichtige aus Politik, Wirtschaft, Sport, Kultur, Wissenschaft, Technik und mehr.","visible_link":"www.spiegel.de › Themen","rank":8}]},"incolumitas.com":{"time":"Sun, 23 Dec 2018 18:10:34 GMT","no_results":false,"effective_query":"","num_results":"9'230 Ergebnisse","results":[{"link":"https://incolumitas.com/","title":"Coding, Learning and Business Ideas","snippet":"Nikolai Tschacher's ideas and projects around IT security and computer science","visible_link":"https://incolumitas.com","rank":1},{"link":"https://incolumitas.com/2018/08/20/cryptographic-properties-mac-and-hmac/","title":"Coding, Learning and Business Ideas …","snippet":"20.08.2018 · Introduction Similarly as digital signatures, Message Authentication Codes provide message integrity and message authentication. When Alice generates a MAC ...","visible_link":"https://incolumitas.com/2018/08/20/cryptographic-properties-mac...","rank":2},{"link":"https://www.easycounter.com/report/incolumitas.com","title":"Incolumitas.com: Coding, Learning …","snippet":"Incolumitas.com is tracked by us since February, 2014. Over the time it has been ranked as high as 478 199 in the world, while most of its traffic comes from India ...","visible_link":"https://www.easycounter.com/report/incolumitas.com","rank":3},{"link":"http://www.incolumitas.es/","title":"www.incolumitas.es","snippet":"Nos estamos preparando… Síguenos en info @ incolumitas.es. 656 69 49 19","visible_link":"www.incolumitas.es","rank":4},{"link":"https://updates.easycounter.com/incolumitas.com","title":"Incolumitas (Incolumitas.com) - …","snippet":"We collected all of metadata history records for Incolumitas.com. Incolumitas has a medium sized description which rather positively influences the efficiency of ...","visible_link":"https://updates.easycounter.com/incolumitas.com","rank":5},{"link":"https://whois.easycounter.com/incolumitas.com","title":"Incolumitas.com whois history records","snippet":"The current Incolumitas.com owner and other personalities/entities that used to own this domain in the past are listed below.","visible_link":"https://whois.easycounter.com/incolumitas.com","rank":6},{"link":"https://legal-dictionary.thefreedictionary.com/incolumitas","title":"Incolumitas legal definition of …","snippet":"Disclaimer. All content on this website, including dictionary, thesaurus, literature, geography, and other reference data is for informational purposes only.","visible_link":"https://legal-dictionary.thefreedictionary.com/incolumitas","rank":7},{"link":"https://review.easycounter.com/incolumitas.com","title":"Incolumitas reviews and fraud and …","snippet":"We checked Incolumitas for scam and fraud. Our comprehensive Incolumitas.com review will show you if Incolumitas is legit and whether it is safe.","visible_link":"https://review.easycounter.com/incolumitas.com","rank":8}]}} \ No newline at end of file diff --git a/run.js b/run.js index 786ef5c..5a63796 100644 --- a/run.js +++ b/run.js @@ -6,22 +6,24 @@ let config = { // if random_user_agent is set to True, a random user agent is chosen random_user_agent: false, // get meta data of scraping in return object - write_meta_data: 'true', + write_meta_data: false, // how long to sleep between requests. a random sleep interval within the range [a,b] // is drawn before every request. empty string for no sleeping. sleep_range: '[1,1]', // which search engine to scrape - search_engine: 'yahoo_news', + search_engine: 'google', // whether debug information should be printed - debug: 'true', + debug: true, // whether verbose program output should be printed - verbose: 'false', + verbose: false, // an array of keywords to scrape - keywords: ['GOOGL', ], + keywords: ['scrapeulous.com', ], // alternatively you can specify a keyword_file. this overwrites the keywords array - keyword_file: './keywords.txt', + keyword_file: '', // whether to start the browser in headless mode headless: false, + // path to output file, data will be stored in JSON + output_file: 'data.json', }; se_scraper.scrape(config, (err, response) => { diff --git a/src/node_scraper.js b/src/node_scraper.js index ddcc84a..5fc4534 100644 --- a/src/node_scraper.js +++ b/src/node_scraper.js @@ -1,5 +1,6 @@ const puppeteer = require('puppeteer'); const zlib = require('zlib'); +var fs = require('fs'); // local module imports const google = require('./modules/google.js'); @@ -12,6 +13,14 @@ const meta = require('./modules/metadata.js'); const duckduckgo = require('./modules/duckduckgo.js'); const tickersearch = require('./modules/ticker_search.js'); + +function write_results(fname, data) { + fs.writeFileSync(fname, data, (err) => { + if (err) throw err; + console.log(`Results written to file ${fname}`); + }); +} + module.exports.handler = async function handler (event, context, callback) { try { @@ -127,6 +136,10 @@ module.exports.handler = async function handler (event, context, callback) { } } + if (event.output_file) { + write_results(event.output_file, JSON.stringify(results)); + } + let response = { headers: { 'Content-Type': 'text/json',