diff --git a/README.md b/README.md index 10fde80..30702f7 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ This node module supports scraping several search engines. -Right now scraping for +Right now scraping the search engines * Google * Google News @@ -17,6 +17,13 @@ Right now scraping for is supported. +Additionally **se-scraper** supports investment ticker search from the following sites: + +* Bloomberg +* Reuters +* cnbc +* Marketwatch + This module uses puppeteer. It was created by the Developer of https://github.com/NikolaiT/GoogleScraper, a module with 1800 Stars on Github. ### Technical Notes @@ -38,8 +45,9 @@ npm install se-scraper Use se-scraper by calling it with a script such as the one below. -```javascript +```js const se_scraper = require('se-scraper'); +const resolve = require('path').resolve; let config = { // the user agent to scrape with @@ -47,27 +55,35 @@ let config = { // if random_user_agent is set to True, a random user agent is chosen random_user_agent: false, // get meta data of scraping in return object - write_meta_data: true, + write_meta_data: false, // how long to sleep between requests. a random sleep interval within the range [a,b] // is drawn before every request. empty string for no sleeping. - sleep_range: '[1,1]', + sleep_range: '', // which search engine to scrape - search_engine: 'yahoo_news', + search_engine: 'google', // whether debug information should be printed - debug: true, + // debug info is useful for developers when debugging + debug: false, // whether verbose program output should be printed + // this output is informational verbose: false, // an array of keywords to scrape - keywords: ['GOOGL', ], + keywords: ['scrapeulous.com', ], // alternatively you can specify a keyword_file. this overwrites the keywords array - keyword_file: './keywords.txt', + keyword_file: '', // whether to start the browser in headless mode - headless: false, + headless: true, // path to output file, data will be stored in JSON - output_file: 'results.json', + output_file: 'data.json', // whether to prevent images, css, fonts from being loaded // will speed up scraping a great deal - block_assets: true + block_assets: true, + // path to js module that extends functionality + // this module should export the functions: + // get_browser, handle_metadata, close_browser + // must be an absolute path to the module + //custom_func: resolve('examples/pluggable.js'), + custom_func: '', }; se_scraper.scrape(config, (err, response) => { @@ -100,6 +116,11 @@ Supported options for the `search_engine` config key: 'duckduckgo_news' 'google_dr' 'yahoo_news' +// ticker search +'bloomberg' +'reuters' +'cnbc' +'marketwatch' ``` Output for the above script on my laptop: diff --git a/TODO.txt b/TODO.txt index 55661b0..7899adb 100644 --- a/TODO.txt +++ b/TODO.txt @@ -12,6 +12,7 @@ - Add functionality to block images and CSS from loading as described here: https://www.scrapehero.com/how-to-increase-web-scraping-speed-using-puppeteer/ + https://www.scrapehero.com/how-to-build-a-web-scraper-using-puppeteer-and-node-js/ TODO: - add proxy support diff --git a/data.json b/data.json new file mode 100644 index 0000000..d96185f --- /dev/null +++ b/data.json @@ -0,0 +1 @@ +{"scrapeulous.com":{"time":"Sun, 27 Jan 2019 14:51:54 GMT","num_results":"Ungefähr 169 Ergebnisse (0,23 Sekunden) ","no_results":false,"effective_query":"","results":[{"link":"https://scrapeulous.com/","title":"Scrapeulous","snippet":"Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. Whether you need to analyze your competitors market ...","visible_link":"https://scrapeulous.com/","date":"","rank":1},{"link":"https://scrapeulous.com/about/","title":"About - Scrapeulous","snippet":"Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. The business requirement to scrape information from ...","visible_link":"https://scrapeulous.com/about/","date":"","rank":2},{"link":"https://scrapeulous.com/contact/","title":"Contact - Scrapeulous","snippet":"Contact scrapeulous.com. Your email address. Valid email address where we are going to contact you. We will not send spam mail. Your inquiry.","visible_link":"https://scrapeulous.com/contact/","date":"","rank":3},{"link":"https://scrapeulous.com/howto/","title":"Howto - Scrapeulous","snippet":"We offer scraping large amounts of keywords for the Google Search Engine. Large means any number of keywords between 40 and 50000. Additionally, we ...","visible_link":"https://scrapeulous.com/howto/","date":"","rank":4},{"link":"https://incolumitas.com/","title":"Coding, Learning and Business Ideas","snippet":"About · Contact · GoogleScraper · Lichess Autoplay-Bot · Projects · Scrapeulous.com · Site Notice · SVGCaptcha · Home Archives Categories Tags Atom ...","visible_link":"https://incolumitas.com/","date":"","rank":5},{"link":"https://incolumitas.com/pages/scrapeulous/","title":"Coding, Learning and Business Ideas – Scrapeulous.com - Incolumitas","snippet":"In autumn 2018, I created a scraping service called scrapeulous.com. There you can purchase scrape jobs that allow you to upload a keyword file which in turn ...","visible_link":"https://incolumitas.com/pages/scrapeulous/","date":"","rank":6},{"link":"https://www.youtube.com/watch?v=a6xn6rc9GbI","title":"scrapeulous intro - YouTube","snippet":"Introduction for https://scrapeulous.com.","visible_link":"https://www.youtube.com/watch?v=a6xn6rc9GbI","date":"","rank":7},{"link":"https://www.youtube.com/channel/UCJs1Xei5LRefg9GwFYdYhOw","title":"Scrapeulous Scrapeulous - YouTube","snippet":"How to use scrapeulous.com - Duration: 3 minutes, 42 seconds. 32 minutes ago; 4 views. Introduction for https://scrapeulous.com. Show more. This item has ...","visible_link":"https://www.youtube.com/.../UCJs1Xei5LRefg9GwFYdYhOw","date":"","rank":8},{"link":"https://readthedocs.org/projects/googlescraper/downloads/pdf/latest/","title":"GoogleScraper Documentation - Read the Docs","snippet":"23.12.2018 - 1.1 Scrapeulous.com - Scraping Service. GoogleScraper is a open source tool and will remain a open source tool in the future. Some people ...","visible_link":"https://readthedocs.org/projects/googlescraper/downloads/.../latest...","date":"23.12.2018 - ","rank":9}]}} \ No newline at end of file diff --git a/examples/pluggable.js b/examples/pluggable.js new file mode 100644 index 0000000..90c634a --- /dev/null +++ b/examples/pluggable.js @@ -0,0 +1,39 @@ +module.exports = { + get_browser: get_browser, + handle_metadata: handle_metadata, + close_browser: close_browser +}; + +async function close_browser(browser) { + await browser.close(); +} + +async function handle_metadata() { + // silence +} + +async function get_browser(launch_args) { + const puppeteer = require('puppeteer'); + + const ADDITIONAL_CHROME_FLAGS = [ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-accelerated-2d-canvas', + '--disable-gpu', + '--window-size=1920x1080', + '--hide-scrollbars', + '--user-agent=Chrome', + ]; + + let custom_args = { + args: ADDITIONAL_CHROME_FLAGS, + headless: true, + }; + + browser = await puppeteer.launch(launch_args); + + console.log('Loaded custom function get_browser()'); + + return browser; +} \ No newline at end of file diff --git a/index.js b/index.js index c881bc3..cb85a76 100644 --- a/index.js +++ b/index.js @@ -29,7 +29,11 @@ exports.scrape = async function(config, callback) { output_file: '', // whether to prevent images, css, fonts from being loaded // will speed up scraping a great deal - block_assets: true + block_assets: true, + // path to js module that extends functionality + // this module should export the functions: + // get_browser, handle_metadata, close_browser + custom_func: 'examples/pluggable.js', }; for (var key in config) { diff --git a/package.json b/package.json index 45671ee..5a9942f 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,8 @@ { "name": "se-scraper", - "version": "1.1.0", + "version": "1.1.1", "description": "A simple module which uses puppeteer to scrape several search engines.", + "homepage": "https://scrapeulous.com/", "main": "index.js", "scripts": { "test": "mocha" diff --git a/run.js b/run.js index 140fca4..0c71fc1 100644 --- a/run.js +++ b/run.js @@ -1,4 +1,5 @@ const se_scraper = require('./index.js'); +const resolve = require('path').resolve; let config = { // the user agent to scrape with @@ -9,24 +10,32 @@ let config = { write_meta_data: false, // how long to sleep between requests. a random sleep interval within the range [a,b] // is drawn before every request. empty string for no sleeping. - sleep_range: '[1,1]', + sleep_range: '', // which search engine to scrape - search_engine: 'bing', + search_engine: 'google', // whether debug information should be printed - debug: true, + // debug info is useful for developers when debugging + debug: false, // whether verbose program output should be printed + // this output is informational verbose: false, // an array of keywords to scrape - keywords: ['MSFT', ], + keywords: ['scrapeulous.com', ], // alternatively you can specify a keyword_file. this overwrites the keywords array keyword_file: '', // whether to start the browser in headless mode - headless: false, + headless: true, // path to output file, data will be stored in JSON output_file: 'data.json', // whether to prevent images, css, fonts from being loaded // will speed up scraping a great deal - block_assets: true + block_assets: true, + // path to js module that extends functionality + // this module should export the functions: + // get_browser, handle_metadata, close_browser + // must be an absolute path to the module + //custom_func: resolve('examples/pluggable.js'), + custom_func: '', }; se_scraper.scrape(config, (err, response) => { diff --git a/src/modules/functions.js b/src/modules/functions.js index 35ef8db..8270caa 100644 --- a/src/modules/functions.js +++ b/src/modules/functions.js @@ -4,6 +4,7 @@ module.exports = { sleep: sleep, random_sleep: random_sleep, set_input_value: set_input_value, + }; async function set_input_value(page, selector, value) { diff --git a/src/modules/google.js b/src/modules/google.js index 99b97a7..637b25d 100644 --- a/src/modules/google.js +++ b/src/modules/google.js @@ -1,13 +1,6 @@ const cheerio = require('cheerio'); const sfunctions = require('./functions.js'); -/* - Scrape for dateranges: - - https://www.google.com/search?lr=&hl=en&tbs=cdr:1,cd_min:1/1/2007,cd_max:1/1/2009&q=%22video+game%22+%22Catan%22&oq=%22video+game%22+%22Catan%22 - - */ - module.exports = { scrape_google_news_old_pup: scrape_google_news_old_pup, scrape_google_pup: scrape_google_pup, @@ -19,13 +12,6 @@ module.exports = { const STANDARD_TIMEOUT = 8000; const SOLVE_CAPTCHA_TIME = 45000; -const setTextInputValue = async (page, selector, value) => { - await page.waitFor(selector); - await page.evaluate((value, selector) => { - return document.querySelector(selector).value = value; - }, value, selector); -}; - async function scrape_google_pup(page, event, context) { await page.goto('https://www.google.com/'); @@ -51,7 +37,7 @@ async function scrape_google_pup(page, event, context) { // await input.click({ clickCount: 3 }); // await sfunctions.sleep(50); //await input.type(keyword); - await setTextInputValue(page, `input[name="q"]`, keyword); + await sfunctions.set_input_value(page, `input[name="q"]`, keyword); await sfunctions.sleep(50); await input.focus(); await page.keyboard.press("Enter"); @@ -130,7 +116,7 @@ async function scrape_google_pup_dr(page, event, context) { // await input.click({ clickCount: 3 }); // await sfunctions.sleep(50); // await input.type(keyword); - await setTextInputValue(page, `input[name="q"]`, keyword); + await sfunctions.set_input_value(page, `input[name="q"]`, keyword); await sfunctions.sleep(50); await input.focus(); @@ -252,7 +238,7 @@ async function scrape_google_news_old_pup(page, event, context) { // overwrites last text in input // await input.click({ clickCount: 3 }); // await input.type(keyword); - await setTextInputValue(page, `input[name="q"]`, keyword); + await sfunctions.set_input_value(page, `input[name="q"]`, keyword); await sfunctions.sleep(50); await input.focus(); await page.keyboard.press("Enter"); @@ -367,7 +353,7 @@ async function scrape_google_image_pup(page, event, context) { // overwrites last text in input // await input.click({ clickCount: 3 }); // await input.type(keyword); - await setTextInputValue(page, `input[name="q"]`, keyword); + await sfunctions.set_input_value(page, `input[name="q"]`, keyword); await sfunctions.sleep(50); await input.focus(); diff --git a/src/node_scraper.js b/src/node_scraper.js index d7b4b22..654ebf1 100644 --- a/src/node_scraper.js +++ b/src/node_scraper.js @@ -13,7 +13,6 @@ const meta = require('./modules/metadata.js'); const duckduckgo = require('./modules/duckduckgo.js'); const tickersearch = require('./modules/ticker_search.js'); - function write_results(fname, data) { fs.writeFileSync(fname, data, (err) => { if (err) throw err; @@ -21,14 +20,23 @@ function write_results(fname, data) { }); } -module.exports.handler = async function handler (event, context, callback) { +module.exports.handler = async function handler (config, context, callback) { + + custom_func = null; + if (config.custom_func && fs.existsSync(config.custom_func)) { + try { + custom_func = require(config.custom_func); + } catch (exception) { + + } + } try { const startTime = Date.now(); - event = parseEventData(event); - if (event.debug === true) { - console.log(event); + config = parseEventData(config); + if (config.debug === true) { + console.log(config); } const ADDITIONAL_CHROME_FLAGS = [ @@ -44,12 +52,12 @@ module.exports.handler = async function handler (event, context, callback) { let USER_AGENT = ''; - if (event.random_user_agent) { + if (config.random_user_agent) { USER_AGENT = ua.random_user_agent(); } - if (event.user_agent) { - USER_AGENT = event.user_agent; + if (config.user_agent) { + USER_AGENT = config.user_agent; } if (USER_AGENT) { @@ -58,23 +66,29 @@ module.exports.handler = async function handler (event, context, callback) { ) } - if (event.debug === true) { - console.log("Chrome Flags: ", ADDITIONAL_CHROME_FLAGS); - } - - browser = await puppeteer.launch({ + let launch_args = { args: ADDITIONAL_CHROME_FLAGS, - headless: event.headless !== false, - }); + headless: config.headless !== false, + }; - if (event.log_http_headers === true) { + if (config.debug === true) { + console.log("Chrome Args: ", launch_args); + } + + if (custom_func) { + browser = await custom_func.get_browser(launch_args); + } else { + browser = await puppeteer.launch(launch_args); + } + + if (config.log_http_headers === true) { headers = await meta.get_http_headers(browser); console.dir(headers); } const page = await browser.newPage(); - if (event.block_assets === true) { + if (config.block_assets === true) { await page.setRequestInterception(true); page.on('request', (req) => { @@ -104,45 +118,53 @@ module.exports.handler = async function handler (event, context, callback) { reuters: tickersearch.scrape_reuters_finance_pup, cnbc: tickersearch.scrape_cnbc_finance_pup, marketwatch: tickersearch.scrape_marketwatch_finance_pup, - }[event.search_engine](page, event, context); + }[config.search_engine](page, config, context); let metadata = {}; - if (event.write_meta_data === true) { + if (config.write_meta_data === true) { metadata = await meta.get_metadata(browser); } - await browser.close(); + if (custom_func) { + await custom_func.close_browser(browser); + } else { + await browser.close(); + } - let num_keywords = event.keywords.length || 0; + let num_keywords = config.keywords.length || 0; let timeDelta = Date.now() - startTime; let ms_per_keyword = timeDelta/num_keywords; - console.log(`Scraper took ${timeDelta}ms to scrape ${num_keywords} keywords.`); - console.log(`On average ms/keyword: ${ms_per_keyword}ms/keyword`); - if (event.verbose === true) { + if (config.verbose === true) { + console.log(`Scraper took ${timeDelta}ms to scrape ${num_keywords} keywords.`); + console.log(`On average ms/keyword: ${ms_per_keyword}ms/keyword`); console.dir(results, {depth: null, colors: true}); } - if (event.compress === true) { + if (config.compress === true) { results = JSON.stringify(results); // https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Encoding results = zlib.deflateSync(results).toString('base64'); } - if (event.write_meta_data === true) { - metadata.id = `${event.job_name} ${event.chunk_lines}`; - metadata.chunk_lines = event.chunk_lines; + if (config.write_meta_data === true) { + metadata.id = `${config.job_name} ${config.chunk_lines}`; + metadata.chunk_lines = config.chunk_lines; metadata.elapsed_time = timeDelta.toString(); metadata.ms_per_keyword = ms_per_keyword.toString(); - if (event.verbose === true) { + if (config.verbose === true) { console.log(metadata); } + + if (custom_func) { + await custom_func.handle_metadata(metadata); + } } - if (event.output_file) { - write_results(event.output_file, JSON.stringify(results)); + if (config.output_file) { + write_results(config.output_file, JSON.stringify(results)); } let response = { @@ -161,7 +183,7 @@ module.exports.handler = async function handler (event, context, callback) { } }; -function parseEventData(event) { +function parseEventData(config) { function _bool(e) { e = String(e); @@ -172,54 +194,54 @@ function parseEventData(event) { } } - if (event.debug) { - event.debug = _bool(event.debug); + if (config.debug) { + config.debug = _bool(config.debug); } - if (event.verbose) { - event.verbose = _bool(event.verbose); + if (config.verbose) { + config.verbose = _bool(config.verbose); } - if (event.upload_to_s3) { - event.upload_to_s3 = _bool(event.upload_to_s3); + if (config.upload_to_s3) { + config.upload_to_s3 = _bool(config.upload_to_s3); } - if (event.write_meta_data) { - event.write_meta_data = _bool(event.write_meta_data); + if (config.write_meta_data) { + config.write_meta_data = _bool(config.write_meta_data); } - if (event.log_http_headers) { - event.log_http_headers = _bool(event.log_http_headers); + if (config.log_http_headers) { + config.log_http_headers = _bool(config.log_http_headers); } - if (event.compress) { - event.compress = _bool(event.compress); + if (config.compress) { + config.compress = _bool(config.compress); } - if (event.is_local) { - event.is_local = _bool(event.is_local); + if (config.is_local) { + config.is_local = _bool(config.is_local); } - if (event.max_results) { - event.max_results = parseInt(event.max_results); + if (config.max_results) { + config.max_results = parseInt(config.max_results); } - if (event.set_manual_settings) { - event.set_manual_settings = _bool(event.set_manual_settings); + if (config.set_manual_settings) { + config.set_manual_settings = _bool(config.set_manual_settings); } - if (event.block_assets) { - event.block_assets = _bool(event.block_assets); + if (config.block_assets) { + config.block_assets = _bool(config.block_assets); } - if (event.sleep_range) { + if (config.sleep_range) { // parse an array - event.sleep_range = eval(event.sleep_range); + config.sleep_range = eval(config.sleep_range); - if (event.sleep_range.length !== 2 && typeof i[0] !== 'number' && typeof i[1] !== 'number') { + if (config.sleep_range.length !== 2 && typeof i[0] !== 'number' && typeof i[1] !== 'number') { throw "sleep_range is not a valid array of two integers."; } } - return event; + return config; } \ No newline at end of file diff --git a/test/tests.js b/test/tests.js index 2f82843..bfd55af 100644 --- a/test/tests.js +++ b/test/tests.js @@ -32,7 +32,6 @@ async function tests() { console.log(`Testing ${se}...`); event.search_engine = se; await handler.handler(event, undefined, test_case); - await sleep(3000); } } @@ -88,26 +87,19 @@ function test_case(err, response) { if (err) { console.error(err); } else { - assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json'); assert.equal(response.statusCode, 200, 'status code must be 200'); - results = response.results; - - for (kw in results) { + for (key in response.results) { + kw = response.results[key]; // at least 6 results - assert.isAtLeast(results[kw].results.length, 6, 'results must have at least 6 links'); + assert.isAtLeast(kw.results.length, 6, 'results must have at least 6 links'); + assert.equal(kw.no_results, false, 'no results should be false'); + assert.typeOf(kw.num_results, 'string', 'num_results must be a string'); + assert.isAtLeast(kw.num_results.length, 5, 'num_results should be a string of at least 5 chars'); + assert.typeOf(Date.parse(kw.time), 'number', 'time should be a valid date'); - assert.equal(results[kw].no_results, false, 'no results should be false'); - - assert.typeOf(results[kw].num_results, 'string', 'num_results must be a string'); - assert.isAtLeast(results[kw].num_results.length, 5, 'num_results should be a string of at least 5 chars'); - - assert.typeOf(Date.parse(results[kw].time), 'number', 'time should be a valid date'); - - - for (k = 0; k < results[kw].results.length; k++) { - res = results[kw].results[k]; + for (let res of kw.results) { assert.isOk(res.link, 'link must be ok'); assert.typeOf(res.link, 'string', 'link must be string'); assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');