diff --git a/data.json b/data.json index d96185f..cb7f202 100644 --- a/data.json +++ b/data.json @@ -1 +1 @@ -{"scrapeulous.com":{"time":"Sun, 27 Jan 2019 14:51:54 GMT","num_results":"Ungefähr 169 Ergebnisse (0,23 Sekunden) ","no_results":false,"effective_query":"","results":[{"link":"https://scrapeulous.com/","title":"Scrapeulous","snippet":"Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. Whether you need to analyze your competitors market ...","visible_link":"https://scrapeulous.com/","date":"","rank":1},{"link":"https://scrapeulous.com/about/","title":"About - Scrapeulous","snippet":"Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. The business requirement to scrape information from ...","visible_link":"https://scrapeulous.com/about/","date":"","rank":2},{"link":"https://scrapeulous.com/contact/","title":"Contact - Scrapeulous","snippet":"Contact scrapeulous.com. Your email address. Valid email address where we are going to contact you. We will not send spam mail. Your inquiry.","visible_link":"https://scrapeulous.com/contact/","date":"","rank":3},{"link":"https://scrapeulous.com/howto/","title":"Howto - Scrapeulous","snippet":"We offer scraping large amounts of keywords for the Google Search Engine. Large means any number of keywords between 40 and 50000. Additionally, we ...","visible_link":"https://scrapeulous.com/howto/","date":"","rank":4},{"link":"https://incolumitas.com/","title":"Coding, Learning and Business Ideas","snippet":"About · Contact · GoogleScraper · Lichess Autoplay-Bot · Projects · Scrapeulous.com · Site Notice · SVGCaptcha · Home Archives Categories Tags Atom ...","visible_link":"https://incolumitas.com/","date":"","rank":5},{"link":"https://incolumitas.com/pages/scrapeulous/","title":"Coding, Learning and Business Ideas – Scrapeulous.com - Incolumitas","snippet":"In autumn 2018, I created a scraping service called scrapeulous.com. There you can purchase scrape jobs that allow you to upload a keyword file which in turn ...","visible_link":"https://incolumitas.com/pages/scrapeulous/","date":"","rank":6},{"link":"https://www.youtube.com/watch?v=a6xn6rc9GbI","title":"scrapeulous intro - YouTube","snippet":"Introduction for https://scrapeulous.com.","visible_link":"https://www.youtube.com/watch?v=a6xn6rc9GbI","date":"","rank":7},{"link":"https://www.youtube.com/channel/UCJs1Xei5LRefg9GwFYdYhOw","title":"Scrapeulous Scrapeulous - YouTube","snippet":"How to use scrapeulous.com - Duration: 3 minutes, 42 seconds. 32 minutes ago; 4 views. Introduction for https://scrapeulous.com. Show more. This item has ...","visible_link":"https://www.youtube.com/.../UCJs1Xei5LRefg9GwFYdYhOw","date":"","rank":8},{"link":"https://readthedocs.org/projects/googlescraper/downloads/pdf/latest/","title":"GoogleScraper Documentation - Read the Docs","snippet":"23.12.2018 - 1.1 Scrapeulous.com - Scraping Service. GoogleScraper is a open source tool and will remain a open source tool in the future. Some people ...","visible_link":"https://readthedocs.org/projects/googlescraper/downloads/.../latest...","date":"23.12.2018 - ","rank":9}]}} \ No newline at end of file +{"scrapeulous.com":{"time":"Sun, 27 Jan 2019 18:07:33 GMT","num_results":"Ungefähr 101 Ergebnisse","no_results":false,"effective_query":"","results":[]}} \ No newline at end of file diff --git a/examples/pluggable.js b/examples/pluggable.js index 90c634a..3fda333 100644 --- a/examples/pluggable.js +++ b/examples/pluggable.js @@ -1,39 +1,46 @@ -module.exports = { - get_browser: get_browser, - handle_metadata: handle_metadata, - close_browser: close_browser -}; +module.exports = class Pluggable { + constructor(options = {}) { + const { + chromeFlags = [ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-accelerated-2d-canvas', + '--disable-gpu', + '--window-size=1920x1080', + '--hide-scrollbars', + '--user-agent=Chrome', + ], + userAgent = 'Chrome', + headless = true, + } = options; -async function close_browser(browser) { - await browser.close(); -} + this.chromeFlags = chromeFlags; + this.userAgent = userAgent; + this.headless = headless; -async function handle_metadata() { - // silence -} + this.chromeFlags.push(this.userAgent); + } -async function get_browser(launch_args) { - const puppeteer = require('puppeteer'); + async close_browser() { + await this.browser.close(); + } - const ADDITIONAL_CHROME_FLAGS = [ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disable-accelerated-2d-canvas', - '--disable-gpu', - '--window-size=1920x1080', - '--hide-scrollbars', - '--user-agent=Chrome', - ]; + async handle_metadata(args) { + // silence + } - let custom_args = { - args: ADDITIONAL_CHROME_FLAGS, - headless: true, - }; + async start_browser(args={}) { + const puppeteer = require('puppeteer'); - browser = await puppeteer.launch(launch_args); + let launch_args = { + args: args.chromeFlags || this.chromeFlags, + headless: args.headless || this.headless, + }; - console.log('Loaded custom function get_browser()'); + this.browser = await puppeteer.launch(launch_args); + console.log('Loaded custom function get_browser()'); - return browser; -} \ No newline at end of file + return this.browser; + } +}; \ No newline at end of file diff --git a/run.js b/run.js index 0c71fc1..a2ca59e 100644 --- a/run.js +++ b/run.js @@ -35,7 +35,7 @@ let config = { // get_browser, handle_metadata, close_browser // must be an absolute path to the module //custom_func: resolve('examples/pluggable.js'), - custom_func: '', + custom_func: resolve('examples/pluggable.js'), }; se_scraper.scrape(config, (err, response) => { diff --git a/src/node_scraper.js b/src/node_scraper.js index 654ebf1..4584623 100644 --- a/src/node_scraper.js +++ b/src/node_scraper.js @@ -21,13 +21,13 @@ function write_results(fname, data) { } module.exports.handler = async function handler (config, context, callback) { - - custom_func = null; + pluggable = null; if (config.custom_func && fs.existsSync(config.custom_func)) { try { - custom_func = require(config.custom_func); + Pluggable = require(config.custom_func); + pluggable = new Pluggable(); } catch (exception) { - + console.error(exception); } } @@ -75,8 +75,9 @@ module.exports.handler = async function handler (config, context, callback) { console.log("Chrome Args: ", launch_args); } - if (custom_func) { - browser = await custom_func.get_browser(launch_args); + if (pluggable) { + launch_args.config = config; + browser = await pluggable.start_browser(launch_args); } else { browser = await puppeteer.launch(launch_args); } @@ -126,8 +127,8 @@ module.exports.handler = async function handler (config, context, callback) { metadata = await meta.get_metadata(browser); } - if (custom_func) { - await custom_func.close_browser(browser); + if (pluggable) { + await pluggable.close_browser(); } else { await browser.close(); } @@ -158,8 +159,8 @@ module.exports.handler = async function handler (config, context, callback) { console.log(metadata); } - if (custom_func) { - await custom_func.handle_metadata(metadata); + if (pluggable) { + await pluggable.handle_metadata({metadata: metadata, config: config}); } }