From 0f7e89c2723faa7ea06e3230809dbdcbc8fe927d Mon Sep 17 00:00:00 2001 From: Nikolai Tschacher Date: Mon, 12 Aug 2019 17:16:37 +0200 Subject: [PATCH] added little bug in cleaning --- TODO.md | 5 ++++- package.json | 2 +- src/modules/se_scraper.js | 3 ++- src/node_scraper.js | 1 + 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/TODO.md b/TODO.md index 1ae2cd3..5976f9c 100644 --- a/TODO.md +++ b/TODO.md @@ -68,7 +68,10 @@ ### 12.8.2019 - add static test case for bing [done] -- add options that minimize `html_output` flag: `clean_html_output` will remove all JS and CSS from the html +- add options that minimize `html_output` flag: + `clean_html_output` will remove all JS and CSS from the html + `clean_data_images` removes all data images from the html + [done] ### TODO: 1. fix googlenewsscraper waiting for results and parsing. remove the static sleep [done] diff --git a/package.json b/package.json index 2832622..cb17210 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "se-scraper", - "version": "1.4.2", + "version": "1.4.4", "description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu", "homepage": "https://scrapeulous.com/", "main": "index.js", diff --git a/src/modules/se_scraper.js b/src/modules/se_scraper.js index 7e55553..8c61a03 100644 --- a/src/modules/se_scraper.js +++ b/src/modules/se_scraper.js @@ -208,7 +208,8 @@ module.exports = class Scraper { await this.page.evaluate(() => { Array.prototype.slice.call(document.getElementsByTagName('img')).forEach( function(item) { - if (item.getAttribute('src').startsWith('data:')) { + let src = item.getAttribute('src'); + if (src && src.startsWith('data:')) { item.setAttribute('src', ''); } }); diff --git a/src/node_scraper.js b/src/node_scraper.js index 597518c..e9df41a 100644 --- a/src/node_scraper.js +++ b/src/node_scraper.js @@ -428,6 +428,7 @@ class ScrapeManager { log(this.config, 1, `On average ms/request: ${ms_per_request}ms/request`); if (this.config.compress) { + log(this.config, 1, 'Compressing results'); results = JSON.stringify(results); // https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Encoding results = zlib.deflateSync(results).toString('base64');