added little bug in cleaning

2025-08-09 05:54:38 +02:00 · 2019-08-12 17:16:37 +02:00
parent ca941cee45
commit 0f7e89c272
4 changed files with 8 additions and 3 deletions
--- a/TODO.md
+++ b/TODO.md
@ -68,7 +68,10 @@
 ### 12.8.2019

 - add static test case for bing [done]
- add options that minimize `html_output` flag: `clean_html_output` will remove all JS and CSS from the html
+- add options that minimize `html_output` flag: 
+    `clean_html_output` will remove all JS and CSS from the html 
+    `clean_data_images` removes all data images from the html
+    [done]

 ### TODO:
 1. fix googlenewsscraper waiting for results and parsing. remove the static sleep [done]
--- a/package.json
+++ b/package.json
@ -1,6 +1,6 @@
 {
  "name": "se-scraper",
-  "version": "1.4.2",
+  "version": "1.4.4",
  "description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu",
  "homepage": "https://scrapeulous.com/",
  "main": "index.js",
--- a/src/modules/se_scraper.js
+++ b/src/modules/se_scraper.js
@ -208,7 +208,8 @@ module.exports = class Scraper {
                            await this.page.evaluate(() => {
                                Array.prototype.slice.call(document.getElementsByTagName('img')).forEach(
                                  function(item) {
-                                    if (item.getAttribute('src').startsWith('data:')) {
+                                    let src = item.getAttribute('src');
+                                    if (src && src.startsWith('data:')) {
                                        item.setAttribute('src', '');
                                    }
                                });
--- a/src/node_scraper.js
+++ b/src/node_scraper.js
@ -428,6 +428,7 @@ class ScrapeManager {
        log(this.config, 1, `On average ms/request: ${ms_per_request}ms/request`);

        if (this.config.compress) {
+            log(this.config, 1, 'Compressing results');
            results = JSON.stringify(results);
            // https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Encoding
            results = zlib.deflateSync(results).toString('base64');