added little bug in cleaning

This commit is contained in:
Nikolai Tschacher 2019-08-12 17:16:37 +02:00
parent ca941cee45
commit 0f7e89c272
4 changed files with 8 additions and 3 deletions

View File

@ -68,7 +68,10 @@
### 12.8.2019
- add static test case for bing [done]
- add options that minimize `html_output` flag: `clean_html_output` will remove all JS and CSS from the html
- add options that minimize `html_output` flag:
`clean_html_output` will remove all JS and CSS from the html
`clean_data_images` removes all data images from the html
[done]
### TODO:
1. fix googlenewsscraper waiting for results and parsing. remove the static sleep [done]

View File

@ -1,6 +1,6 @@
{
"name": "se-scraper",
"version": "1.4.2",
"version": "1.4.4",
"description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu",
"homepage": "https://scrapeulous.com/",
"main": "index.js",

View File

@ -208,7 +208,8 @@ module.exports = class Scraper {
await this.page.evaluate(() => {
Array.prototype.slice.call(document.getElementsByTagName('img')).forEach(
function(item) {
if (item.getAttribute('src').startsWith('data:')) {
let src = item.getAttribute('src');
if (src && src.startsWith('data:')) {
item.setAttribute('src', '');
}
});

View File

@ -428,6 +428,7 @@ class ScrapeManager {
log(this.config, 1, `On average ms/request: ${ms_per_request}ms/request`);
if (this.config.compress) {
log(this.config, 1, 'Compressing results');
results = JSON.stringify(results);
// https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Encoding
results = zlib.deflateSync(results).toString('base64');