mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2024-11-21 23:23:07 +01:00
added little bug in cleaning
This commit is contained in:
parent
ca941cee45
commit
0f7e89c272
5
TODO.md
5
TODO.md
@ -68,7 +68,10 @@
|
||||
### 12.8.2019
|
||||
|
||||
- add static test case for bing [done]
|
||||
- add options that minimize `html_output` flag: `clean_html_output` will remove all JS and CSS from the html
|
||||
- add options that minimize `html_output` flag:
|
||||
`clean_html_output` will remove all JS and CSS from the html
|
||||
`clean_data_images` removes all data images from the html
|
||||
[done]
|
||||
|
||||
### TODO:
|
||||
1. fix googlenewsscraper waiting for results and parsing. remove the static sleep [done]
|
||||
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "se-scraper",
|
||||
"version": "1.4.2",
|
||||
"version": "1.4.4",
|
||||
"description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu",
|
||||
"homepage": "https://scrapeulous.com/",
|
||||
"main": "index.js",
|
||||
|
@ -208,7 +208,8 @@ module.exports = class Scraper {
|
||||
await this.page.evaluate(() => {
|
||||
Array.prototype.slice.call(document.getElementsByTagName('img')).forEach(
|
||||
function(item) {
|
||||
if (item.getAttribute('src').startsWith('data:')) {
|
||||
let src = item.getAttribute('src');
|
||||
if (src && src.startsWith('data:')) {
|
||||
item.setAttribute('src', '');
|
||||
}
|
||||
});
|
||||
|
@ -428,6 +428,7 @@ class ScrapeManager {
|
||||
log(this.config, 1, `On average ms/request: ${ms_per_request}ms/request`);
|
||||
|
||||
if (this.config.compress) {
|
||||
log(this.config, 1, 'Compressing results');
|
||||
results = JSON.stringify(results);
|
||||
// https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Encoding
|
||||
results = zlib.deflateSync(results).toString('base64');
|
||||
|
Loading…
Reference in New Issue
Block a user