mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2025-06-20 17:47:49 +02:00
added little bug in cleaning
This commit is contained in:
parent
ca941cee45
commit
0f7e89c272
5
TODO.md
5
TODO.md
@ -68,7 +68,10 @@
|
|||||||
### 12.8.2019
|
### 12.8.2019
|
||||||
|
|
||||||
- add static test case for bing [done]
|
- add static test case for bing [done]
|
||||||
- add options that minimize `html_output` flag: `clean_html_output` will remove all JS and CSS from the html
|
- add options that minimize `html_output` flag:
|
||||||
|
`clean_html_output` will remove all JS and CSS from the html
|
||||||
|
`clean_data_images` removes all data images from the html
|
||||||
|
[done]
|
||||||
|
|
||||||
### TODO:
|
### TODO:
|
||||||
1. fix googlenewsscraper waiting for results and parsing. remove the static sleep [done]
|
1. fix googlenewsscraper waiting for results and parsing. remove the static sleep [done]
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "se-scraper",
|
"name": "se-scraper",
|
||||||
"version": "1.4.2",
|
"version": "1.4.4",
|
||||||
"description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu",
|
"description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu",
|
||||||
"homepage": "https://scrapeulous.com/",
|
"homepage": "https://scrapeulous.com/",
|
||||||
"main": "index.js",
|
"main": "index.js",
|
||||||
|
@ -208,7 +208,8 @@ module.exports = class Scraper {
|
|||||||
await this.page.evaluate(() => {
|
await this.page.evaluate(() => {
|
||||||
Array.prototype.slice.call(document.getElementsByTagName('img')).forEach(
|
Array.prototype.slice.call(document.getElementsByTagName('img')).forEach(
|
||||||
function(item) {
|
function(item) {
|
||||||
if (item.getAttribute('src').startsWith('data:')) {
|
let src = item.getAttribute('src');
|
||||||
|
if (src && src.startsWith('data:')) {
|
||||||
item.setAttribute('src', '');
|
item.setAttribute('src', '');
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@ -428,6 +428,7 @@ class ScrapeManager {
|
|||||||
log(this.config, 1, `On average ms/request: ${ms_per_request}ms/request`);
|
log(this.config, 1, `On average ms/request: ${ms_per_request}ms/request`);
|
||||||
|
|
||||||
if (this.config.compress) {
|
if (this.config.compress) {
|
||||||
|
log(this.config, 1, 'Compressing results');
|
||||||
results = JSON.stringify(results);
|
results = JSON.stringify(results);
|
||||||
// https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Encoding
|
// https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Encoding
|
||||||
results = zlib.deflateSync(results).toString('base64');
|
results = zlib.deflateSync(results).toString('base64');
|
||||||
|
Loading…
x
Reference in New Issue
Block a user