mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2025-06-30 14:20:00 +02:00
feat: add keep_html_on_error option
This commit is contained in:
@ -272,6 +272,12 @@ module.exports = class Scraper {
|
|||||||
await this.page.screenshot({ path: `debug_se_scraper_${this.config.search_engine_name}_${keyword}.png` });
|
await this.page.screenshot({ path: `debug_se_scraper_${this.config.search_engine_name}_${keyword}.png` });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (this.config.keep_html_on_error){
|
||||||
|
const html_error = await this.page.content();
|
||||||
|
e.html_on_error = html_error;
|
||||||
|
e.lastUrl = await this.page.evaluate(() => {return window.location.href;});
|
||||||
|
}
|
||||||
|
|
||||||
this.metadata.scraping_detected = await this.detected();
|
this.metadata.scraping_detected = await this.detected();
|
||||||
|
|
||||||
if (this.metadata.scraping_detected === true) {
|
if (this.metadata.scraping_detected === true) {
|
||||||
|
@ -139,8 +139,9 @@ class ScrapeManager {
|
|||||||
//custom_func: resolve('examples/pluggable.js'),
|
//custom_func: resolve('examples/pluggable.js'),
|
||||||
custom_func: null,
|
custom_func: null,
|
||||||
throw_on_detection: false,
|
throw_on_detection: false,
|
||||||
|
keep_html_on_error: false,
|
||||||
standard_timeout: 10000,
|
standard_timeout: 10000,
|
||||||
solve_captcha_time: 45000,
|
solve_captcha_time: 45000,
|
||||||
// List of proxies to use ['socks5://78.94.172.42:1080', 'http://localhost:1080']
|
// List of proxies to use ['socks5://78.94.172.42:1080', 'http://localhost:1080']
|
||||||
proxies: null,
|
proxies: null,
|
||||||
// a file with one proxy per line. Example:
|
// a file with one proxy per line. Example:
|
||||||
|
108
test/keep_html_on_error.js
Normal file
108
test/keep_html_on_error.js
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
'use strict';
|
||||||
|
const express = require('express');
|
||||||
|
const { createLogger, transports } = require('winston');
|
||||||
|
const http = require('http');
|
||||||
|
const https = require('https');
|
||||||
|
const assert = require('assert');
|
||||||
|
const path = require('path');
|
||||||
|
const keyCert = require('key-cert');
|
||||||
|
const Promise = require('bluebird');
|
||||||
|
const Proxy = require('http-mitm-proxy');
|
||||||
|
|
||||||
|
const debug = require('debug')('se-scraper:test');
|
||||||
|
const se_scraper = require('..');
|
||||||
|
|
||||||
|
const httpPort = 3012;
|
||||||
|
const httpsPort = httpPort + 1;
|
||||||
|
const proxyPort = httpPort + 2;
|
||||||
|
|
||||||
|
const fakeSearchEngine = express();
|
||||||
|
fakeSearchEngine.get('/search', (req, res) => {
|
||||||
|
debug('q=%s', req.query.q);
|
||||||
|
const pageNumber = ((req.query.start/10) || 0) + 1;
|
||||||
|
res.sendFile(path.join(__dirname, 'mocks/google/' + req.query.q + '_page' + pageNumber + '.html'));
|
||||||
|
});
|
||||||
|
fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']}));
|
||||||
|
|
||||||
|
describe('Config', function(){
|
||||||
|
|
||||||
|
let httpServer, httpsServer, proxy;
|
||||||
|
before(async function(){
|
||||||
|
// Here mount our fake engine in both http and https listen server
|
||||||
|
httpServer = http.createServer(fakeSearchEngine);
|
||||||
|
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
||||||
|
|
||||||
|
proxy = Proxy();
|
||||||
|
proxy.onRequest((ctx, callback) => {
|
||||||
|
ctx.proxyToServerRequestOptions.host = 'localhost';
|
||||||
|
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
|
||||||
|
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
|
||||||
|
debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
|
||||||
|
return callback();
|
||||||
|
});
|
||||||
|
|
||||||
|
await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
|
||||||
|
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
|
||||||
|
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
||||||
|
debug('Fake http search engine servers started');
|
||||||
|
});
|
||||||
|
|
||||||
|
after(function(){
|
||||||
|
httpsServer.close();
|
||||||
|
httpServer.close();
|
||||||
|
proxy.close();
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('keep_html_on_error', function(){
|
||||||
|
|
||||||
|
const testLogger = createLogger({
|
||||||
|
transports: [
|
||||||
|
new transports.Console({
|
||||||
|
level: 'error'
|
||||||
|
})
|
||||||
|
]
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test html_output option
|
||||||
|
*/
|
||||||
|
it('html_output single page single keyword', async function () {
|
||||||
|
|
||||||
|
const scrape_job = {
|
||||||
|
search_engine: 'google',
|
||||||
|
/* TODO refactor start_url
|
||||||
|
google_settings: {
|
||||||
|
start_url: 'http://localhost:' + httpPort
|
||||||
|
},
|
||||||
|
*/
|
||||||
|
keywords: ['test error'],
|
||||||
|
};
|
||||||
|
|
||||||
|
var scraper = new se_scraper.ScrapeManager({
|
||||||
|
throw_on_detection: true,
|
||||||
|
keep_html_on_error: true,
|
||||||
|
logger: testLogger,
|
||||||
|
//clean_html_output: false,
|
||||||
|
//clean_data_images: false,
|
||||||
|
// TODO refactor start_url so we can use-it instead of depending of the proxy for this test
|
||||||
|
proxies: ['http://localhost:' + proxyPort],
|
||||||
|
use_proxies_only: true,
|
||||||
|
standard_timeout: 500,
|
||||||
|
});
|
||||||
|
await scraper.start();
|
||||||
|
await assert.rejects(
|
||||||
|
async () => {
|
||||||
|
await scraper.scrape(scrape_job);
|
||||||
|
},
|
||||||
|
(error) => {
|
||||||
|
assert(error.html_on_error, 'Error is containing the html output');
|
||||||
|
return /#fbar/.test(error.message);
|
||||||
|
}
|
||||||
|
)
|
||||||
|
await scraper.quit();
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
});
|
1
test/mocks/google/test error_page1.html
Normal file
1
test/mocks/google/test error_page1.html
Normal file
@ -0,0 +1 @@
|
|||||||
|
THIS IS A EMPTY PAGE TO THROW SOME ERROR IN SE-SCRAPER
|
Reference in New Issue
Block a user