mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2025-03-13 12:38:16 +01:00
Merge 77b749e3fc
into 5a0eea201d
This commit is contained in:
commit
6742e37310
@ -13,13 +13,13 @@ class GoogleScraper extends Scraper {
|
||||
|
||||
const results = await this.page.evaluate(() => {
|
||||
|
||||
let _text = (el, s) => {
|
||||
let _text = (el, s, onlyFirstTextNode) => {
|
||||
let n = el.querySelector(s);
|
||||
|
||||
if (n) {
|
||||
return n.innerText;
|
||||
return (onlyFirstTextNode) ? n.childNodes[0].nodeValue : n.innerText;
|
||||
} else {
|
||||
return '';
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
@ -29,7 +29,7 @@ class GoogleScraper extends Scraper {
|
||||
if (n) {
|
||||
return n.getAttribute(attr);
|
||||
} else {
|
||||
return null;
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
@ -111,14 +111,14 @@ class GoogleScraper extends Scraper {
|
||||
// parse right side product information
|
||||
results.right_info.review = _attr(document, '#rhs .cu-container g-review-stars span', 'aria-label');
|
||||
|
||||
let title_el = document.querySelector('#rhs .cu-container g-review-stars');
|
||||
let title_el = document.querySelector('#rhs .cu-container .Q7Oxbd');
|
||||
if (title_el) {
|
||||
results.right_info.review.title = title_el.parentNode.querySelector('div:first-child').innerText;
|
||||
results.right_info.title = title_el.innerText;
|
||||
}
|
||||
|
||||
let num_reviews_el = document.querySelector('#rhs .cu-container g-review-stars');
|
||||
let num_reviews_el = document.querySelector('#rhs .cu-container .PGDKUd');
|
||||
if (num_reviews_el) {
|
||||
results.right_info.num_reviews = num_reviews_el.parentNode.querySelector('div:nth-of-type(2)').innerText;
|
||||
results.right_info.num_reviews = num_reviews_el.innerText;
|
||||
}
|
||||
|
||||
results.right_info.vendors = [];
|
||||
@ -127,20 +127,16 @@ class GoogleScraper extends Scraper {
|
||||
document.querySelectorAll('#rhs .cu-container .rhsvw > div > div:nth-child(4) > div > div:nth-child(3) > div').forEach((el) => {
|
||||
results.right_info.vendors.push({
|
||||
price: _text(el, 'span:nth-of-type(1)'),
|
||||
merchant_name: _text(el, 'span:nth-child(3) a:nth-child(2)'),
|
||||
merchant_name: _text(el, '.doUe3s0oL2B__jackpot-merchant a'),
|
||||
merchant_ad_link: _attr(el, 'span:nth-child(3) a:first-child', 'href'),
|
||||
merchant_link: _attr(el, 'span:nth-child(3) a:nth-child(2)', 'href'),
|
||||
merchant_link: _attr(el, 'span:nth-child(3) a:nth-child(2)', 'href'), // TODO this is not working anymore
|
||||
source_name: _text(el, 'span:nth-child(4) a'),
|
||||
source_link: _attr(el, 'span:nth-child(4) a', 'href'),
|
||||
info: _text(el, 'div span'),
|
||||
shipping: _text(el, 'span:last-child > span'),
|
||||
info: _text(el, '.SdBHnc.e2CF7c'),
|
||||
shipping: _text(el, '.JfwJme'),
|
||||
})
|
||||
});
|
||||
|
||||
if (!results.right_info.title) {
|
||||
results.right_info = {};
|
||||
}
|
||||
|
||||
let right_side_info_el = document.getElementById('rhs');
|
||||
|
||||
if (right_side_info_el) {
|
||||
@ -151,26 +147,19 @@ class GoogleScraper extends Scraper {
|
||||
}
|
||||
}
|
||||
|
||||
// parse top main column product information
|
||||
// #tvcap .pla-unit
|
||||
document.querySelectorAll('#tvcap .pla-unit').forEach((el) => {
|
||||
// Parse Google Shopping top or left
|
||||
document.querySelectorAll('.pla-unit').forEach((el) => {
|
||||
let top_product = {
|
||||
tracking_link: _attr(el, '.pla-unit-title a:first-child', 'href'),
|
||||
link: _attr(el, '.pla-unit-title a:nth-child(2)', 'href'),
|
||||
title: _text(el, '.pla-unit-title a:nth-child(2) span'),
|
||||
price: _text(el, '.pla-unit-title + div'),
|
||||
shipping: _text(el, '.pla-extensions-container div:nth-of-type(1)'),
|
||||
vendor_link: _attr(el,'.pla-extensions-container div > a', 'href'),
|
||||
price: _text(el, '.pla-unit-title + div', true),
|
||||
originalPrice: _text(el, '.pla-unit-title + div > span'),
|
||||
shipping: _text(el, '.pla-extensions-container .cYBBsb'),
|
||||
vendor_link: _attr(el,'.pla-extensions-container a.FfKHB', 'href'),
|
||||
merchant_name: _text(el,'.LbUacb span:nth-child(1)'),
|
||||
};
|
||||
|
||||
let merchant_node = el.querySelector('.pla-unit-title');
|
||||
if (merchant_node) {
|
||||
let node = merchant_node.parentNode.querySelector('div > span');
|
||||
if (node) {
|
||||
top_product.merchant_name = node.innerText;
|
||||
}
|
||||
}
|
||||
|
||||
results.top_products.push(top_product);
|
||||
});
|
||||
|
||||
|
@ -31,8 +31,8 @@ module.exports = class Scraper {
|
||||
this.proxy = config.proxy;
|
||||
this.keywords = config.keywords;
|
||||
|
||||
this.STANDARD_TIMEOUT = 10000;
|
||||
this.SOLVE_CAPTCHA_TIME = 45000;
|
||||
this.STANDARD_TIMEOUT = config.standard_timeout;
|
||||
this.SOLVE_CAPTCHA_TIME = config.solve_captcha_time;
|
||||
|
||||
this.results = {};
|
||||
this.result_rank = 1;
|
||||
@ -272,6 +272,12 @@ module.exports = class Scraper {
|
||||
await this.page.screenshot({ path: `debug_se_scraper_${this.config.search_engine_name}_${keyword}.png` });
|
||||
}
|
||||
|
||||
if (this.config.keep_html_on_error){
|
||||
const html_error = await this.page.content();
|
||||
e.html_on_error = html_error;
|
||||
e.lastUrl = await this.page.evaluate(() => {return window.location.href;});
|
||||
}
|
||||
|
||||
this.metadata.scraping_detected = await this.detected();
|
||||
|
||||
if (this.metadata.scraping_detected === true) {
|
||||
|
@ -139,6 +139,9 @@ class ScrapeManager {
|
||||
//custom_func: resolve('examples/pluggable.js'),
|
||||
custom_func: null,
|
||||
throw_on_detection: false,
|
||||
keep_html_on_error: false,
|
||||
standard_timeout: 10000,
|
||||
solve_captcha_time: 45000,
|
||||
// List of proxies to use ['socks5://78.94.172.42:1080', 'http://localhost:1080']
|
||||
proxies: null,
|
||||
// a file with one proxy per line. Example:
|
||||
|
108
test/keep_html_on_error.js
Normal file
108
test/keep_html_on_error.js
Normal file
@ -0,0 +1,108 @@
|
||||
'use strict';
|
||||
const express = require('express');
|
||||
const { createLogger, transports } = require('winston');
|
||||
const http = require('http');
|
||||
const https = require('https');
|
||||
const assert = require('assert');
|
||||
const path = require('path');
|
||||
const keyCert = require('key-cert');
|
||||
const Promise = require('bluebird');
|
||||
const Proxy = require('http-mitm-proxy');
|
||||
|
||||
const debug = require('debug')('se-scraper:test');
|
||||
const se_scraper = require('..');
|
||||
|
||||
const httpPort = 3012;
|
||||
const httpsPort = httpPort + 1;
|
||||
const proxyPort = httpPort + 2;
|
||||
|
||||
const fakeSearchEngine = express();
|
||||
fakeSearchEngine.get('/search', (req, res) => {
|
||||
debug('q=%s', req.query.q);
|
||||
const pageNumber = ((req.query.start/10) || 0) + 1;
|
||||
res.sendFile(path.join(__dirname, 'mocks/google/' + req.query.q + '_page' + pageNumber + '.html'));
|
||||
});
|
||||
fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']}));
|
||||
|
||||
describe('Config', function(){
|
||||
|
||||
let httpServer, httpsServer, proxy;
|
||||
before(async function(){
|
||||
// Here mount our fake engine in both http and https listen server
|
||||
httpServer = http.createServer(fakeSearchEngine);
|
||||
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
||||
|
||||
proxy = Proxy();
|
||||
proxy.onRequest((ctx, callback) => {
|
||||
ctx.proxyToServerRequestOptions.host = 'localhost';
|
||||
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
|
||||
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
|
||||
debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
|
||||
return callback();
|
||||
});
|
||||
|
||||
await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
|
||||
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
|
||||
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
||||
debug('Fake http search engine servers started');
|
||||
});
|
||||
|
||||
after(function(){
|
||||
httpsServer.close();
|
||||
httpServer.close();
|
||||
proxy.close();
|
||||
});
|
||||
|
||||
describe('keep_html_on_error', function(){
|
||||
|
||||
const testLogger = createLogger({
|
||||
transports: [
|
||||
new transports.Console({
|
||||
level: 'error'
|
||||
})
|
||||
]
|
||||
});
|
||||
|
||||
/**
|
||||
* Test html_output option
|
||||
*/
|
||||
it('html_output single page single keyword', async function () {
|
||||
|
||||
const scrape_job = {
|
||||
search_engine: 'google',
|
||||
/* TODO refactor start_url
|
||||
google_settings: {
|
||||
start_url: 'http://localhost:' + httpPort
|
||||
},
|
||||
*/
|
||||
keywords: ['test error'],
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager({
|
||||
throw_on_detection: true,
|
||||
keep_html_on_error: true,
|
||||
logger: testLogger,
|
||||
//clean_html_output: false,
|
||||
//clean_data_images: false,
|
||||
// TODO refactor start_url so we can use-it instead of depending of the proxy for this test
|
||||
proxies: ['http://localhost:' + proxyPort],
|
||||
use_proxies_only: true,
|
||||
standard_timeout: 500,
|
||||
});
|
||||
await scraper.start();
|
||||
await assert.rejects(
|
||||
async () => {
|
||||
await scraper.scrape(scrape_job);
|
||||
},
|
||||
(error) => {
|
||||
assert(error.html_on_error, 'Error is containing the html output');
|
||||
return /#fbar/.test(error.message);
|
||||
}
|
||||
)
|
||||
await scraper.quit();
|
||||
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
});
|
209
test/mocks/google/shopping 2_page1.html
Normal file
209
test/mocks/google/shopping 2_page1.html
Normal file
File diff suppressed because one or more lines are too long
220
test/mocks/google/shopping right product review_page1.html
Normal file
220
test/mocks/google/shopping right product review_page1.html
Normal file
File diff suppressed because one or more lines are too long
213
test/mocks/google/shopping_page1.html
Normal file
213
test/mocks/google/shopping_page1.html
Normal file
File diff suppressed because one or more lines are too long
1
test/mocks/google/test error_page1.html
Normal file
1
test/mocks/google/test error_page1.html
Normal file
@ -0,0 +1 @@
|
||||
THIS IS A EMPTY PAGE TO THROW SOME ERROR IN SE-SCRAPER
|
@ -120,4 +120,150 @@ describe('Module Google', function(){
|
||||
});
|
||||
});
|
||||
|
||||
});
|
||||
it('extract google shopping on right', function () {
|
||||
const googleScraper = new GoogleScraper({
|
||||
config: {
|
||||
search_engine_name: 'google',
|
||||
throw_on_detection: true,
|
||||
keywords: ['shopping'],
|
||||
logger: testLogger,
|
||||
scrape_from_file: '',
|
||||
num_pages: 1,
|
||||
}
|
||||
});
|
||||
googleScraper.STANDARD_TIMEOUT = 500;
|
||||
return googleScraper.run({page}).then(({results, metadata, num_requests}) => {
|
||||
assert.strictEqual(num_requests, 1, 'One request should be done');
|
||||
assert.strictEqual(results['shopping']['1'].results.length, 10, 'Must have 10 organic results parsed on page 1');
|
||||
assert.deepEqual(results['shopping']['1'].top_products, [
|
||||
{
|
||||
'link': 'https://www.laboutiqueofficielle.com/achat-baskets-basses/classic-series-baskets-317-blanc-144046.html?referer=gshopping&LGWCODE=3010559970809;160079;7403',
|
||||
'merchant_name': 'LaBoutiqueOffi...',
|
||||
'price': '39,99 €',
|
||||
'rank': 1,
|
||||
'title': 'Classic Series - Baskets 317 Blanc',
|
||||
'tracking_link': '/aclk?sa=l&ai=DChcSEwjJqLX1v4bqAhXJlBgKHYRrDO4YABAEGgJsZQ&sig=AOD64_1OEdvZgHU2YEMPI4JNdeTqLJTVjw&ctype=5&q=&ved=2ahUKEwjPmK31v4bqAhXLxYUKHe8BByEQ9A56BAgOEFU&adurl=',
|
||||
'vendor_link': 'https://www.google.com/search?tbm=shop&q=cheap%20lacoste%20shoes',
|
||||
},
|
||||
{
|
||||
'link': 'https://www.chausport.com/p/lacoste-carnaby-evo-noire-enfant-173257.html',
|
||||
'merchant_name': 'Chausport',
|
||||
'price': '45,00 €',
|
||||
'rank': 2,
|
||||
'title': 'Tennis Lacoste Carnaby Evo Noire Enfant 28',
|
||||
'tracking_link': '/aclk?sa=L&ai=DChcSEwjJqLX1v4bqAhXJlBgKHYRrDO4YABAFGgJsZQ&sig=AOD64_0lhZrLNYCENmxzquCMa5M4_D04ng&ctype=5&q=&ved=2ahUKEwjPmK31v4bqAhXLxYUKHe8BByEQ9A56BAgOEGA&adurl=',
|
||||
'vendor_link': 'http://www.choozen.fr/nf/gs-cheap%20lacoste%20shoes.htm?kpartnerid=96955353',
|
||||
},
|
||||
{
|
||||
'link': 'https://www.getthelabel.com/fr/p/lacoste-baskets-lerond-418/138256',
|
||||
'merchant_name': 'GetTheLabel.c...',
|
||||
'price': '44,99 €',
|
||||
'rank': 3,
|
||||
'title': 'Lacoste Baskets Lerond 418 Size 9 in Blanc pour Homme',
|
||||
'tracking_link': '/aclk?sa=l&ai=DChcSEwjJqLX1v4bqAhXJlBgKHYRrDO4YABAIGgJsZQ&sig=AOD64_13MoA9It0w-yp3GqriMf13OPLI8w&ctype=5&q=&ved=2ahUKEwjPmK31v4bqAhXLxYUKHe8BByEQ9A56BAgOEG0&adurl=',
|
||||
'vendor_link': 'https://highstreetone.com/?search=cheap%20lacoste%20shoes',
|
||||
},
|
||||
{
|
||||
'link': 'https://www.sarenza.com/lacoste-carnaby-evo-120-2-s834061-br918-t76-p0000227925#size=39-39',
|
||||
'merchant_name': 'Sarenza',
|
||||
'price': '45,50 €',
|
||||
'originalPrice': '65 €',
|
||||
'rank': 4,
|
||||
'title': 'Lacoste Carnaby Evo 120 2 Blanc - Baskets - Disponible en 39',
|
||||
'tracking_link': '/aclk?sa=l&ai=DChcSEwjJqLX1v4bqAhXJlBgKHYRrDO4YABANGgJsZQ&sig=AOD64_1Q6WUe8YXjhb-y_k0rErD2WUsTqQ&ctype=5&q=&ved=2ahUKEwjPmK31v4bqAhXLxYUKHe8BByEQ9A56BAgOEHk&adurl=',
|
||||
'vendor_link': 'https://www.feed-price.com/search/cheap%20lacoste%20shoes',
|
||||
},
|
||||
{
|
||||
'link': 'https://www.spartoo.com/Lacoste-CARNABY-EVO-BL-1-x4736301.php?track_id=adwo_fgl&sx=B&utm_source=froogle&utm_medium=comparateurs&utm_content=4736301&utm_campaign=adwo_fgl&size_id=158&fcsize=1&sx=B',
|
||||
'merchant_name': 'Spartoo.com',
|
||||
'price': '58,00 €',
|
||||
'rank': 5,
|
||||
'title': 'Lacoste CARNABY EVO BL 1 Baskets basses enfant (garcons)',
|
||||
'tracking_link': '/aclk?sa=l&ai=DChcSEwjJqLX1v4bqAhXJlBgKHYRrDO4YABAMGgJsZQ&sig=AOD64_0NfyG0tH5Pc7kPfADKcQflx78H1g&ctype=5&q=&ved=2ahUKEwjPmK31v4bqAhXLxYUKHe8BByEQ9A56BQgOEIcB&adurl=',
|
||||
'vendor_link': 'https://www.google.com/search?tbm=shop&q=cheap%20lacoste%20shoes',
|
||||
},
|
||||
{
|
||||
'link': 'https://www.nike.com/fr/t/nikecourt-royale-shoe-KyTwJwgV/749747-111',
|
||||
'merchant_name': 'Nike Officiel',
|
||||
'price': '55,00 €',
|
||||
'rank': 6,
|
||||
'title': 'Chaussure Nike Court Royale pour Homme - Blanc',
|
||||
'tracking_link': '/aclk?sa=l&ai=DChcSEwjJqLX1v4bqAhXJlBgKHYRrDO4YABASGgJsZQ&sig=AOD64_2KQENuVGnvXutmSUufDSa4FnTYsw&ctype=5&q=&ved=2ahUKEwjPmK31v4bqAhXLxYUKHe8BByEQ9A56BQgOEJIB&adurl=',
|
||||
'vendor_link': 'https://www.pricesearcher.com/css/search/?p=1&q=cheap%20lacoste%20shoes&utm_source=google&utm_medium=css',
|
||||
}
|
||||
])
|
||||
});
|
||||
});
|
||||
|
||||
it('extract google shopping on top', function () {
|
||||
const googleScraper = new GoogleScraper({
|
||||
config: {
|
||||
search_engine_name: 'google',
|
||||
throw_on_detection: true,
|
||||
keywords: ['shopping 2'],
|
||||
logger: testLogger,
|
||||
scrape_from_file: '',
|
||||
num_pages: 1,
|
||||
}
|
||||
});
|
||||
googleScraper.STANDARD_TIMEOUT = 500;
|
||||
return googleScraper.run({page}).then(({results, metadata, num_requests}) => {
|
||||
assert.strictEqual(num_requests, 1, 'One request should be done');
|
||||
assert.strictEqual(results['shopping 2']['1'].results.length, 10, 'Must have 10 organic results parsed on page 1');
|
||||
assert.deepEqual(results['shopping 2']['1'].top_products[2], {
|
||||
"link": "https://www.zalando.fr/lacoste-sideline-cub-chaussons-pour-bebe-whitegreen-la216f003-k11.html?size=17&allophones=0",
|
||||
"merchant_name": "Zalando.fr",
|
||||
"price": "31,95 €",
|
||||
"rank": 3,
|
||||
'shipping': 'Livraison gratuite',
|
||||
"title": "Lacoste Sideline CUB Cadeau de naissance white/green, gender.kids.unisex, Taille: 17, Blanc - Imitation cuir/textile",
|
||||
"tracking_link": "/aclk?sa=l&ai=DChcSEwjt7o3yj4nqAhVZhdUKHbshBNwYABASGgJ3cw&sig=AOD64_0usikwrH4jD5vqtbS7vVoCrNxMOg&ctype=5&q=&ved=2ahUKEwj0w4fyj4nqAhWZDGMBHY7HAzAQww96BAgOEFI&adurl=",
|
||||
"vendor_link": "https://fr.shoptail.eu/cheap%20lacoste%20shoes",
|
||||
})
|
||||
});
|
||||
});
|
||||
|
||||
it('shopping extract right one product', function () {
|
||||
const googleScraper = new GoogleScraper({
|
||||
config: {
|
||||
search_engine_name: 'google',
|
||||
throw_on_detection: true,
|
||||
keywords: ['shopping right product review'],
|
||||
logger: testLogger,
|
||||
scrape_from_file: '',
|
||||
num_pages: 1,
|
||||
}
|
||||
});
|
||||
googleScraper.STANDARD_TIMEOUT = 500;
|
||||
return googleScraper.run({page}).then(({results, metadata, num_requests}) => {
|
||||
assert.strictEqual(num_requests, 1, 'One request should be done');
|
||||
assert.strictEqual(results['shopping right product review']['1'].results.length, 9, 'Must have 9 organic results parsed on page 1');
|
||||
assert.deepEqual(results['shopping right product review']['1'].right_info, {
|
||||
title: 'Lacoste Lunettes',
|
||||
'num_reviews': '146 avis',
|
||||
'review': 'Note : 4,6 sur 5',
|
||||
'vendors': [
|
||||
{
|
||||
'info': '317 · 2807',
|
||||
'merchant_ad_link': 'https://www.googleadservices.com/pagead/aclk?sa=L&ai=DChcSEwihq9C82ojqAhUIyrIKHbIHAx8YABACGgJscg&ohost=www.google.com&cid=CAASE-Roz5UHMJg95vk99OwXQnKbUG0&sig=AOD64_0Wfsw3t3eO_yEtq8lWRIjiF6EqZw&ctype=5&q=&ved=2ahUKEwjsqsi82ojqAhVFPBoKHY38DAIQ9A56BAgNEH0&adurl=',
|
||||
'merchant_name': 'Edel-Optics FR',
|
||||
'price': '102,75 €',
|
||||
'shipping': 'Livraison gratuite',
|
||||
'source_link': 'https://www.google.com/search?tbm=shop&q=lacoste%20317',
|
||||
'source_name': 'Par Google',
|
||||
},
|
||||
{
|
||||
'info': '317 · 2805',
|
||||
'merchant_ad_link': 'https://www.googleadservices.com/pagead/aclk?sa=L&ai=DChcSEwihq9C82ojqAhUIyrIKHbIHAx8YABADGgJscg&ohost=www.google.com&cid=CAASE-Roz5UHMJg95vk99OwXQnKbUG0&sig=AOD64_2R4Idoiqc783K8OLyv9W9YQTJfog&ctype=5&q=&ved=2ahUKEwjsqsi82ojqAhVFPBoKHY38DAIQ9A56BQgNEIEB&adurl=',
|
||||
'merchant_name': 'EasyLunettes.fr',
|
||||
'price': '75,00 €',
|
||||
'shipping': 'Livraison gratuite',
|
||||
'source_link': 'https://producthero.com/?utm_source=google&utm_medium=css&q=lacoste%20317',
|
||||
'source_name': 'Par Producthero',
|
||||
}
|
||||
]
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
});
|
||||
|
@ -21,7 +21,7 @@ fakeSearchEngine.set('trust proxy', 'loopback');
|
||||
fakeSearchEngine.get('/test-proxy', (req, res) => {
|
||||
debug('fake-search-engine req.hostname=%s', req.hostname);
|
||||
//debug('req to', req.socket.localAddress, req.socket.localPort);
|
||||
res.send(req.hostname);
|
||||
setTimeout(() => res.send(req.hostname), 100); // Add timeout here because raise condition for first test
|
||||
});
|
||||
|
||||
describe('Config', function(){
|
||||
|
122
test/scrape-manager.js
Normal file
122
test/scrape-manager.js
Normal file
@ -0,0 +1,122 @@
|
||||
'use strict';
|
||||
const express = require('express');
|
||||
const { createLogger, transports } = require('winston');
|
||||
const http = require('http');
|
||||
const https = require('https');
|
||||
const assert = require('assert');
|
||||
const path = require('path');
|
||||
const keyCert = require('key-cert');
|
||||
const Promise = require('bluebird');
|
||||
const Proxy = require('http-mitm-proxy');
|
||||
|
||||
const debug = require('debug')('se-scraper:test');
|
||||
const se_scraper = require('../');
|
||||
|
||||
const httpPort = 3012;
|
||||
const httpsPort = httpPort + 1;
|
||||
const proxyPort = httpPort + 2;
|
||||
|
||||
const fakeSearchEngine = express();
|
||||
fakeSearchEngine.get('/search', (req, res) => {
|
||||
debug('q=%s', req.query.q);
|
||||
const pageNumber = ((req.query.start/10) || 0) + 1;
|
||||
res.sendFile(path.join(__dirname, 'mocks/google/' + req.query.q + '_page' + pageNumber + '.html'));
|
||||
});
|
||||
fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']}));
|
||||
|
||||
describe('ScrapeManager', function(){
|
||||
|
||||
let httpServer, httpsServer, proxy;
|
||||
before(async function(){
|
||||
// Here mount our fake engine in both http and https listen server
|
||||
httpServer = http.createServer(fakeSearchEngine);
|
||||
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
||||
|
||||
proxy = Proxy();
|
||||
proxy.onRequest((ctx, callback) => {
|
||||
ctx.proxyToServerRequestOptions.host = 'localhost';
|
||||
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
|
||||
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
|
||||
debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
|
||||
return callback();
|
||||
});
|
||||
|
||||
await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
|
||||
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
|
||||
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
||||
debug('Fake http search engine servers started');
|
||||
});
|
||||
|
||||
after(function(){
|
||||
httpsServer.close();
|
||||
httpServer.close();
|
||||
proxy.close();
|
||||
});
|
||||
|
||||
describe('.quit()', function(){
|
||||
|
||||
const testLogger = createLogger({
|
||||
transports: [
|
||||
new transports.Console({
|
||||
level: 'error'
|
||||
})
|
||||
]
|
||||
});
|
||||
|
||||
/**
|
||||
* Test if quit correctly close all opened chrome
|
||||
*/
|
||||
it('Ensure all chrome are closed after .quit() has been called', async function () {
|
||||
|
||||
const scrape_job = {
|
||||
search_engine: 'google',
|
||||
/* TODO refactor start_url
|
||||
google_settings: {
|
||||
start_url: 'http://localhost:' + httpPort
|
||||
},
|
||||
*/
|
||||
keywords: ['test keyword'],
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager({
|
||||
throw_on_detection: true,
|
||||
logger: testLogger,
|
||||
// TODO refactor start_url so we can use-it instead of depending of the proxy for this test
|
||||
proxies: ['http://localhost:' + proxyPort],
|
||||
use_proxies_only: true,
|
||||
});
|
||||
await scraper.start();
|
||||
const { results } = await scraper.scrape(scrape_job);
|
||||
await scraper.quit();
|
||||
|
||||
// TODO Check if all puppeteer chrome are stopped here
|
||||
});
|
||||
|
||||
|
||||
it('Ensure all chrome are closed after .scrape() has been called on index module', async function () {
|
||||
|
||||
const scrape_job = {
|
||||
search_engine: 'google',
|
||||
/* TODO refactor start_url
|
||||
google_settings: {
|
||||
start_url: 'http://localhost:' + httpPort
|
||||
},
|
||||
*/
|
||||
keywords: ['test keyword'],
|
||||
};
|
||||
|
||||
var results = await se_scraper.scrape({
|
||||
throw_on_detection: true,
|
||||
logger: testLogger,
|
||||
// TODO refactor start_url so we can use-it instead of depending of the proxy for this test
|
||||
proxies: ['http://localhost:' + proxyPort],
|
||||
use_proxies_only: true,
|
||||
}, scrape_job);
|
||||
|
||||
// TODO Check if all puppeteer chrome are stopped here
|
||||
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
});
|
Loading…
Reference in New Issue
Block a user