This commit is contained in:
Hugo 2020-06-17 17:30:55 +02:00 committed by GitHub
commit 6742e37310
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 1051 additions and 34 deletions

View File

@ -13,13 +13,13 @@ class GoogleScraper extends Scraper {
const results = await this.page.evaluate(() => {
let _text = (el, s) => {
let _text = (el, s, onlyFirstTextNode) => {
let n = el.querySelector(s);
if (n) {
return n.innerText;
return (onlyFirstTextNode) ? n.childNodes[0].nodeValue : n.innerText;
} else {
return '';
return;
}
};
@ -29,7 +29,7 @@ class GoogleScraper extends Scraper {
if (n) {
return n.getAttribute(attr);
} else {
return null;
return;
}
};
@ -111,14 +111,14 @@ class GoogleScraper extends Scraper {
// parse right side product information
results.right_info.review = _attr(document, '#rhs .cu-container g-review-stars span', 'aria-label');
let title_el = document.querySelector('#rhs .cu-container g-review-stars');
let title_el = document.querySelector('#rhs .cu-container .Q7Oxbd');
if (title_el) {
results.right_info.review.title = title_el.parentNode.querySelector('div:first-child').innerText;
results.right_info.title = title_el.innerText;
}
let num_reviews_el = document.querySelector('#rhs .cu-container g-review-stars');
let num_reviews_el = document.querySelector('#rhs .cu-container .PGDKUd');
if (num_reviews_el) {
results.right_info.num_reviews = num_reviews_el.parentNode.querySelector('div:nth-of-type(2)').innerText;
results.right_info.num_reviews = num_reviews_el.innerText;
}
results.right_info.vendors = [];
@ -127,20 +127,16 @@ class GoogleScraper extends Scraper {
document.querySelectorAll('#rhs .cu-container .rhsvw > div > div:nth-child(4) > div > div:nth-child(3) > div').forEach((el) => {
results.right_info.vendors.push({
price: _text(el, 'span:nth-of-type(1)'),
merchant_name: _text(el, 'span:nth-child(3) a:nth-child(2)'),
merchant_name: _text(el, '.doUe3s0oL2B__jackpot-merchant a'),
merchant_ad_link: _attr(el, 'span:nth-child(3) a:first-child', 'href'),
merchant_link: _attr(el, 'span:nth-child(3) a:nth-child(2)', 'href'),
merchant_link: _attr(el, 'span:nth-child(3) a:nth-child(2)', 'href'), // TODO this is not working anymore
source_name: _text(el, 'span:nth-child(4) a'),
source_link: _attr(el, 'span:nth-child(4) a', 'href'),
info: _text(el, 'div span'),
shipping: _text(el, 'span:last-child > span'),
info: _text(el, '.SdBHnc.e2CF7c'),
shipping: _text(el, '.JfwJme'),
})
});
if (!results.right_info.title) {
results.right_info = {};
}
let right_side_info_el = document.getElementById('rhs');
if (right_side_info_el) {
@ -151,26 +147,19 @@ class GoogleScraper extends Scraper {
}
}
// parse top main column product information
// #tvcap .pla-unit
document.querySelectorAll('#tvcap .pla-unit').forEach((el) => {
// Parse Google Shopping top or left
document.querySelectorAll('.pla-unit').forEach((el) => {
let top_product = {
tracking_link: _attr(el, '.pla-unit-title a:first-child', 'href'),
link: _attr(el, '.pla-unit-title a:nth-child(2)', 'href'),
title: _text(el, '.pla-unit-title a:nth-child(2) span'),
price: _text(el, '.pla-unit-title + div'),
shipping: _text(el, '.pla-extensions-container div:nth-of-type(1)'),
vendor_link: _attr(el,'.pla-extensions-container div > a', 'href'),
price: _text(el, '.pla-unit-title + div', true),
originalPrice: _text(el, '.pla-unit-title + div > span'),
shipping: _text(el, '.pla-extensions-container .cYBBsb'),
vendor_link: _attr(el,'.pla-extensions-container a.FfKHB', 'href'),
merchant_name: _text(el,'.LbUacb span:nth-child(1)'),
};
let merchant_node = el.querySelector('.pla-unit-title');
if (merchant_node) {
let node = merchant_node.parentNode.querySelector('div > span');
if (node) {
top_product.merchant_name = node.innerText;
}
}
results.top_products.push(top_product);
});

View File

@ -31,8 +31,8 @@ module.exports = class Scraper {
this.proxy = config.proxy;
this.keywords = config.keywords;
this.STANDARD_TIMEOUT = 10000;
this.SOLVE_CAPTCHA_TIME = 45000;
this.STANDARD_TIMEOUT = config.standard_timeout;
this.SOLVE_CAPTCHA_TIME = config.solve_captcha_time;
this.results = {};
this.result_rank = 1;
@ -272,6 +272,12 @@ module.exports = class Scraper {
await this.page.screenshot({ path: `debug_se_scraper_${this.config.search_engine_name}_${keyword}.png` });
}
if (this.config.keep_html_on_error){
const html_error = await this.page.content();
e.html_on_error = html_error;
e.lastUrl = await this.page.evaluate(() => {return window.location.href;});
}
this.metadata.scraping_detected = await this.detected();
if (this.metadata.scraping_detected === true) {

View File

@ -139,6 +139,9 @@ class ScrapeManager {
//custom_func: resolve('examples/pluggable.js'),
custom_func: null,
throw_on_detection: false,
keep_html_on_error: false,
standard_timeout: 10000,
solve_captcha_time: 45000,
// List of proxies to use ['socks5://78.94.172.42:1080', 'http://localhost:1080']
proxies: null,
// a file with one proxy per line. Example:

108
test/keep_html_on_error.js Normal file
View File

@ -0,0 +1,108 @@
'use strict';
const express = require('express');
const { createLogger, transports } = require('winston');
const http = require('http');
const https = require('https');
const assert = require('assert');
const path = require('path');
const keyCert = require('key-cert');
const Promise = require('bluebird');
const Proxy = require('http-mitm-proxy');
const debug = require('debug')('se-scraper:test');
const se_scraper = require('..');
const httpPort = 3012;
const httpsPort = httpPort + 1;
const proxyPort = httpPort + 2;
const fakeSearchEngine = express();
fakeSearchEngine.get('/search', (req, res) => {
debug('q=%s', req.query.q);
const pageNumber = ((req.query.start/10) || 0) + 1;
res.sendFile(path.join(__dirname, 'mocks/google/' + req.query.q + '_page' + pageNumber + '.html'));
});
fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']}));
describe('Config', function(){
let httpServer, httpsServer, proxy;
before(async function(){
// Here mount our fake engine in both http and https listen server
httpServer = http.createServer(fakeSearchEngine);
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
proxy = Proxy();
proxy.onRequest((ctx, callback) => {
ctx.proxyToServerRequestOptions.host = 'localhost';
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
return callback();
});
await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
debug('Fake http search engine servers started');
});
after(function(){
httpsServer.close();
httpServer.close();
proxy.close();
});
describe('keep_html_on_error', function(){
const testLogger = createLogger({
transports: [
new transports.Console({
level: 'error'
})
]
});
/**
* Test html_output option
*/
it('html_output single page single keyword', async function () {
const scrape_job = {
search_engine: 'google',
/* TODO refactor start_url
google_settings: {
start_url: 'http://localhost:' + httpPort
},
*/
keywords: ['test error'],
};
var scraper = new se_scraper.ScrapeManager({
throw_on_detection: true,
keep_html_on_error: true,
logger: testLogger,
//clean_html_output: false,
//clean_data_images: false,
// TODO refactor start_url so we can use-it instead of depending of the proxy for this test
proxies: ['http://localhost:' + proxyPort],
use_proxies_only: true,
standard_timeout: 500,
});
await scraper.start();
await assert.rejects(
async () => {
await scraper.scrape(scrape_job);
},
(error) => {
assert(error.html_on_error, 'Error is containing the html output');
return /#fbar/.test(error.message);
}
)
await scraper.quit();
});
});
});

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1 @@
THIS IS A EMPTY PAGE TO THROW SOME ERROR IN SE-SCRAPER

View File

@ -120,4 +120,150 @@ describe('Module Google', function(){
});
});
});
it('extract google shopping on right', function () {
const googleScraper = new GoogleScraper({
config: {
search_engine_name: 'google',
throw_on_detection: true,
keywords: ['shopping'],
logger: testLogger,
scrape_from_file: '',
num_pages: 1,
}
});
googleScraper.STANDARD_TIMEOUT = 500;
return googleScraper.run({page}).then(({results, metadata, num_requests}) => {
assert.strictEqual(num_requests, 1, 'One request should be done');
assert.strictEqual(results['shopping']['1'].results.length, 10, 'Must have 10 organic results parsed on page 1');
assert.deepEqual(results['shopping']['1'].top_products, [
{
'link': 'https://www.laboutiqueofficielle.com/achat-baskets-basses/classic-series-baskets-317-blanc-144046.html?referer=gshopping&LGWCODE=3010559970809;160079;7403',
'merchant_name': 'LaBoutiqueOffi...',
'price': '39,99 €',
'rank': 1,
'title': 'Classic Series - Baskets 317 Blanc',
'tracking_link': '/aclk?sa=l&ai=DChcSEwjJqLX1v4bqAhXJlBgKHYRrDO4YABAEGgJsZQ&sig=AOD64_1OEdvZgHU2YEMPI4JNdeTqLJTVjw&ctype=5&q=&ved=2ahUKEwjPmK31v4bqAhXLxYUKHe8BByEQ9A56BAgOEFU&adurl=',
'vendor_link': 'https://www.google.com/search?tbm=shop&q=cheap%20lacoste%20shoes',
},
{
'link': 'https://www.chausport.com/p/lacoste-carnaby-evo-noire-enfant-173257.html',
'merchant_name': 'Chausport',
'price': '45,00 €',
'rank': 2,
'title': 'Tennis Lacoste Carnaby Evo Noire Enfant 28',
'tracking_link': '/aclk?sa=L&ai=DChcSEwjJqLX1v4bqAhXJlBgKHYRrDO4YABAFGgJsZQ&sig=AOD64_0lhZrLNYCENmxzquCMa5M4_D04ng&ctype=5&q=&ved=2ahUKEwjPmK31v4bqAhXLxYUKHe8BByEQ9A56BAgOEGA&adurl=',
'vendor_link': 'http://www.choozen.fr/nf/gs-cheap%20lacoste%20shoes.htm?kpartnerid=96955353',
},
{
'link': 'https://www.getthelabel.com/fr/p/lacoste-baskets-lerond-418/138256',
'merchant_name': 'GetTheLabel.c...',
'price': '44,99 €',
'rank': 3,
'title': 'Lacoste Baskets Lerond 418 Size 9 in Blanc pour Homme',
'tracking_link': '/aclk?sa=l&ai=DChcSEwjJqLX1v4bqAhXJlBgKHYRrDO4YABAIGgJsZQ&sig=AOD64_13MoA9It0w-yp3GqriMf13OPLI8w&ctype=5&q=&ved=2ahUKEwjPmK31v4bqAhXLxYUKHe8BByEQ9A56BAgOEG0&adurl=',
'vendor_link': 'https://highstreetone.com/?search=cheap%20lacoste%20shoes',
},
{
'link': 'https://www.sarenza.com/lacoste-carnaby-evo-120-2-s834061-br918-t76-p0000227925#size=39-39',
'merchant_name': 'Sarenza',
'price': '45,50 €',
'originalPrice': '65 €',
'rank': 4,
'title': 'Lacoste Carnaby Evo 120 2 Blanc - Baskets - Disponible en 39',
'tracking_link': '/aclk?sa=l&ai=DChcSEwjJqLX1v4bqAhXJlBgKHYRrDO4YABANGgJsZQ&sig=AOD64_1Q6WUe8YXjhb-y_k0rErD2WUsTqQ&ctype=5&q=&ved=2ahUKEwjPmK31v4bqAhXLxYUKHe8BByEQ9A56BAgOEHk&adurl=',
'vendor_link': 'https://www.feed-price.com/search/cheap%20lacoste%20shoes',
},
{
'link': 'https://www.spartoo.com/Lacoste-CARNABY-EVO-BL-1-x4736301.php?track_id=adwo_fgl&sx=B&utm_source=froogle&utm_medium=comparateurs&utm_content=4736301&utm_campaign=adwo_fgl&size_id=158&fcsize=1&sx=B',
'merchant_name': 'Spartoo.com',
'price': '58,00 €',
'rank': 5,
'title': 'Lacoste CARNABY EVO BL 1 Baskets basses enfant (garcons)',
'tracking_link': '/aclk?sa=l&ai=DChcSEwjJqLX1v4bqAhXJlBgKHYRrDO4YABAMGgJsZQ&sig=AOD64_0NfyG0tH5Pc7kPfADKcQflx78H1g&ctype=5&q=&ved=2ahUKEwjPmK31v4bqAhXLxYUKHe8BByEQ9A56BQgOEIcB&adurl=',
'vendor_link': 'https://www.google.com/search?tbm=shop&q=cheap%20lacoste%20shoes',
},
{
'link': 'https://www.nike.com/fr/t/nikecourt-royale-shoe-KyTwJwgV/749747-111',
'merchant_name': 'Nike Officiel',
'price': '55,00 €',
'rank': 6,
'title': 'Chaussure Nike Court Royale pour Homme - Blanc',
'tracking_link': '/aclk?sa=l&ai=DChcSEwjJqLX1v4bqAhXJlBgKHYRrDO4YABASGgJsZQ&sig=AOD64_2KQENuVGnvXutmSUufDSa4FnTYsw&ctype=5&q=&ved=2ahUKEwjPmK31v4bqAhXLxYUKHe8BByEQ9A56BQgOEJIB&adurl=',
'vendor_link': 'https://www.pricesearcher.com/css/search/?p=1&q=cheap%20lacoste%20shoes&utm_source=google&utm_medium=css',
}
])
});
});
it('extract google shopping on top', function () {
const googleScraper = new GoogleScraper({
config: {
search_engine_name: 'google',
throw_on_detection: true,
keywords: ['shopping 2'],
logger: testLogger,
scrape_from_file: '',
num_pages: 1,
}
});
googleScraper.STANDARD_TIMEOUT = 500;
return googleScraper.run({page}).then(({results, metadata, num_requests}) => {
assert.strictEqual(num_requests, 1, 'One request should be done');
assert.strictEqual(results['shopping 2']['1'].results.length, 10, 'Must have 10 organic results parsed on page 1');
assert.deepEqual(results['shopping 2']['1'].top_products[2], {
"link": "https://www.zalando.fr/lacoste-sideline-cub-chaussons-pour-bebe-whitegreen-la216f003-k11.html?size=17&allophones=0",
"merchant_name": "Zalando.fr",
"price": "31,95 €",
"rank": 3,
'shipping': 'Livraison gratuite',
"title": "Lacoste Sideline CUB Cadeau de naissance white/green, gender.kids.unisex, Taille: 17, Blanc - Imitation cuir/textile",
"tracking_link": "/aclk?sa=l&ai=DChcSEwjt7o3yj4nqAhVZhdUKHbshBNwYABASGgJ3cw&sig=AOD64_0usikwrH4jD5vqtbS7vVoCrNxMOg&ctype=5&q=&ved=2ahUKEwj0w4fyj4nqAhWZDGMBHY7HAzAQww96BAgOEFI&adurl=",
"vendor_link": "https://fr.shoptail.eu/cheap%20lacoste%20shoes",
})
});
});
it('shopping extract right one product', function () {
const googleScraper = new GoogleScraper({
config: {
search_engine_name: 'google',
throw_on_detection: true,
keywords: ['shopping right product review'],
logger: testLogger,
scrape_from_file: '',
num_pages: 1,
}
});
googleScraper.STANDARD_TIMEOUT = 500;
return googleScraper.run({page}).then(({results, metadata, num_requests}) => {
assert.strictEqual(num_requests, 1, 'One request should be done');
assert.strictEqual(results['shopping right product review']['1'].results.length, 9, 'Must have 9 organic results parsed on page 1');
assert.deepEqual(results['shopping right product review']['1'].right_info, {
title: 'Lacoste Lunettes',
'num_reviews': '146 avis',
'review': 'Note : 4,6 sur 5',
'vendors': [
{
'info': '317 · 2807',
'merchant_ad_link': 'https://www.googleadservices.com/pagead/aclk?sa=L&ai=DChcSEwihq9C82ojqAhUIyrIKHbIHAx8YABACGgJscg&ohost=www.google.com&cid=CAASE-Roz5UHMJg95vk99OwXQnKbUG0&sig=AOD64_0Wfsw3t3eO_yEtq8lWRIjiF6EqZw&ctype=5&q=&ved=2ahUKEwjsqsi82ojqAhVFPBoKHY38DAIQ9A56BAgNEH0&adurl=',
'merchant_name': 'Edel-Optics FR',
'price': '102,75 €',
'shipping': 'Livraison gratuite',
'source_link': 'https://www.google.com/search?tbm=shop&q=lacoste%20317',
'source_name': 'Par Google',
},
{
'info': '317 · 2805',
'merchant_ad_link': 'https://www.googleadservices.com/pagead/aclk?sa=L&ai=DChcSEwihq9C82ojqAhUIyrIKHbIHAx8YABADGgJscg&ohost=www.google.com&cid=CAASE-Roz5UHMJg95vk99OwXQnKbUG0&sig=AOD64_2R4Idoiqc783K8OLyv9W9YQTJfog&ctype=5&q=&ved=2ahUKEwjsqsi82ojqAhVFPBoKHY38DAIQ9A56BQgNEIEB&adurl=',
'merchant_name': 'EasyLunettes.fr',
'price': '75,00 €',
'shipping': 'Livraison gratuite',
'source_link': 'https://producthero.com/?utm_source=google&utm_medium=css&q=lacoste%20317',
'source_name': 'Par Producthero',
}
]
});
});
});
});

View File

@ -21,7 +21,7 @@ fakeSearchEngine.set('trust proxy', 'loopback');
fakeSearchEngine.get('/test-proxy', (req, res) => {
debug('fake-search-engine req.hostname=%s', req.hostname);
//debug('req to', req.socket.localAddress, req.socket.localPort);
res.send(req.hostname);
setTimeout(() => res.send(req.hostname), 100); // Add timeout here because raise condition for first test
});
describe('Config', function(){

122
test/scrape-manager.js Normal file
View File

@ -0,0 +1,122 @@
'use strict';
const express = require('express');
const { createLogger, transports } = require('winston');
const http = require('http');
const https = require('https');
const assert = require('assert');
const path = require('path');
const keyCert = require('key-cert');
const Promise = require('bluebird');
const Proxy = require('http-mitm-proxy');
const debug = require('debug')('se-scraper:test');
const se_scraper = require('../');
const httpPort = 3012;
const httpsPort = httpPort + 1;
const proxyPort = httpPort + 2;
const fakeSearchEngine = express();
fakeSearchEngine.get('/search', (req, res) => {
debug('q=%s', req.query.q);
const pageNumber = ((req.query.start/10) || 0) + 1;
res.sendFile(path.join(__dirname, 'mocks/google/' + req.query.q + '_page' + pageNumber + '.html'));
});
fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']}));
describe('ScrapeManager', function(){
let httpServer, httpsServer, proxy;
before(async function(){
// Here mount our fake engine in both http and https listen server
httpServer = http.createServer(fakeSearchEngine);
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
proxy = Proxy();
proxy.onRequest((ctx, callback) => {
ctx.proxyToServerRequestOptions.host = 'localhost';
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
return callback();
});
await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
debug('Fake http search engine servers started');
});
after(function(){
httpsServer.close();
httpServer.close();
proxy.close();
});
describe('.quit()', function(){
const testLogger = createLogger({
transports: [
new transports.Console({
level: 'error'
})
]
});
/**
* Test if quit correctly close all opened chrome
*/
it('Ensure all chrome are closed after .quit() has been called', async function () {
const scrape_job = {
search_engine: 'google',
/* TODO refactor start_url
google_settings: {
start_url: 'http://localhost:' + httpPort
},
*/
keywords: ['test keyword'],
};
var scraper = new se_scraper.ScrapeManager({
throw_on_detection: true,
logger: testLogger,
// TODO refactor start_url so we can use-it instead of depending of the proxy for this test
proxies: ['http://localhost:' + proxyPort],
use_proxies_only: true,
});
await scraper.start();
const { results } = await scraper.scrape(scrape_job);
await scraper.quit();
// TODO Check if all puppeteer chrome are stopped here
});
it('Ensure all chrome are closed after .scrape() has been called on index module', async function () {
const scrape_job = {
search_engine: 'google',
/* TODO refactor start_url
google_settings: {
start_url: 'http://localhost:' + httpPort
},
*/
keywords: ['test keyword'],
};
var results = await se_scraper.scrape({
throw_on_detection: true,
logger: testLogger,
// TODO refactor start_url so we can use-it instead of depending of the proxy for this test
proxies: ['http://localhost:' + proxyPort],
use_proxies_only: true,
}, scrape_job);
// TODO Check if all puppeteer chrome are stopped here
});
});
});