added google shopping results

This commit is contained in:
Nikolai Tschacher 2019-07-11 16:42:01 +02:00
parent a413cb54ef
commit dab25f9068
15 changed files with 161 additions and 97 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 182 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 331 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 91 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 58 KiB

View File

@ -4,7 +4,7 @@ const se_scraper = require('./../src/node_scraper.js');
let browser_config = {
debug_level: 1,
test_evasion: false,
headless: true,
headless: false,
block_assets: false,
random_user_agent: false,
log_http_headers: false,
@ -12,8 +12,8 @@ const se_scraper = require('./../src/node_scraper.js');
};
let scrape_job = {
search_engine: 'bing',
keywords: ['auto verkaufen'],
search_engine: 'google_shopping',
keywords: ['wasserpistole'],
num_pages: 1,
};

View File

@ -1,6 +1,6 @@
{
"name": "se-scraper",
"version": "1.3.14",
"version": "1.3.15",
"description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu",
"homepage": "https://scrapeulous.com/",
"main": "index.js",

View File

@ -66,13 +66,7 @@ class AmazonScraper extends Scraper {
let effective_query = $('[data-component-type="s-result-info-bar"] span.a-text-bold').text() || '';
const cleaned = [];
for (var res of results) {
if (res.link && res.link.trim() && res.title && res.title.trim() && res.price && res.price.trim() && res.stars.trim()) {
res.rank = this.result_rank++;
cleaned.push(res);
}
}
const cleaned = this.clean_results(results, ['title', 'link', 'price', 'stars']);
return {
time: (new Date()).toUTCString(),

View File

@ -17,14 +17,7 @@ class BaiduScraper extends Scraper {
})
});
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim()) {
res.rank = this.result_rank++;
cleaned.push(res);
}
}
const cleaned = this.clean_results(results, ['link']);
return {
time: (new Date()).toUTCString(),

View File

@ -38,14 +38,7 @@ class BingScraper extends Scraper {
let effective_query = $('#sp_requery a').first().text() || '';
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.title && res.title.trim()) {
res.rank = this.result_rank++;
cleaned.push(res);
}
}
const cleaned = this.clean_results(results, ['title', 'link']);
return {
time: (new Date()).toUTCString(),
@ -133,14 +126,7 @@ class BingNewsScraper extends Scraper {
})
});
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.title && res.title.trim()) {
res.rank = this.result_rank++;
cleaned.push(res);
}
}
const cleaned = this.clean_results(results, ['title', 'link']);
return {
time: (new Date()).toUTCString(),

View File

@ -6,7 +6,7 @@ function log(config, loglevel, msg = null, cb = null) {
if (loglevel <= config.debug_level) {
if (msg) {
if (typeof msg == 'object') {
console.dir(msg, {depth: null, colors: true});
console.dir(msg, {depth: null, colors: false});
} else {
console.log('[i] ' + msg);
}

View File

@ -31,14 +31,7 @@ class DuckduckgoScraper extends Scraper {
let effective_query = $('a.js-spelling-suggestion-link').attr('data-query') || '';
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.title && res.title.trim()) {
res.rank = this.result_rank++;
cleaned.push(res);
}
}
const cleaned = this.clean_results(results, ['title', 'link']);
return {
time: (new Date()).toUTCString(),

View File

@ -75,14 +75,7 @@ class GoogleScraper extends Scraper {
effective_query = $('#fprs a').text()
}
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.title && res.title.trim()) {
res.rank = this.result_rank++;
cleaned.push(res);
}
}
const cleaned = this.clean_results(results, ['title', 'link']);
return {
time: (new Date()).toUTCString(),
@ -184,14 +177,7 @@ class GoogleNewsOldScraper extends Scraper {
effective_query = $('#fprs a').text()
}
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim()) {
res.rank = this.result_rank++;
cleaned.push(res);
}
}
const cleaned = this.clean_results(results, ['link']);
return {
time: (new Date()).toUTCString(),
@ -274,15 +260,7 @@ class GoogleImageScraper extends Scraper {
effective_query = $('#fprs a').text();
}
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.link.trim().length > 10) {
res.link = res.link.trim();
res.rank = this.result_rank++;
cleaned.push(res);
}
}
const cleaned = this.clean_results(results, ['link']);
return {
time: (new Date()).toUTCString(),
@ -371,14 +349,7 @@ class GoogleNewsScraper extends Scraper {
let effective_query = $('#fprsl').text() || '';
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.title && res.title.trim()) {
res.rank = this.result_rank++;
cleaned.push(res);
}
}
const cleaned = this.clean_results(results, ['title',]);
return {
time: (new Date()).toUTCString(),
@ -608,6 +579,118 @@ class GoogleMapsScraper extends Scraper {
}
class GoogleShoppingScraper extends Scraper {
constructor(...args) {
super(...args);
}
parse(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
const results = [];
$('.sh-dlr__list-result').each((i, link) => {
results.push({
price: $(link).find('.sh-dlr__content div:nth-child(2) span > span').text(),
link: $(link).find('.sh-dlr__thumbnail a').attr('href'),
title: $(link).find('div > div > a[data-what="1"]').text(),
info1: $(link).find('.sh-dlr__content div:nth-child(2)').text(),
info2: $(link).find('.sh-dlr__content div:nth-child(3)').text(),
info3: $(link).find('.sh-dlr__content div:nth-child(4)').text(),
})
});
const grid_results = [];
$('.sh-pr__product-results-grid .sh-dgr__grid-result').each((i, link) => {
grid_results.push({
price: $(link).find('.sh-dgr__content div:nth-child(2) span').text(),
link: $(link).find('.sh-dgr__content a').attr('href'),
title: $(link).find('.sh-dgr__content a').text(),
info: $(link).find('.sh-dgr__content').text(),
})
});
// 'Ergebnisse für', 'Showing results for'
let no_results = this.no_results(
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
'No results found for'],
$('#main').text()
);
const cleaned = this.clean_results(results, ['title', 'link']);
return {
time: (new Date()).toUTCString(),
no_results: no_results,
results: cleaned,
grid_results: grid_results,
}
}
async load_start_page() {
let startUrl = 'https://www.google.com/shopping?';
if (this.config.google_settings) {
startUrl = `https://www.${this.config.google_settings.google_domain}/shopping?q=`;
if (this.config.google_settings.google_domain) {
startUrl = `https://www.${this.config.google_settings.google_domain}/shopping?`;
} else {
startUrl = `https://www.google.com/shopping?`;
}
for (var key in this.config.google_settings) {
if (key !== 'google_domain') {
startUrl += `${key}=${this.config.google_settings[key]}&`
}
}
}
log(this.config, 1, 'Using startUrl: ' + startUrl);
this.last_response = await this.page.goto(startUrl);
try {
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
} catch (e) {
return false;
}
return true;
}
async search_keyword(keyword) {
const input = await this.page.$('input[name="q"]');
await this.set_input_value(`input[name="q"]`, keyword);
await this.sleep(50);
await input.focus();
await this.page.keyboard.press("Enter");
}
async next_page() {
let next_page_link = await this.page.$('#pnnext', {timeout: 1000});
if (!next_page_link) {
return false;
}
await next_page_link.click();
return true;
}
async wait_for_results() {
await this.page.waitForSelector('#fbar', { timeout: this.STANDARD_TIMEOUT });
}
async detected() {
const title = await this.page.title();
let html = await this.page.content();
return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
}
}
function clean_image_url(url) {
// Example:
@ -632,7 +715,9 @@ function clean_google_url(url) {
}
}
module.exports = {
GoogleShoppingScraper: GoogleShoppingScraper,
GoogleNewsOldScraper: GoogleNewsOldScraper,
GoogleScraper: GoogleScraper,
GoogleImageScraper: GoogleImageScraper,

View File

@ -100,14 +100,7 @@ class WebcrawlerNewsScraper extends Scraper {
});
});
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.title && res.title.trim()) {
res.rank = this.result_rank++;
cleaned.push(res);
}
}
const cleaned = this.clean_results(results, ['title', 'link']);
return {
time: (new Date()).toUTCString(),

View File

@ -170,9 +170,6 @@ module.exports = class Scraper {
num_keywords: this.num_keywords,
num_requests: this.num_requests,
keyword: keyword,
page: this.page,
config: this.config,
context: this.context,
});
}
@ -318,6 +315,28 @@ module.exports = class Scraper {
return false;
}
/*
Throw away all elements that do not have data in the
specified attributes. Most be of value string.
*/
clean_results(results, attributes) {
const cleaned = [];
for (var res of results) {
let goodboy = true;
for (var attr of attributes) {
if (!res[attr] || !res[attr].trim()) {
goodboy = false;
break;
}
}
if (goodboy) {
res.rank = this.result_rank++;
cleaned.push(res);
}
}
return cleaned;
}
parse(html) {
}

View File

@ -41,6 +41,7 @@ function getScraper(search_engine, args) {
google_news: google.GoogleNewsScraper,
google_image: google.GoogleImageScraper,
google_maps: google.GoogleMapsScraper,
google_shopping: google.GoogleShoppingScraper,
bing: bing.BingScraper,
bing_news: bing.BingNewsScraper,
amazon: amazon.AmazonScraper,
@ -74,7 +75,7 @@ class ScrapeManager {
this.config = {
// the user agent to scrape with
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3835.0 Safari/537.36',
// if random_user_agent is set to True, a random user agent is chosen
random_user_agent: false,
// whether to select manual settings in visible mode
@ -183,7 +184,10 @@ class ScrapeManager {
if (fs.existsSync(this.config.custom_func)) {
try {
const PluggableClass = require(this.config.custom_func);
this.pluggable = new PluggableClass({config: this.config});
this.pluggable = new PluggableClass({
config: this.config,
context: this.context
});
} catch (exception) {
console.error(exception);
return false;
@ -223,7 +227,7 @@ class ScrapeManager {
user_agent = this.config.user_agent;
}
if (this.config.random_user_agent === true) {
if (this.config.random_user_agent) {
user_agent = ua.random_user_agent(this.config);
}
@ -423,17 +427,14 @@ class ScrapeManager {
log(this.config, 1, `Scraper took ${timeDelta}ms to perform ${num_requests} requests.`);
log(this.config, 1, `On average ms/request: ${ms_per_request}ms/request`);
if (this.config.compress === true) {
if (this.config.compress) {
results = JSON.stringify(results);
// https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Encoding
results = zlib.deflateSync(results).toString('base64');
}
if (this.pluggable && this.pluggable.handle_results) {
await this.pluggable.handle_results({
config: this.config,
results: results,
});
await this.pluggable.handle_results(results);
}
if (this.config.chunk_lines) {
@ -450,7 +451,7 @@ class ScrapeManager {
log(this.config, 2, metadata);
if (this.pluggable && this.pluggable.handle_metadata) {
await this.pluggable.handle_metadata({metadata: metadata, config: this.config});
await this.pluggable.handle_metadata(metadata);
}
if (this.config.output_file) {