mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2024-11-22 07:33:07 +01:00
added google shopping results
This commit is contained in:
parent
a413cb54ef
commit
dab25f9068
Binary file not shown.
Before Width: | Height: | Size: 182 KiB |
Binary file not shown.
Before Width: | Height: | Size: 331 KiB |
Binary file not shown.
Before Width: | Height: | Size: 91 KiB |
BIN
debug_se_scraper_google_french press.png
Normal file
BIN
debug_se_scraper_google_french press.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 58 KiB |
@ -4,7 +4,7 @@ const se_scraper = require('./../src/node_scraper.js');
|
||||
let browser_config = {
|
||||
debug_level: 1,
|
||||
test_evasion: false,
|
||||
headless: true,
|
||||
headless: false,
|
||||
block_assets: false,
|
||||
random_user_agent: false,
|
||||
log_http_headers: false,
|
||||
@ -12,8 +12,8 @@ const se_scraper = require('./../src/node_scraper.js');
|
||||
};
|
||||
|
||||
let scrape_job = {
|
||||
search_engine: 'bing',
|
||||
keywords: ['auto verkaufen'],
|
||||
search_engine: 'google_shopping',
|
||||
keywords: ['wasserpistole'],
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "se-scraper",
|
||||
"version": "1.3.14",
|
||||
"version": "1.3.15",
|
||||
"description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu",
|
||||
"homepage": "https://scrapeulous.com/",
|
||||
"main": "index.js",
|
||||
|
@ -66,13 +66,7 @@ class AmazonScraper extends Scraper {
|
||||
|
||||
let effective_query = $('[data-component-type="s-result-info-bar"] span.a-text-bold').text() || '';
|
||||
|
||||
const cleaned = [];
|
||||
for (var res of results) {
|
||||
if (res.link && res.link.trim() && res.title && res.title.trim() && res.price && res.price.trim() && res.stars.trim()) {
|
||||
res.rank = this.result_rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
const cleaned = this.clean_results(results, ['title', 'link', 'price', 'stars']);
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
|
@ -17,14 +17,7 @@ class BaiduScraper extends Scraper {
|
||||
})
|
||||
});
|
||||
|
||||
const cleaned = [];
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.link && res.link.trim()) {
|
||||
res.rank = this.result_rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
const cleaned = this.clean_results(results, ['link']);
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
|
@ -38,14 +38,7 @@ class BingScraper extends Scraper {
|
||||
|
||||
let effective_query = $('#sp_requery a').first().text() || '';
|
||||
|
||||
const cleaned = [];
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
||||
res.rank = this.result_rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
const cleaned = this.clean_results(results, ['title', 'link']);
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
@ -133,14 +126,7 @@ class BingNewsScraper extends Scraper {
|
||||
})
|
||||
});
|
||||
|
||||
const cleaned = [];
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
||||
res.rank = this.result_rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
const cleaned = this.clean_results(results, ['title', 'link']);
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
|
@ -6,7 +6,7 @@ function log(config, loglevel, msg = null, cb = null) {
|
||||
if (loglevel <= config.debug_level) {
|
||||
if (msg) {
|
||||
if (typeof msg == 'object') {
|
||||
console.dir(msg, {depth: null, colors: true});
|
||||
console.dir(msg, {depth: null, colors: false});
|
||||
} else {
|
||||
console.log('[i] ' + msg);
|
||||
}
|
||||
|
@ -31,14 +31,7 @@ class DuckduckgoScraper extends Scraper {
|
||||
|
||||
let effective_query = $('a.js-spelling-suggestion-link').attr('data-query') || '';
|
||||
|
||||
const cleaned = [];
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
||||
res.rank = this.result_rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
const cleaned = this.clean_results(results, ['title', 'link']);
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
|
@ -75,14 +75,7 @@ class GoogleScraper extends Scraper {
|
||||
effective_query = $('#fprs a').text()
|
||||
}
|
||||
|
||||
const cleaned = [];
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
||||
res.rank = this.result_rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
const cleaned = this.clean_results(results, ['title', 'link']);
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
@ -184,14 +177,7 @@ class GoogleNewsOldScraper extends Scraper {
|
||||
effective_query = $('#fprs a').text()
|
||||
}
|
||||
|
||||
const cleaned = [];
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.link && res.link.trim()) {
|
||||
res.rank = this.result_rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
const cleaned = this.clean_results(results, ['link']);
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
@ -274,15 +260,7 @@ class GoogleImageScraper extends Scraper {
|
||||
effective_query = $('#fprs a').text();
|
||||
}
|
||||
|
||||
const cleaned = [];
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.link && res.link.trim() && res.link.trim().length > 10) {
|
||||
res.link = res.link.trim();
|
||||
res.rank = this.result_rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
const cleaned = this.clean_results(results, ['link']);
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
@ -371,14 +349,7 @@ class GoogleNewsScraper extends Scraper {
|
||||
|
||||
let effective_query = $('#fprsl').text() || '';
|
||||
|
||||
const cleaned = [];
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.title && res.title.trim()) {
|
||||
res.rank = this.result_rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
const cleaned = this.clean_results(results, ['title',]);
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
@ -608,6 +579,118 @@ class GoogleMapsScraper extends Scraper {
|
||||
}
|
||||
|
||||
|
||||
class GoogleShoppingScraper extends Scraper {
|
||||
|
||||
constructor(...args) {
|
||||
super(...args);
|
||||
}
|
||||
|
||||
parse(html) {
|
||||
// load the page source into cheerio
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const results = [];
|
||||
$('.sh-dlr__list-result').each((i, link) => {
|
||||
results.push({
|
||||
price: $(link).find('.sh-dlr__content div:nth-child(2) span > span').text(),
|
||||
link: $(link).find('.sh-dlr__thumbnail a').attr('href'),
|
||||
title: $(link).find('div > div > a[data-what="1"]').text(),
|
||||
info1: $(link).find('.sh-dlr__content div:nth-child(2)').text(),
|
||||
info2: $(link).find('.sh-dlr__content div:nth-child(3)').text(),
|
||||
info3: $(link).find('.sh-dlr__content div:nth-child(4)').text(),
|
||||
})
|
||||
});
|
||||
|
||||
const grid_results = [];
|
||||
|
||||
$('.sh-pr__product-results-grid .sh-dgr__grid-result').each((i, link) => {
|
||||
grid_results.push({
|
||||
price: $(link).find('.sh-dgr__content div:nth-child(2) span').text(),
|
||||
link: $(link).find('.sh-dgr__content a').attr('href'),
|
||||
title: $(link).find('.sh-dgr__content a').text(),
|
||||
info: $(link).find('.sh-dgr__content').text(),
|
||||
})
|
||||
});
|
||||
|
||||
// 'Ergebnisse für', 'Showing results for'
|
||||
let no_results = this.no_results(
|
||||
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
|
||||
'No results found for'],
|
||||
$('#main').text()
|
||||
);
|
||||
|
||||
const cleaned = this.clean_results(results, ['title', 'link']);
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
no_results: no_results,
|
||||
results: cleaned,
|
||||
grid_results: grid_results,
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
let startUrl = 'https://www.google.com/shopping?';
|
||||
|
||||
if (this.config.google_settings) {
|
||||
startUrl = `https://www.${this.config.google_settings.google_domain}/shopping?q=`;
|
||||
if (this.config.google_settings.google_domain) {
|
||||
startUrl = `https://www.${this.config.google_settings.google_domain}/shopping?`;
|
||||
} else {
|
||||
startUrl = `https://www.google.com/shopping?`;
|
||||
}
|
||||
|
||||
for (var key in this.config.google_settings) {
|
||||
if (key !== 'google_domain') {
|
||||
startUrl += `${key}=${this.config.google_settings[key]}&`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log(this.config, 1, 'Using startUrl: ' + startUrl);
|
||||
|
||||
this.last_response = await this.page.goto(startUrl);
|
||||
|
||||
try {
|
||||
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
async search_keyword(keyword) {
|
||||
const input = await this.page.$('input[name="q"]');
|
||||
await this.set_input_value(`input[name="q"]`, keyword);
|
||||
await this.sleep(50);
|
||||
await input.focus();
|
||||
await this.page.keyboard.press("Enter");
|
||||
}
|
||||
|
||||
async next_page() {
|
||||
let next_page_link = await this.page.$('#pnnext', {timeout: 1000});
|
||||
if (!next_page_link) {
|
||||
return false;
|
||||
}
|
||||
await next_page_link.click();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
async wait_for_results() {
|
||||
await this.page.waitForSelector('#fbar', { timeout: this.STANDARD_TIMEOUT });
|
||||
}
|
||||
|
||||
async detected() {
|
||||
const title = await this.page.title();
|
||||
let html = await this.page.content();
|
||||
return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
function clean_image_url(url) {
|
||||
// Example:
|
||||
@ -632,7 +715,9 @@ function clean_google_url(url) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
module.exports = {
|
||||
GoogleShoppingScraper: GoogleShoppingScraper,
|
||||
GoogleNewsOldScraper: GoogleNewsOldScraper,
|
||||
GoogleScraper: GoogleScraper,
|
||||
GoogleImageScraper: GoogleImageScraper,
|
||||
|
@ -100,14 +100,7 @@ class WebcrawlerNewsScraper extends Scraper {
|
||||
});
|
||||
});
|
||||
|
||||
const cleaned = [];
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
||||
res.rank = this.result_rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
const cleaned = this.clean_results(results, ['title', 'link']);
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
|
@ -170,9 +170,6 @@ module.exports = class Scraper {
|
||||
num_keywords: this.num_keywords,
|
||||
num_requests: this.num_requests,
|
||||
keyword: keyword,
|
||||
page: this.page,
|
||||
config: this.config,
|
||||
context: this.context,
|
||||
});
|
||||
}
|
||||
|
||||
@ -318,6 +315,28 @@ module.exports = class Scraper {
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
Throw away all elements that do not have data in the
|
||||
specified attributes. Most be of value string.
|
||||
*/
|
||||
clean_results(results, attributes) {
|
||||
const cleaned = [];
|
||||
for (var res of results) {
|
||||
let goodboy = true;
|
||||
for (var attr of attributes) {
|
||||
if (!res[attr] || !res[attr].trim()) {
|
||||
goodboy = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (goodboy) {
|
||||
res.rank = this.result_rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
return cleaned;
|
||||
}
|
||||
|
||||
parse(html) {
|
||||
|
||||
}
|
||||
|
@ -41,6 +41,7 @@ function getScraper(search_engine, args) {
|
||||
google_news: google.GoogleNewsScraper,
|
||||
google_image: google.GoogleImageScraper,
|
||||
google_maps: google.GoogleMapsScraper,
|
||||
google_shopping: google.GoogleShoppingScraper,
|
||||
bing: bing.BingScraper,
|
||||
bing_news: bing.BingNewsScraper,
|
||||
amazon: amazon.AmazonScraper,
|
||||
@ -74,7 +75,7 @@ class ScrapeManager {
|
||||
|
||||
this.config = {
|
||||
// the user agent to scrape with
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3835.0 Safari/537.36',
|
||||
// if random_user_agent is set to True, a random user agent is chosen
|
||||
random_user_agent: false,
|
||||
// whether to select manual settings in visible mode
|
||||
@ -183,7 +184,10 @@ class ScrapeManager {
|
||||
if (fs.existsSync(this.config.custom_func)) {
|
||||
try {
|
||||
const PluggableClass = require(this.config.custom_func);
|
||||
this.pluggable = new PluggableClass({config: this.config});
|
||||
this.pluggable = new PluggableClass({
|
||||
config: this.config,
|
||||
context: this.context
|
||||
});
|
||||
} catch (exception) {
|
||||
console.error(exception);
|
||||
return false;
|
||||
@ -223,7 +227,7 @@ class ScrapeManager {
|
||||
user_agent = this.config.user_agent;
|
||||
}
|
||||
|
||||
if (this.config.random_user_agent === true) {
|
||||
if (this.config.random_user_agent) {
|
||||
user_agent = ua.random_user_agent(this.config);
|
||||
}
|
||||
|
||||
@ -423,17 +427,14 @@ class ScrapeManager {
|
||||
log(this.config, 1, `Scraper took ${timeDelta}ms to perform ${num_requests} requests.`);
|
||||
log(this.config, 1, `On average ms/request: ${ms_per_request}ms/request`);
|
||||
|
||||
if (this.config.compress === true) {
|
||||
if (this.config.compress) {
|
||||
results = JSON.stringify(results);
|
||||
// https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Encoding
|
||||
results = zlib.deflateSync(results).toString('base64');
|
||||
}
|
||||
|
||||
if (this.pluggable && this.pluggable.handle_results) {
|
||||
await this.pluggable.handle_results({
|
||||
config: this.config,
|
||||
results: results,
|
||||
});
|
||||
await this.pluggable.handle_results(results);
|
||||
}
|
||||
|
||||
if (this.config.chunk_lines) {
|
||||
@ -450,7 +451,7 @@ class ScrapeManager {
|
||||
log(this.config, 2, metadata);
|
||||
|
||||
if (this.pluggable && this.pluggable.handle_metadata) {
|
||||
await this.pluggable.handle_metadata({metadata: metadata, config: this.config});
|
||||
await this.pluggable.handle_metadata(metadata);
|
||||
}
|
||||
|
||||
if (this.config.output_file) {
|
||||
|
Loading…
Reference in New Issue
Block a user