diff --git a/debug_se_scraper_google_buy used car.png b/debug_se_scraper_google_buy used car.png deleted file mode 100644 index 8b4024f..0000000 Binary files a/debug_se_scraper_google_buy used car.png and /dev/null differ diff --git a/debug_se_scraper_google_cloud service.png b/debug_se_scraper_google_cloud service.png deleted file mode 100644 index a42249f..0000000 Binary files a/debug_se_scraper_google_cloud service.png and /dev/null differ diff --git a/debug_se_scraper_google_fgskl34440abJAksafkl34a44dsflkjaQQuBBdfk.png b/debug_se_scraper_google_fgskl34440abJAksafkl34a44dsflkjaQQuBBdfk.png deleted file mode 100644 index cd1922d..0000000 Binary files a/debug_se_scraper_google_fgskl34440abJAksafkl34a44dsflkjaQQuBBdfk.png and /dev/null differ diff --git a/debug_se_scraper_google_french press.png b/debug_se_scraper_google_french press.png new file mode 100644 index 0000000..1f7b370 Binary files /dev/null and b/debug_se_scraper_google_french press.png differ diff --git a/examples/quickstart.js b/examples/quickstart.js index 5d9d9ed..a6f4896 100644 --- a/examples/quickstart.js +++ b/examples/quickstart.js @@ -4,7 +4,7 @@ const se_scraper = require('./../src/node_scraper.js'); let browser_config = { debug_level: 1, test_evasion: false, - headless: true, + headless: false, block_assets: false, random_user_agent: false, log_http_headers: false, @@ -12,8 +12,8 @@ const se_scraper = require('./../src/node_scraper.js'); }; let scrape_job = { - search_engine: 'bing', - keywords: ['auto verkaufen'], + search_engine: 'google_shopping', + keywords: ['wasserpistole'], num_pages: 1, }; diff --git a/package.json b/package.json index 09682fe..566f738 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "se-scraper", - "version": "1.3.14", + "version": "1.3.15", "description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu", "homepage": "https://scrapeulous.com/", "main": "index.js", diff --git a/src/modules/amazon.js b/src/modules/amazon.js index 312c3c5..c50a96d 100644 --- a/src/modules/amazon.js +++ b/src/modules/amazon.js @@ -66,13 +66,7 @@ class AmazonScraper extends Scraper { let effective_query = $('[data-component-type="s-result-info-bar"] span.a-text-bold').text() || ''; - const cleaned = []; - for (var res of results) { - if (res.link && res.link.trim() && res.title && res.title.trim() && res.price && res.price.trim() && res.stars.trim()) { - res.rank = this.result_rank++; - cleaned.push(res); - } - } + const cleaned = this.clean_results(results, ['title', 'link', 'price', 'stars']); return { time: (new Date()).toUTCString(), diff --git a/src/modules/baidu.js b/src/modules/baidu.js index 52a1dbc..725a774 100644 --- a/src/modules/baidu.js +++ b/src/modules/baidu.js @@ -17,14 +17,7 @@ class BaiduScraper extends Scraper { }) }); - const cleaned = []; - for (var i=0; i < results.length; i++) { - let res = results[i]; - if (res.link && res.link.trim()) { - res.rank = this.result_rank++; - cleaned.push(res); - } - } + const cleaned = this.clean_results(results, ['link']); return { time: (new Date()).toUTCString(), diff --git a/src/modules/bing.js b/src/modules/bing.js index 5698cfa..c68d001 100644 --- a/src/modules/bing.js +++ b/src/modules/bing.js @@ -38,14 +38,7 @@ class BingScraper extends Scraper { let effective_query = $('#sp_requery a').first().text() || ''; - const cleaned = []; - for (var i=0; i < results.length; i++) { - let res = results[i]; - if (res.link && res.link.trim() && res.title && res.title.trim()) { - res.rank = this.result_rank++; - cleaned.push(res); - } - } + const cleaned = this.clean_results(results, ['title', 'link']); return { time: (new Date()).toUTCString(), @@ -133,14 +126,7 @@ class BingNewsScraper extends Scraper { }) }); - const cleaned = []; - for (var i=0; i < results.length; i++) { - let res = results[i]; - if (res.link && res.link.trim() && res.title && res.title.trim()) { - res.rank = this.result_rank++; - cleaned.push(res); - } - } + const cleaned = this.clean_results(results, ['title', 'link']); return { time: (new Date()).toUTCString(), diff --git a/src/modules/common.js b/src/modules/common.js index b259094..dbcfe85 100644 --- a/src/modules/common.js +++ b/src/modules/common.js @@ -6,7 +6,7 @@ function log(config, loglevel, msg = null, cb = null) { if (loglevel <= config.debug_level) { if (msg) { if (typeof msg == 'object') { - console.dir(msg, {depth: null, colors: true}); + console.dir(msg, {depth: null, colors: false}); } else { console.log('[i] ' + msg); } diff --git a/src/modules/duckduckgo.js b/src/modules/duckduckgo.js index 9f1e581..ae7696f 100644 --- a/src/modules/duckduckgo.js +++ b/src/modules/duckduckgo.js @@ -31,14 +31,7 @@ class DuckduckgoScraper extends Scraper { let effective_query = $('a.js-spelling-suggestion-link').attr('data-query') || ''; - const cleaned = []; - for (var i=0; i < results.length; i++) { - let res = results[i]; - if (res.link && res.link.trim() && res.title && res.title.trim()) { - res.rank = this.result_rank++; - cleaned.push(res); - } - } + const cleaned = this.clean_results(results, ['title', 'link']); return { time: (new Date()).toUTCString(), diff --git a/src/modules/google.js b/src/modules/google.js index 722679e..c72870d 100644 --- a/src/modules/google.js +++ b/src/modules/google.js @@ -75,14 +75,7 @@ class GoogleScraper extends Scraper { effective_query = $('#fprs a').text() } - const cleaned = []; - for (var i=0; i < results.length; i++) { - let res = results[i]; - if (res.link && res.link.trim() && res.title && res.title.trim()) { - res.rank = this.result_rank++; - cleaned.push(res); - } - } + const cleaned = this.clean_results(results, ['title', 'link']); return { time: (new Date()).toUTCString(), @@ -184,14 +177,7 @@ class GoogleNewsOldScraper extends Scraper { effective_query = $('#fprs a').text() } - const cleaned = []; - for (var i=0; i < results.length; i++) { - let res = results[i]; - if (res.link && res.link.trim()) { - res.rank = this.result_rank++; - cleaned.push(res); - } - } + const cleaned = this.clean_results(results, ['link']); return { time: (new Date()).toUTCString(), @@ -274,15 +260,7 @@ class GoogleImageScraper extends Scraper { effective_query = $('#fprs a').text(); } - const cleaned = []; - for (var i=0; i < results.length; i++) { - let res = results[i]; - if (res.link && res.link.trim() && res.link.trim().length > 10) { - res.link = res.link.trim(); - res.rank = this.result_rank++; - cleaned.push(res); - } - } + const cleaned = this.clean_results(results, ['link']); return { time: (new Date()).toUTCString(), @@ -371,14 +349,7 @@ class GoogleNewsScraper extends Scraper { let effective_query = $('#fprsl').text() || ''; - const cleaned = []; - for (var i=0; i < results.length; i++) { - let res = results[i]; - if (res.title && res.title.trim()) { - res.rank = this.result_rank++; - cleaned.push(res); - } - } + const cleaned = this.clean_results(results, ['title',]); return { time: (new Date()).toUTCString(), @@ -608,6 +579,118 @@ class GoogleMapsScraper extends Scraper { } +class GoogleShoppingScraper extends Scraper { + + constructor(...args) { + super(...args); + } + + parse(html) { + // load the page source into cheerio + const $ = cheerio.load(html); + + const results = []; + $('.sh-dlr__list-result').each((i, link) => { + results.push({ + price: $(link).find('.sh-dlr__content div:nth-child(2) span > span').text(), + link: $(link).find('.sh-dlr__thumbnail a').attr('href'), + title: $(link).find('div > div > a[data-what="1"]').text(), + info1: $(link).find('.sh-dlr__content div:nth-child(2)').text(), + info2: $(link).find('.sh-dlr__content div:nth-child(3)').text(), + info3: $(link).find('.sh-dlr__content div:nth-child(4)').text(), + }) + }); + + const grid_results = []; + + $('.sh-pr__product-results-grid .sh-dgr__grid-result').each((i, link) => { + grid_results.push({ + price: $(link).find('.sh-dgr__content div:nth-child(2) span').text(), + link: $(link).find('.sh-dgr__content a').attr('href'), + title: $(link).find('.sh-dgr__content a').text(), + info: $(link).find('.sh-dgr__content').text(), + }) + }); + + // 'Ergebnisse für', 'Showing results for' + let no_results = this.no_results( + ['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für', + 'No results found for'], + $('#main').text() + ); + + const cleaned = this.clean_results(results, ['title', 'link']); + + return { + time: (new Date()).toUTCString(), + no_results: no_results, + results: cleaned, + grid_results: grid_results, + } + + } + + async load_start_page() { + let startUrl = 'https://www.google.com/shopping?'; + + if (this.config.google_settings) { + startUrl = `https://www.${this.config.google_settings.google_domain}/shopping?q=`; + if (this.config.google_settings.google_domain) { + startUrl = `https://www.${this.config.google_settings.google_domain}/shopping?`; + } else { + startUrl = `https://www.google.com/shopping?`; + } + + for (var key in this.config.google_settings) { + if (key !== 'google_domain') { + startUrl += `${key}=${this.config.google_settings[key]}&` + } + } + } + + log(this.config, 1, 'Using startUrl: ' + startUrl); + + this.last_response = await this.page.goto(startUrl); + + try { + await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT }); + } catch (e) { + return false; + } + + return true; + } + + async search_keyword(keyword) { + const input = await this.page.$('input[name="q"]'); + await this.set_input_value(`input[name="q"]`, keyword); + await this.sleep(50); + await input.focus(); + await this.page.keyboard.press("Enter"); + } + + async next_page() { + let next_page_link = await this.page.$('#pnnext', {timeout: 1000}); + if (!next_page_link) { + return false; + } + await next_page_link.click(); + + return true; + } + + async wait_for_results() { + await this.page.waitForSelector('#fbar', { timeout: this.STANDARD_TIMEOUT }); + } + + async detected() { + const title = await this.page.title(); + let html = await this.page.content(); + return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1; + } +} + + function clean_image_url(url) { // Example: @@ -632,7 +715,9 @@ function clean_google_url(url) { } } + module.exports = { + GoogleShoppingScraper: GoogleShoppingScraper, GoogleNewsOldScraper: GoogleNewsOldScraper, GoogleScraper: GoogleScraper, GoogleImageScraper: GoogleImageScraper, diff --git a/src/modules/infospace.js b/src/modules/infospace.js index 93661b6..c10c10c 100644 --- a/src/modules/infospace.js +++ b/src/modules/infospace.js @@ -100,14 +100,7 @@ class WebcrawlerNewsScraper extends Scraper { }); }); - const cleaned = []; - for (var i=0; i < results.length; i++) { - let res = results[i]; - if (res.link && res.link.trim() && res.title && res.title.trim()) { - res.rank = this.result_rank++; - cleaned.push(res); - } - } + const cleaned = this.clean_results(results, ['title', 'link']); return { time: (new Date()).toUTCString(), diff --git a/src/modules/se_scraper.js b/src/modules/se_scraper.js index 4559643..c282e02 100644 --- a/src/modules/se_scraper.js +++ b/src/modules/se_scraper.js @@ -170,9 +170,6 @@ module.exports = class Scraper { num_keywords: this.num_keywords, num_requests: this.num_requests, keyword: keyword, - page: this.page, - config: this.config, - context: this.context, }); } @@ -318,6 +315,28 @@ module.exports = class Scraper { return false; } + /* + Throw away all elements that do not have data in the + specified attributes. Most be of value string. + */ + clean_results(results, attributes) { + const cleaned = []; + for (var res of results) { + let goodboy = true; + for (var attr of attributes) { + if (!res[attr] || !res[attr].trim()) { + goodboy = false; + break; + } + } + if (goodboy) { + res.rank = this.result_rank++; + cleaned.push(res); + } + } + return cleaned; + } + parse(html) { } diff --git a/src/node_scraper.js b/src/node_scraper.js index 17b383f..96a177c 100644 --- a/src/node_scraper.js +++ b/src/node_scraper.js @@ -41,6 +41,7 @@ function getScraper(search_engine, args) { google_news: google.GoogleNewsScraper, google_image: google.GoogleImageScraper, google_maps: google.GoogleMapsScraper, + google_shopping: google.GoogleShoppingScraper, bing: bing.BingScraper, bing_news: bing.BingNewsScraper, amazon: amazon.AmazonScraper, @@ -74,7 +75,7 @@ class ScrapeManager { this.config = { // the user agent to scrape with - user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', + user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3835.0 Safari/537.36', // if random_user_agent is set to True, a random user agent is chosen random_user_agent: false, // whether to select manual settings in visible mode @@ -183,7 +184,10 @@ class ScrapeManager { if (fs.existsSync(this.config.custom_func)) { try { const PluggableClass = require(this.config.custom_func); - this.pluggable = new PluggableClass({config: this.config}); + this.pluggable = new PluggableClass({ + config: this.config, + context: this.context + }); } catch (exception) { console.error(exception); return false; @@ -223,7 +227,7 @@ class ScrapeManager { user_agent = this.config.user_agent; } - if (this.config.random_user_agent === true) { + if (this.config.random_user_agent) { user_agent = ua.random_user_agent(this.config); } @@ -423,17 +427,14 @@ class ScrapeManager { log(this.config, 1, `Scraper took ${timeDelta}ms to perform ${num_requests} requests.`); log(this.config, 1, `On average ms/request: ${ms_per_request}ms/request`); - if (this.config.compress === true) { + if (this.config.compress) { results = JSON.stringify(results); // https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Encoding results = zlib.deflateSync(results).toString('base64'); } if (this.pluggable && this.pluggable.handle_results) { - await this.pluggable.handle_results({ - config: this.config, - results: results, - }); + await this.pluggable.handle_results(results); } if (this.config.chunk_lines) { @@ -450,7 +451,7 @@ class ScrapeManager { log(this.config, 2, metadata); if (this.pluggable && this.pluggable.handle_metadata) { - await this.pluggable.handle_metadata({metadata: metadata, config: this.config}); + await this.pluggable.handle_metadata(metadata); } if (this.config.output_file) {