diff --git a/.gitignore b/.gitignore index a018916..0103c8d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ # ignore static tests -test/static_tests/ -test/static_tests/* +test/static_tests/html/ +test/static_tests/html/* .idea diff --git a/src/modules/google.js b/src/modules/google.js index da121cd..2f61b1c 100644 --- a/src/modules/google.js +++ b/src/modules/google.js @@ -1,4 +1,5 @@ 'use strict'; + const cheerio = require('cheerio'); const Scraper = require('./se_scraper'); const common = require('./common.js'); @@ -10,161 +11,218 @@ class GoogleScraper extends Scraper { super(...args); } - parse(html) { - // load the page source into cheerio - const $ = cheerio.load(html); + async parse_async(html) { - const results = []; - $('#center_col .g').each((i, link) => { - let obj = { - link: $(link).find('.r a').attr('href'), - title: $(link).find('.r a h3').text(), - snippet: $(link).find('span.st').text(), - visible_link: $(link).find('.r cite').text(), - date: $(link).find('span.f').text() || '', + const results = await this.page.evaluate(() => { + + let _text = (el, s) => { + let n = el.querySelector(s); + + if (n) { + return n.innerText; + } else { + return ''; + } }; - if (obj.date) { - obj.date = obj.date.replace(' - ', ''); + let _attr = (el, s, attr) => { + let n = el.querySelector(s); + + if (n) { + return n.getAttribute(attr); + } else { + return null; + } + }; + + let results = { + num_results: '', + no_results: false, + effective_query: '', + right_info: {}, + results: [], + top_products: [], + right_products: [], + top_ads: [], + bottom_ads: [], + places: [], + }; + + let num_results_el = document.getElementById('resultStats'); + + if (num_results_el) { + results.num_results = num_results_el.innerText; } - results.push(obj); - }); + let organic_results = document.querySelectorAll('#center_col .g'); - // parse ads - let parseAds = (storage, selector) => { - $(selector).each((i, element) => { - let obj = { - visible_link: $(element).find('.ads-visurl cite').text(), - tracking_link: $(element).find('a:first-child').attr('href'), - link: $(element).find('a:nth-child(2)').attr('href'), - title: $(element).find('a h3').text(), - snippet: $(element).find('.ads-creative').text(), - links: [], + organic_results.forEach((el) => { + + let serp_obj = { + link: _attr(el, '.r a', 'href'), + title: _text(el, '.r a h3'), + snippet: _text(el, 'span.st'), + visible_link: _text(el, '.r cite'), + date: _text(el, 'span.f'), }; - $(element).find('ul li a').each((i, el) => { - obj.links.push({ - tracking_link: $(el).attr('data-arwt'), - link: $(el).attr('href'), - title: $(el).text(), - }) - }); - storage.push(obj); + + if (serp_obj.date) { + serp_obj.date = serp_obj.date.replace(' - ', ''); + } + + results.results.push(serp_obj); }); - }; - const top_ads = []; - const bottomads = []; + // check if no results + results.no_results = (results.results.length === 0); - parseAds(top_ads, '#tads .ads-ad'); - parseAds(bottomads, '#tadsb .ads-ad'); + let parseAds = (container, selector) => { + document.querySelectorAll(selector).forEach((el) => { + let ad_obj = { + visible_link: _text(el, '.ads-visurl cite'), + tracking_link: _attr(el, 'a:first-child', 'href'), + link: _attr(el, 'a:nth-child(2)', 'href'), + title: _text(el, 'a h3'), + snippet: _text(el, '.ads-creative'), + links: [], + }; + el.querySelectorAll('ul li a').forEach((node) => { + ad_obj.links.push({ + tracking_link: node.getAttribute('data-arwt'), + link: node.getAttribute('href'), + title: node.innerText, + }) + }); + container.push(ad_obj); + }); + }; - // parse google places - const places = []; - $('.rllt__link').each((i, element) => { - places.push({ - heading: $(element).find('[role="heading"] span').text(), - rating: $(element).find('.rllt__details div:first-child').text(), - contact: $(element).find('.rllt__details div:nth-child(2)').text(), - hours: $(element).find('.rllt__details div:nth-child(3)').text(), - }) + parseAds(results.top_ads, '#tads .ads-ad'); + parseAds(results.bottom_ads, '#tadsb .ads-ad'); + + // parse google places + document.querySelectorAll('.rllt__link').forEach((el) => { + results.places.push({ + heading: _text(el, '[role="heading"] span'), + rating: _text(el, '.rllt__details div:first-child'), + contact: _text(el, '.rllt__details div:nth-child(2)'), + hours: _text(el, '.rllt__details div:nth-child(3)'), + }) + }); + + // parse right side product information + results.right_info.review = _attr(document, '#rhs .cu-container g-review-stars span', 'aria-label'); + + let title_el = document.querySelector('#rhs .cu-container g-review-stars'); + if (title_el) { + results.right_info.review.title = title_el.parentNode.querySelector('div:first-child').innerText; + } + + let num_reviews_el = document.querySelector('#rhs .cu-container g-review-stars'); + if (num_reviews_el) { + results.right_info.num_reviews = num_reviews_el.parentNode.querySelector('div:nth-of-type(2)').innerText; + } + + results.right_info.vendors = []; + results.right_info.info = _text(document, '#rhs_block > div > div > div > div:nth-child(5) > div > div'); + + document.querySelectorAll('#rhs .cu-container .rhsvw > div > div:nth-child(4) > div > div:nth-child(3) > div').forEach((el) => { + results.right_info.vendors.push({ + price: _text(el, 'span:nth-of-type(1)'), + merchant_name: _text(el, 'span:nth-child(3) a:nth-child(2)'), + merchant_ad_link: _attr(el, 'span:nth-child(3) a:first-child', 'href'), + merchant_link: _attr(el, 'span:nth-child(3) a:nth-child(2)', 'href'), + source_name: _text(el, 'span:nth-child(4) a'), + source_link: _attr(el, 'span:nth-child(4) a', 'href'), + info: _text(el, 'div span'), + shipping: _text(el, 'span:last-child > span'), + }) + }); + + if (!results.right_info.title) { + results.right_info = {}; + } + + let right_side_info_el = document.getElementById('rhs'); + + if (right_side_info_el) { + let right_side_info_text = right_side_info_el.innerText; + + if (right_side_info_text && right_side_info_text.length > 0) { + results.right_side_info_text = right_side_info_text; + } + } + + // parse top main column product information + // #tvcap .pla-unit + document.querySelectorAll('#tvcap .pla-unit').forEach((el) => { + let top_product = { + tracking_link: _attr(el, '.pla-unit-title a:first-child', 'href'), + link: _attr(el, '.pla-unit-title a:nth-child(2)', 'href'), + title: _text(el, '.pla-unit-title a:nth-child(2) span'), + price: _text(el, '.pla-unit-title + div'), + shipping: _text(el, '.pla-extensions-container div:nth-of-type(1)'), + vendor_link: _attr(el,'.pla-extensions-container div > a', 'href'), + }; + + let merchant_node = el.querySelector('.pla-unit-title'); + if (merchant_node) { + let node = merchant_node.parentNode.querySelector('div > span'); + if (node) { + top_product.merchant_name = node.innerText; + } + } + + results.top_products.push(top_product); + }); + + // parse top right product information + // #tvcap .pla-unit + document.querySelectorAll('#rhs_block .pla-unit').forEach((el) => { + let right_product = { + tracking_link: _attr(el, '.pla-unit-title a:first-child', 'href'), + link: _attr(el, '.pla-unit-title a:nth-child(2)', 'href'), + title: _text(el, '.pla-unit-title a:nth-child(2) span:first-child'), + price: _text(el,'.pla-unit-title + div'), + shipping: _text(el,'.pla-extensions-container > div'), + vendor_link: _text(el,'.pla-extensions-container div > a'), + vendor_name: _text(el,'.pla-extensions-container div > a > div'), + }; + + let merchant_node = el.querySelector('.pla-unit-title'); + if (merchant_node) { + let node = merchant_node.parentNode.querySelector('div > span:first-child'); + if (node) { + right_product.merchant_name = node.innerText; + } + } + + results.right_products.push(right_product); + }); + + let effective_query_el = document.getElementById('fprsl'); + + if (effective_query_el) { + + results.effective_query = effective_query_el.innerText; + if (!results.effective_query) { + let effective_query_el2 = document.querySelector('#fprs a'); + if (effective_query_el2) { + results.effective_query = document.querySelector('#fprs a').innerText; + } + } + } + + return results; }); - // parse right side product information - var right_side_info = {}; - right_side_info.review = $('#rhs .cu-container g-review-stars span').attr('aria-label'); - right_side_info.title = $('#rhs .cu-container g-review-stars').parent().find('div:first-child').text(); - right_side_info.num_reviews = $('#rhs .cu-container g-review-stars').parent().find('div:nth-of-type(2)').text(); - right_side_info.vendors = []; - right_side_info.info = $('#rhs_block > div > div > div > div:nth-child(5) > div > div').text(); + // clean some results + results.top_products = this.clean_results(results.top_products, ['title', 'link']); + results.right_products = this.clean_results(results.right_products, ['title', 'link']); + results.results = this.clean_results(results.results, ['title', 'link' , 'snippet']); - $('#rhs .cu-container .rhsvw > div > div:nth-child(4) > div > div:nth-child(3) > div').each((i, element) => { - right_side_info.vendors.push({ - price: $(element).find('span:nth-of-type(1)').text(), - merchant_name: $(element).find('span:nth-child(3) a:nth-child(2)').text(), - merchant_ad_link: $(element).find('span:nth-child(3) a:first-child').attr('href'), - merchant_link: $(element).find('span:nth-child(3) a:nth-child(2)').attr('href'), - source_name: $(element).find('span:nth-child(4) a').text(), - source_link: $(element).find('span:nth-child(4) a').attr('href'), - info: $(element).find('div span').text(), - shipping: $(element).find('span:last-child > span').text(), - }) - }); - - if (!right_side_info.title) { - right_side_info = {}; - } - - let right_side_info_text = $('#rhs').text(); - - // parse top main column product information - // #tvcap .pla-unit - var top_products = []; - $('#tvcap .pla-unit').each((i, element) => { - top_products.push({ - tracking_link: $(element).find('.pla-unit-title a:first-child').attr('href'), - link: $(element).find('.pla-unit-title a:nth-child(2)').attr('href'), - title: $(element).find('.pla-unit-title a:nth-child(2) span').text(), - price: $(element).find('.pla-unit-title + div').text(), - merchant_name: $(element).find('.pla-unit-title').parent().find('div > span').text(), - shipping: $(element).find('.pla-extensions-container div:nth-of-type(1)').text(), - vendor_link: $(element).find('.pla-extensions-container div > a').attr('href'), - }) - }); - - top_products = this.clean_results(top_products, ['title', 'link']); - - // parse top right product information - // #tvcap .pla-unit - var right_products = []; - $('#rhs_block .pla-unit').each((i, element) => { - right_products.push({ - tracking_link: $(element).find('.pla-unit-title a:first-child').attr('href'), - link: $(element).find('.pla-unit-title a:nth-child(2)').attr('href'), - title: $(element).find('.pla-unit-title a:nth-child(2) span:first-child').first().text(), - price: $(element).find('.pla-unit-title + div').text(), - merchant_name: $(element).find('.pla-unit-title').parent().find('div > span:first-child').text(), - shipping: $(element).find('.pla-extensions-container > div').text(), - vendor_link: $(element).find('.pla-extensions-container div > a').attr('href'), - vendor_name: $(element).find('.pla-extensions-container div > a > div').text(), - }) - }); - - right_products = this.clean_results(right_products, ['title', 'link']); - - // 'Ergebnisse für', 'Showing results for' - let no_results = this.no_results( - ['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für', - 'No results found for'], - $('#main').text() - ); - - let effective_query = $('#fprsl').text() || ''; - if (!effective_query) { - effective_query = $('#fprs a').text() - } - - const cleaned = this.clean_results(results, ['title', 'link' , 'snippet']); - - let res_obj = { - time: (new Date()).toUTCString(), - num_results: $('#resultStats').text(), - no_results: no_results, - effective_query: effective_query, - right_info: right_side_info, - results: cleaned, - top_products: top_products, - right_products: right_products, - top_ads: top_ads, - bottom_ads: bottomads, - places: places, - }; - - if (right_side_info_text && right_side_info_text.length > 0) { - res_obj.right_side_info_text = right_side_info_text; - } - - return res_obj; + results.time = (new Date()).toUTCString(); + return results; } async load_start_page() { diff --git a/test/static_tests/README.md b/test/static_tests/README.md new file mode 100644 index 0000000..140c6d2 --- /dev/null +++ b/test/static_tests/README.md @@ -0,0 +1,15 @@ +## Test with static HTML + +Dynamic testing of se-scraper takes too much time. + +Save some html and initialize se-scraper by loading the search from disk. + +### Disadvantage + +static html gets outdated after some time + +### Advantages + +1. Let's us test corner cases that are missed easily +2. Testing is not reliable, since search engines do not always return the same results for the same query +3. As said, much faster \ No newline at end of file diff --git a/test/static_tests/bing.js b/test/static_tests/bing.js new file mode 100644 index 0000000..bed1e16 --- /dev/null +++ b/test/static_tests/bing.js @@ -0,0 +1,168 @@ +const se_scraper = require('./../../index.js'); +const chai = require('chai'); +chai.use(require('chai-string')); +const assert = chai.assert; +const path = require('path'); + +async function bing_ads() { + let config = { + compress: false, + debug_level: 1, + headless: true, + }; + + let scrape_config = { + search_engine: 'bing', + keywords: ['kaffeemaschine kaufen'], + num_pages: 1, + scrape_from_file: 'file://' + path.join(__dirname, './html/bing.html'), + }; + + var scraper = new se_scraper.ScrapeManager(config); + + await scraper.start(); + + bing_search_with_ads( await scraper.scrape(scrape_config) ); + + scrape_config.keywords = ['best cloud services']; + scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/bing2.html'); + + bing_search_with_ads2( await scraper.scrape(scrape_config) ); + + scrape_config.keywords = ['car tires cheap']; + scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/bing3.html'); + + bing_search_with_ads3( await scraper.scrape(scrape_config) ); + + await scraper.quit(); +} + +// we test with a callback function to our handler +function bing_search_with_ads(response) { + assert.equal(response.metadata.num_requests, 1); + + for (let query in response.results) { + + for (let page_number in response.results[query]) { + + assert.isNumber(parseInt(page_number), 'page_number must be numeric'); + + let obj = response.results[query][page_number]; + + assert.include(obj.num_results, '1’100’000', 'num results not included'); + assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object'); + assert.isAtLeast(obj.results.length, 6, 'results must have at least 6 SERP objects'); + assert.isAtLeast(obj.ads.length, 12, 'there are 12 ads'); + + assert.equal(obj.no_results, false, 'no results should be false'); + assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); + assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); + assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); + + confirm_results_ok(obj); + } + } +} + + +function bing_search_with_ads2(response) { + assert.equal(response.metadata.num_requests, 1); + + for (let query in response.results) { + + for (let page_number in response.results[query]) { + + assert.isNumber(parseInt(page_number), 'page_number must be numeric'); + + let obj = response.results[query][page_number]; + + assert.include(obj.num_results, '44’300’000', 'num results not included'); + assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object'); + assert.isAtLeast(obj.results.length, 6, 'results must have at least 6 SERP objects'); + assert.isAtLeast(obj.ads.length, 12, 'there are 12 ads'); + + assert.equal(obj.no_results, false, 'no results should be false'); + assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); + assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); + assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); + + confirm_results_ok(obj); + } + } +} + +function bing_search_with_ads3(response) { + assert.equal(response.metadata.num_requests, 1); + + for (let query in response.results) { + + for (let page_number in response.results[query]) { + + assert.isNumber(parseInt(page_number), 'page_number must be numeric'); + + let obj = response.results[query][page_number]; + + assert.include(obj.num_results, '65.500.000 results', 'num results not included'); + assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object'); + assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects'); + assert.isAtLeast(obj.ads.length, 3, 'there are 3 ads'); + + assert.equal(obj.no_results, false, 'no results should be false'); + assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); + assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); + assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); + + confirm_results_ok(obj); + } + } +} + + +function confirm_results_ok(obj) { + + for (let res of obj.results) { + assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object'); + + assert.isOk(res.link, 'link must be ok'); + assert.typeOf(res.link, 'string', 'link must be string'); + assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars'); + + assert.isOk(res.visible_link, 'visible_link must be ok'); + assert.typeOf(res.visible_link, 'string', 'visible_link must be string'); + assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars'); + + assert.isOk(res.title, 'title must be ok'); + assert.typeOf(res.title, 'string', 'title must be string'); + assert.isAtLeast(res.title.length, 8, 'title must have at least 8 chars'); + + assert.isOk(res.snippet, 'snippet must be ok'); + assert.typeOf(res.snippet, 'string', 'snippet must be string'); + assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); + + assert.isNumber(res.rank, 'rank must be integer'); + } + + for (let res of obj.ads) { + + assert.isOk(res.tracking_link, 'link must be ok'); + assert.typeOf(res.tracking_link, 'string', 'link must be string'); + assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars'); + + assert.isOk(res.visible_link, 'link must be ok'); + assert.typeOf(res.visible_link, 'string', 'link must be string'); + assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars'); + + assert.isOk(res.title, 'title must be ok'); + assert.typeOf(res.title, 'string', 'title must be string'); + assert.isAtLeast(res.title.length, 8, 'title must have at least 8 chars'); + + assert.isOk(res.snippet, 'snippet must be ok'); + assert.typeOf(res.snippet, 'string', 'snippet must be string'); + assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); + } +} + +describe('Bing', function(){ + this.timeout(15000); + it('static bing searches with ads', bing_ads); +}); \ No newline at end of file diff --git a/test/static_tests/clean_html_test.js b/test/static_tests/clean_html_test.js new file mode 100644 index 0000000..6bbe4dc --- /dev/null +++ b/test/static_tests/clean_html_test.js @@ -0,0 +1,173 @@ +const se_scraper = require('./../../index.js'); +const chai = require('chai'); +chai.use(require('chai-string')); +const assert = chai.assert; +const path = require('path'); +const cheerio = require('cheerio'); + + +async function test_html_output() { + let config = { + debug_level: 1, + headless: true, + html_output: true, + // whether to strip JS and CSS from the html_output + // has only an effect if `html_output` is true + clean_html_output: true, + // remove all data images from the html + clean_data_images: true, + // test compression + compress: false, + }; + + let scrape_config = { + search_engine: 'bing', + keywords: ['kaffeemaschine kaufen'], + num_pages: 1, + scrape_from_file: 'file://' + path.join(__dirname, './html/bing.html'), + }; + + var scraper = new se_scraper.ScrapeManager(config); + + await scraper.start(); + + var response = await scraper.scrape(scrape_config); + + scrape_config.clean_html_output = false; + scrape_config.clean_data_images = false; + + var response_no_cleaned = await scraper.scrape(scrape_config); + + test(response, response_no_cleaned, 'bing'); + + scrape_config.search_engine = 'google'; + scrape_config.keywords = ['rückspiegel schwarz']; + scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google.html'); + scrape_config.clean_html_output = true; + scrape_config.clean_data_images = true; + + var responseGoogle = await scraper.scrape(scrape_config); + + scrape_config.clean_html_output = false; + scrape_config.clean_data_images = false; + + var response_no_cleanedGoogle = await scraper.scrape(scrape_config); + + test(responseGoogle, response_no_cleanedGoogle, 'google'); + + + scrape_config.keywords = ['cloud services']; + scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/googleLarge.html'); + scrape_config.clean_html_output = true; + scrape_config.clean_data_images = true; + + var responseGoogle = await scraper.scrape(scrape_config); + + scrape_config.clean_html_output = false; + scrape_config.clean_data_images = false; + + var response_no_cleanedGoogle = await scraper.scrape(scrape_config); + + test(responseGoogle, response_no_cleanedGoogle, 'google'); + + await scraper.quit(); +} + +function test(response, response_no_cleaned, se='google') { + for (let query in response.results) { + for (let page_number in response.results[query]) { + let obj = response.results[query][page_number]; + let obj_no_cleaned = response_no_cleaned.results[query][page_number]; + + console.log('html length of no cleaned SERP: ' + obj_no_cleaned.html.length); + console.log('html length of cleaned SERP: ' + obj.html.length); + + assert.isOk(obj.html, 'Html must be ok!'); + assert.isAtLeast(obj.html.length, 100, 'html must be a length string'); + + assert.isOk(obj_no_cleaned.html, 'Html must be ok!'); + assert.isAtLeast(obj_no_cleaned.html.length, 100, 'html must be a length string'); + + assert.isBelow(obj.html.length, obj_no_cleaned.html.length, 'cleaned html must be smaller'); + + // test that we can parse the html of both the cleaned and no cleaned versions + // with cheerio and that serp results are roughly the same + + const cleaned$ = cheerio.load(obj.html); + const no_cleaned$ = cheerio.load(obj_no_cleaned.html); + + var resCleaned = parseResults(cleaned$, se); + var resNoCleaned = parseResults(no_cleaned$, se); + + assert.equal(resCleaned.length, resNoCleaned.length); + assert.equal(resCleaned.length, obj.results.length); + assert.equal(resNoCleaned.length, obj.results.length); + + // unset the rank + resCleaned = resCleaned.map((el) => el.rank = undefined); + resNoCleaned = resNoCleaned.map((el) => el.rank = undefined); + obj.results = obj.results.map((el) => el.rank = undefined); + + assert.deepEqual(resCleaned, resNoCleaned, 'parsed results should be equal, even if html is cleaned'); + assert.deepEqual(resCleaned, obj.results, 'parsed results from cleaned html should be equal to se-scraper results'); + assert.deepEqual(resNoCleaned, obj.results, 'parsed results from non-cleaned html should be equal to se-scraper results'); + } + } +} + + +function parseResults(s$, se) { + + var results = []; + + if (se === 'google') { + s$('#center_col .g').each((i, link) => { + results.push({ + link: s$(link).find('.r a').attr('href'), + title: s$(link).find('.r a').text(), + snippet: s$(link).find('span.st').text(), + visible_link: s$(link).find('.r cite').text(), + date: s$(link).find('span.f').text() || '', + }) + }); + + } else if (se === 'bing') { + s$('#b_content #b_results .b_algo').each((i, link) => { + results.push({ + link: s$(link).find('h2 a').attr('href'), + title: s$(link).find('h2').text(), + snippet: s$(link).find('.b_caption p').text(), + visible_link: s$(link).find('cite').text(), + }) + }); + } else { + throw "no such search engine"; + } + + results = clean_results(results, ['title', 'link', 'snippet']); + return results; +} + +function clean_results(results, attributes) { + const cleaned = []; + var rank = 1; + for (var res of results) { + let goodboy = true; + for (var attr of attributes) { + if (!res[attr] || !res[attr].trim()) { + goodboy = false; + break; + } + } + if (goodboy) { + res.rank = rank++; + cleaned.push(res); + } + } + return cleaned; +} + +describe('html output', function(){ + this.timeout(15000); + it('static html output test', test_html_output); +}); \ No newline at end of file diff --git a/test/static_tests/compression.js b/test/static_tests/compression.js new file mode 100644 index 0000000..a41dba8 --- /dev/null +++ b/test/static_tests/compression.js @@ -0,0 +1,24 @@ +'use strict'; +const zlib = require('zlib'); +const fs = require('fs'); +const path = require('path'); + +var files = ['google.html', 'google2.html', 'google3.html', 'bing.html', 'bing2.html']; + +for (var file of files) { + var html = fs.readFileSync(path.resolve(__dirname, './html/' + file)); + + var compressed = zlib.gzipSync(html); + var deflated = zlib.deflateSync(html); + + var compressed_encoded = compressed.toString('base64'); + var deflated_encoded = deflated.toString('base64'); + + console.log(file) + console.log('Normal length: ' + html.length/1000); + console.log('GZIP Compressed length: ' + compressed.length/1000); + console.log('Deflate Compressed length: ' + deflated.length/1000); + console.log('Encoded GZIP Compressed length: ' + compressed_encoded.length/1000); + console.log('Encoded Deflate Compressed length: ' + deflated_encoded.length/1000); + console.log('------\n') +} diff --git a/test/static_tests/duckduckgo.js b/test/static_tests/duckduckgo.js new file mode 100644 index 0000000..f0f0834 --- /dev/null +++ b/test/static_tests/duckduckgo.js @@ -0,0 +1,99 @@ +const se_scraper = require('./../../index.js'); +const chai = require('chai'); +chai.use(require('chai-string')); +const assert = chai.assert; +const path = require('path'); + +async function duckduckgo() { + let config = { + compress: false, + debug_level: 1, + headless: true, + }; + + let scrape_config = { + search_engine: 'duckduckgo', + keywords: ['cloud service'], + num_pages: 1, + scrape_from_file: 'file://' + path.join(__dirname, './html/duckduckgo1.html'), + }; + + var scraper = new se_scraper.ScrapeManager(config); + + await scraper.start(); + + duckduckgo_normal( await scraper.scrape(scrape_config) ); + + await scraper.quit(); +} + +function duckduckgo_normal(response) { + assert.equal(response.metadata.num_requests, 1); + + for (let query in response.results) { + + for (let page_number in response.results[query]) { + + assert.isNumber(parseInt(page_number), 'page_number must be numeric'); + + let obj = response.results[query][page_number]; + + assert.containsAllKeys(obj, ['results', 'time', 'ads',], 'not all keys are in the object'); + assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects'); + assert.isAtLeast(obj.ads.length, 2, 'ads must have at least 2 SERP objects'); + + assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); + + confirm_results_ok(obj); + } + } +} + +function confirm_results_ok(obj) { + + for (let res of obj.results) { + assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object'); + + assert.isOk(res.link, 'link must be ok'); + assert.typeOf(res.link, 'string', 'link must be string'); + assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars'); + + assert.isOk(res.visible_link, 'visible_link must be ok'); + assert.typeOf(res.visible_link, 'string', 'visible_link must be string'); + assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars'); + + assert.isOk(res.title, 'title must be ok'); + assert.typeOf(res.title, 'string', 'title must be string'); + assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); + + assert.isOk(res.snippet, 'snippet must be ok'); + assert.typeOf(res.snippet, 'string', 'snippet must be string'); + assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); + + assert.isNumber(res.rank, 'rank must be integer'); + } + + for (let res of obj.ads) { + + assert.isOk(res.tracking_link, 'link must be ok'); + assert.typeOf(res.tracking_link, 'string', 'link must be string'); + assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars'); + + assert.isOk(res.visible_link, 'link must be ok'); + assert.typeOf(res.visible_link, 'string', 'link must be string'); + assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars'); + + assert.isOk(res.title, 'title must be ok'); + assert.typeOf(res.title, 'string', 'title must be string'); + assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); + + assert.isOk(res.snippet, 'snippet must be ok'); + assert.typeOf(res.snippet, 'string', 'snippet must be string'); + assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); + } +} + +describe('Duckduckgo', function(){ + this.timeout(10000); + it('static duckduckgo sarch', duckduckgo); +}); \ No newline at end of file diff --git a/test/static_tests/google.js b/test/static_tests/google.js new file mode 100644 index 0000000..fd9e154 --- /dev/null +++ b/test/static_tests/google.js @@ -0,0 +1,410 @@ +const se_scraper = require('./../../index.js'); +const chai = require('chai'); +chai.use(require('chai-string')); +const assert = chai.assert; +const path = require('path'); + +async function normal_search_test() { + let config = { + compress: false, + debug_level: 1, + headless: true, + }; + + let scrape_config = { + search_engine: 'google', + keywords: ['rückspiegel schwarz'], + num_pages: 1, + scrape_from_file: 'file://' + path.join(__dirname, './html/google.html'), + }; + + var scraper = new se_scraper.ScrapeManager(config); + + await scraper.start(); + + google_search_with_products( await scraper.scrape(scrape_config) ); + + scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google2.html'); + scrape_config.keywords = ['autoreifen mercedes c-klasse']; + + google_search_with_products2( await scraper.scrape(scrape_config) ); + + scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google3.html'); + scrape_config.keywords = ['kaffeemaschine kaufen']; + + google_places( await scraper.scrape(scrape_config) ); + + scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google4.html'); + scrape_config.keywords = ['MODEL MARKET SW18 4ES']; + + right_side_info_text( await scraper.scrape(scrape_config) ); + + scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google5.html'); + scrape_config.keywords = ['BRANDON MOTORS HP13 6NR']; + + right_side_info_text2( await scraper.scrape(scrape_config) ); + + scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google6.html'); + scrape_config.keywords = ['car tires for sale']; + + google_places_and_ads( await scraper.scrape(scrape_config) ); + + scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google_bmw_felgen.html'); + scrape_config.keywords = ['bmw felgen']; + + google_ads2( await scraper.scrape(scrape_config) ); + + await scraper.quit(); +} + +// we test with a callback function to our handler +function google_search_with_products(response) { + assert.equal(response.metadata.num_requests, 1); + + for (let query in response.results) { + + for (let page_number in response.results[query]) { + + assert.isNumber(parseInt(page_number), 'page_number must be numeric'); + + let obj = response.results[query][page_number]; + + assert.include(obj.num_results, '1’780’000', 'num results not included'); + assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object'); + assert.isAtLeast(obj.results.length, 9, 'results must have at least 8 SERP objects'); + assert.isAtLeast(obj.top_ads.length, 0, 'there are no top ads'); + assert.isAtLeast(obj.bottom_ads.length, 3, 'there are 3 bottom ads'); + assert.isAtLeast(obj.top_products.length, 15, 'there are 15 top products'); + assert.equal(obj.right_products.length, 0, 'there are 0 right products'); + + assert.equal(obj.no_results, false, 'no results should be false'); + assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); + assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); + assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); + + confirm_results_ok(obj); + + } + } +} + + +function google_search_with_products2(response) { + assert.equal(response.metadata.num_requests, 1); + + for (let query in response.results) { + + for (let page_number in response.results[query]) { + + assert.isNumber(parseInt(page_number), 'page_number must be numeric'); + + let obj = response.results[query][page_number]; + + assert.include(obj.num_results, '437’000 Ergebnisse (0.41 Sekunden)', 'num results not included'); + assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object'); + assert.isAtLeast(obj.results.length, 9, 'results must have at least 8 SERP objects'); + assert.isAtLeast(obj.top_ads.length, 0, 'there are no top ads'); + assert.isAtLeast(obj.bottom_ads.length, 1, 'there are 1 bottom ads'); + assert.isAtLeast(obj.top_products.length, 0, 'there are 0 top products'); + assert.equal(obj.right_products.length, 4, 'there are 4 right products'); + + assert.equal(obj.no_results, false, 'no results should be false'); + assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); + assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); + assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); + + confirm_results_ok(obj); + + } + } +} + +function google_places(response) { + assert.equal(response.metadata.num_requests, 1); + + for (let query in response.results) { + + for (let page_number in response.results[query]) { + + assert.isNumber(parseInt(page_number), 'page_number must be numeric'); + + let obj = response.results[query][page_number]; + + assert.include(obj.num_results, '6’750’000 Ergebnisse (0.52 Sekunden)', 'num results not included'); + assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object'); + assert.isAtLeast(obj.results.length, 10, 'results must have at least 9 SERP objects'); + assert.equal(obj.top_ads.length, 0, 'there are no top ads'); + assert.equal(obj.bottom_ads.length, 0, 'there are 0 bottom ads'); + assert.equal(obj.top_products.length, 0, 'there are 0 top products'); + assert.equal(obj.right_products.length, 0, 'there are 0 right products'); + assert.equal(obj.places.length, 3, 'there are 3 places'); + + assert.equal(obj.no_results, false, 'no results should be false'); + assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); + assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); + assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); + + confirm_results_ok(obj); + } + } +} + +function right_side_info_text(response) { + assert.equal(response.metadata.num_requests, 1); + for (let query in response.results) { + for (let page_number in response.results[query]) { + + assert.isNumber(parseInt(page_number), 'page_number must be numeric'); + + let obj = response.results[query][page_number]; + + assert.include(obj.num_results, '6 Ergebnisse', 'num results not included'); + assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', + 'effective_query', 'top_ads', 'bottom_ads', 'right_side_info_text'], 'not all keys are in the object'); + + assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects'); + + assert.isOk(obj.right_side_info_text.length, 'right_side_info_text should have some data'); + assert.isAtLeast(obj.right_side_info_text.length, 50, 'right_side_info_text should have some data'); + + assert.equal(obj.no_results, false, 'no results should be false'); + assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); + assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); + assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); + + confirm_results_ok(obj); + } + } +} + +function right_side_info_text2(response) { + assert.equal(response.metadata.num_requests, 1); + for (let query in response.results) { + for (let page_number in response.results[query]) { + + assert.isNumber(parseInt(page_number), 'page_number must be numeric'); + + let obj = response.results[query][page_number]; + + assert.include(obj.num_results, '5 Ergebnisse', 'num results not included'); + assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', + 'effective_query', 'top_ads', 'bottom_ads', 'right_side_info_text'], 'not all keys are in the object'); + + assert.isAtLeast(obj.results.length, 5, 'results must have at least 5 SERP objects'); + assert.isOk(obj.right_side_info_text.length, 'right_side_info_text should have some data'); + assert.isAtLeast(obj.right_side_info_text.length, 50, 'right_side_info_text should have some data'); + + assert.equal(obj.no_results, false, 'no results should be false'); + assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); + assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); + assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); + + confirm_results_ok(obj); + } + } +} + +function google_places_and_ads(response) { + assert.equal(response.metadata.num_requests, 1); + + for (let query in response.results) { + + for (let page_number in response.results[query]) { + + assert.isNumber(parseInt(page_number), 'page_number must be numeric'); + + let obj = response.results[query][page_number]; + + assert.include(obj.num_results, '439.000.000 Ergebnisse (0,64 Sekunden)', 'num results not included'); + assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object'); + assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects'); + assert.equal(obj.top_ads.length, 0, 'there are no top ads'); + assert.equal(obj.bottom_ads.length, 0, 'there are 0 bottom ads'); + assert.isAtLeast(obj.top_products.length, 13, 'there are 13 top products'); + assert.equal(obj.right_products.length, 0, 'there are 0 right products'); + assert.equal(obj.places.length, 2, 'there are 2 places'); + + assert.equal(obj.no_results, false, 'no results should be false'); + assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); + assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); + assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); + + confirm_results_ok(obj); + } + } +} + + +function google_ads2(response) { + assert.equal(response.metadata.num_requests, 1); + + for (let query in response.results) { + + for (let page_number in response.results[query]) { + + assert.isNumber(parseInt(page_number), 'page_number must be numeric'); + + let obj = response.results[query][page_number]; + + assert.include(obj.num_results, 'Ungefähr 23.200.000 Ergebnisse (0,29 Sekunden)', 'num results not included'); + assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object'); + assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects'); + assert.equal(obj.top_ads.length, 3, 'there are no top ads'); + assert.equal(obj.bottom_ads.length, 0, 'there are 0 bottom ads'); + assert.isAtLeast(obj.top_products.length, 0, 'there must be 0 top products'); + assert.equal(obj.right_products.length, 9, 'there are 9 right products'); + assert.equal(obj.places.length, 0, 'there are 0 places'); + + assert.equal(obj.no_results, false, 'no results should be false'); + assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); + assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); + assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); + + confirm_results_ok(obj); + } + } +} + + +function confirm_results_ok(obj) { + + for (let res of obj.results) { + assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object'); + + assert.isOk(res.link, 'link must be ok'); + assert.typeOf(res.link, 'string', 'link must be string'); + assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars'); + + assert.isOk(res.visible_link, 'visible_link must be ok'); + assert.typeOf(res.visible_link, 'string', 'visible_link must be string'); + assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars'); + + assert.isOk(res.title, 'title must be ok'); + assert.typeOf(res.title, 'string', 'title must be string'); + assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); + + assert.isOk(res.snippet, 'snippet must be ok'); + assert.typeOf(res.snippet, 'string', 'snippet must be string'); + assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); + + assert.isNumber(res.rank, 'rank must be integer'); + } + + for (let res of obj.top_ads) { + + assert.isOk(res.tracking_link, 'link must be ok'); + assert.typeOf(res.tracking_link, 'string', 'link must be string'); + assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars'); + + assert.isOk(res.visible_link, 'link must be ok'); + assert.typeOf(res.visible_link, 'string', 'link must be string'); + assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars'); + + assert.isOk(res.link, 'visible_link must be ok'); + assert.typeOf(res.link, 'string', 'visible_link must be string'); + assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars'); + + assert.isOk(res.title, 'title must be ok'); + assert.typeOf(res.title, 'string', 'title must be string'); + assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); + + assert.isOk(res.snippet, 'snippet must be ok'); + assert.typeOf(res.snippet, 'string', 'snippet must be string'); + assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); + + assert.typeOf(res.links, 'array', 'links must be array'); + } + + for (let res of obj.bottom_ads) { + assert.isOk(res.tracking_link, 'link must be ok'); + assert.typeOf(res.tracking_link, 'string', 'link must be string'); + assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars'); + + assert.isOk(res.visible_link, 'link must be ok'); + assert.typeOf(res.visible_link, 'string', 'link must be string'); + assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars'); + + assert.isOk(res.link, 'visible_link must be ok'); + assert.typeOf(res.link, 'string', 'visible_link must be string'); + assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars'); + + assert.isOk(res.title, 'title must be ok'); + assert.typeOf(res.title, 'string', 'title must be string'); + assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); + + assert.isOk(res.snippet, 'snippet must be ok'); + assert.typeOf(res.snippet, 'string', 'snippet must be string'); + assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); + + assert.typeOf(res.links, 'array', 'links must be array'); + } + + for (let res of obj.top_products) { + + assert.isOk(res.tracking_link, 'link must be ok'); + assert.typeOf(res.tracking_link, 'string', 'link must be string'); + assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars'); + + assert.isOk(res.link, 'link must be ok'); + assert.typeOf(res.link, 'string', 'link must be string'); + assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars'); + + assert.isOk(res.price, 'price must be ok'); + assert.typeOf(res.price, 'string', 'price must be string'); + assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars'); + + assert.isOk(res.title, 'title must be ok'); + assert.typeOf(res.title, 'string', 'title must be string'); + assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); + + assert.isOk(res.vendor_link, 'vendor_link must be ok'); + assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string'); + assert.isAtLeast(res.vendor_link.length, 8, 'vendor_link must have at least 10 chars'); + } + + for (let res of obj.right_products) { + assert.isOk(res.tracking_link, 'link must be ok'); + assert.typeOf(res.tracking_link, 'string', 'link must be string'); + assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars'); + + assert.isOk(res.link, 'link must be ok'); + assert.typeOf(res.link, 'string', 'link must be string'); + assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars'); + + assert.isOk(res.price, 'price must be ok'); + assert.typeOf(res.price, 'string', 'price must be string'); + assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars'); + + assert.isOk(res.title, 'title must be ok'); + assert.typeOf(res.title, 'string', 'title must be string'); + assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); + + assert.isOk(res.vendor_link, 'vendor_link must be ok'); + assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string'); + assert.isAtLeast(res.vendor_link.length, 8, 'vendor_link must have at least 10 chars'); + } + + for (let res of obj.places) { + assert.isOk(res.heading, 'heading must be ok'); + assert.typeOf(res.heading, 'string', 'heading must be string'); + assert.isAtLeast(res.heading.length, 5, 'heading must have at least 5 chars'); + + assert.isOk(res.rating, 'rating must be ok'); + assert.typeOf(res.rating, 'string', 'rating must be string'); + assert.isAtLeast(res.rating.length, 5, 'rating must have at least 5 chars'); + + assert.isOk(res.contact, 'contact must be ok'); + assert.typeOf(res.contact, 'string', 'contact must be string'); + assert.isAtLeast(res.contact.length, 5, 'contact must have at least 5 chars'); + + assert.typeOf(res.hours, 'string', 'hours must be string'); + if (res.hours) { + assert.isAtLeast(res.hours.length, 10, 'hours must have at least 10 chars'); + } + } +} + +describe('Google', function() { + this.timeout(25000); + it('static google searches with products,ads and places', normal_search_test); +}); \ No newline at end of file diff --git a/test/static_tests/second_google.js b/test/static_tests/second_google.js new file mode 100644 index 0000000..9fd95b8 --- /dev/null +++ b/test/static_tests/second_google.js @@ -0,0 +1,213 @@ +const se_scraper = require('./../../index.js'); +const chai = require('chai'); +chai.use(require('chai-string')); +const assert = chai.assert; +const path = require('path'); + +async function normal_search_test() { + let config = { + compress: false, + debug_level: 1, + headless: true, + }; + + let scrape_config = { + search_engine: 'google', + keywords: ['in.linkedin.com/in/altanai'], + num_pages: 1, + scrape_from_file: 'file://' + path.join(__dirname, './html/google7.html'), + }; + + var scraper = new se_scraper.ScrapeManager(config); + + await scraper.start(); + + google_test_title( await scraper.scrape(scrape_config) ); + + await scraper.quit(); +} + +// we test with a callback function to our handler +function google_test_title(response) { + assert.equal(response.metadata.num_requests, 1); + + for (let query in response.results) { + + for (let page_number in response.results[query]) { + + assert.isNumber(parseInt(page_number), 'page_number must be numeric'); + + let obj = response.results[query][page_number]; + + assert.include(obj.num_results, '7.600', 'num results not included'); + assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object'); + assert.isAtLeast(obj.results.length, 9, 'results must have at least 9 SERP objects'); + assert.isAtLeast(obj.top_ads.length, 0, 'there are no top ads'); + assert.isAtLeast(obj.bottom_ads.length, 0, 'there are 0 bottom ads'); + assert.isAtLeast(obj.top_products.length, 0, 'there are 0 top products'); + assert.equal(obj.right_products.length, 0, 'there are 0 right products'); + + assert.equal(obj.no_results, false, 'no results should be false'); + assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); + assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); + assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); + + confirm_results_ok(obj); + + assert.equal( obj.results[0].title, 'ALTANAI BISHT - SD2 at Voice Engineering - Plivo | LinkedIn' ); + assert.equal( obj.results[1].title, 'ALTANAI BISHT | LinkedIn' ); + assert.equal( obj.results[2].title, 'ALTANAI BISHT – SD2 at Voice Engineering – Plivo | LinkedIn' ); + assert.equal( obj.results[3].title, 'AI AT - South Delhi, Delhi, India | Professional Profile | LinkedIn' ); + assert.equal( obj.results[4].title, 'ALTANAI BISHT | LinkedIn' ); + assert.equal( obj.results[9].title, 'Phani Kumar Parasaram - VOIP Expert - Infinite ... - LinkedIn'); + + assert.equal (obj.results[0].date, '27.07.2016'); + assert.equal( obj.results[0].snippet, '27.07.2016 - View ALTANAI BISHT\'S profile on LinkedIn, the world\'s largest professional community. ALTANAI has 6 jobs listed on their profile. See the ...'); + + assert.equal (obj.results[2].date, '27.07.2016'); + } + } +} + +function confirm_results_ok(obj) { + + for (let res of obj.results) { + assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object'); + + assert.isOk(res.link, 'link must be ok'); + assert.typeOf(res.link, 'string', 'link must be string'); + assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars'); + + assert.isOk(res.visible_link, 'visible_link must be ok'); + assert.typeOf(res.visible_link, 'string', 'visible_link must be string'); + assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars'); + + assert.isOk(res.title, 'title must be ok'); + assert.typeOf(res.title, 'string', 'title must be string'); + assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); + + assert.isOk(res.snippet, 'snippet must be ok'); + assert.typeOf(res.snippet, 'string', 'snippet must be string'); + assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); + + assert.isNumber(res.rank, 'rank must be integer'); + } + + for (let res of obj.top_ads) { + + assert.isOk(res.tracking_link, 'link must be ok'); + assert.typeOf(res.tracking_link, 'string', 'link must be string'); + assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars'); + + assert.isOk(res.visible_link, 'link must be ok'); + assert.typeOf(res.visible_link, 'string', 'link must be string'); + assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars'); + + assert.isOk(res.link, 'visible_link must be ok'); + assert.typeOf(res.link, 'string', 'visible_link must be string'); + assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars'); + + assert.isOk(res.title, 'title must be ok'); + assert.typeOf(res.title, 'string', 'title must be string'); + assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); + + assert.isOk(res.snippet, 'snippet must be ok'); + assert.typeOf(res.snippet, 'string', 'snippet must be string'); + assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); + + assert.typeOf(res.links, 'array', 'links must be array'); + } + + for (let res of obj.bottom_ads) { + assert.isOk(res.tracking_link, 'link must be ok'); + assert.typeOf(res.tracking_link, 'string', 'link must be string'); + assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars'); + + assert.isOk(res.visible_link, 'link must be ok'); + assert.typeOf(res.visible_link, 'string', 'link must be string'); + assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars'); + + assert.isOk(res.link, 'visible_link must be ok'); + assert.typeOf(res.link, 'string', 'visible_link must be string'); + assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars'); + + assert.isOk(res.title, 'title must be ok'); + assert.typeOf(res.title, 'string', 'title must be string'); + assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); + + assert.isOk(res.snippet, 'snippet must be ok'); + assert.typeOf(res.snippet, 'string', 'snippet must be string'); + assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); + + assert.typeOf(res.links, 'array', 'links must be array'); + } + + for (let res of obj.top_products) { + + assert.isOk(res.tracking_link, 'link must be ok'); + assert.typeOf(res.tracking_link, 'string', 'link must be string'); + assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars'); + + assert.isOk(res.link, 'link must be ok'); + assert.typeOf(res.link, 'string', 'link must be string'); + assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars'); + + assert.isOk(res.price, 'price must be ok'); + assert.typeOf(res.price, 'string', 'price must be string'); + assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars'); + + assert.isOk(res.title, 'title must be ok'); + assert.typeOf(res.title, 'string', 'title must be string'); + assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); + + assert.isOk(res.vendor_link, 'vendor_link must be ok'); + assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string'); + assert.isAtLeast(res.vendor_link.length, 10, 'vendor_link must have at least 10 chars'); + } + + for (let res of obj.right_products) { + assert.isOk(res.tracking_link, 'link must be ok'); + assert.typeOf(res.tracking_link, 'string', 'link must be string'); + assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars'); + + assert.isOk(res.link, 'link must be ok'); + assert.typeOf(res.link, 'string', 'link must be string'); + assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars'); + + assert.isOk(res.price, 'price must be ok'); + assert.typeOf(res.price, 'string', 'price must be string'); + assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars'); + + assert.isOk(res.title, 'title must be ok'); + assert.typeOf(res.title, 'string', 'title must be string'); + assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); + + assert.isOk(res.vendor_link, 'vendor_link must be ok'); + assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string'); + assert.isAtLeast(res.vendor_link.length, 10, 'vendor_link must have at least 10 chars'); + } + + for (let res of obj.places) { + assert.isOk(res.heading, 'heading must be ok'); + assert.typeOf(res.heading, 'string', 'heading must be string'); + assert.isAtLeast(res.heading.length, 5, 'heading must have at least 5 chars'); + + assert.isOk(res.rating, 'rating must be ok'); + assert.typeOf(res.rating, 'string', 'rating must be string'); + assert.isAtLeast(res.rating.length, 5, 'rating must have at least 5 chars'); + + assert.isOk(res.contact, 'contact must be ok'); + assert.typeOf(res.contact, 'string', 'contact must be string'); + assert.isAtLeast(res.contact.length, 5, 'contact must have at least 5 chars'); + + assert.typeOf(res.hours, 'string', 'hours must be string'); + if (res.hours) { + assert.isAtLeast(res.hours.length, 10, 'hours must have at least 10 chars'); + } + } +} + +describe('Google2', function(){ + this.timeout(10000); + it('static google searches testing various details', normal_search_test); +}); \ No newline at end of file diff --git a/test/static_tests/yandex.js b/test/static_tests/yandex.js new file mode 100644 index 0000000..fce5816 --- /dev/null +++ b/test/static_tests/yandex.js @@ -0,0 +1,114 @@ +const se_scraper = require('./../../index.js'); +const chai = require('chai'); +chai.use(require('chai-string')); +const assert = chai.assert; +const path = require('path'); + +async function yandex_ads() { + let config = { + compress: false, + debug_level: 1, + headless: true, + }; + + let scrape_config = { + search_engine: 'yandex', + keywords: ['cloud service'], + num_pages: 1, + scrape_from_file: 'file://' + path.join(__dirname, './html/yandex1.html'), + }; + + var scraper = new se_scraper.ScrapeManager(config); + + await scraper.start(); + + yandex_search_with_ads( await scraper.scrape(scrape_config) ); + + scrape_config.keywords = ['car tires cheap']; + scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/yandex2.html'); + + yandex_search_with_ads2( await scraper.scrape(scrape_config) ); + + await scraper.quit(); +} + +// we test with a callback function to our handler +function yandex_search_with_ads(response) { + assert.equal(response.metadata.num_requests, 1); + + for (let query in response.results) { + + for (let page_number in response.results[query]) { + + assert.isNumber(parseInt(page_number), 'page_number must be numeric'); + + let obj = response.results[query][page_number]; + + assert.include(obj.num_results, '2 million results', 'num results not included'); + assert.containsAllKeys(obj, ['results', 'time', 'num_results',], 'not all keys are in the object'); + assert.isAtLeast(obj.results.length, 12, 'results must have at least 12 SERP objects'); + + assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); + assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); + assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); + + confirm_results_ok(obj); + } + } +} + + +function yandex_search_with_ads2(response) { + assert.equal(response.metadata.num_requests, 1); + + for (let query in response.results) { + + for (let page_number in response.results[query]) { + + assert.isNumber(parseInt(page_number), 'page_number must be numeric'); + + let obj = response.results[query][page_number]; + + assert.include(obj.num_results, '5 million results', 'num results not included'); + assert.containsAllKeys(obj, ['results', 'time', 'num_results',], 'not all keys are in the object'); + assert.isAtLeast(obj.results.length, 11, 'results must have at least 12 SERP objects'); + + assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); + assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); + assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); + + confirm_results_ok(obj); + } + } +} + + +function confirm_results_ok(obj) { + + for (let res of obj.results) { + assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object'); + + assert.isOk(res.link, 'link must be ok'); + assert.typeOf(res.link, 'string', 'link must be string'); + assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars'); + + assert.isOk(res.visible_link, 'visible_link must be ok'); + assert.typeOf(res.visible_link, 'string', 'visible_link must be string'); + assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars'); + + assert.isOk(res.title, 'title must be ok'); + assert.typeOf(res.title, 'string', 'title must be string'); + assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); + + assert.isOk(res.snippet, 'snippet must be ok'); + assert.typeOf(res.snippet, 'string', 'snippet must be string'); + assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); + + assert.isNumber(res.rank, 'rank must be integer'); + } +} + +describe('Yandex', function(){ + this.timeout(10000); + it('static yandex searches with ads', yandex_ads); +}); \ No newline at end of file diff --git a/test/test_google.js b/test/test_google.js index eec387d..c21ef4b 100644 --- a/test/test_google.js +++ b/test/test_google.js @@ -125,7 +125,7 @@ function test_case_no_results(response) { assert.strictEqual(obj.results.length, 0, 'results must have 0 SERP objects'); assert.equal(obj.no_results, true, 'no results should be true'); - assert.isEmpty(obj.num_results, 'no results should be a empty string'); + assert.isEmpty(obj.num_results, 'num_results should be a empty string'); assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); } }