remove cheerio from parsing

This commit is contained in:
Nikolai Tschacher 2019-09-23 21:57:13 +02:00
parent 52a2ec7b33
commit 95a5ee56d8
11 changed files with 1418 additions and 144 deletions

4
.gitignore vendored
View File

@ -1,7 +1,7 @@
# ignore static tests
test/static_tests/
test/static_tests/*
test/static_tests/html/
test/static_tests/html/*
.idea

View File

@ -1,4 +1,5 @@
'use strict';
const cheerio = require('cheerio');
const Scraper = require('./se_scraper');
const common = require('./common.js');
@ -10,161 +11,218 @@ class GoogleScraper extends Scraper {
super(...args);
}
parse(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
async parse_async(html) {
const results = [];
$('#center_col .g').each((i, link) => {
let obj = {
link: $(link).find('.r a').attr('href'),
title: $(link).find('.r a h3').text(),
snippet: $(link).find('span.st').text(),
visible_link: $(link).find('.r cite').text(),
date: $(link).find('span.f').text() || '',
const results = await this.page.evaluate(() => {
let _text = (el, s) => {
let n = el.querySelector(s);
if (n) {
return n.innerText;
} else {
return '';
}
};
if (obj.date) {
obj.date = obj.date.replace(' - ', '');
let _attr = (el, s, attr) => {
let n = el.querySelector(s);
if (n) {
return n.getAttribute(attr);
} else {
return null;
}
};
let results = {
num_results: '',
no_results: false,
effective_query: '',
right_info: {},
results: [],
top_products: [],
right_products: [],
top_ads: [],
bottom_ads: [],
places: [],
};
let num_results_el = document.getElementById('resultStats');
if (num_results_el) {
results.num_results = num_results_el.innerText;
}
results.push(obj);
});
let organic_results = document.querySelectorAll('#center_col .g');
// parse ads
let parseAds = (storage, selector) => {
$(selector).each((i, element) => {
let obj = {
visible_link: $(element).find('.ads-visurl cite').text(),
tracking_link: $(element).find('a:first-child').attr('href'),
link: $(element).find('a:nth-child(2)').attr('href'),
title: $(element).find('a h3').text(),
snippet: $(element).find('.ads-creative').text(),
links: [],
organic_results.forEach((el) => {
let serp_obj = {
link: _attr(el, '.r a', 'href'),
title: _text(el, '.r a h3'),
snippet: _text(el, 'span.st'),
visible_link: _text(el, '.r cite'),
date: _text(el, 'span.f'),
};
$(element).find('ul li a').each((i, el) => {
obj.links.push({
tracking_link: $(el).attr('data-arwt'),
link: $(el).attr('href'),
title: $(el).text(),
})
});
storage.push(obj);
if (serp_obj.date) {
serp_obj.date = serp_obj.date.replace(' - ', '');
}
results.results.push(serp_obj);
});
};
const top_ads = [];
const bottomads = [];
// check if no results
results.no_results = (results.results.length === 0);
parseAds(top_ads, '#tads .ads-ad');
parseAds(bottomads, '#tadsb .ads-ad');
let parseAds = (container, selector) => {
document.querySelectorAll(selector).forEach((el) => {
let ad_obj = {
visible_link: _text(el, '.ads-visurl cite'),
tracking_link: _attr(el, 'a:first-child', 'href'),
link: _attr(el, 'a:nth-child(2)', 'href'),
title: _text(el, 'a h3'),
snippet: _text(el, '.ads-creative'),
links: [],
};
el.querySelectorAll('ul li a').forEach((node) => {
ad_obj.links.push({
tracking_link: node.getAttribute('data-arwt'),
link: node.getAttribute('href'),
title: node.innerText,
})
});
container.push(ad_obj);
});
};
// parse google places
const places = [];
$('.rllt__link').each((i, element) => {
places.push({
heading: $(element).find('[role="heading"] span').text(),
rating: $(element).find('.rllt__details div:first-child').text(),
contact: $(element).find('.rllt__details div:nth-child(2)').text(),
hours: $(element).find('.rllt__details div:nth-child(3)').text(),
})
parseAds(results.top_ads, '#tads .ads-ad');
parseAds(results.bottom_ads, '#tadsb .ads-ad');
// parse google places
document.querySelectorAll('.rllt__link').forEach((el) => {
results.places.push({
heading: _text(el, '[role="heading"] span'),
rating: _text(el, '.rllt__details div:first-child'),
contact: _text(el, '.rllt__details div:nth-child(2)'),
hours: _text(el, '.rllt__details div:nth-child(3)'),
})
});
// parse right side product information
results.right_info.review = _attr(document, '#rhs .cu-container g-review-stars span', 'aria-label');
let title_el = document.querySelector('#rhs .cu-container g-review-stars');
if (title_el) {
results.right_info.review.title = title_el.parentNode.querySelector('div:first-child').innerText;
}
let num_reviews_el = document.querySelector('#rhs .cu-container g-review-stars');
if (num_reviews_el) {
results.right_info.num_reviews = num_reviews_el.parentNode.querySelector('div:nth-of-type(2)').innerText;
}
results.right_info.vendors = [];
results.right_info.info = _text(document, '#rhs_block > div > div > div > div:nth-child(5) > div > div');
document.querySelectorAll('#rhs .cu-container .rhsvw > div > div:nth-child(4) > div > div:nth-child(3) > div').forEach((el) => {
results.right_info.vendors.push({
price: _text(el, 'span:nth-of-type(1)'),
merchant_name: _text(el, 'span:nth-child(3) a:nth-child(2)'),
merchant_ad_link: _attr(el, 'span:nth-child(3) a:first-child', 'href'),
merchant_link: _attr(el, 'span:nth-child(3) a:nth-child(2)', 'href'),
source_name: _text(el, 'span:nth-child(4) a'),
source_link: _attr(el, 'span:nth-child(4) a', 'href'),
info: _text(el, 'div span'),
shipping: _text(el, 'span:last-child > span'),
})
});
if (!results.right_info.title) {
results.right_info = {};
}
let right_side_info_el = document.getElementById('rhs');
if (right_side_info_el) {
let right_side_info_text = right_side_info_el.innerText;
if (right_side_info_text && right_side_info_text.length > 0) {
results.right_side_info_text = right_side_info_text;
}
}
// parse top main column product information
// #tvcap .pla-unit
document.querySelectorAll('#tvcap .pla-unit').forEach((el) => {
let top_product = {
tracking_link: _attr(el, '.pla-unit-title a:first-child', 'href'),
link: _attr(el, '.pla-unit-title a:nth-child(2)', 'href'),
title: _text(el, '.pla-unit-title a:nth-child(2) span'),
price: _text(el, '.pla-unit-title + div'),
shipping: _text(el, '.pla-extensions-container div:nth-of-type(1)'),
vendor_link: _attr(el,'.pla-extensions-container div > a', 'href'),
};
let merchant_node = el.querySelector('.pla-unit-title');
if (merchant_node) {
let node = merchant_node.parentNode.querySelector('div > span');
if (node) {
top_product.merchant_name = node.innerText;
}
}
results.top_products.push(top_product);
});
// parse top right product information
// #tvcap .pla-unit
document.querySelectorAll('#rhs_block .pla-unit').forEach((el) => {
let right_product = {
tracking_link: _attr(el, '.pla-unit-title a:first-child', 'href'),
link: _attr(el, '.pla-unit-title a:nth-child(2)', 'href'),
title: _text(el, '.pla-unit-title a:nth-child(2) span:first-child'),
price: _text(el,'.pla-unit-title + div'),
shipping: _text(el,'.pla-extensions-container > div'),
vendor_link: _text(el,'.pla-extensions-container div > a'),
vendor_name: _text(el,'.pla-extensions-container div > a > div'),
};
let merchant_node = el.querySelector('.pla-unit-title');
if (merchant_node) {
let node = merchant_node.parentNode.querySelector('div > span:first-child');
if (node) {
right_product.merchant_name = node.innerText;
}
}
results.right_products.push(right_product);
});
let effective_query_el = document.getElementById('fprsl');
if (effective_query_el) {
results.effective_query = effective_query_el.innerText;
if (!results.effective_query) {
let effective_query_el2 = document.querySelector('#fprs a');
if (effective_query_el2) {
results.effective_query = document.querySelector('#fprs a').innerText;
}
}
}
return results;
});
// parse right side product information
var right_side_info = {};
right_side_info.review = $('#rhs .cu-container g-review-stars span').attr('aria-label');
right_side_info.title = $('#rhs .cu-container g-review-stars').parent().find('div:first-child').text();
right_side_info.num_reviews = $('#rhs .cu-container g-review-stars').parent().find('div:nth-of-type(2)').text();
right_side_info.vendors = [];
right_side_info.info = $('#rhs_block > div > div > div > div:nth-child(5) > div > div').text();
// clean some results
results.top_products = this.clean_results(results.top_products, ['title', 'link']);
results.right_products = this.clean_results(results.right_products, ['title', 'link']);
results.results = this.clean_results(results.results, ['title', 'link' , 'snippet']);
$('#rhs .cu-container .rhsvw > div > div:nth-child(4) > div > div:nth-child(3) > div').each((i, element) => {
right_side_info.vendors.push({
price: $(element).find('span:nth-of-type(1)').text(),
merchant_name: $(element).find('span:nth-child(3) a:nth-child(2)').text(),
merchant_ad_link: $(element).find('span:nth-child(3) a:first-child').attr('href'),
merchant_link: $(element).find('span:nth-child(3) a:nth-child(2)').attr('href'),
source_name: $(element).find('span:nth-child(4) a').text(),
source_link: $(element).find('span:nth-child(4) a').attr('href'),
info: $(element).find('div span').text(),
shipping: $(element).find('span:last-child > span').text(),
})
});
if (!right_side_info.title) {
right_side_info = {};
}
let right_side_info_text = $('#rhs').text();
// parse top main column product information
// #tvcap .pla-unit
var top_products = [];
$('#tvcap .pla-unit').each((i, element) => {
top_products.push({
tracking_link: $(element).find('.pla-unit-title a:first-child').attr('href'),
link: $(element).find('.pla-unit-title a:nth-child(2)').attr('href'),
title: $(element).find('.pla-unit-title a:nth-child(2) span').text(),
price: $(element).find('.pla-unit-title + div').text(),
merchant_name: $(element).find('.pla-unit-title').parent().find('div > span').text(),
shipping: $(element).find('.pla-extensions-container div:nth-of-type(1)').text(),
vendor_link: $(element).find('.pla-extensions-container div > a').attr('href'),
})
});
top_products = this.clean_results(top_products, ['title', 'link']);
// parse top right product information
// #tvcap .pla-unit
var right_products = [];
$('#rhs_block .pla-unit').each((i, element) => {
right_products.push({
tracking_link: $(element).find('.pla-unit-title a:first-child').attr('href'),
link: $(element).find('.pla-unit-title a:nth-child(2)').attr('href'),
title: $(element).find('.pla-unit-title a:nth-child(2) span:first-child').first().text(),
price: $(element).find('.pla-unit-title + div').text(),
merchant_name: $(element).find('.pla-unit-title').parent().find('div > span:first-child').text(),
shipping: $(element).find('.pla-extensions-container > div').text(),
vendor_link: $(element).find('.pla-extensions-container div > a').attr('href'),
vendor_name: $(element).find('.pla-extensions-container div > a > div').text(),
})
});
right_products = this.clean_results(right_products, ['title', 'link']);
// 'Ergebnisse für', 'Showing results for'
let no_results = this.no_results(
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
'No results found for'],
$('#main').text()
);
let effective_query = $('#fprsl').text() || '';
if (!effective_query) {
effective_query = $('#fprs a').text()
}
const cleaned = this.clean_results(results, ['title', 'link' , 'snippet']);
let res_obj = {
time: (new Date()).toUTCString(),
num_results: $('#resultStats').text(),
no_results: no_results,
effective_query: effective_query,
right_info: right_side_info,
results: cleaned,
top_products: top_products,
right_products: right_products,
top_ads: top_ads,
bottom_ads: bottomads,
places: places,
};
if (right_side_info_text && right_side_info_text.length > 0) {
res_obj.right_side_info_text = right_side_info_text;
}
return res_obj;
results.time = (new Date()).toUTCString();
return results;
}
async load_start_page() {

View File

@ -0,0 +1,15 @@
## Test with static HTML
Dynamic testing of se-scraper takes too much time.
Save some html and initialize se-scraper by loading the search from disk.
### Disadvantage
static html gets outdated after some time
### Advantages
1. Let's us test corner cases that are missed easily
2. Testing is not reliable, since search engines do not always return the same results for the same query
3. As said, much faster

168
test/static_tests/bing.js Normal file
View File

@ -0,0 +1,168 @@
const se_scraper = require('./../../index.js');
const chai = require('chai');
chai.use(require('chai-string'));
const assert = chai.assert;
const path = require('path');
async function bing_ads() {
let config = {
compress: false,
debug_level: 1,
headless: true,
};
let scrape_config = {
search_engine: 'bing',
keywords: ['kaffeemaschine kaufen'],
num_pages: 1,
scrape_from_file: 'file://' + path.join(__dirname, './html/bing.html'),
};
var scraper = new se_scraper.ScrapeManager(config);
await scraper.start();
bing_search_with_ads( await scraper.scrape(scrape_config) );
scrape_config.keywords = ['best cloud services'];
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/bing2.html');
bing_search_with_ads2( await scraper.scrape(scrape_config) );
scrape_config.keywords = ['car tires cheap'];
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/bing3.html');
bing_search_with_ads3( await scraper.scrape(scrape_config) );
await scraper.quit();
}
// we test with a callback function to our handler
function bing_search_with_ads(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.include(obj.num_results, '1100000', 'num results not included');
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 6, 'results must have at least 6 SERP objects');
assert.isAtLeast(obj.ads.length, 12, 'there are 12 ads');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
confirm_results_ok(obj);
}
}
}
function bing_search_with_ads2(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.include(obj.num_results, '44300000', 'num results not included');
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 6, 'results must have at least 6 SERP objects');
assert.isAtLeast(obj.ads.length, 12, 'there are 12 ads');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
confirm_results_ok(obj);
}
}
}
function bing_search_with_ads3(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.include(obj.num_results, '65.500.000 results', 'num results not included');
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects');
assert.isAtLeast(obj.ads.length, 3, 'there are 3 ads');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
confirm_results_ok(obj);
}
}
}
function confirm_results_ok(obj) {
for (let res of obj.results) {
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'visible_link must be ok');
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 8, 'title must have at least 8 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
assert.isNumber(res.rank, 'rank must be integer');
}
for (let res of obj.ads) {
assert.isOk(res.tracking_link, 'link must be ok');
assert.typeOf(res.tracking_link, 'string', 'link must be string');
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'link must be ok');
assert.typeOf(res.visible_link, 'string', 'link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 8, 'title must have at least 8 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
}
}
describe('Bing', function(){
this.timeout(15000);
it('static bing searches with ads', bing_ads);
});

View File

@ -0,0 +1,173 @@
const se_scraper = require('./../../index.js');
const chai = require('chai');
chai.use(require('chai-string'));
const assert = chai.assert;
const path = require('path');
const cheerio = require('cheerio');
async function test_html_output() {
let config = {
debug_level: 1,
headless: true,
html_output: true,
// whether to strip JS and CSS from the html_output
// has only an effect if `html_output` is true
clean_html_output: true,
// remove all data images from the html
clean_data_images: true,
// test compression
compress: false,
};
let scrape_config = {
search_engine: 'bing',
keywords: ['kaffeemaschine kaufen'],
num_pages: 1,
scrape_from_file: 'file://' + path.join(__dirname, './html/bing.html'),
};
var scraper = new se_scraper.ScrapeManager(config);
await scraper.start();
var response = await scraper.scrape(scrape_config);
scrape_config.clean_html_output = false;
scrape_config.clean_data_images = false;
var response_no_cleaned = await scraper.scrape(scrape_config);
test(response, response_no_cleaned, 'bing');
scrape_config.search_engine = 'google';
scrape_config.keywords = ['rückspiegel schwarz'];
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google.html');
scrape_config.clean_html_output = true;
scrape_config.clean_data_images = true;
var responseGoogle = await scraper.scrape(scrape_config);
scrape_config.clean_html_output = false;
scrape_config.clean_data_images = false;
var response_no_cleanedGoogle = await scraper.scrape(scrape_config);
test(responseGoogle, response_no_cleanedGoogle, 'google');
scrape_config.keywords = ['cloud services'];
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/googleLarge.html');
scrape_config.clean_html_output = true;
scrape_config.clean_data_images = true;
var responseGoogle = await scraper.scrape(scrape_config);
scrape_config.clean_html_output = false;
scrape_config.clean_data_images = false;
var response_no_cleanedGoogle = await scraper.scrape(scrape_config);
test(responseGoogle, response_no_cleanedGoogle, 'google');
await scraper.quit();
}
function test(response, response_no_cleaned, se='google') {
for (let query in response.results) {
for (let page_number in response.results[query]) {
let obj = response.results[query][page_number];
let obj_no_cleaned = response_no_cleaned.results[query][page_number];
console.log('html length of no cleaned SERP: ' + obj_no_cleaned.html.length);
console.log('html length of cleaned SERP: ' + obj.html.length);
assert.isOk(obj.html, 'Html must be ok!');
assert.isAtLeast(obj.html.length, 100, 'html must be a length string');
assert.isOk(obj_no_cleaned.html, 'Html must be ok!');
assert.isAtLeast(obj_no_cleaned.html.length, 100, 'html must be a length string');
assert.isBelow(obj.html.length, obj_no_cleaned.html.length, 'cleaned html must be smaller');
// test that we can parse the html of both the cleaned and no cleaned versions
// with cheerio and that serp results are roughly the same
const cleaned$ = cheerio.load(obj.html);
const no_cleaned$ = cheerio.load(obj_no_cleaned.html);
var resCleaned = parseResults(cleaned$, se);
var resNoCleaned = parseResults(no_cleaned$, se);
assert.equal(resCleaned.length, resNoCleaned.length);
assert.equal(resCleaned.length, obj.results.length);
assert.equal(resNoCleaned.length, obj.results.length);
// unset the rank
resCleaned = resCleaned.map((el) => el.rank = undefined);
resNoCleaned = resNoCleaned.map((el) => el.rank = undefined);
obj.results = obj.results.map((el) => el.rank = undefined);
assert.deepEqual(resCleaned, resNoCleaned, 'parsed results should be equal, even if html is cleaned');
assert.deepEqual(resCleaned, obj.results, 'parsed results from cleaned html should be equal to se-scraper results');
assert.deepEqual(resNoCleaned, obj.results, 'parsed results from non-cleaned html should be equal to se-scraper results');
}
}
}
function parseResults(s$, se) {
var results = [];
if (se === 'google') {
s$('#center_col .g').each((i, link) => {
results.push({
link: s$(link).find('.r a').attr('href'),
title: s$(link).find('.r a').text(),
snippet: s$(link).find('span.st').text(),
visible_link: s$(link).find('.r cite').text(),
date: s$(link).find('span.f').text() || '',
})
});
} else if (se === 'bing') {
s$('#b_content #b_results .b_algo').each((i, link) => {
results.push({
link: s$(link).find('h2 a').attr('href'),
title: s$(link).find('h2').text(),
snippet: s$(link).find('.b_caption p').text(),
visible_link: s$(link).find('cite').text(),
})
});
} else {
throw "no such search engine";
}
results = clean_results(results, ['title', 'link', 'snippet']);
return results;
}
function clean_results(results, attributes) {
const cleaned = [];
var rank = 1;
for (var res of results) {
let goodboy = true;
for (var attr of attributes) {
if (!res[attr] || !res[attr].trim()) {
goodboy = false;
break;
}
}
if (goodboy) {
res.rank = rank++;
cleaned.push(res);
}
}
return cleaned;
}
describe('html output', function(){
this.timeout(15000);
it('static html output test', test_html_output);
});

View File

@ -0,0 +1,24 @@
'use strict';
const zlib = require('zlib');
const fs = require('fs');
const path = require('path');
var files = ['google.html', 'google2.html', 'google3.html', 'bing.html', 'bing2.html'];
for (var file of files) {
var html = fs.readFileSync(path.resolve(__dirname, './html/' + file));
var compressed = zlib.gzipSync(html);
var deflated = zlib.deflateSync(html);
var compressed_encoded = compressed.toString('base64');
var deflated_encoded = deflated.toString('base64');
console.log(file)
console.log('Normal length: ' + html.length/1000);
console.log('GZIP Compressed length: ' + compressed.length/1000);
console.log('Deflate Compressed length: ' + deflated.length/1000);
console.log('Encoded GZIP Compressed length: ' + compressed_encoded.length/1000);
console.log('Encoded Deflate Compressed length: ' + deflated_encoded.length/1000);
console.log('------\n')
}

View File

@ -0,0 +1,99 @@
const se_scraper = require('./../../index.js');
const chai = require('chai');
chai.use(require('chai-string'));
const assert = chai.assert;
const path = require('path');
async function duckduckgo() {
let config = {
compress: false,
debug_level: 1,
headless: true,
};
let scrape_config = {
search_engine: 'duckduckgo',
keywords: ['cloud service'],
num_pages: 1,
scrape_from_file: 'file://' + path.join(__dirname, './html/duckduckgo1.html'),
};
var scraper = new se_scraper.ScrapeManager(config);
await scraper.start();
duckduckgo_normal( await scraper.scrape(scrape_config) );
await scraper.quit();
}
function duckduckgo_normal(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time', 'ads',], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects');
assert.isAtLeast(obj.ads.length, 2, 'ads must have at least 2 SERP objects');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
confirm_results_ok(obj);
}
}
}
function confirm_results_ok(obj) {
for (let res of obj.results) {
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'visible_link must be ok');
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
assert.isNumber(res.rank, 'rank must be integer');
}
for (let res of obj.ads) {
assert.isOk(res.tracking_link, 'link must be ok');
assert.typeOf(res.tracking_link, 'string', 'link must be string');
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'link must be ok');
assert.typeOf(res.visible_link, 'string', 'link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
}
}
describe('Duckduckgo', function(){
this.timeout(10000);
it('static duckduckgo sarch', duckduckgo);
});

410
test/static_tests/google.js Normal file
View File

@ -0,0 +1,410 @@
const se_scraper = require('./../../index.js');
const chai = require('chai');
chai.use(require('chai-string'));
const assert = chai.assert;
const path = require('path');
async function normal_search_test() {
let config = {
compress: false,
debug_level: 1,
headless: true,
};
let scrape_config = {
search_engine: 'google',
keywords: ['rückspiegel schwarz'],
num_pages: 1,
scrape_from_file: 'file://' + path.join(__dirname, './html/google.html'),
};
var scraper = new se_scraper.ScrapeManager(config);
await scraper.start();
google_search_with_products( await scraper.scrape(scrape_config) );
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google2.html');
scrape_config.keywords = ['autoreifen mercedes c-klasse'];
google_search_with_products2( await scraper.scrape(scrape_config) );
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google3.html');
scrape_config.keywords = ['kaffeemaschine kaufen'];
google_places( await scraper.scrape(scrape_config) );
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google4.html');
scrape_config.keywords = ['MODEL MARKET SW18 4ES'];
right_side_info_text( await scraper.scrape(scrape_config) );
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google5.html');
scrape_config.keywords = ['BRANDON MOTORS HP13 6NR'];
right_side_info_text2( await scraper.scrape(scrape_config) );
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google6.html');
scrape_config.keywords = ['car tires for sale'];
google_places_and_ads( await scraper.scrape(scrape_config) );
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google_bmw_felgen.html');
scrape_config.keywords = ['bmw felgen'];
google_ads2( await scraper.scrape(scrape_config) );
await scraper.quit();
}
// we test with a callback function to our handler
function google_search_with_products(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.include(obj.num_results, '1780000', 'num results not included');
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 9, 'results must have at least 8 SERP objects');
assert.isAtLeast(obj.top_ads.length, 0, 'there are no top ads');
assert.isAtLeast(obj.bottom_ads.length, 3, 'there are 3 bottom ads');
assert.isAtLeast(obj.top_products.length, 15, 'there are 15 top products');
assert.equal(obj.right_products.length, 0, 'there are 0 right products');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
confirm_results_ok(obj);
}
}
}
function google_search_with_products2(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.include(obj.num_results, '437000 Ergebnisse (0.41 Sekunden)', 'num results not included');
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 9, 'results must have at least 8 SERP objects');
assert.isAtLeast(obj.top_ads.length, 0, 'there are no top ads');
assert.isAtLeast(obj.bottom_ads.length, 1, 'there are 1 bottom ads');
assert.isAtLeast(obj.top_products.length, 0, 'there are 0 top products');
assert.equal(obj.right_products.length, 4, 'there are 4 right products');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
confirm_results_ok(obj);
}
}
}
function google_places(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.include(obj.num_results, '6750000 Ergebnisse (0.52 Sekunden)', 'num results not included');
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 10, 'results must have at least 9 SERP objects');
assert.equal(obj.top_ads.length, 0, 'there are no top ads');
assert.equal(obj.bottom_ads.length, 0, 'there are 0 bottom ads');
assert.equal(obj.top_products.length, 0, 'there are 0 top products');
assert.equal(obj.right_products.length, 0, 'there are 0 right products');
assert.equal(obj.places.length, 3, 'there are 3 places');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
confirm_results_ok(obj);
}
}
}
function right_side_info_text(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.include(obj.num_results, '6 Ergebnisse', 'num results not included');
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results',
'effective_query', 'top_ads', 'bottom_ads', 'right_side_info_text'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
assert.isOk(obj.right_side_info_text.length, 'right_side_info_text should have some data');
assert.isAtLeast(obj.right_side_info_text.length, 50, 'right_side_info_text should have some data');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
confirm_results_ok(obj);
}
}
}
function right_side_info_text2(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.include(obj.num_results, '5 Ergebnisse', 'num results not included');
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results',
'effective_query', 'top_ads', 'bottom_ads', 'right_side_info_text'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 5, 'results must have at least 5 SERP objects');
assert.isOk(obj.right_side_info_text.length, 'right_side_info_text should have some data');
assert.isAtLeast(obj.right_side_info_text.length, 50, 'right_side_info_text should have some data');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
confirm_results_ok(obj);
}
}
}
function google_places_and_ads(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.include(obj.num_results, '439.000.000 Ergebnisse (0,64 Sekunden)', 'num results not included');
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects');
assert.equal(obj.top_ads.length, 0, 'there are no top ads');
assert.equal(obj.bottom_ads.length, 0, 'there are 0 bottom ads');
assert.isAtLeast(obj.top_products.length, 13, 'there are 13 top products');
assert.equal(obj.right_products.length, 0, 'there are 0 right products');
assert.equal(obj.places.length, 2, 'there are 2 places');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
confirm_results_ok(obj);
}
}
}
function google_ads2(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.include(obj.num_results, 'Ungefähr 23.200.000 Ergebnisse (0,29 Sekunden)', 'num results not included');
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects');
assert.equal(obj.top_ads.length, 3, 'there are no top ads');
assert.equal(obj.bottom_ads.length, 0, 'there are 0 bottom ads');
assert.isAtLeast(obj.top_products.length, 0, 'there must be 0 top products');
assert.equal(obj.right_products.length, 9, 'there are 9 right products');
assert.equal(obj.places.length, 0, 'there are 0 places');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
confirm_results_ok(obj);
}
}
}
function confirm_results_ok(obj) {
for (let res of obj.results) {
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'visible_link must be ok');
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
assert.isNumber(res.rank, 'rank must be integer');
}
for (let res of obj.top_ads) {
assert.isOk(res.tracking_link, 'link must be ok');
assert.typeOf(res.tracking_link, 'string', 'link must be string');
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'link must be ok');
assert.typeOf(res.visible_link, 'string', 'link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.link, 'visible_link must be ok');
assert.typeOf(res.link, 'string', 'visible_link must be string');
assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
assert.typeOf(res.links, 'array', 'links must be array');
}
for (let res of obj.bottom_ads) {
assert.isOk(res.tracking_link, 'link must be ok');
assert.typeOf(res.tracking_link, 'string', 'link must be string');
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'link must be ok');
assert.typeOf(res.visible_link, 'string', 'link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.link, 'visible_link must be ok');
assert.typeOf(res.link, 'string', 'visible_link must be string');
assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
assert.typeOf(res.links, 'array', 'links must be array');
}
for (let res of obj.top_products) {
assert.isOk(res.tracking_link, 'link must be ok');
assert.typeOf(res.tracking_link, 'string', 'link must be string');
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.price, 'price must be ok');
assert.typeOf(res.price, 'string', 'price must be string');
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.vendor_link, 'vendor_link must be ok');
assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string');
assert.isAtLeast(res.vendor_link.length, 8, 'vendor_link must have at least 10 chars');
}
for (let res of obj.right_products) {
assert.isOk(res.tracking_link, 'link must be ok');
assert.typeOf(res.tracking_link, 'string', 'link must be string');
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.price, 'price must be ok');
assert.typeOf(res.price, 'string', 'price must be string');
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.vendor_link, 'vendor_link must be ok');
assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string');
assert.isAtLeast(res.vendor_link.length, 8, 'vendor_link must have at least 10 chars');
}
for (let res of obj.places) {
assert.isOk(res.heading, 'heading must be ok');
assert.typeOf(res.heading, 'string', 'heading must be string');
assert.isAtLeast(res.heading.length, 5, 'heading must have at least 5 chars');
assert.isOk(res.rating, 'rating must be ok');
assert.typeOf(res.rating, 'string', 'rating must be string');
assert.isAtLeast(res.rating.length, 5, 'rating must have at least 5 chars');
assert.isOk(res.contact, 'contact must be ok');
assert.typeOf(res.contact, 'string', 'contact must be string');
assert.isAtLeast(res.contact.length, 5, 'contact must have at least 5 chars');
assert.typeOf(res.hours, 'string', 'hours must be string');
if (res.hours) {
assert.isAtLeast(res.hours.length, 10, 'hours must have at least 10 chars');
}
}
}
describe('Google', function() {
this.timeout(25000);
it('static google searches with products,ads and places', normal_search_test);
});

View File

@ -0,0 +1,213 @@
const se_scraper = require('./../../index.js');
const chai = require('chai');
chai.use(require('chai-string'));
const assert = chai.assert;
const path = require('path');
async function normal_search_test() {
let config = {
compress: false,
debug_level: 1,
headless: true,
};
let scrape_config = {
search_engine: 'google',
keywords: ['in.linkedin.com/in/altanai'],
num_pages: 1,
scrape_from_file: 'file://' + path.join(__dirname, './html/google7.html'),
};
var scraper = new se_scraper.ScrapeManager(config);
await scraper.start();
google_test_title( await scraper.scrape(scrape_config) );
await scraper.quit();
}
// we test with a callback function to our handler
function google_test_title(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.include(obj.num_results, '7.600', 'num results not included');
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 9, 'results must have at least 9 SERP objects');
assert.isAtLeast(obj.top_ads.length, 0, 'there are no top ads');
assert.isAtLeast(obj.bottom_ads.length, 0, 'there are 0 bottom ads');
assert.isAtLeast(obj.top_products.length, 0, 'there are 0 top products');
assert.equal(obj.right_products.length, 0, 'there are 0 right products');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
confirm_results_ok(obj);
assert.equal( obj.results[0].title, 'ALTANAI BISHT - SD2 at Voice Engineering - Plivo | LinkedIn' );
assert.equal( obj.results[1].title, 'ALTANAI BISHT | LinkedIn' );
assert.equal( obj.results[2].title, 'ALTANAI BISHT SD2 at Voice Engineering Plivo | LinkedIn' );
assert.equal( obj.results[3].title, 'AI AT - South Delhi, Delhi, India | Professional Profile | LinkedIn' );
assert.equal( obj.results[4].title, 'ALTANAI BISHT | LinkedIn' );
assert.equal( obj.results[9].title, 'Phani Kumar Parasaram - VOIP Expert - Infinite ... - LinkedIn');
assert.equal (obj.results[0].date, '27.07.2016');
assert.equal( obj.results[0].snippet, '27.07.2016 - View ALTANAI BISHT\'S profile on LinkedIn, the world\'s largest professional community. ALTANAI has 6 jobs listed on their profile. See the ...');
assert.equal (obj.results[2].date, '27.07.2016');
}
}
}
function confirm_results_ok(obj) {
for (let res of obj.results) {
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'visible_link must be ok');
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
assert.isNumber(res.rank, 'rank must be integer');
}
for (let res of obj.top_ads) {
assert.isOk(res.tracking_link, 'link must be ok');
assert.typeOf(res.tracking_link, 'string', 'link must be string');
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'link must be ok');
assert.typeOf(res.visible_link, 'string', 'link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.link, 'visible_link must be ok');
assert.typeOf(res.link, 'string', 'visible_link must be string');
assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
assert.typeOf(res.links, 'array', 'links must be array');
}
for (let res of obj.bottom_ads) {
assert.isOk(res.tracking_link, 'link must be ok');
assert.typeOf(res.tracking_link, 'string', 'link must be string');
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'link must be ok');
assert.typeOf(res.visible_link, 'string', 'link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.link, 'visible_link must be ok');
assert.typeOf(res.link, 'string', 'visible_link must be string');
assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
assert.typeOf(res.links, 'array', 'links must be array');
}
for (let res of obj.top_products) {
assert.isOk(res.tracking_link, 'link must be ok');
assert.typeOf(res.tracking_link, 'string', 'link must be string');
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.price, 'price must be ok');
assert.typeOf(res.price, 'string', 'price must be string');
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.vendor_link, 'vendor_link must be ok');
assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string');
assert.isAtLeast(res.vendor_link.length, 10, 'vendor_link must have at least 10 chars');
}
for (let res of obj.right_products) {
assert.isOk(res.tracking_link, 'link must be ok');
assert.typeOf(res.tracking_link, 'string', 'link must be string');
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.price, 'price must be ok');
assert.typeOf(res.price, 'string', 'price must be string');
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.vendor_link, 'vendor_link must be ok');
assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string');
assert.isAtLeast(res.vendor_link.length, 10, 'vendor_link must have at least 10 chars');
}
for (let res of obj.places) {
assert.isOk(res.heading, 'heading must be ok');
assert.typeOf(res.heading, 'string', 'heading must be string');
assert.isAtLeast(res.heading.length, 5, 'heading must have at least 5 chars');
assert.isOk(res.rating, 'rating must be ok');
assert.typeOf(res.rating, 'string', 'rating must be string');
assert.isAtLeast(res.rating.length, 5, 'rating must have at least 5 chars');
assert.isOk(res.contact, 'contact must be ok');
assert.typeOf(res.contact, 'string', 'contact must be string');
assert.isAtLeast(res.contact.length, 5, 'contact must have at least 5 chars');
assert.typeOf(res.hours, 'string', 'hours must be string');
if (res.hours) {
assert.isAtLeast(res.hours.length, 10, 'hours must have at least 10 chars');
}
}
}
describe('Google2', function(){
this.timeout(10000);
it('static google searches testing various details', normal_search_test);
});

114
test/static_tests/yandex.js Normal file
View File

@ -0,0 +1,114 @@
const se_scraper = require('./../../index.js');
const chai = require('chai');
chai.use(require('chai-string'));
const assert = chai.assert;
const path = require('path');
async function yandex_ads() {
let config = {
compress: false,
debug_level: 1,
headless: true,
};
let scrape_config = {
search_engine: 'yandex',
keywords: ['cloud service'],
num_pages: 1,
scrape_from_file: 'file://' + path.join(__dirname, './html/yandex1.html'),
};
var scraper = new se_scraper.ScrapeManager(config);
await scraper.start();
yandex_search_with_ads( await scraper.scrape(scrape_config) );
scrape_config.keywords = ['car tires cheap'];
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/yandex2.html');
yandex_search_with_ads2( await scraper.scrape(scrape_config) );
await scraper.quit();
}
// we test with a callback function to our handler
function yandex_search_with_ads(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.include(obj.num_results, '2 million results', 'num results not included');
assert.containsAllKeys(obj, ['results', 'time', 'num_results',], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 12, 'results must have at least 12 SERP objects');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
confirm_results_ok(obj);
}
}
}
function yandex_search_with_ads2(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.include(obj.num_results, '5 million results', 'num results not included');
assert.containsAllKeys(obj, ['results', 'time', 'num_results',], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 11, 'results must have at least 12 SERP objects');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
confirm_results_ok(obj);
}
}
}
function confirm_results_ok(obj) {
for (let res of obj.results) {
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'visible_link must be ok');
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
assert.isNumber(res.rank, 'rank must be integer');
}
}
describe('Yandex', function(){
this.timeout(10000);
it('static yandex searches with ads', yandex_ads);
});

View File

@ -125,7 +125,7 @@ function test_case_no_results(response) {
assert.strictEqual(obj.results.length, 0, 'results must have 0 SERP objects');
assert.equal(obj.no_results, true, 'no results should be true');
assert.isEmpty(obj.num_results, 'no results should be a empty string');
assert.isEmpty(obj.num_results, 'num_results should be a empty string');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
}
}