forked from extern/se-scraper
remove cheerio from parsing
This commit is contained in:
parent
52a2ec7b33
commit
95a5ee56d8
4
.gitignore
vendored
4
.gitignore
vendored
@ -1,7 +1,7 @@
|
||||
# ignore static tests
|
||||
|
||||
test/static_tests/
|
||||
test/static_tests/*
|
||||
test/static_tests/html/
|
||||
test/static_tests/html/*
|
||||
|
||||
.idea
|
||||
|
||||
|
@ -1,4 +1,5 @@
|
||||
'use strict';
|
||||
|
||||
const cheerio = require('cheerio');
|
||||
const Scraper = require('./se_scraper');
|
||||
const common = require('./common.js');
|
||||
@ -10,161 +11,218 @@ class GoogleScraper extends Scraper {
|
||||
super(...args);
|
||||
}
|
||||
|
||||
parse(html) {
|
||||
// load the page source into cheerio
|
||||
const $ = cheerio.load(html);
|
||||
async parse_async(html) {
|
||||
|
||||
const results = [];
|
||||
$('#center_col .g').each((i, link) => {
|
||||
let obj = {
|
||||
link: $(link).find('.r a').attr('href'),
|
||||
title: $(link).find('.r a h3').text(),
|
||||
snippet: $(link).find('span.st').text(),
|
||||
visible_link: $(link).find('.r cite').text(),
|
||||
date: $(link).find('span.f').text() || '',
|
||||
const results = await this.page.evaluate(() => {
|
||||
|
||||
let _text = (el, s) => {
|
||||
let n = el.querySelector(s);
|
||||
|
||||
if (n) {
|
||||
return n.innerText;
|
||||
} else {
|
||||
return '';
|
||||
}
|
||||
};
|
||||
|
||||
if (obj.date) {
|
||||
obj.date = obj.date.replace(' - ', '');
|
||||
let _attr = (el, s, attr) => {
|
||||
let n = el.querySelector(s);
|
||||
|
||||
if (n) {
|
||||
return n.getAttribute(attr);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
let results = {
|
||||
num_results: '',
|
||||
no_results: false,
|
||||
effective_query: '',
|
||||
right_info: {},
|
||||
results: [],
|
||||
top_products: [],
|
||||
right_products: [],
|
||||
top_ads: [],
|
||||
bottom_ads: [],
|
||||
places: [],
|
||||
};
|
||||
|
||||
let num_results_el = document.getElementById('resultStats');
|
||||
|
||||
if (num_results_el) {
|
||||
results.num_results = num_results_el.innerText;
|
||||
}
|
||||
|
||||
results.push(obj);
|
||||
});
|
||||
let organic_results = document.querySelectorAll('#center_col .g');
|
||||
|
||||
// parse ads
|
||||
let parseAds = (storage, selector) => {
|
||||
$(selector).each((i, element) => {
|
||||
let obj = {
|
||||
visible_link: $(element).find('.ads-visurl cite').text(),
|
||||
tracking_link: $(element).find('a:first-child').attr('href'),
|
||||
link: $(element).find('a:nth-child(2)').attr('href'),
|
||||
title: $(element).find('a h3').text(),
|
||||
snippet: $(element).find('.ads-creative').text(),
|
||||
links: [],
|
||||
organic_results.forEach((el) => {
|
||||
|
||||
let serp_obj = {
|
||||
link: _attr(el, '.r a', 'href'),
|
||||
title: _text(el, '.r a h3'),
|
||||
snippet: _text(el, 'span.st'),
|
||||
visible_link: _text(el, '.r cite'),
|
||||
date: _text(el, 'span.f'),
|
||||
};
|
||||
$(element).find('ul li a').each((i, el) => {
|
||||
obj.links.push({
|
||||
tracking_link: $(el).attr('data-arwt'),
|
||||
link: $(el).attr('href'),
|
||||
title: $(el).text(),
|
||||
})
|
||||
});
|
||||
storage.push(obj);
|
||||
|
||||
if (serp_obj.date) {
|
||||
serp_obj.date = serp_obj.date.replace(' - ', '');
|
||||
}
|
||||
|
||||
results.results.push(serp_obj);
|
||||
});
|
||||
};
|
||||
|
||||
const top_ads = [];
|
||||
const bottomads = [];
|
||||
// check if no results
|
||||
results.no_results = (results.results.length === 0);
|
||||
|
||||
parseAds(top_ads, '#tads .ads-ad');
|
||||
parseAds(bottomads, '#tadsb .ads-ad');
|
||||
let parseAds = (container, selector) => {
|
||||
document.querySelectorAll(selector).forEach((el) => {
|
||||
let ad_obj = {
|
||||
visible_link: _text(el, '.ads-visurl cite'),
|
||||
tracking_link: _attr(el, 'a:first-child', 'href'),
|
||||
link: _attr(el, 'a:nth-child(2)', 'href'),
|
||||
title: _text(el, 'a h3'),
|
||||
snippet: _text(el, '.ads-creative'),
|
||||
links: [],
|
||||
};
|
||||
el.querySelectorAll('ul li a').forEach((node) => {
|
||||
ad_obj.links.push({
|
||||
tracking_link: node.getAttribute('data-arwt'),
|
||||
link: node.getAttribute('href'),
|
||||
title: node.innerText,
|
||||
})
|
||||
});
|
||||
container.push(ad_obj);
|
||||
});
|
||||
};
|
||||
|
||||
// parse google places
|
||||
const places = [];
|
||||
$('.rllt__link').each((i, element) => {
|
||||
places.push({
|
||||
heading: $(element).find('[role="heading"] span').text(),
|
||||
rating: $(element).find('.rllt__details div:first-child').text(),
|
||||
contact: $(element).find('.rllt__details div:nth-child(2)').text(),
|
||||
hours: $(element).find('.rllt__details div:nth-child(3)').text(),
|
||||
})
|
||||
parseAds(results.top_ads, '#tads .ads-ad');
|
||||
parseAds(results.bottom_ads, '#tadsb .ads-ad');
|
||||
|
||||
// parse google places
|
||||
document.querySelectorAll('.rllt__link').forEach((el) => {
|
||||
results.places.push({
|
||||
heading: _text(el, '[role="heading"] span'),
|
||||
rating: _text(el, '.rllt__details div:first-child'),
|
||||
contact: _text(el, '.rllt__details div:nth-child(2)'),
|
||||
hours: _text(el, '.rllt__details div:nth-child(3)'),
|
||||
})
|
||||
});
|
||||
|
||||
// parse right side product information
|
||||
results.right_info.review = _attr(document, '#rhs .cu-container g-review-stars span', 'aria-label');
|
||||
|
||||
let title_el = document.querySelector('#rhs .cu-container g-review-stars');
|
||||
if (title_el) {
|
||||
results.right_info.review.title = title_el.parentNode.querySelector('div:first-child').innerText;
|
||||
}
|
||||
|
||||
let num_reviews_el = document.querySelector('#rhs .cu-container g-review-stars');
|
||||
if (num_reviews_el) {
|
||||
results.right_info.num_reviews = num_reviews_el.parentNode.querySelector('div:nth-of-type(2)').innerText;
|
||||
}
|
||||
|
||||
results.right_info.vendors = [];
|
||||
results.right_info.info = _text(document, '#rhs_block > div > div > div > div:nth-child(5) > div > div');
|
||||
|
||||
document.querySelectorAll('#rhs .cu-container .rhsvw > div > div:nth-child(4) > div > div:nth-child(3) > div').forEach((el) => {
|
||||
results.right_info.vendors.push({
|
||||
price: _text(el, 'span:nth-of-type(1)'),
|
||||
merchant_name: _text(el, 'span:nth-child(3) a:nth-child(2)'),
|
||||
merchant_ad_link: _attr(el, 'span:nth-child(3) a:first-child', 'href'),
|
||||
merchant_link: _attr(el, 'span:nth-child(3) a:nth-child(2)', 'href'),
|
||||
source_name: _text(el, 'span:nth-child(4) a'),
|
||||
source_link: _attr(el, 'span:nth-child(4) a', 'href'),
|
||||
info: _text(el, 'div span'),
|
||||
shipping: _text(el, 'span:last-child > span'),
|
||||
})
|
||||
});
|
||||
|
||||
if (!results.right_info.title) {
|
||||
results.right_info = {};
|
||||
}
|
||||
|
||||
let right_side_info_el = document.getElementById('rhs');
|
||||
|
||||
if (right_side_info_el) {
|
||||
let right_side_info_text = right_side_info_el.innerText;
|
||||
|
||||
if (right_side_info_text && right_side_info_text.length > 0) {
|
||||
results.right_side_info_text = right_side_info_text;
|
||||
}
|
||||
}
|
||||
|
||||
// parse top main column product information
|
||||
// #tvcap .pla-unit
|
||||
document.querySelectorAll('#tvcap .pla-unit').forEach((el) => {
|
||||
let top_product = {
|
||||
tracking_link: _attr(el, '.pla-unit-title a:first-child', 'href'),
|
||||
link: _attr(el, '.pla-unit-title a:nth-child(2)', 'href'),
|
||||
title: _text(el, '.pla-unit-title a:nth-child(2) span'),
|
||||
price: _text(el, '.pla-unit-title + div'),
|
||||
shipping: _text(el, '.pla-extensions-container div:nth-of-type(1)'),
|
||||
vendor_link: _attr(el,'.pla-extensions-container div > a', 'href'),
|
||||
};
|
||||
|
||||
let merchant_node = el.querySelector('.pla-unit-title');
|
||||
if (merchant_node) {
|
||||
let node = merchant_node.parentNode.querySelector('div > span');
|
||||
if (node) {
|
||||
top_product.merchant_name = node.innerText;
|
||||
}
|
||||
}
|
||||
|
||||
results.top_products.push(top_product);
|
||||
});
|
||||
|
||||
// parse top right product information
|
||||
// #tvcap .pla-unit
|
||||
document.querySelectorAll('#rhs_block .pla-unit').forEach((el) => {
|
||||
let right_product = {
|
||||
tracking_link: _attr(el, '.pla-unit-title a:first-child', 'href'),
|
||||
link: _attr(el, '.pla-unit-title a:nth-child(2)', 'href'),
|
||||
title: _text(el, '.pla-unit-title a:nth-child(2) span:first-child'),
|
||||
price: _text(el,'.pla-unit-title + div'),
|
||||
shipping: _text(el,'.pla-extensions-container > div'),
|
||||
vendor_link: _text(el,'.pla-extensions-container div > a'),
|
||||
vendor_name: _text(el,'.pla-extensions-container div > a > div'),
|
||||
};
|
||||
|
||||
let merchant_node = el.querySelector('.pla-unit-title');
|
||||
if (merchant_node) {
|
||||
let node = merchant_node.parentNode.querySelector('div > span:first-child');
|
||||
if (node) {
|
||||
right_product.merchant_name = node.innerText;
|
||||
}
|
||||
}
|
||||
|
||||
results.right_products.push(right_product);
|
||||
});
|
||||
|
||||
let effective_query_el = document.getElementById('fprsl');
|
||||
|
||||
if (effective_query_el) {
|
||||
|
||||
results.effective_query = effective_query_el.innerText;
|
||||
if (!results.effective_query) {
|
||||
let effective_query_el2 = document.querySelector('#fprs a');
|
||||
if (effective_query_el2) {
|
||||
results.effective_query = document.querySelector('#fprs a').innerText;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
});
|
||||
|
||||
// parse right side product information
|
||||
var right_side_info = {};
|
||||
right_side_info.review = $('#rhs .cu-container g-review-stars span').attr('aria-label');
|
||||
right_side_info.title = $('#rhs .cu-container g-review-stars').parent().find('div:first-child').text();
|
||||
right_side_info.num_reviews = $('#rhs .cu-container g-review-stars').parent().find('div:nth-of-type(2)').text();
|
||||
right_side_info.vendors = [];
|
||||
right_side_info.info = $('#rhs_block > div > div > div > div:nth-child(5) > div > div').text();
|
||||
// clean some results
|
||||
results.top_products = this.clean_results(results.top_products, ['title', 'link']);
|
||||
results.right_products = this.clean_results(results.right_products, ['title', 'link']);
|
||||
results.results = this.clean_results(results.results, ['title', 'link' , 'snippet']);
|
||||
|
||||
$('#rhs .cu-container .rhsvw > div > div:nth-child(4) > div > div:nth-child(3) > div').each((i, element) => {
|
||||
right_side_info.vendors.push({
|
||||
price: $(element).find('span:nth-of-type(1)').text(),
|
||||
merchant_name: $(element).find('span:nth-child(3) a:nth-child(2)').text(),
|
||||
merchant_ad_link: $(element).find('span:nth-child(3) a:first-child').attr('href'),
|
||||
merchant_link: $(element).find('span:nth-child(3) a:nth-child(2)').attr('href'),
|
||||
source_name: $(element).find('span:nth-child(4) a').text(),
|
||||
source_link: $(element).find('span:nth-child(4) a').attr('href'),
|
||||
info: $(element).find('div span').text(),
|
||||
shipping: $(element).find('span:last-child > span').text(),
|
||||
})
|
||||
});
|
||||
|
||||
if (!right_side_info.title) {
|
||||
right_side_info = {};
|
||||
}
|
||||
|
||||
let right_side_info_text = $('#rhs').text();
|
||||
|
||||
// parse top main column product information
|
||||
// #tvcap .pla-unit
|
||||
var top_products = [];
|
||||
$('#tvcap .pla-unit').each((i, element) => {
|
||||
top_products.push({
|
||||
tracking_link: $(element).find('.pla-unit-title a:first-child').attr('href'),
|
||||
link: $(element).find('.pla-unit-title a:nth-child(2)').attr('href'),
|
||||
title: $(element).find('.pla-unit-title a:nth-child(2) span').text(),
|
||||
price: $(element).find('.pla-unit-title + div').text(),
|
||||
merchant_name: $(element).find('.pla-unit-title').parent().find('div > span').text(),
|
||||
shipping: $(element).find('.pla-extensions-container div:nth-of-type(1)').text(),
|
||||
vendor_link: $(element).find('.pla-extensions-container div > a').attr('href'),
|
||||
})
|
||||
});
|
||||
|
||||
top_products = this.clean_results(top_products, ['title', 'link']);
|
||||
|
||||
// parse top right product information
|
||||
// #tvcap .pla-unit
|
||||
var right_products = [];
|
||||
$('#rhs_block .pla-unit').each((i, element) => {
|
||||
right_products.push({
|
||||
tracking_link: $(element).find('.pla-unit-title a:first-child').attr('href'),
|
||||
link: $(element).find('.pla-unit-title a:nth-child(2)').attr('href'),
|
||||
title: $(element).find('.pla-unit-title a:nth-child(2) span:first-child').first().text(),
|
||||
price: $(element).find('.pla-unit-title + div').text(),
|
||||
merchant_name: $(element).find('.pla-unit-title').parent().find('div > span:first-child').text(),
|
||||
shipping: $(element).find('.pla-extensions-container > div').text(),
|
||||
vendor_link: $(element).find('.pla-extensions-container div > a').attr('href'),
|
||||
vendor_name: $(element).find('.pla-extensions-container div > a > div').text(),
|
||||
})
|
||||
});
|
||||
|
||||
right_products = this.clean_results(right_products, ['title', 'link']);
|
||||
|
||||
// 'Ergebnisse für', 'Showing results for'
|
||||
let no_results = this.no_results(
|
||||
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
|
||||
'No results found for'],
|
||||
$('#main').text()
|
||||
);
|
||||
|
||||
let effective_query = $('#fprsl').text() || '';
|
||||
if (!effective_query) {
|
||||
effective_query = $('#fprs a').text()
|
||||
}
|
||||
|
||||
const cleaned = this.clean_results(results, ['title', 'link' , 'snippet']);
|
||||
|
||||
let res_obj = {
|
||||
time: (new Date()).toUTCString(),
|
||||
num_results: $('#resultStats').text(),
|
||||
no_results: no_results,
|
||||
effective_query: effective_query,
|
||||
right_info: right_side_info,
|
||||
results: cleaned,
|
||||
top_products: top_products,
|
||||
right_products: right_products,
|
||||
top_ads: top_ads,
|
||||
bottom_ads: bottomads,
|
||||
places: places,
|
||||
};
|
||||
|
||||
if (right_side_info_text && right_side_info_text.length > 0) {
|
||||
res_obj.right_side_info_text = right_side_info_text;
|
||||
}
|
||||
|
||||
return res_obj;
|
||||
results.time = (new Date()).toUTCString();
|
||||
return results;
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
|
15
test/static_tests/README.md
Normal file
15
test/static_tests/README.md
Normal file
@ -0,0 +1,15 @@
|
||||
## Test with static HTML
|
||||
|
||||
Dynamic testing of se-scraper takes too much time.
|
||||
|
||||
Save some html and initialize se-scraper by loading the search from disk.
|
||||
|
||||
### Disadvantage
|
||||
|
||||
static html gets outdated after some time
|
||||
|
||||
### Advantages
|
||||
|
||||
1. Let's us test corner cases that are missed easily
|
||||
2. Testing is not reliable, since search engines do not always return the same results for the same query
|
||||
3. As said, much faster
|
168
test/static_tests/bing.js
Normal file
168
test/static_tests/bing.js
Normal file
@ -0,0 +1,168 @@
|
||||
const se_scraper = require('./../../index.js');
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
const path = require('path');
|
||||
|
||||
async function bing_ads() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'bing',
|
||||
keywords: ['kaffeemaschine kaufen'],
|
||||
num_pages: 1,
|
||||
scrape_from_file: 'file://' + path.join(__dirname, './html/bing.html'),
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(config);
|
||||
|
||||
await scraper.start();
|
||||
|
||||
bing_search_with_ads( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.keywords = ['best cloud services'];
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/bing2.html');
|
||||
|
||||
bing_search_with_ads2( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.keywords = ['car tires cheap'];
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/bing3.html');
|
||||
|
||||
bing_search_with_ads3( await scraper.scrape(scrape_config) );
|
||||
|
||||
await scraper.quit();
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function bing_search_with_ads(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '1’100’000', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 6, 'results must have at least 6 SERP objects');
|
||||
assert.isAtLeast(obj.ads.length, 12, 'there are 12 ads');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function bing_search_with_ads2(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '44’300’000', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 6, 'results must have at least 6 SERP objects');
|
||||
assert.isAtLeast(obj.ads.length, 12, 'there are 12 ads');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function bing_search_with_ads3(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '65.500.000 results', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects');
|
||||
assert.isAtLeast(obj.ads.length, 3, 'there are 3 ads');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function confirm_results_ok(obj) {
|
||||
|
||||
for (let res of obj.results) {
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 8, 'title must have at least 8 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
}
|
||||
|
||||
for (let res of obj.ads) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 8, 'title must have at least 8 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
}
|
||||
}
|
||||
|
||||
describe('Bing', function(){
|
||||
this.timeout(15000);
|
||||
it('static bing searches with ads', bing_ads);
|
||||
});
|
173
test/static_tests/clean_html_test.js
Normal file
173
test/static_tests/clean_html_test.js
Normal file
@ -0,0 +1,173 @@
|
||||
const se_scraper = require('./../../index.js');
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
const path = require('path');
|
||||
const cheerio = require('cheerio');
|
||||
|
||||
|
||||
async function test_html_output() {
|
||||
let config = {
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
html_output: true,
|
||||
// whether to strip JS and CSS from the html_output
|
||||
// has only an effect if `html_output` is true
|
||||
clean_html_output: true,
|
||||
// remove all data images from the html
|
||||
clean_data_images: true,
|
||||
// test compression
|
||||
compress: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'bing',
|
||||
keywords: ['kaffeemaschine kaufen'],
|
||||
num_pages: 1,
|
||||
scrape_from_file: 'file://' + path.join(__dirname, './html/bing.html'),
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(config);
|
||||
|
||||
await scraper.start();
|
||||
|
||||
var response = await scraper.scrape(scrape_config);
|
||||
|
||||
scrape_config.clean_html_output = false;
|
||||
scrape_config.clean_data_images = false;
|
||||
|
||||
var response_no_cleaned = await scraper.scrape(scrape_config);
|
||||
|
||||
test(response, response_no_cleaned, 'bing');
|
||||
|
||||
scrape_config.search_engine = 'google';
|
||||
scrape_config.keywords = ['rückspiegel schwarz'];
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google.html');
|
||||
scrape_config.clean_html_output = true;
|
||||
scrape_config.clean_data_images = true;
|
||||
|
||||
var responseGoogle = await scraper.scrape(scrape_config);
|
||||
|
||||
scrape_config.clean_html_output = false;
|
||||
scrape_config.clean_data_images = false;
|
||||
|
||||
var response_no_cleanedGoogle = await scraper.scrape(scrape_config);
|
||||
|
||||
test(responseGoogle, response_no_cleanedGoogle, 'google');
|
||||
|
||||
|
||||
scrape_config.keywords = ['cloud services'];
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/googleLarge.html');
|
||||
scrape_config.clean_html_output = true;
|
||||
scrape_config.clean_data_images = true;
|
||||
|
||||
var responseGoogle = await scraper.scrape(scrape_config);
|
||||
|
||||
scrape_config.clean_html_output = false;
|
||||
scrape_config.clean_data_images = false;
|
||||
|
||||
var response_no_cleanedGoogle = await scraper.scrape(scrape_config);
|
||||
|
||||
test(responseGoogle, response_no_cleanedGoogle, 'google');
|
||||
|
||||
await scraper.quit();
|
||||
}
|
||||
|
||||
function test(response, response_no_cleaned, se='google') {
|
||||
for (let query in response.results) {
|
||||
for (let page_number in response.results[query]) {
|
||||
let obj = response.results[query][page_number];
|
||||
let obj_no_cleaned = response_no_cleaned.results[query][page_number];
|
||||
|
||||
console.log('html length of no cleaned SERP: ' + obj_no_cleaned.html.length);
|
||||
console.log('html length of cleaned SERP: ' + obj.html.length);
|
||||
|
||||
assert.isOk(obj.html, 'Html must be ok!');
|
||||
assert.isAtLeast(obj.html.length, 100, 'html must be a length string');
|
||||
|
||||
assert.isOk(obj_no_cleaned.html, 'Html must be ok!');
|
||||
assert.isAtLeast(obj_no_cleaned.html.length, 100, 'html must be a length string');
|
||||
|
||||
assert.isBelow(obj.html.length, obj_no_cleaned.html.length, 'cleaned html must be smaller');
|
||||
|
||||
// test that we can parse the html of both the cleaned and no cleaned versions
|
||||
// with cheerio and that serp results are roughly the same
|
||||
|
||||
const cleaned$ = cheerio.load(obj.html);
|
||||
const no_cleaned$ = cheerio.load(obj_no_cleaned.html);
|
||||
|
||||
var resCleaned = parseResults(cleaned$, se);
|
||||
var resNoCleaned = parseResults(no_cleaned$, se);
|
||||
|
||||
assert.equal(resCleaned.length, resNoCleaned.length);
|
||||
assert.equal(resCleaned.length, obj.results.length);
|
||||
assert.equal(resNoCleaned.length, obj.results.length);
|
||||
|
||||
// unset the rank
|
||||
resCleaned = resCleaned.map((el) => el.rank = undefined);
|
||||
resNoCleaned = resNoCleaned.map((el) => el.rank = undefined);
|
||||
obj.results = obj.results.map((el) => el.rank = undefined);
|
||||
|
||||
assert.deepEqual(resCleaned, resNoCleaned, 'parsed results should be equal, even if html is cleaned');
|
||||
assert.deepEqual(resCleaned, obj.results, 'parsed results from cleaned html should be equal to se-scraper results');
|
||||
assert.deepEqual(resNoCleaned, obj.results, 'parsed results from non-cleaned html should be equal to se-scraper results');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function parseResults(s$, se) {
|
||||
|
||||
var results = [];
|
||||
|
||||
if (se === 'google') {
|
||||
s$('#center_col .g').each((i, link) => {
|
||||
results.push({
|
||||
link: s$(link).find('.r a').attr('href'),
|
||||
title: s$(link).find('.r a').text(),
|
||||
snippet: s$(link).find('span.st').text(),
|
||||
visible_link: s$(link).find('.r cite').text(),
|
||||
date: s$(link).find('span.f').text() || '',
|
||||
})
|
||||
});
|
||||
|
||||
} else if (se === 'bing') {
|
||||
s$('#b_content #b_results .b_algo').each((i, link) => {
|
||||
results.push({
|
||||
link: s$(link).find('h2 a').attr('href'),
|
||||
title: s$(link).find('h2').text(),
|
||||
snippet: s$(link).find('.b_caption p').text(),
|
||||
visible_link: s$(link).find('cite').text(),
|
||||
})
|
||||
});
|
||||
} else {
|
||||
throw "no such search engine";
|
||||
}
|
||||
|
||||
results = clean_results(results, ['title', 'link', 'snippet']);
|
||||
return results;
|
||||
}
|
||||
|
||||
function clean_results(results, attributes) {
|
||||
const cleaned = [];
|
||||
var rank = 1;
|
||||
for (var res of results) {
|
||||
let goodboy = true;
|
||||
for (var attr of attributes) {
|
||||
if (!res[attr] || !res[attr].trim()) {
|
||||
goodboy = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (goodboy) {
|
||||
res.rank = rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
return cleaned;
|
||||
}
|
||||
|
||||
describe('html output', function(){
|
||||
this.timeout(15000);
|
||||
it('static html output test', test_html_output);
|
||||
});
|
24
test/static_tests/compression.js
Normal file
24
test/static_tests/compression.js
Normal file
@ -0,0 +1,24 @@
|
||||
'use strict';
|
||||
const zlib = require('zlib');
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
|
||||
var files = ['google.html', 'google2.html', 'google3.html', 'bing.html', 'bing2.html'];
|
||||
|
||||
for (var file of files) {
|
||||
var html = fs.readFileSync(path.resolve(__dirname, './html/' + file));
|
||||
|
||||
var compressed = zlib.gzipSync(html);
|
||||
var deflated = zlib.deflateSync(html);
|
||||
|
||||
var compressed_encoded = compressed.toString('base64');
|
||||
var deflated_encoded = deflated.toString('base64');
|
||||
|
||||
console.log(file)
|
||||
console.log('Normal length: ' + html.length/1000);
|
||||
console.log('GZIP Compressed length: ' + compressed.length/1000);
|
||||
console.log('Deflate Compressed length: ' + deflated.length/1000);
|
||||
console.log('Encoded GZIP Compressed length: ' + compressed_encoded.length/1000);
|
||||
console.log('Encoded Deflate Compressed length: ' + deflated_encoded.length/1000);
|
||||
console.log('------\n')
|
||||
}
|
99
test/static_tests/duckduckgo.js
Normal file
99
test/static_tests/duckduckgo.js
Normal file
@ -0,0 +1,99 @@
|
||||
const se_scraper = require('./../../index.js');
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
const path = require('path');
|
||||
|
||||
async function duckduckgo() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'duckduckgo',
|
||||
keywords: ['cloud service'],
|
||||
num_pages: 1,
|
||||
scrape_from_file: 'file://' + path.join(__dirname, './html/duckduckgo1.html'),
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(config);
|
||||
|
||||
await scraper.start();
|
||||
|
||||
duckduckgo_normal( await scraper.scrape(scrape_config) );
|
||||
|
||||
await scraper.quit();
|
||||
}
|
||||
|
||||
function duckduckgo_normal(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'ads',], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects');
|
||||
assert.isAtLeast(obj.ads.length, 2, 'ads must have at least 2 SERP objects');
|
||||
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function confirm_results_ok(obj) {
|
||||
|
||||
for (let res of obj.results) {
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
}
|
||||
|
||||
for (let res of obj.ads) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
}
|
||||
}
|
||||
|
||||
describe('Duckduckgo', function(){
|
||||
this.timeout(10000);
|
||||
it('static duckduckgo sarch', duckduckgo);
|
||||
});
|
410
test/static_tests/google.js
Normal file
410
test/static_tests/google.js
Normal file
@ -0,0 +1,410 @@
|
||||
const se_scraper = require('./../../index.js');
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
const path = require('path');
|
||||
|
||||
async function normal_search_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'google',
|
||||
keywords: ['rückspiegel schwarz'],
|
||||
num_pages: 1,
|
||||
scrape_from_file: 'file://' + path.join(__dirname, './html/google.html'),
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(config);
|
||||
|
||||
await scraper.start();
|
||||
|
||||
google_search_with_products( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google2.html');
|
||||
scrape_config.keywords = ['autoreifen mercedes c-klasse'];
|
||||
|
||||
google_search_with_products2( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google3.html');
|
||||
scrape_config.keywords = ['kaffeemaschine kaufen'];
|
||||
|
||||
google_places( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google4.html');
|
||||
scrape_config.keywords = ['MODEL MARKET SW18 4ES'];
|
||||
|
||||
right_side_info_text( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google5.html');
|
||||
scrape_config.keywords = ['BRANDON MOTORS HP13 6NR'];
|
||||
|
||||
right_side_info_text2( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google6.html');
|
||||
scrape_config.keywords = ['car tires for sale'];
|
||||
|
||||
google_places_and_ads( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google_bmw_felgen.html');
|
||||
scrape_config.keywords = ['bmw felgen'];
|
||||
|
||||
google_ads2( await scraper.scrape(scrape_config) );
|
||||
|
||||
await scraper.quit();
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function google_search_with_products(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '1’780’000', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 9, 'results must have at least 8 SERP objects');
|
||||
assert.isAtLeast(obj.top_ads.length, 0, 'there are no top ads');
|
||||
assert.isAtLeast(obj.bottom_ads.length, 3, 'there are 3 bottom ads');
|
||||
assert.isAtLeast(obj.top_products.length, 15, 'there are 15 top products');
|
||||
assert.equal(obj.right_products.length, 0, 'there are 0 right products');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function google_search_with_products2(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '437’000 Ergebnisse (0.41 Sekunden)', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 9, 'results must have at least 8 SERP objects');
|
||||
assert.isAtLeast(obj.top_ads.length, 0, 'there are no top ads');
|
||||
assert.isAtLeast(obj.bottom_ads.length, 1, 'there are 1 bottom ads');
|
||||
assert.isAtLeast(obj.top_products.length, 0, 'there are 0 top products');
|
||||
assert.equal(obj.right_products.length, 4, 'there are 4 right products');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function google_places(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '6’750’000 Ergebnisse (0.52 Sekunden)', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 10, 'results must have at least 9 SERP objects');
|
||||
assert.equal(obj.top_ads.length, 0, 'there are no top ads');
|
||||
assert.equal(obj.bottom_ads.length, 0, 'there are 0 bottom ads');
|
||||
assert.equal(obj.top_products.length, 0, 'there are 0 top products');
|
||||
assert.equal(obj.right_products.length, 0, 'there are 0 right products');
|
||||
assert.equal(obj.places.length, 3, 'there are 3 places');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function right_side_info_text(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
for (let query in response.results) {
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '6 Ergebnisse', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results',
|
||||
'effective_query', 'top_ads', 'bottom_ads', 'right_side_info_text'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
|
||||
assert.isOk(obj.right_side_info_text.length, 'right_side_info_text should have some data');
|
||||
assert.isAtLeast(obj.right_side_info_text.length, 50, 'right_side_info_text should have some data');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function right_side_info_text2(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
for (let query in response.results) {
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '5 Ergebnisse', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results',
|
||||
'effective_query', 'top_ads', 'bottom_ads', 'right_side_info_text'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 5, 'results must have at least 5 SERP objects');
|
||||
assert.isOk(obj.right_side_info_text.length, 'right_side_info_text should have some data');
|
||||
assert.isAtLeast(obj.right_side_info_text.length, 50, 'right_side_info_text should have some data');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function google_places_and_ads(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '439.000.000 Ergebnisse (0,64 Sekunden)', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects');
|
||||
assert.equal(obj.top_ads.length, 0, 'there are no top ads');
|
||||
assert.equal(obj.bottom_ads.length, 0, 'there are 0 bottom ads');
|
||||
assert.isAtLeast(obj.top_products.length, 13, 'there are 13 top products');
|
||||
assert.equal(obj.right_products.length, 0, 'there are 0 right products');
|
||||
assert.equal(obj.places.length, 2, 'there are 2 places');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function google_ads2(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, 'Ungefähr 23.200.000 Ergebnisse (0,29 Sekunden)', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects');
|
||||
assert.equal(obj.top_ads.length, 3, 'there are no top ads');
|
||||
assert.equal(obj.bottom_ads.length, 0, 'there are 0 bottom ads');
|
||||
assert.isAtLeast(obj.top_products.length, 0, 'there must be 0 top products');
|
||||
assert.equal(obj.right_products.length, 9, 'there are 9 right products');
|
||||
assert.equal(obj.places.length, 0, 'there are 0 places');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function confirm_results_ok(obj) {
|
||||
|
||||
for (let res of obj.results) {
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
}
|
||||
|
||||
for (let res of obj.top_ads) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'visible_link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.typeOf(res.links, 'array', 'links must be array');
|
||||
}
|
||||
|
||||
for (let res of obj.bottom_ads) {
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'visible_link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.typeOf(res.links, 'array', 'links must be array');
|
||||
}
|
||||
|
||||
for (let res of obj.top_products) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.price, 'price must be ok');
|
||||
assert.typeOf(res.price, 'string', 'price must be string');
|
||||
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.vendor_link, 'vendor_link must be ok');
|
||||
assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string');
|
||||
assert.isAtLeast(res.vendor_link.length, 8, 'vendor_link must have at least 10 chars');
|
||||
}
|
||||
|
||||
for (let res of obj.right_products) {
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.price, 'price must be ok');
|
||||
assert.typeOf(res.price, 'string', 'price must be string');
|
||||
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.vendor_link, 'vendor_link must be ok');
|
||||
assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string');
|
||||
assert.isAtLeast(res.vendor_link.length, 8, 'vendor_link must have at least 10 chars');
|
||||
}
|
||||
|
||||
for (let res of obj.places) {
|
||||
assert.isOk(res.heading, 'heading must be ok');
|
||||
assert.typeOf(res.heading, 'string', 'heading must be string');
|
||||
assert.isAtLeast(res.heading.length, 5, 'heading must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.rating, 'rating must be ok');
|
||||
assert.typeOf(res.rating, 'string', 'rating must be string');
|
||||
assert.isAtLeast(res.rating.length, 5, 'rating must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.contact, 'contact must be ok');
|
||||
assert.typeOf(res.contact, 'string', 'contact must be string');
|
||||
assert.isAtLeast(res.contact.length, 5, 'contact must have at least 5 chars');
|
||||
|
||||
assert.typeOf(res.hours, 'string', 'hours must be string');
|
||||
if (res.hours) {
|
||||
assert.isAtLeast(res.hours.length, 10, 'hours must have at least 10 chars');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
describe('Google', function() {
|
||||
this.timeout(25000);
|
||||
it('static google searches with products,ads and places', normal_search_test);
|
||||
});
|
213
test/static_tests/second_google.js
Normal file
213
test/static_tests/second_google.js
Normal file
@ -0,0 +1,213 @@
|
||||
const se_scraper = require('./../../index.js');
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
const path = require('path');
|
||||
|
||||
async function normal_search_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'google',
|
||||
keywords: ['in.linkedin.com/in/altanai'],
|
||||
num_pages: 1,
|
||||
scrape_from_file: 'file://' + path.join(__dirname, './html/google7.html'),
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(config);
|
||||
|
||||
await scraper.start();
|
||||
|
||||
google_test_title( await scraper.scrape(scrape_config) );
|
||||
|
||||
await scraper.quit();
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function google_test_title(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '7.600', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 9, 'results must have at least 9 SERP objects');
|
||||
assert.isAtLeast(obj.top_ads.length, 0, 'there are no top ads');
|
||||
assert.isAtLeast(obj.bottom_ads.length, 0, 'there are 0 bottom ads');
|
||||
assert.isAtLeast(obj.top_products.length, 0, 'there are 0 top products');
|
||||
assert.equal(obj.right_products.length, 0, 'there are 0 right products');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
|
||||
assert.equal( obj.results[0].title, 'ALTANAI BISHT - SD2 at Voice Engineering - Plivo | LinkedIn' );
|
||||
assert.equal( obj.results[1].title, 'ALTANAI BISHT | LinkedIn' );
|
||||
assert.equal( obj.results[2].title, 'ALTANAI BISHT – SD2 at Voice Engineering – Plivo | LinkedIn' );
|
||||
assert.equal( obj.results[3].title, 'AI AT - South Delhi, Delhi, India | Professional Profile | LinkedIn' );
|
||||
assert.equal( obj.results[4].title, 'ALTANAI BISHT | LinkedIn' );
|
||||
assert.equal( obj.results[9].title, 'Phani Kumar Parasaram - VOIP Expert - Infinite ... - LinkedIn');
|
||||
|
||||
assert.equal (obj.results[0].date, '27.07.2016');
|
||||
assert.equal( obj.results[0].snippet, '27.07.2016 - View ALTANAI BISHT\'S profile on LinkedIn, the world\'s largest professional community. ALTANAI has 6 jobs listed on their profile. See the ...');
|
||||
|
||||
assert.equal (obj.results[2].date, '27.07.2016');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function confirm_results_ok(obj) {
|
||||
|
||||
for (let res of obj.results) {
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
}
|
||||
|
||||
for (let res of obj.top_ads) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'visible_link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.typeOf(res.links, 'array', 'links must be array');
|
||||
}
|
||||
|
||||
for (let res of obj.bottom_ads) {
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'visible_link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.typeOf(res.links, 'array', 'links must be array');
|
||||
}
|
||||
|
||||
for (let res of obj.top_products) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.price, 'price must be ok');
|
||||
assert.typeOf(res.price, 'string', 'price must be string');
|
||||
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.vendor_link, 'vendor_link must be ok');
|
||||
assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string');
|
||||
assert.isAtLeast(res.vendor_link.length, 10, 'vendor_link must have at least 10 chars');
|
||||
}
|
||||
|
||||
for (let res of obj.right_products) {
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.price, 'price must be ok');
|
||||
assert.typeOf(res.price, 'string', 'price must be string');
|
||||
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.vendor_link, 'vendor_link must be ok');
|
||||
assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string');
|
||||
assert.isAtLeast(res.vendor_link.length, 10, 'vendor_link must have at least 10 chars');
|
||||
}
|
||||
|
||||
for (let res of obj.places) {
|
||||
assert.isOk(res.heading, 'heading must be ok');
|
||||
assert.typeOf(res.heading, 'string', 'heading must be string');
|
||||
assert.isAtLeast(res.heading.length, 5, 'heading must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.rating, 'rating must be ok');
|
||||
assert.typeOf(res.rating, 'string', 'rating must be string');
|
||||
assert.isAtLeast(res.rating.length, 5, 'rating must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.contact, 'contact must be ok');
|
||||
assert.typeOf(res.contact, 'string', 'contact must be string');
|
||||
assert.isAtLeast(res.contact.length, 5, 'contact must have at least 5 chars');
|
||||
|
||||
assert.typeOf(res.hours, 'string', 'hours must be string');
|
||||
if (res.hours) {
|
||||
assert.isAtLeast(res.hours.length, 10, 'hours must have at least 10 chars');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
describe('Google2', function(){
|
||||
this.timeout(10000);
|
||||
it('static google searches testing various details', normal_search_test);
|
||||
});
|
114
test/static_tests/yandex.js
Normal file
114
test/static_tests/yandex.js
Normal file
@ -0,0 +1,114 @@
|
||||
const se_scraper = require('./../../index.js');
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
const path = require('path');
|
||||
|
||||
async function yandex_ads() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'yandex',
|
||||
keywords: ['cloud service'],
|
||||
num_pages: 1,
|
||||
scrape_from_file: 'file://' + path.join(__dirname, './html/yandex1.html'),
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(config);
|
||||
|
||||
await scraper.start();
|
||||
|
||||
yandex_search_with_ads( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.keywords = ['car tires cheap'];
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/yandex2.html');
|
||||
|
||||
yandex_search_with_ads2( await scraper.scrape(scrape_config) );
|
||||
|
||||
await scraper.quit();
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function yandex_search_with_ads(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '2 million results', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'num_results',], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 12, 'results must have at least 12 SERP objects');
|
||||
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function yandex_search_with_ads2(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '5 million results', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'num_results',], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 11, 'results must have at least 12 SERP objects');
|
||||
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function confirm_results_ok(obj) {
|
||||
|
||||
for (let res of obj.results) {
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
}
|
||||
}
|
||||
|
||||
describe('Yandex', function(){
|
||||
this.timeout(10000);
|
||||
it('static yandex searches with ads', yandex_ads);
|
||||
});
|
@ -125,7 +125,7 @@ function test_case_no_results(response) {
|
||||
|
||||
assert.strictEqual(obj.results.length, 0, 'results must have 0 SERP objects');
|
||||
assert.equal(obj.no_results, true, 'no results should be true');
|
||||
assert.isEmpty(obj.num_results, 'no results should be a empty string');
|
||||
assert.isEmpty(obj.num_results, 'num_results should be a empty string');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user