mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2024-11-21 23:23:07 +01:00
test: remove legacy tests
This commit is contained in:
parent
3ab8e46126
commit
f192e4ebb4
@ -5,7 +5,7 @@
|
||||
"homepage": "https://scrapeulous.com/",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"test": "mocha test/static_tests/"
|
||||
"test": "mocha test test/modules"
|
||||
},
|
||||
"keywords": [
|
||||
"scraping",
|
||||
|
@ -1,66 +0,0 @@
|
||||
/**
|
||||
|
||||
Test server with:
|
||||
|
||||
curl -XPOST http://0.0.0.0:3000 -H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"browser_config": {
|
||||
"random_user_agent": true
|
||||
},
|
||||
"scrape_config": {
|
||||
"search_engine": "google",
|
||||
"keywords": ["test"],
|
||||
"num_pages": 1
|
||||
}
|
||||
}'
|
||||
|
||||
*/
|
||||
|
||||
const se_scraper = require('../index.js');
|
||||
'use strict';
|
||||
const express = require('express');
|
||||
|
||||
// Constants
|
||||
const PORT = process.env.PORT || 3000;
|
||||
const HOST = process.env.HOST || '0.0.0.0';
|
||||
|
||||
// App
|
||||
const app = express();
|
||||
app.use(express.json());
|
||||
|
||||
let browser_config = {
|
||||
random_user_agent: true,
|
||||
headless : true,
|
||||
debug_level: 1,
|
||||
sleep_range: '',
|
||||
puppeteer_cluster_config: {
|
||||
timeout: 30 * 60 * 1000, // max timeout set to 30 minutes
|
||||
monitor: false,
|
||||
concurrency: 1, // one scraper per tab
|
||||
maxConcurrency: 1, // scrape with 5 tabs
|
||||
}
|
||||
};
|
||||
|
||||
app.post('/', async (req, res) => {
|
||||
if (!req.body.browser_config || !req.body.scrape_config) {
|
||||
res.json({
|
||||
'status': 400,
|
||||
'msg': 'please specify browser_config and scrape_config'
|
||||
});
|
||||
} else {
|
||||
// overwrite standard browser config
|
||||
Object.assign(browser_config, req.body.browser_config);
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(browser_config);
|
||||
await scraper.start();
|
||||
var results = await scraper.scrape(req.body.scrape_config);
|
||||
// console.dir(results, {depth: null, colors: true});
|
||||
await scraper.quit();
|
||||
|
||||
res.send(results);
|
||||
}
|
||||
});
|
||||
|
||||
app.listen(PORT, HOST);
|
||||
|
||||
console.log(`Running on http://${HOST}:${PORT}`);
|
@ -1,15 +0,0 @@
|
||||
## Test with static HTML
|
||||
|
||||
Dynamic testing of se-scraper takes too much time.
|
||||
|
||||
Save some html and initialize se-scraper by loading the search from disk.
|
||||
|
||||
### Disadvantage
|
||||
|
||||
static html gets outdated after some time
|
||||
|
||||
### Advantages
|
||||
|
||||
1. Let's us test corner cases that are missed easily
|
||||
2. Testing is not reliable, since search engines do not always return the same results for the same query
|
||||
3. As said, much faster
|
@ -1,222 +0,0 @@
|
||||
const se_scraper = require('./../../index.js');
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
const path = require('path');
|
||||
|
||||
async function bing_ads() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'bing',
|
||||
keywords: ['kaffeemaschine kaufen'],
|
||||
num_pages: 1,
|
||||
scrape_from_file: 'file://' + path.join(__dirname, './html/bing.html'),
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(config);
|
||||
|
||||
await scraper.start();
|
||||
|
||||
bing_search_with_ads( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.keywords = ['best cloud services'];
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/bing2.html');
|
||||
|
||||
bing_search_with_ads2( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.keywords = ['car tires cheap'];
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/bing3.html');
|
||||
|
||||
bing_search_with_ads3( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.keywords = ['service auto garage'];
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/bing4.html');
|
||||
|
||||
bing_search_with_ads4( await scraper.scrape(scrape_config) );
|
||||
|
||||
await scraper.quit();
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function bing_search_with_ads(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '1’100’000', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 6, 'results must have at least 6 SERP objects');
|
||||
assert.isAtLeast(obj.ads.length, 7, 'there are 7 ads');
|
||||
|
||||
assert.isAtLeast(obj.right_side_ads.length, 5, 'there are 5 ads');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function bing_search_with_ads2(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '44’300’000', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 6, 'results must have at least 6 SERP objects');
|
||||
assert.isAtLeast(obj.ads.length, 7, 'there are 7 ads');
|
||||
|
||||
assert.isAtLeast(obj.right_side_ads.length, 5, 'there are 5 ads');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function bing_search_with_ads3(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '65.500.000 Results', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects');
|
||||
assert.isAtLeast(obj.ads.length, 3, 'there are 3 ads');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function bing_search_with_ads4(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '4.200.000 Ergebnisse', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 9, 'results must have at least 9 SERP objects');
|
||||
assert.isAtLeast(obj.ads.length, 3, 'there are 3 ads');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function confirm_results_ok(obj) {
|
||||
|
||||
for (let res of obj.results) {
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 8, 'title must have at least 8 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
}
|
||||
|
||||
for (let res of obj.ads) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 8, 'title must have at least 8 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
}
|
||||
|
||||
for (let res of obj.right_side_ads) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 8, 'title must have at least 8 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
}
|
||||
}
|
||||
|
||||
describe('Bing', function(){
|
||||
this.timeout(15000);
|
||||
it('static bing searches with ads', bing_ads);
|
||||
});
|
@ -1,173 +0,0 @@
|
||||
const se_scraper = require('./../../index.js');
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
const path = require('path');
|
||||
const cheerio = require('cheerio');
|
||||
|
||||
|
||||
async function test_html_output() {
|
||||
let config = {
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
html_output: true,
|
||||
// whether to strip JS and CSS from the html_output
|
||||
// has only an effect if `html_output` is true
|
||||
clean_html_output: true,
|
||||
// remove all data images from the html
|
||||
clean_data_images: true,
|
||||
// test compression
|
||||
compress: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'bing',
|
||||
keywords: ['kaffeemaschine kaufen'],
|
||||
num_pages: 1,
|
||||
scrape_from_file: 'file://' + path.join(__dirname, './html/bing.html'),
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(config);
|
||||
|
||||
await scraper.start();
|
||||
|
||||
var response = await scraper.scrape(scrape_config);
|
||||
|
||||
scrape_config.clean_html_output = false;
|
||||
scrape_config.clean_data_images = false;
|
||||
|
||||
var response_no_cleaned = await scraper.scrape(scrape_config);
|
||||
|
||||
test(response, response_no_cleaned, 'bing');
|
||||
|
||||
scrape_config.search_engine = 'google';
|
||||
scrape_config.keywords = ['rückspiegel schwarz'];
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google.html');
|
||||
scrape_config.clean_html_output = true;
|
||||
scrape_config.clean_data_images = true;
|
||||
|
||||
var responseGoogle = await scraper.scrape(scrape_config);
|
||||
|
||||
scrape_config.clean_html_output = false;
|
||||
scrape_config.clean_data_images = false;
|
||||
|
||||
var response_no_cleanedGoogle = await scraper.scrape(scrape_config);
|
||||
|
||||
test(responseGoogle, response_no_cleanedGoogle, 'google');
|
||||
|
||||
|
||||
scrape_config.keywords = ['cloud services'];
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/googleLarge.html');
|
||||
scrape_config.clean_html_output = true;
|
||||
scrape_config.clean_data_images = true;
|
||||
|
||||
var responseGoogle = await scraper.scrape(scrape_config);
|
||||
|
||||
scrape_config.clean_html_output = false;
|
||||
scrape_config.clean_data_images = false;
|
||||
|
||||
var response_no_cleanedGoogle = await scraper.scrape(scrape_config);
|
||||
|
||||
test(responseGoogle, response_no_cleanedGoogle, 'google');
|
||||
|
||||
await scraper.quit();
|
||||
}
|
||||
|
||||
function test(response, response_no_cleaned, se='google') {
|
||||
for (let query in response.results) {
|
||||
for (let page_number in response.results[query]) {
|
||||
let obj = response.results[query][page_number];
|
||||
let obj_no_cleaned = response_no_cleaned.results[query][page_number];
|
||||
|
||||
console.log('html length of no cleaned SERP: ' + obj_no_cleaned.html.length);
|
||||
console.log('html length of cleaned SERP: ' + obj.html.length);
|
||||
|
||||
assert.isOk(obj.html, 'Html must be ok!');
|
||||
assert.isAtLeast(obj.html.length, 100, 'html must be a length string');
|
||||
|
||||
assert.isOk(obj_no_cleaned.html, 'Html must be ok!');
|
||||
assert.isAtLeast(obj_no_cleaned.html.length, 100, 'html must be a length string');
|
||||
|
||||
assert.isBelow(obj.html.length, obj_no_cleaned.html.length, 'cleaned html must be smaller');
|
||||
|
||||
// test that we can parse the html of both the cleaned and no cleaned versions
|
||||
// with cheerio and that serp results are roughly the same
|
||||
|
||||
const cleaned$ = cheerio.load(obj.html);
|
||||
const no_cleaned$ = cheerio.load(obj_no_cleaned.html);
|
||||
|
||||
var resCleaned = parseResults(cleaned$, se);
|
||||
var resNoCleaned = parseResults(no_cleaned$, se);
|
||||
|
||||
assert.equal(resCleaned.length, resNoCleaned.length);
|
||||
assert.equal(resCleaned.length, obj.results.length);
|
||||
assert.equal(resNoCleaned.length, obj.results.length);
|
||||
|
||||
// unset the rank
|
||||
resCleaned = resCleaned.map((el) => el.rank = undefined);
|
||||
resNoCleaned = resNoCleaned.map((el) => el.rank = undefined);
|
||||
obj.results = obj.results.map((el) => el.rank = undefined);
|
||||
|
||||
assert.deepEqual(resCleaned, resNoCleaned, 'parsed results should be equal, even if html is cleaned');
|
||||
assert.deepEqual(resCleaned, obj.results, 'parsed results from cleaned html should be equal to se-scraper results');
|
||||
assert.deepEqual(resNoCleaned, obj.results, 'parsed results from non-cleaned html should be equal to se-scraper results');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function parseResults(s$, se) {
|
||||
|
||||
var results = [];
|
||||
|
||||
if (se === 'google') {
|
||||
s$('#center_col .g').each((i, link) => {
|
||||
results.push({
|
||||
link: s$(link).find('.r a').attr('href'),
|
||||
title: s$(link).find('.r a').text(),
|
||||
snippet: s$(link).find('span.st').text(),
|
||||
visible_link: s$(link).find('.r cite').text(),
|
||||
date: s$(link).find('span.f').text() || '',
|
||||
})
|
||||
});
|
||||
|
||||
} else if (se === 'bing') {
|
||||
s$('#b_content #b_results .b_algo').each((i, link) => {
|
||||
results.push({
|
||||
link: s$(link).find('h2 a').attr('href'),
|
||||
title: s$(link).find('h2').text(),
|
||||
snippet: s$(link).find('.b_caption p').text(),
|
||||
visible_link: s$(link).find('cite').text(),
|
||||
})
|
||||
});
|
||||
} else {
|
||||
throw "no such search engine";
|
||||
}
|
||||
|
||||
results = clean_results(results, ['title', 'link', 'snippet']);
|
||||
return results;
|
||||
}
|
||||
|
||||
function clean_results(results, attributes) {
|
||||
const cleaned = [];
|
||||
var rank = 1;
|
||||
for (var res of results) {
|
||||
let goodboy = true;
|
||||
for (var attr of attributes) {
|
||||
if (!res[attr] || !res[attr].trim()) {
|
||||
goodboy = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (goodboy) {
|
||||
res.rank = rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
return cleaned;
|
||||
}
|
||||
|
||||
describe('html output', function(){
|
||||
this.timeout(15000);
|
||||
it('static html output test', test_html_output);
|
||||
});
|
@ -1,24 +0,0 @@
|
||||
'use strict';
|
||||
const zlib = require('zlib');
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
|
||||
var files = ['google.html', 'google2.html', 'google3.html', 'bing.html', 'bing2.html'];
|
||||
|
||||
for (var file of files) {
|
||||
var html = fs.readFileSync(path.resolve(__dirname, './html/' + file));
|
||||
|
||||
var compressed = zlib.gzipSync(html);
|
||||
var deflated = zlib.deflateSync(html);
|
||||
|
||||
var compressed_encoded = compressed.toString('base64');
|
||||
var deflated_encoded = deflated.toString('base64');
|
||||
|
||||
console.log(file)
|
||||
console.log('Normal length: ' + html.length/1000);
|
||||
console.log('GZIP Compressed length: ' + compressed.length/1000);
|
||||
console.log('Deflate Compressed length: ' + deflated.length/1000);
|
||||
console.log('Encoded GZIP Compressed length: ' + compressed_encoded.length/1000);
|
||||
console.log('Encoded Deflate Compressed length: ' + deflated_encoded.length/1000);
|
||||
console.log('------\n')
|
||||
}
|
@ -1,99 +0,0 @@
|
||||
const se_scraper = require('./../../index.js');
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
const path = require('path');
|
||||
|
||||
async function duckduckgo() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'duckduckgo',
|
||||
keywords: ['cloud service'],
|
||||
num_pages: 1,
|
||||
scrape_from_file: 'file://' + path.join(__dirname, './html/duckduckgo1.html'),
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(config);
|
||||
|
||||
await scraper.start();
|
||||
|
||||
duckduckgo_normal( await scraper.scrape(scrape_config) );
|
||||
|
||||
await scraper.quit();
|
||||
}
|
||||
|
||||
function duckduckgo_normal(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'ads',], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects');
|
||||
assert.isAtLeast(obj.ads.length, 2, 'ads must have at least 2 SERP objects');
|
||||
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function confirm_results_ok(obj) {
|
||||
|
||||
for (let res of obj.results) {
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
}
|
||||
|
||||
for (let res of obj.ads) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
}
|
||||
}
|
||||
|
||||
describe('Duckduckgo', function(){
|
||||
this.timeout(10000);
|
||||
it('static duckduckgo sarch', duckduckgo);
|
||||
});
|
@ -1,410 +0,0 @@
|
||||
const se_scraper = require('./../../index.js');
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
const path = require('path');
|
||||
|
||||
async function normal_search_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'google',
|
||||
keywords: ['rückspiegel schwarz'],
|
||||
num_pages: 1,
|
||||
scrape_from_file: 'file://' + path.join(__dirname, './html/google.html'),
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(config);
|
||||
|
||||
await scraper.start();
|
||||
|
||||
google_search_with_products( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google2.html');
|
||||
scrape_config.keywords = ['autoreifen mercedes c-klasse'];
|
||||
|
||||
google_search_with_products2( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google3.html');
|
||||
scrape_config.keywords = ['kaffeemaschine kaufen'];
|
||||
|
||||
google_places( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google4.html');
|
||||
scrape_config.keywords = ['MODEL MARKET SW18 4ES'];
|
||||
|
||||
right_side_info_text( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google5.html');
|
||||
scrape_config.keywords = ['BRANDON MOTORS HP13 6NR'];
|
||||
|
||||
right_side_info_text2( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google6.html');
|
||||
scrape_config.keywords = ['car tires for sale'];
|
||||
|
||||
google_places_and_ads( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google_bmw_felgen.html');
|
||||
scrape_config.keywords = ['bmw felgen'];
|
||||
|
||||
google_ads2( await scraper.scrape(scrape_config) );
|
||||
|
||||
await scraper.quit();
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function google_search_with_products(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '1’780’000', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 9, 'results must have at least 8 SERP objects');
|
||||
assert.isAtLeast(obj.top_ads.length, 0, 'there are no top ads');
|
||||
assert.isAtLeast(obj.bottom_ads.length, 3, 'there are 3 bottom ads');
|
||||
assert.isAtLeast(obj.top_products.length, 15, 'there are 15 top products');
|
||||
assert.equal(obj.right_products.length, 0, 'there are 0 right products');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function google_search_with_products2(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '437’000 Ergebnisse (0.41 Sekunden)', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 9, 'results must have at least 8 SERP objects');
|
||||
assert.isAtLeast(obj.top_ads.length, 0, 'there are no top ads');
|
||||
assert.isAtLeast(obj.bottom_ads.length, 1, 'there are 1 bottom ads');
|
||||
assert.isAtLeast(obj.top_products.length, 0, 'there are 0 top products');
|
||||
assert.equal(obj.right_products.length, 4, 'there are 4 right products');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function google_places(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '6’750’000 Ergebnisse (0.52 Sekunden)', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 10, 'results must have at least 9 SERP objects');
|
||||
assert.equal(obj.top_ads.length, 0, 'there are no top ads');
|
||||
assert.equal(obj.bottom_ads.length, 0, 'there are 0 bottom ads');
|
||||
assert.equal(obj.top_products.length, 0, 'there are 0 top products');
|
||||
assert.equal(obj.right_products.length, 0, 'there are 0 right products');
|
||||
assert.equal(obj.places.length, 3, 'there are 3 places');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function right_side_info_text(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
for (let query in response.results) {
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '6 Ergebnisse', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results',
|
||||
'effective_query', 'top_ads', 'bottom_ads', 'right_side_info_text'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
|
||||
assert.isOk(obj.right_side_info_text.length, 'right_side_info_text should have some data');
|
||||
assert.isAtLeast(obj.right_side_info_text.length, 50, 'right_side_info_text should have some data');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function right_side_info_text2(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
for (let query in response.results) {
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '5 Ergebnisse', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results',
|
||||
'effective_query', 'top_ads', 'bottom_ads', 'right_side_info_text'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 5, 'results must have at least 5 SERP objects');
|
||||
assert.isOk(obj.right_side_info_text.length, 'right_side_info_text should have some data');
|
||||
assert.isAtLeast(obj.right_side_info_text.length, 50, 'right_side_info_text should have some data');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function google_places_and_ads(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '439.000.000 Ergebnisse (0,64 Sekunden)', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects');
|
||||
assert.equal(obj.top_ads.length, 0, 'there are no top ads');
|
||||
assert.equal(obj.bottom_ads.length, 0, 'there are 0 bottom ads');
|
||||
assert.isAtLeast(obj.top_products.length, 13, 'there are 13 top products');
|
||||
assert.equal(obj.right_products.length, 0, 'there are 0 right products');
|
||||
assert.equal(obj.places.length, 2, 'there are 2 places');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function google_ads2(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, 'Ungefähr 23.200.000 Ergebnisse (0,29 Sekunden)', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects');
|
||||
assert.equal(obj.top_ads.length, 3, 'there are no top ads');
|
||||
assert.equal(obj.bottom_ads.length, 0, 'there are 0 bottom ads');
|
||||
assert.isAtLeast(obj.top_products.length, 0, 'there must be 0 top products');
|
||||
assert.equal(obj.right_products.length, 9, 'there are 9 right products');
|
||||
assert.equal(obj.places.length, 0, 'there are 0 places');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function confirm_results_ok(obj) {
|
||||
|
||||
for (let res of obj.results) {
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
}
|
||||
|
||||
for (let res of obj.top_ads) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'visible_link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.typeOf(res.links, 'array', 'links must be array');
|
||||
}
|
||||
|
||||
for (let res of obj.bottom_ads) {
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'visible_link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.typeOf(res.links, 'array', 'links must be array');
|
||||
}
|
||||
|
||||
for (let res of obj.top_products) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.price, 'price must be ok');
|
||||
assert.typeOf(res.price, 'string', 'price must be string');
|
||||
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.vendor_link, 'vendor_link must be ok');
|
||||
assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string');
|
||||
assert.isAtLeast(res.vendor_link.length, 8, 'vendor_link must have at least 10 chars');
|
||||
}
|
||||
|
||||
for (let res of obj.right_products) {
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.price, 'price must be ok');
|
||||
assert.typeOf(res.price, 'string', 'price must be string');
|
||||
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.vendor_link, 'vendor_link must be ok');
|
||||
assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string');
|
||||
assert.isAtLeast(res.vendor_link.length, 8, 'vendor_link must have at least 10 chars');
|
||||
}
|
||||
|
||||
for (let res of obj.places) {
|
||||
assert.isOk(res.heading, 'heading must be ok');
|
||||
assert.typeOf(res.heading, 'string', 'heading must be string');
|
||||
assert.isAtLeast(res.heading.length, 5, 'heading must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.rating, 'rating must be ok');
|
||||
assert.typeOf(res.rating, 'string', 'rating must be string');
|
||||
assert.isAtLeast(res.rating.length, 5, 'rating must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.contact, 'contact must be ok');
|
||||
assert.typeOf(res.contact, 'string', 'contact must be string');
|
||||
assert.isAtLeast(res.contact.length, 5, 'contact must have at least 5 chars');
|
||||
|
||||
assert.typeOf(res.hours, 'string', 'hours must be string');
|
||||
if (res.hours) {
|
||||
assert.isAtLeast(res.hours.length, 10, 'hours must have at least 10 chars');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
describe('Google', function() {
|
||||
this.timeout(25000);
|
||||
it('static google searches with products,ads and places', normal_search_test);
|
||||
});
|
@ -1,213 +0,0 @@
|
||||
const se_scraper = require('./../../index.js');
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
const path = require('path');
|
||||
|
||||
async function normal_search_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'google',
|
||||
keywords: ['in.linkedin.com/in/altanai'],
|
||||
num_pages: 1,
|
||||
scrape_from_file: 'file://' + path.join(__dirname, './html/google7.html'),
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(config);
|
||||
|
||||
await scraper.start();
|
||||
|
||||
google_test_title( await scraper.scrape(scrape_config) );
|
||||
|
||||
await scraper.quit();
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function google_test_title(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '7.600', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 9, 'results must have at least 9 SERP objects');
|
||||
assert.isAtLeast(obj.top_ads.length, 0, 'there are no top ads');
|
||||
assert.isAtLeast(obj.bottom_ads.length, 0, 'there are 0 bottom ads');
|
||||
assert.isAtLeast(obj.top_products.length, 0, 'there are 0 top products');
|
||||
assert.equal(obj.right_products.length, 0, 'there are 0 right products');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
|
||||
assert.equal( obj.results[0].title, 'ALTANAI BISHT - SD2 at Voice Engineering - Plivo | LinkedIn' );
|
||||
assert.equal( obj.results[1].title, 'ALTANAI BISHT | LinkedIn' );
|
||||
assert.equal( obj.results[2].title, 'ALTANAI BISHT – SD2 at Voice Engineering – Plivo | LinkedIn' );
|
||||
assert.equal( obj.results[3].title, 'AI AT - South Delhi, Delhi, India | Professional Profile | LinkedIn' );
|
||||
assert.equal( obj.results[4].title, 'ALTANAI BISHT | LinkedIn' );
|
||||
assert.equal( obj.results[9].title, 'Phani Kumar Parasaram - VOIP Expert - Infinite ... - LinkedIn');
|
||||
|
||||
assert.equal (obj.results[0].date, '27.07.2016');
|
||||
assert.equal( obj.results[0].snippet, '27.07.2016 - View ALTANAI BISHT\'S profile on LinkedIn, the world\'s largest professional community. ALTANAI has 6 jobs listed on their profile. See the ...');
|
||||
|
||||
assert.equal (obj.results[2].date, '27.07.2016');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function confirm_results_ok(obj) {
|
||||
|
||||
for (let res of obj.results) {
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
}
|
||||
|
||||
for (let res of obj.top_ads) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'visible_link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.typeOf(res.links, 'array', 'links must be array');
|
||||
}
|
||||
|
||||
for (let res of obj.bottom_ads) {
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'visible_link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.typeOf(res.links, 'array', 'links must be array');
|
||||
}
|
||||
|
||||
for (let res of obj.top_products) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.price, 'price must be ok');
|
||||
assert.typeOf(res.price, 'string', 'price must be string');
|
||||
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.vendor_link, 'vendor_link must be ok');
|
||||
assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string');
|
||||
assert.isAtLeast(res.vendor_link.length, 10, 'vendor_link must have at least 10 chars');
|
||||
}
|
||||
|
||||
for (let res of obj.right_products) {
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.price, 'price must be ok');
|
||||
assert.typeOf(res.price, 'string', 'price must be string');
|
||||
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.vendor_link, 'vendor_link must be ok');
|
||||
assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string');
|
||||
assert.isAtLeast(res.vendor_link.length, 10, 'vendor_link must have at least 10 chars');
|
||||
}
|
||||
|
||||
for (let res of obj.places) {
|
||||
assert.isOk(res.heading, 'heading must be ok');
|
||||
assert.typeOf(res.heading, 'string', 'heading must be string');
|
||||
assert.isAtLeast(res.heading.length, 5, 'heading must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.rating, 'rating must be ok');
|
||||
assert.typeOf(res.rating, 'string', 'rating must be string');
|
||||
assert.isAtLeast(res.rating.length, 5, 'rating must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.contact, 'contact must be ok');
|
||||
assert.typeOf(res.contact, 'string', 'contact must be string');
|
||||
assert.isAtLeast(res.contact.length, 5, 'contact must have at least 5 chars');
|
||||
|
||||
assert.typeOf(res.hours, 'string', 'hours must be string');
|
||||
if (res.hours) {
|
||||
assert.isAtLeast(res.hours.length, 10, 'hours must have at least 10 chars');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
describe('Google2', function(){
|
||||
this.timeout(10000);
|
||||
it('static google searches testing various details', normal_search_test);
|
||||
});
|
@ -1,152 +0,0 @@
|
||||
const se_scraper = require('./../../index.js');
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
const path = require('path');
|
||||
|
||||
async function yandex_ads() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'yandex',
|
||||
keywords: ['cloud service'],
|
||||
num_pages: 1,
|
||||
scrape_from_file: 'file://' + path.join(__dirname, './html/yandex1.html'),
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(config);
|
||||
|
||||
await scraper.start();
|
||||
|
||||
yandex_search_with_ads( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.keywords = ['car tires cheap'];
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/yandex2.html');
|
||||
|
||||
yandex_search_with_ads2( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.keywords = ['купить деревянные окна'];
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/yandex3.html');
|
||||
|
||||
yandex_search_with_ads3( await scraper.scrape(scrape_config) );
|
||||
|
||||
await scraper.quit();
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function yandex_search_with_ads(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '2 million results', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'num_results',], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 12, 'results must have at least 12 SERP objects');
|
||||
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function yandex_search_with_ads2(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '5 million results', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'num_results',], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 11, 'results must have at least 12 SERP objects');
|
||||
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function yandex_search_with_ads3(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
// console.dir(obj.results, {depth: null, colors: true});
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'num_results',], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 14, 'results must have at least 14 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
// at least 4 ads
|
||||
let cnt = 0;
|
||||
obj.results.forEach((res) => {
|
||||
if (res.is_ad) {
|
||||
cnt++;
|
||||
}
|
||||
});
|
||||
|
||||
assert.isAtLeast(cnt, 4, 'there should be at least 4 ads in the results');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function confirm_results_ok(obj) {
|
||||
|
||||
for (let res of obj.results) {
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
}
|
||||
}
|
||||
|
||||
describe('Yandex', function(){
|
||||
this.timeout(10000);
|
||||
it('static yandex searches with ads', yandex_ads);
|
||||
});
|
@ -1,141 +0,0 @@
|
||||
'use strict';
|
||||
const se_scraper = require('./../index.js');
|
||||
const assert = require('chai').assert;
|
||||
|
||||
/*
|
||||
* Use chai and mocha for tests.
|
||||
* https://mochajs.org/#installation
|
||||
*/
|
||||
|
||||
const normal_search_keywords = ['iphone', 'clock'];
|
||||
|
||||
async function normal_search_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
keyword_file: '',
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'amazon',
|
||||
num_pages: 1,
|
||||
keywords: normal_search_keywords,
|
||||
};
|
||||
|
||||
console.log('normal_search_test()');
|
||||
normal_search_test_case( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function normal_search_test_case(response) {
|
||||
assert.equal(response.metadata.num_requests, 2);
|
||||
|
||||
for (let query in response.results) {
|
||||
let total_rank = 1;
|
||||
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'image', 'seller', 'stars', 'num_reviews', 'price', 'oldprice'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.seller, 'seller must be ok');
|
||||
assert.typeOf(res.seller, 'string', 'seller must be string');
|
||||
assert.isAtLeast(res.seller.length, 5, 'seller must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.stars, 'stars be ok');
|
||||
assert.typeOf(res.stars, 'string', 'stars must be string');
|
||||
assert.isAtLeast(res.stars.length, 5, 'stars must have at least 6 chars');
|
||||
assert.include(res.stars, ' out of ', 'stars must include " out of "');
|
||||
|
||||
assert.isOk(res.num_reviews, 'num_reviews be ok');
|
||||
assert.typeOf(res.num_reviews, 'string', 'num_reviews must be string');
|
||||
assert.isAtLeast(res.num_reviews.length, 1, 'num_reviews must have at least 1 chars');
|
||||
|
||||
assert.isOk(res.price, 'price be ok');
|
||||
assert.typeOf(res.price, 'string', 'price must be string');
|
||||
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
assert.equal(res.rank, total_rank++, 'rank ist wrong');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const keywords_no_results = ['2342kljp;fj9834u40abJ54634344023safkl34a44dsflkjaQQuBBdfk',];
|
||||
|
||||
async function no_results_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
keyword_file: '',
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'amazon',
|
||||
num_pages: 1,
|
||||
keywords: keywords_no_results,
|
||||
};
|
||||
|
||||
console.log('no_results_test()');
|
||||
test_case_no_results( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function test_case_no_results(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, keywords_no_results, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
assert(obj.results.length === 0, 'results must have 0 SERP objects');
|
||||
assert.equal(obj.no_results, true, 'no results should be true');
|
||||
assert.isEmpty(obj.num_results, 'no results should be a empty string');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
describe('Amazon', function(){
|
||||
this.timeout(30000);
|
||||
it('normal search test', normal_search_test);
|
||||
it('no results test', no_results_test);
|
||||
});
|
@ -1,87 +0,0 @@
|
||||
'use strict';
|
||||
const se_scraper = require('./../index.js');
|
||||
const assert = require('chai').assert;
|
||||
|
||||
/*
|
||||
* Use chai and mocha for tests.
|
||||
* https://mochajs.org/#installation
|
||||
*/
|
||||
|
||||
const normal_search_keywords = ['mouse', 'cat'];
|
||||
|
||||
async function normal_search_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
keyword_file: '',
|
||||
headless: true,
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'baidu',
|
||||
keywords: normal_search_keywords,
|
||||
num_pages: 2,
|
||||
};
|
||||
|
||||
console.log('normal_search_test()');
|
||||
normal_search_test_case( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function normal_search_test_case(response) {
|
||||
assert.equal(response.metadata.num_requests, 4);
|
||||
|
||||
for (let query in response.results) {
|
||||
let total_rank = 1;
|
||||
|
||||
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'num_results', 'no_results'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
assert.equal(res.rank, total_rank++, 'rank ist wrong');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
describe('Baidu', function(){
|
||||
this.timeout(30000);
|
||||
it('normal search test', normal_search_test);
|
||||
});
|
@ -1,271 +0,0 @@
|
||||
'use strict';
|
||||
const se_scraper = require('./../index.js');
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
/*
|
||||
* Use chai and mocha for tests.
|
||||
* https://mochajs.org/#installation
|
||||
*/
|
||||
|
||||
const normal_search_keywords = ['apple tree', 'weather tomorrow'];
|
||||
|
||||
async function normal_search_test() {
|
||||
let config = {
|
||||
search_engine: 'bing',
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
keywords: normal_search_keywords,
|
||||
keyword_file: '',
|
||||
num_pages: 3,
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'bing',
|
||||
keywords: normal_search_keywords,
|
||||
num_pages: 3,
|
||||
};
|
||||
|
||||
console.log('normal_search_test()');
|
||||
normal_search_test_case( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function normal_search_test_case(response) {
|
||||
assert.equal(response.metadata.num_requests, 6);
|
||||
|
||||
for (let query in response.results) {
|
||||
let total_rank = 1;
|
||||
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'rank'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
|
||||
|
||||
if (res.snippet) {
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
}
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
assert.equal(res.rank, total_rank++, 'rank ist wrong');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const keywords_no_results = ['2342kljp;fj9834u40abJAkasdlfkjsladfkjasfdas;lk3453-934023safkl34a44dsflkjaQQuBBdfk',];
|
||||
|
||||
async function no_results_test() {
|
||||
let config = {
|
||||
search_engine: 'bing',
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
keywords: keywords_no_results,
|
||||
keyword_file: '',
|
||||
num_pages: 1,
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'bing',
|
||||
keywords: keywords_no_results,
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
console.log('no_results_test()');
|
||||
test_case_no_results( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function test_case_no_results(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, keywords_no_results, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
assert(obj.results.length === 0, 'results must have 0 SERP objects');
|
||||
assert.equal(obj.no_results, true, 'no results should be true');
|
||||
assert.isEmpty(obj.num_results, 'no results should be a empty string');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const effective_query_keywords = ['mount everrest'];
|
||||
|
||||
async function effective_query_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
keyword_file: '',
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'bing',
|
||||
keywords: effective_query_keywords,
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
console.log('effective_query_test()');
|
||||
test_case_effective_query( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function test_case_effective_query(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, effective_query_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
// effective query must be different to the original keyword
|
||||
assert.isOk(obj.effective_query, 'effective query must be ok');
|
||||
assert.isNotEmpty(obj.effective_query, 'effective query must be valid');
|
||||
assert(obj.effective_query !== query, 'effective query must be different from keyword');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
const ads_keywords = ['cloud services', 'buy shoes'];
|
||||
|
||||
async function ads_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
block_assets: false,
|
||||
random_user_agent: true,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'bing',
|
||||
keywords: ads_keywords,
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
console.log('ads_test()');
|
||||
test_case_ads_test( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
function test_case_ads_test(response) {
|
||||
assert.equal(response.metadata.num_requests, 2);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, ads_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 5, 'results must have at least 5 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
assert.isAtLeast(obj.ads.length, 2, 'ads must have at least 2 SERP objects');
|
||||
|
||||
for (let res of obj.ads) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
// assert.isOk(res.link, 'link must be ok');
|
||||
// assert.typeOf(res.link, 'string', 'link must be string');
|
||||
// assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
describe('Bing', function(){
|
||||
this.timeout(30000);
|
||||
it('normal search', normal_search_test);
|
||||
it('no results', no_results_test);
|
||||
it('effective query', effective_query_test);
|
||||
it('finds ads', ads_test);
|
||||
});
|
@ -1,192 +0,0 @@
|
||||
'use strict';
|
||||
const se_scraper = require('./../index.js');
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
|
||||
const normal_search_keywords = ['apple tree', 'weather tomorrow'];
|
||||
|
||||
async function normal_search_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
block_assets: false,
|
||||
random_user_agent: true,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'duckduckgo',
|
||||
keywords: normal_search_keywords,
|
||||
num_pages: 2,
|
||||
};
|
||||
|
||||
console.log('normal_search_test()');
|
||||
normal_search_test_case( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function normal_search_test_case(response) {
|
||||
assert.equal(response.metadata.num_requests, 2);
|
||||
|
||||
for (let query in response.results) {
|
||||
let total_rank = 1;
|
||||
|
||||
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
assert.equal(res.rank, total_rank++, 'rank ist wrong');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const effective_query_keywords = ['mount everrest'];
|
||||
|
||||
async function effective_query_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
block_assets: true,
|
||||
random_user_agent: true,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'duckduckgo',
|
||||
keywords: effective_query_keywords,
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
console.log('test_case_effective_query()');
|
||||
test_case_effective_query( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function test_case_effective_query(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
results = response.results;
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, effective_query_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
// effective query must be different to the original keyword
|
||||
assert.isOk(obj.effective_query, 'effective query must be ok');
|
||||
assert.isNotEmpty(obj.effective_query, 'effective query must be valid');
|
||||
assert(obj.effective_query !== query, 'effective query must be different from keyword');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const ads_keywords = ['cloud services', 'buy shoes'];
|
||||
|
||||
async function ads_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
block_assets: false,
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'duckduckgo',
|
||||
keywords: ads_keywords,
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
console.log('ads_test()');
|
||||
test_case_ads_test( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
function test_case_ads_test(response) {
|
||||
assert.equal(response.metadata.num_requests, 2);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, ads_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'effective_query', 'ads'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 6, 'results must have at least 6 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
assert.isAtLeast(obj.ads.length, 2, 'ads must have at least 2 SERP object');
|
||||
|
||||
for (let res of obj.ads) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
describe('Duckduckgo', function(){
|
||||
this.timeout(30000);
|
||||
it('normal search', normal_search_test);
|
||||
it('effective query', effective_query_test);
|
||||
it('finds ads', ads_test);
|
||||
});
|
@ -1,424 +0,0 @@
|
||||
'use strict';
|
||||
const se_scraper = require('./../index.js');
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
|
||||
/*
|
||||
* Use chai and mocha for tests.
|
||||
* https://mochajs.org/#installation
|
||||
*/
|
||||
|
||||
const normal_search_keywords = ['apple tree', 'weather tomorrow'];
|
||||
|
||||
async function normal_search_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
keyword_file: '',
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'google',
|
||||
keywords: normal_search_keywords,
|
||||
num_pages: 3,
|
||||
};
|
||||
|
||||
console.log('normal_search_test()');
|
||||
normal_search_test_case( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function normal_search_test_case(response) {
|
||||
assert.equal(response.metadata.num_requests, 6);
|
||||
|
||||
for (let query in response.results) {
|
||||
let total_rank = 1;
|
||||
|
||||
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 8 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
assert.equal(res.rank, total_rank++, 'rank ist wrong');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const keywords_no_results = ['fgskl34440abJAksfs4353534a3l34AVGFDFflkjaQQuBBdfk',];
|
||||
|
||||
async function no_results_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
keyword_file: '',
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'google',
|
||||
keywords: keywords_no_results,
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
console.log('no_results_test()');
|
||||
test_case_no_results( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function test_case_no_results(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, keywords_no_results, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
assert.strictEqual(obj.results.length, 0, 'results must have 0 SERP objects');
|
||||
assert.equal(obj.no_results, true, 'no results should be true');
|
||||
assert.isEmpty(obj.num_results, 'num_results should be a empty string');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const effective_query_keywords = ['mount evverrest'];
|
||||
|
||||
async function effective_query_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
keyword_file: '',
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'google',
|
||||
keywords: effective_query_keywords,
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
console.log('effective_query_test()');
|
||||
test_case_effective_query( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function test_case_effective_query(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, effective_query_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
// effective query must be different to the original keyword
|
||||
assert.isOk(obj.effective_query, 'effective query must be ok');
|
||||
assert.isNotEmpty(obj.effective_query, 'effective query must be valid');
|
||||
assert(obj.effective_query !== query, 'effective query must be different from keyword');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 8 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function html_output_query_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
keyword_file: '',
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'google',
|
||||
keywords: normal_search_keywords,
|
||||
num_pages: 3,
|
||||
html_output: true,
|
||||
};
|
||||
|
||||
let output = await se_scraper.scrape(config, scrape_config);
|
||||
normal_search_test_case( output );
|
||||
check_html_output_test_case( output );
|
||||
}
|
||||
|
||||
function check_html_output_test_case( response ) {
|
||||
for (let query in response.html_output) {
|
||||
|
||||
assert.containsAllKeys(response.html_output, normal_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.html_output[query]) {
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
assert.startsWith(response.html_output[query][page_number], '<!DOCTYPE html><html');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const ads_keywords = ['cloud services', 'auto kaufen'];
|
||||
|
||||
async function ads_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
block_assets: false,
|
||||
random_user_agent: false, // dont try to trick google with ads
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'google',
|
||||
keywords: ads_keywords,
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
console.log('ads_test()');
|
||||
test_case_ads_test( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
function test_case_ads_test(response) {
|
||||
assert.equal(response.metadata.num_requests, 2);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, ads_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
assert(obj.top_ads.length >= 1 || obj.bottom_ads.length >= 1, 'top_ads or bottom_ads must have at least 1 SERP object');
|
||||
|
||||
for (let res of obj.top_ads) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'visible_link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.typeOf(res.links, 'array', 'links must be array');
|
||||
}
|
||||
|
||||
for (let res of obj.bottom_ads) {
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'visible_link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.typeOf(res.links, 'array', 'links must be array');
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
const product_keywords = ['autoreifen bmw'];
|
||||
|
||||
async function products_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
block_assets: false,
|
||||
random_user_agent: false, // dont try to trick google with ads
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'google',
|
||||
keywords: ads_keywords,
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
console.log('products_test()');
|
||||
test_case_products_test( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
function test_case_products_test(response) {
|
||||
assert.equal(response.metadata.num_requests, 2);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, ads_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
assert(obj.top_products.length >= 1 || obj.right_products.length >= 1, 'top_products or right_products must have at least 1 SERP object');
|
||||
|
||||
for (let res of obj.top_products) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.price, 'price must be ok');
|
||||
assert.typeOf(res.price, 'string', 'price must be string');
|
||||
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.vendor_link, 'vendor_link must be ok');
|
||||
assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string');
|
||||
assert.isAtLeast(res.vendor_link.length, 10, 'vendor_link must have at least 10 chars');
|
||||
}
|
||||
|
||||
for (let res of obj.right_products) {
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.price, 'price must be ok');
|
||||
assert.typeOf(res.price, 'string', 'price must be string');
|
||||
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.vendor_link, 'vendor_link must be ok');
|
||||
assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string');
|
||||
assert.isAtLeast(res.vendor_link.length, 10, 'vendor_link must have at least 10 chars');
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
describe('Google', function(){
|
||||
this.timeout(30000);
|
||||
it('normal search', normal_search_test);
|
||||
it('no results', no_results_test);
|
||||
it('effective query', effective_query_test);
|
||||
it('html output query', html_output_query_test);
|
||||
it('ads', ads_test);
|
||||
it('products test', products_test);
|
||||
});
|
@ -1,80 +0,0 @@
|
||||
'use strict';
|
||||
const se_scraper = require('./../index.js');
|
||||
const assert = require('chai').assert;
|
||||
|
||||
/*
|
||||
* Use chai and mocha for tests.
|
||||
* https://mochajs.org/#installation
|
||||
*/
|
||||
|
||||
const normal_search_keywords = ['apple', 'rain'];
|
||||
|
||||
async function normal_image_search_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 0,
|
||||
headless: true,
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'google_image',
|
||||
keywords: normal_search_keywords,
|
||||
num_pages: 2,
|
||||
};
|
||||
|
||||
console.log('normal_image_search_test()');
|
||||
normal_image_search_test_case( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function normal_image_search_test_case(response) {
|
||||
assert.equal(response.metadata.num_requests, 2);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
let total_rank = 1;
|
||||
|
||||
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 15, 'results must have at least 15 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'snippet', 'rank', 'clean_link'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.clean_link, 'clean_link must be ok');
|
||||
assert.typeOf(res.clean_link, 'string', 'clean_link must be string');
|
||||
assert.isAtLeast(res.clean_link.length, 5, 'clean_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
assert.equal(res.rank, total_rank++, 'rank ist wrong');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
describe('Google Image', function(){
|
||||
this.timeout(30000);
|
||||
it('normal image search test', normal_image_search_test);
|
||||
});
|
@ -1,91 +0,0 @@
|
||||
'use strict';
|
||||
const se_scraper = require('./../index.js');
|
||||
const assert = require('chai').assert;
|
||||
|
||||
const normal_search_keywords = ['apple juice'];
|
||||
|
||||
async function queryargs_search_test() {
|
||||
let config = {
|
||||
search_engine: 'google',
|
||||
compress: false,
|
||||
debug: true,
|
||||
verbose: true,
|
||||
keywords: normal_search_keywords,
|
||||
keyword_file: '',
|
||||
num_pages: 2,
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
// use specific search engine parameters for various search engines
|
||||
google_settings: {
|
||||
google_domain: 'google.com',
|
||||
gl: 'fr', // The gl parameter determines the Google country to use for the query.
|
||||
hl: 'fr', // The hl parameter determines the Google UI language to return results.
|
||||
start: 30, // Determines the results offset to use, defaults to 0.
|
||||
num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
|
||||
},
|
||||
};
|
||||
|
||||
console.log('queryargs_search_test()');
|
||||
await se_scraper.scrape(config, queryargs_search_test_case);
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function queryargs_search_test_case(err, response) {
|
||||
|
||||
if (err) {
|
||||
console.error(err);
|
||||
} else {
|
||||
assert.equal(response.metadata.num_requests, 2);
|
||||
|
||||
for (let query in response.results) {
|
||||
let total_rank = 1;
|
||||
|
||||
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 90, 'results must have at least 80 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
assert.equal(res.rank, total_rank++, 'rank ist wrong');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
describe('Google with query arguments', function(){
|
||||
this.timeout(30000);
|
||||
it('query args search test', queryargs_search_test);
|
||||
});
|
@ -1,217 +0,0 @@
|
||||
'use strict';
|
||||
const se_scraper = require('./../index.js');
|
||||
const assert = require('chai').assert;
|
||||
|
||||
/*
|
||||
* Use chai and mocha for tests.
|
||||
* https://mochajs.org/#installation
|
||||
*/
|
||||
|
||||
const quote_search_keywords = ['MSFT', 'AAPL'];
|
||||
|
||||
async function reuters_search_test() {
|
||||
let config = {
|
||||
search_engine: 'reuters',
|
||||
compress: false,
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: quote_search_keywords,
|
||||
keyword_file: '',
|
||||
num_pages: 1,
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
console.log('reuters_search_test()');
|
||||
await se_scraper.scrape(config, reuters_search_test_case);
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function reuters_search_test_case(err, response) {
|
||||
|
||||
if (err) {
|
||||
console.error(err);
|
||||
} else {
|
||||
|
||||
for (let query in response.results) {
|
||||
let total_rank = 1;
|
||||
assert.containsAllKeys(response.results, quote_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'date', 'snippet'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.date, 'date must be ok');
|
||||
assert.typeOf(res.date, 'string', 'date must be string');
|
||||
assert.isAtLeast(res.date.length, 5, 'date must have at least 5 chars');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function cnbc_search_test() {
|
||||
let config = {
|
||||
search_engine: 'cnbc',
|
||||
compress: false,
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: quote_search_keywords,
|
||||
keyword_file: '',
|
||||
num_pages: 1,
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
console.log('cnbc_search_test()');
|
||||
await se_scraper.scrape(config, cnbc_search_test_case);
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function cnbc_search_test_case(err, response) {
|
||||
|
||||
if (err) {
|
||||
console.error(err);
|
||||
} else {
|
||||
|
||||
for (let query in response.results) {
|
||||
let total_rank = 1;
|
||||
assert.containsAllKeys(response.results, quote_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'date'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.date, 'date must be ok');
|
||||
assert.typeOf(res.date, 'string', 'date must be string');
|
||||
assert.isAtLeast(res.date.length, 5, 'date must have at least 5 chars');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const marketwatch_search_keywords = ['MSFT'];
|
||||
|
||||
async function marketwatch_search_test() {
|
||||
let config = {
|
||||
search_engine: 'marketwatch',
|
||||
compress: false,
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: marketwatch_search_keywords,
|
||||
keyword_file: '',
|
||||
num_pages: 1,
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
console.log('marketwatch_search_test()');
|
||||
await se_scraper.scrape(config, marketwatch_search_test_case);
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function marketwatch_search_test_case(err, response) {
|
||||
|
||||
if (err) {
|
||||
console.error(err);
|
||||
} else {
|
||||
|
||||
for (let query in response.results) {
|
||||
let total_rank = 1;
|
||||
assert.containsAllKeys(response.results, marketwatch_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'date', 'author'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.author, 'author must be ok');
|
||||
assert.typeOf(res.author, 'string', 'author must be string');
|
||||
assert.isAtLeast(res.author.length, 5, 'author must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.date, 'date must be ok');
|
||||
assert.typeOf(res.date, 'string', 'date must be string');
|
||||
assert.isAtLeast(res.date.length, 5, 'date must have at least 5 chars');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
describe('Ticker', function(){
|
||||
this.timeout(30000);
|
||||
it('Reuters search test', reuters_search_test);
|
||||
it('CNBC search test', cnbc_search_test);
|
||||
it('Marketwatch search test', marketwatch_search_test);
|
||||
});
|
Loading…
Reference in New Issue
Block a user