forked from extern/se-scraper
173 lines
5.9 KiB
JavaScript
173 lines
5.9 KiB
JavaScript
const se_scraper = require('./../../index.js');
|
|
const chai = require('chai');
|
|
chai.use(require('chai-string'));
|
|
const assert = chai.assert;
|
|
const path = require('path');
|
|
const cheerio = require('cheerio');
|
|
|
|
|
|
async function test_html_output() {
|
|
let config = {
|
|
debug_level: 1,
|
|
headless: true,
|
|
html_output: true,
|
|
// whether to strip JS and CSS from the html_output
|
|
// has only an effect if `html_output` is true
|
|
clean_html_output: true,
|
|
// remove all data images from the html
|
|
clean_data_images: true,
|
|
// test compression
|
|
compress: false,
|
|
};
|
|
|
|
let scrape_config = {
|
|
search_engine: 'bing',
|
|
keywords: ['kaffeemaschine kaufen'],
|
|
num_pages: 1,
|
|
scrape_from_file: 'file://' + path.join(__dirname, './html/bing.html'),
|
|
};
|
|
|
|
var scraper = new se_scraper.ScrapeManager(config);
|
|
|
|
await scraper.start();
|
|
|
|
var response = await scraper.scrape(scrape_config);
|
|
|
|
scrape_config.clean_html_output = false;
|
|
scrape_config.clean_data_images = false;
|
|
|
|
var response_no_cleaned = await scraper.scrape(scrape_config);
|
|
|
|
test(response, response_no_cleaned, 'bing');
|
|
|
|
scrape_config.search_engine = 'google';
|
|
scrape_config.keywords = ['rückspiegel schwarz'];
|
|
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/google.html');
|
|
scrape_config.clean_html_output = true;
|
|
scrape_config.clean_data_images = true;
|
|
|
|
var responseGoogle = await scraper.scrape(scrape_config);
|
|
|
|
scrape_config.clean_html_output = false;
|
|
scrape_config.clean_data_images = false;
|
|
|
|
var response_no_cleanedGoogle = await scraper.scrape(scrape_config);
|
|
|
|
test(responseGoogle, response_no_cleanedGoogle, 'google');
|
|
|
|
|
|
scrape_config.keywords = ['cloud services'];
|
|
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/googleLarge.html');
|
|
scrape_config.clean_html_output = true;
|
|
scrape_config.clean_data_images = true;
|
|
|
|
var responseGoogle = await scraper.scrape(scrape_config);
|
|
|
|
scrape_config.clean_html_output = false;
|
|
scrape_config.clean_data_images = false;
|
|
|
|
var response_no_cleanedGoogle = await scraper.scrape(scrape_config);
|
|
|
|
test(responseGoogle, response_no_cleanedGoogle, 'google');
|
|
|
|
await scraper.quit();
|
|
}
|
|
|
|
function test(response, response_no_cleaned, se='google') {
|
|
for (let query in response.results) {
|
|
for (let page_number in response.results[query]) {
|
|
let obj = response.results[query][page_number];
|
|
let obj_no_cleaned = response_no_cleaned.results[query][page_number];
|
|
|
|
console.log('html length of no cleaned SERP: ' + obj_no_cleaned.html.length);
|
|
console.log('html length of cleaned SERP: ' + obj.html.length);
|
|
|
|
assert.isOk(obj.html, 'Html must be ok!');
|
|
assert.isAtLeast(obj.html.length, 100, 'html must be a length string');
|
|
|
|
assert.isOk(obj_no_cleaned.html, 'Html must be ok!');
|
|
assert.isAtLeast(obj_no_cleaned.html.length, 100, 'html must be a length string');
|
|
|
|
assert.isBelow(obj.html.length, obj_no_cleaned.html.length, 'cleaned html must be smaller');
|
|
|
|
// test that we can parse the html of both the cleaned and no cleaned versions
|
|
// with cheerio and that serp results are roughly the same
|
|
|
|
const cleaned$ = cheerio.load(obj.html);
|
|
const no_cleaned$ = cheerio.load(obj_no_cleaned.html);
|
|
|
|
var resCleaned = parseResults(cleaned$, se);
|
|
var resNoCleaned = parseResults(no_cleaned$, se);
|
|
|
|
assert.equal(resCleaned.length, resNoCleaned.length);
|
|
assert.equal(resCleaned.length, obj.results.length);
|
|
assert.equal(resNoCleaned.length, obj.results.length);
|
|
|
|
// unset the rank
|
|
resCleaned = resCleaned.map((el) => el.rank = undefined);
|
|
resNoCleaned = resNoCleaned.map((el) => el.rank = undefined);
|
|
obj.results = obj.results.map((el) => el.rank = undefined);
|
|
|
|
assert.deepEqual(resCleaned, resNoCleaned, 'parsed results should be equal, even if html is cleaned');
|
|
assert.deepEqual(resCleaned, obj.results, 'parsed results from cleaned html should be equal to se-scraper results');
|
|
assert.deepEqual(resNoCleaned, obj.results, 'parsed results from non-cleaned html should be equal to se-scraper results');
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
function parseResults(s$, se) {
|
|
|
|
var results = [];
|
|
|
|
if (se === 'google') {
|
|
s$('#center_col .g').each((i, link) => {
|
|
results.push({
|
|
link: s$(link).find('.r a').attr('href'),
|
|
title: s$(link).find('.r a').text(),
|
|
snippet: s$(link).find('span.st').text(),
|
|
visible_link: s$(link).find('.r cite').text(),
|
|
date: s$(link).find('span.f').text() || '',
|
|
})
|
|
});
|
|
|
|
} else if (se === 'bing') {
|
|
s$('#b_content #b_results .b_algo').each((i, link) => {
|
|
results.push({
|
|
link: s$(link).find('h2 a').attr('href'),
|
|
title: s$(link).find('h2').text(),
|
|
snippet: s$(link).find('.b_caption p').text(),
|
|
visible_link: s$(link).find('cite').text(),
|
|
})
|
|
});
|
|
} else {
|
|
throw "no such search engine";
|
|
}
|
|
|
|
results = clean_results(results, ['title', 'link', 'snippet']);
|
|
return results;
|
|
}
|
|
|
|
function clean_results(results, attributes) {
|
|
const cleaned = [];
|
|
var rank = 1;
|
|
for (var res of results) {
|
|
let goodboy = true;
|
|
for (var attr of attributes) {
|
|
if (!res[attr] || !res[attr].trim()) {
|
|
goodboy = false;
|
|
break;
|
|
}
|
|
}
|
|
if (goodboy) {
|
|
res.rank = rank++;
|
|
cleaned.push(res);
|
|
}
|
|
}
|
|
return cleaned;
|
|
}
|
|
|
|
describe('html output', function(){
|
|
this.timeout(15000);
|
|
it('static html output test', test_html_output);
|
|
}); |