Add test for html_output, refactor the results return

This commit is contained in:
HugoPoi 2019-06-26 12:03:42 +02:00
parent a0e63aa4b0
commit d9ac9f4162
5 changed files with 930 additions and 35 deletions

874
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@ -21,12 +21,16 @@
}, },
"license": "ISC", "license": "ISC",
"dependencies": { "dependencies": {
"chai": "^4.2.0",
"cheerio": "^1.0.0-rc.2", "cheerio": "^1.0.0-rc.2",
"debug": "^4.1.1", "debug": "^4.1.1",
"got": "^9.6.0", "got": "^9.6.0",
"proxy-chain": "^0.2.7", "proxy-chain": "^0.2.7",
"puppeteer": "^1.17.0", "puppeteer": "^1.17.0",
"puppeteer-cluster": "^0.13.0" "puppeteer-cluster": "^0.13.0"
},
"devDependencies": {
"chai": "^4.2.0",
"chai-string": "^1.5.0",
"mocha": "^6.1.4"
} }
} }

View File

@ -70,15 +70,10 @@ module.exports = class Scraper {
await this.scraping_loop(); await this.scraping_loop();
let response = this.results; return {
if (this.config.html_output) { 'results': this.results,
response = { 'html_output': this.html_output,
'results': this.results, };
'html_output': this.html_output
};
}
return response;
} }
/** /**
@ -199,7 +194,11 @@ module.exports = class Scraper {
} }
let html = await this.page.content(); let html = await this.page.content();
this.html_output[keyword][page_num] = html;
if (this.config.html_output) {
this.html_output[keyword][page_num] = html;
}
let parsed = this.parse(html); let parsed = this.parse(html);
this.results[keyword][page_num] = parsed ? parsed : await this.parse_async(html); this.results[keyword][page_num] = parsed ? parsed : await this.parse_async(html);

View File

@ -325,6 +325,7 @@ class ScrapeManager {
Object.assign(this.config, scrape_config); Object.assign(this.config, scrape_config);
var results = {}; var results = {};
var html_output = {};
var num_requests = 0; var num_requests = 0;
var metadata = {}; var metadata = {};
@ -389,11 +390,13 @@ class ScrapeManager {
scraperInstances.push(obj); scraperInstances.push(obj);
} }
let resolved = await Promise.all(execPromises); let promiseReturns = await Promise.all(execPromises);
for (var group of resolved) { // Merge results per keyword
for (var key in group) { for (let promiseReturn of promiseReturns) {
results[key] = group[key]; for (let keyword of this.config.keywords) {
results[keyword] = promiseReturn.results[keyword];
html_output[keyword] = promiseReturn.html_output[keyword];
} }
} }
@ -446,6 +449,7 @@ class ScrapeManager {
return { return {
results: results, results: results,
html_output: (this.config.html_output) ? html_output : undefined,
metadata: metadata || {}, metadata: metadata || {},
}; };
} }
@ -496,4 +500,4 @@ function parseEventData(config) {
module.exports = { module.exports = {
ScrapeManager: ScrapeManager, ScrapeManager: ScrapeManager,
}; };

View File

@ -1,5 +1,7 @@
const se_scraper = require('./../index.js'); const se_scraper = require('./../index.js');
var assert = require('chai').assert; const chai = require('chai');
chai.use(require('chai-string'));
const assert = chai.assert;
/* /*
* Use chai and mocha for tests. * Use chai and mocha for tests.
@ -184,8 +186,46 @@ function test_case_effective_query(response) {
} }
} }
(async () => { async function html_output_query_test() {
await normal_search_test(); let config = {
await no_results_test(); compress: false,
await effective_query_test(); debug_level: 1,
})(); keyword_file: '',
headless: true,
output_file: '',
block_assets: true,
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
random_user_agent: false,
};
let scrape_config = {
search_engine: 'google',
keywords: normal_search_keywords,
num_pages: 3,
html_output: true,
};
let output = await se_scraper.scrape(config, scrape_config);
normal_search_test_case( output );
check_html_output_test_case( output );
}
function check_html_output_test_case( response ) {
for (let query in response.html_output) {
assert.containsAllKeys(response.html_output, normal_search_keywords, 'not all keywords were scraped.');
for (let page_number in response.html_output[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
assert.startsWith(response.html_output[query][page_number], '<!DOCTYPE html><html');
}
}
}
describe('Google', function(){
this.timeout(30000);
it('normal search', normal_search_test);
it('no results', no_results_test);
it('effective query', effective_query_test);
it('html output query', html_output_query_test);
});