forked from extern/se-scraper
Add test for html_output, refactor the results return
This commit is contained in:
parent
a0e63aa4b0
commit
d9ac9f4162
874
package-lock.json
generated
874
package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@ -21,12 +21,16 @@
|
|||||||
},
|
},
|
||||||
"license": "ISC",
|
"license": "ISC",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"chai": "^4.2.0",
|
|
||||||
"cheerio": "^1.0.0-rc.2",
|
"cheerio": "^1.0.0-rc.2",
|
||||||
"debug": "^4.1.1",
|
"debug": "^4.1.1",
|
||||||
"got": "^9.6.0",
|
"got": "^9.6.0",
|
||||||
"proxy-chain": "^0.2.7",
|
"proxy-chain": "^0.2.7",
|
||||||
"puppeteer": "^1.17.0",
|
"puppeteer": "^1.17.0",
|
||||||
"puppeteer-cluster": "^0.13.0"
|
"puppeteer-cluster": "^0.13.0"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"chai": "^4.2.0",
|
||||||
|
"chai-string": "^1.5.0",
|
||||||
|
"mocha": "^6.1.4"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -70,15 +70,10 @@ module.exports = class Scraper {
|
|||||||
|
|
||||||
await this.scraping_loop();
|
await this.scraping_loop();
|
||||||
|
|
||||||
let response = this.results;
|
return {
|
||||||
if (this.config.html_output) {
|
'results': this.results,
|
||||||
response = {
|
'html_output': this.html_output,
|
||||||
'results': this.results,
|
};
|
||||||
'html_output': this.html_output
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
return response;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -199,7 +194,11 @@ module.exports = class Scraper {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let html = await this.page.content();
|
let html = await this.page.content();
|
||||||
this.html_output[keyword][page_num] = html;
|
|
||||||
|
if (this.config.html_output) {
|
||||||
|
this.html_output[keyword][page_num] = html;
|
||||||
|
}
|
||||||
|
|
||||||
let parsed = this.parse(html);
|
let parsed = this.parse(html);
|
||||||
this.results[keyword][page_num] = parsed ? parsed : await this.parse_async(html);
|
this.results[keyword][page_num] = parsed ? parsed : await this.parse_async(html);
|
||||||
|
|
||||||
|
@ -325,6 +325,7 @@ class ScrapeManager {
|
|||||||
Object.assign(this.config, scrape_config);
|
Object.assign(this.config, scrape_config);
|
||||||
|
|
||||||
var results = {};
|
var results = {};
|
||||||
|
var html_output = {};
|
||||||
var num_requests = 0;
|
var num_requests = 0;
|
||||||
var metadata = {};
|
var metadata = {};
|
||||||
|
|
||||||
@ -389,11 +390,13 @@ class ScrapeManager {
|
|||||||
scraperInstances.push(obj);
|
scraperInstances.push(obj);
|
||||||
}
|
}
|
||||||
|
|
||||||
let resolved = await Promise.all(execPromises);
|
let promiseReturns = await Promise.all(execPromises);
|
||||||
|
|
||||||
for (var group of resolved) {
|
// Merge results per keyword
|
||||||
for (var key in group) {
|
for (let promiseReturn of promiseReturns) {
|
||||||
results[key] = group[key];
|
for (let keyword of this.config.keywords) {
|
||||||
|
results[keyword] = promiseReturn.results[keyword];
|
||||||
|
html_output[keyword] = promiseReturn.html_output[keyword];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -446,6 +449,7 @@ class ScrapeManager {
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
results: results,
|
results: results,
|
||||||
|
html_output: (this.config.html_output) ? html_output : undefined,
|
||||||
metadata: metadata || {},
|
metadata: metadata || {},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@ -496,4 +500,4 @@ function parseEventData(config) {
|
|||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
ScrapeManager: ScrapeManager,
|
ScrapeManager: ScrapeManager,
|
||||||
};
|
};
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
const se_scraper = require('./../index.js');
|
const se_scraper = require('./../index.js');
|
||||||
var assert = require('chai').assert;
|
const chai = require('chai');
|
||||||
|
chai.use(require('chai-string'));
|
||||||
|
const assert = chai.assert;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Use chai and mocha for tests.
|
* Use chai and mocha for tests.
|
||||||
@ -184,8 +186,46 @@ function test_case_effective_query(response) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
(async () => {
|
async function html_output_query_test() {
|
||||||
await normal_search_test();
|
let config = {
|
||||||
await no_results_test();
|
compress: false,
|
||||||
await effective_query_test();
|
debug_level: 1,
|
||||||
})();
|
keyword_file: '',
|
||||||
|
headless: true,
|
||||||
|
output_file: '',
|
||||||
|
block_assets: true,
|
||||||
|
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||||
|
random_user_agent: false,
|
||||||
|
};
|
||||||
|
|
||||||
|
let scrape_config = {
|
||||||
|
search_engine: 'google',
|
||||||
|
keywords: normal_search_keywords,
|
||||||
|
num_pages: 3,
|
||||||
|
html_output: true,
|
||||||
|
};
|
||||||
|
|
||||||
|
let output = await se_scraper.scrape(config, scrape_config);
|
||||||
|
normal_search_test_case( output );
|
||||||
|
check_html_output_test_case( output );
|
||||||
|
}
|
||||||
|
|
||||||
|
function check_html_output_test_case( response ) {
|
||||||
|
for (let query in response.html_output) {
|
||||||
|
|
||||||
|
assert.containsAllKeys(response.html_output, normal_search_keywords, 'not all keywords were scraped.');
|
||||||
|
|
||||||
|
for (let page_number in response.html_output[query]) {
|
||||||
|
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||||
|
assert.startsWith(response.html_output[query][page_number], '<!DOCTYPE html><html');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
describe('Google', function(){
|
||||||
|
this.timeout(30000);
|
||||||
|
it('normal search', normal_search_test);
|
||||||
|
it('no results', no_results_test);
|
||||||
|
it('effective query', effective_query_test);
|
||||||
|
it('html output query', html_output_query_test);
|
||||||
|
});
|
||||||
|
Loading…
Reference in New Issue
Block a user