mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2025-06-20 17:47:49 +02:00
clean test case for google is passing
This commit is contained in:
parent
987e3d7342
commit
c60d0f3528
21
TODO.txt
21
TODO.txt
@ -44,24 +44,7 @@ TODO:
|
|||||||
|
|
||||||
|
|
||||||
TODO:
|
TODO:
|
||||||
okay its fucking time to make a generic scraping class like in GoogleScraper
|
okay its fucking time to make a generic scraping class like in GoogleScraper [done]
|
||||||
i feel like history repeats
|
i feel like history repeats
|
||||||
|
|
||||||
class Scraper
|
write good test case for google
|
||||||
|
|
||||||
constructor(options = {}) {
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
async load_search_engine() {}
|
|
||||||
|
|
||||||
async search_keyword() {}
|
|
||||||
|
|
||||||
async new_page() {}
|
|
||||||
|
|
||||||
async detected() {}
|
|
||||||
|
|
||||||
|
|
||||||
then each search engine derives from this generic class
|
|
||||||
|
|
||||||
some search engines do not seed such a abstract class, because they are too complex
|
|
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "se-scraper",
|
"name": "se-scraper",
|
||||||
"version": "1.1.8",
|
"version": "1.1.9",
|
||||||
"description": "A simple module which uses puppeteer to scrape several search engines.",
|
"description": "A simple module which uses puppeteer to scrape several search engines.",
|
||||||
"homepage": "https://scrapeulous.com/",
|
"homepage": "https://scrapeulous.com/",
|
||||||
"main": "index.js",
|
"main": "index.js",
|
||||||
|
6
run.js
6
run.js
@ -20,13 +20,13 @@ let config = {
|
|||||||
// this output is informational
|
// this output is informational
|
||||||
verbose: true,
|
verbose: true,
|
||||||
// an array of keywords to scrape
|
// an array of keywords to scrape
|
||||||
keywords: ['news'],
|
keywords: ['apple tree'],
|
||||||
// alternatively you can specify a keyword_file. this overwrites the keywords array
|
// alternatively you can specify a keyword_file. this overwrites the keywords array
|
||||||
keyword_file: '',
|
keyword_file: '',
|
||||||
// the number of pages to scrape for each keyword
|
// the number of pages to scrape for each keyword
|
||||||
num_pages: 2,
|
num_pages: 1,
|
||||||
// whether to start the browser in headless mode
|
// whether to start the browser in headless mode
|
||||||
headless: true,
|
headless: false,
|
||||||
// path to output file, data will be stored in JSON
|
// path to output file, data will be stored in JSON
|
||||||
output_file: '',
|
output_file: '',
|
||||||
// whether to prevent images, css, fonts from being loaded
|
// whether to prevent images, css, fonts from being loaded
|
||||||
|
@ -19,9 +19,10 @@ class GoogleScraper extends Scraper {
|
|||||||
})
|
})
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// 'Ergebnisse für', 'Showing results for'
|
||||||
let no_results = this.no_results(
|
let no_results = this.no_results(
|
||||||
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
|
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
|
||||||
'No results found for', 'Ergebnisse für', 'Showing results for'],
|
'No results found for'],
|
||||||
$('#main').text()
|
$('#main').text()
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -109,7 +110,7 @@ class GoogleNewsOldScraper extends Scraper {
|
|||||||
|
|
||||||
let no_results = this.no_results(
|
let no_results = this.no_results(
|
||||||
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
|
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
|
||||||
'No results found for', 'Ergebnisse für', 'Showing results for', 'did not match any news results'],
|
'No results found for', 'did not match any news results'],
|
||||||
$('#main').text()
|
$('#main').text()
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -190,8 +191,7 @@ class GoogleImageScraper extends Scraper {
|
|||||||
});
|
});
|
||||||
|
|
||||||
let no_results = this.no_results(
|
let no_results = this.no_results(
|
||||||
['stimmt mit keinem Bildergebnis', 'Keine Ergebnisse für', 'not match any image results', 'No results found for',
|
['stimmt mit keinem Bildergebnis', 'Keine Ergebnisse für', 'not match any image results', 'No results found for',],
|
||||||
'Showing results for', 'Ergebnisse für'],
|
|
||||||
$('#main').text()
|
$('#main').text()
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -297,7 +297,7 @@ class GoogleNewsScraper extends Scraper {
|
|||||||
|
|
||||||
let no_results = this.no_results(
|
let no_results = this.no_results(
|
||||||
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
|
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
|
||||||
'No results found for', 'Ergebnisse für', 'Showing results for', 'did not match any news results'],
|
'No results found for', 'did not match any news results'],
|
||||||
$('body').text()
|
$('body').text()
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@ -176,8 +176,15 @@ module.exports = class Scraper {
|
|||||||
}
|
}
|
||||||
|
|
||||||
no_results(needles, html) {
|
no_results(needles, html) {
|
||||||
return !needles.map((needle) => { return html.indexOf(needle)})
|
for (let needle of needles) {
|
||||||
.every((res) => { return res == -1});
|
if (html.includes(needle)) {
|
||||||
|
if (this.config.debug) {
|
||||||
|
console.log(`HTML contains needle ${needle}. no_results=true`);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
parse(html) {
|
parse(html) {
|
||||||
|
205
test/test_google.js
Normal file
205
test/test_google.js
Normal file
@ -0,0 +1,205 @@
|
|||||||
|
const se_scraper = require('./../index.js');
|
||||||
|
var assert = require('chai').assert;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Use chai and mocha for tests.
|
||||||
|
* https://mochajs.org/#installation
|
||||||
|
*/
|
||||||
|
|
||||||
|
function sleep(ms) {
|
||||||
|
return new Promise(resolve => {
|
||||||
|
setTimeout(resolve, ms)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
const normal_search_keywords = ['apple tree', 'weather tomorrow'];
|
||||||
|
|
||||||
|
async function normal_search_test() {
|
||||||
|
let config = {
|
||||||
|
search_engine: 'google',
|
||||||
|
compress: false,
|
||||||
|
debug: false,
|
||||||
|
verbose: false,
|
||||||
|
keywords: normal_search_keywords,
|
||||||
|
keyword_file: '',
|
||||||
|
num_pages: 3,
|
||||||
|
headless: true,
|
||||||
|
output_file: '',
|
||||||
|
block_assets: true,
|
||||||
|
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||||
|
random_user_agent: false,
|
||||||
|
};
|
||||||
|
|
||||||
|
console.log('normal_search_test()');
|
||||||
|
await se_scraper.scrape(config, normal_search_test_case);
|
||||||
|
}
|
||||||
|
|
||||||
|
// we test with a callback function to our handler
|
||||||
|
function normal_search_test_case(err, response) {
|
||||||
|
|
||||||
|
if (err) {
|
||||||
|
console.error(err);
|
||||||
|
} else {
|
||||||
|
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
|
||||||
|
assert.equal(response.statusCode, 200, 'status code must be 200');
|
||||||
|
|
||||||
|
let total_rank = 1;
|
||||||
|
|
||||||
|
for (let query in response.results) {
|
||||||
|
|
||||||
|
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
|
||||||
|
|
||||||
|
for (let page_number in response.results[query]) {
|
||||||
|
|
||||||
|
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||||
|
|
||||||
|
let obj = response.results[query][page_number];
|
||||||
|
|
||||||
|
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results'], 'not all keys are in the object');
|
||||||
|
|
||||||
|
assert.isAtLeast(obj.results.length, 8, 'results must have at least 8 SERP objects');
|
||||||
|
assert.equal(obj.no_results, false, 'no results should be false');
|
||||||
|
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||||
|
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||||
|
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||||
|
|
||||||
|
for (let res of obj.results) {
|
||||||
|
|
||||||
|
assert.containsAllKeys(res, ['link', 'title', 'rank'], 'not all keys are in the SERP object');
|
||||||
|
|
||||||
|
assert.isOk(res.link, 'link must be ok');
|
||||||
|
assert.typeOf(res.link, 'string', 'link must be string');
|
||||||
|
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||||
|
|
||||||
|
assert.isOk(res.title, 'title must be ok');
|
||||||
|
assert.typeOf(res.title, 'string', 'title must be string');
|
||||||
|
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||||
|
|
||||||
|
assert.isOk(res.snippet, 'snippet must be ok');
|
||||||
|
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||||
|
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||||
|
|
||||||
|
assert.isNumber(res.rank, 'rank must be integer');
|
||||||
|
assert.equal(res.rank, total_rank++, 'rank ist wrong');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
const keywords_no_results = ['fgskl34440abJAksafkl34a44dsflkjaQQuBBdfk',];
|
||||||
|
|
||||||
|
async function no_results_test() {
|
||||||
|
let config = {
|
||||||
|
search_engine: 'google',
|
||||||
|
compress: false,
|
||||||
|
debug: false,
|
||||||
|
verbose: false,
|
||||||
|
keywords: keywords_no_results,
|
||||||
|
keyword_file: '',
|
||||||
|
num_pages: 1,
|
||||||
|
headless: true,
|
||||||
|
output_file: '',
|
||||||
|
block_assets: true,
|
||||||
|
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||||
|
random_user_agent: false,
|
||||||
|
};
|
||||||
|
console.log('no_results_test()');
|
||||||
|
await se_scraper.scrape(config, test_case_no_results);
|
||||||
|
}
|
||||||
|
|
||||||
|
// we test with a callback function to our handler
|
||||||
|
function test_case_no_results(err, response) {
|
||||||
|
if (err) {
|
||||||
|
console.error(err);
|
||||||
|
} else {
|
||||||
|
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
|
||||||
|
assert.equal(response.statusCode, 200, 'status code must be 200');
|
||||||
|
results = response.results;
|
||||||
|
for (let query in response.results) {
|
||||||
|
|
||||||
|
assert.containsAllKeys(response.results, keywords_no_results, 'not all keywords were scraped.');
|
||||||
|
|
||||||
|
for (let page_number in response.results[query]) {
|
||||||
|
|
||||||
|
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||||
|
|
||||||
|
let obj = response.results[query][page_number];
|
||||||
|
|
||||||
|
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results'], 'not all keys are in the object');
|
||||||
|
|
||||||
|
assert(obj.results.length === 0, 'results must have 0 SERP objects');
|
||||||
|
assert.equal(obj.no_results, true, 'no results should be true');
|
||||||
|
assert.isEmpty(obj.num_results, 'no results should be a empty string');
|
||||||
|
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const effective_query_keywords = ['mount evverrest'];
|
||||||
|
|
||||||
|
async function effective_query_test() {
|
||||||
|
let config = {
|
||||||
|
search_engine: 'google',
|
||||||
|
compress: false,
|
||||||
|
debug: false,
|
||||||
|
verbose: false,
|
||||||
|
keywords: effective_query_keywords,
|
||||||
|
keyword_file: '',
|
||||||
|
num_pages: 1,
|
||||||
|
headless: true,
|
||||||
|
output_file: '',
|
||||||
|
block_assets: true,
|
||||||
|
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||||
|
random_user_agent: false,
|
||||||
|
};
|
||||||
|
console.log('effective_query_test()');
|
||||||
|
await se_scraper.scrape(config, test_case_effective_query);
|
||||||
|
}
|
||||||
|
|
||||||
|
// we test with a callback function to our handler
|
||||||
|
function test_case_effective_query(err, response) {
|
||||||
|
|
||||||
|
if (err) {
|
||||||
|
console.error(err);
|
||||||
|
} else {
|
||||||
|
|
||||||
|
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
|
||||||
|
assert.equal(response.statusCode, 200, 'status code must be 200');
|
||||||
|
|
||||||
|
results = response.results;
|
||||||
|
for (let query in response.results) {
|
||||||
|
|
||||||
|
assert.containsAllKeys(response.results, effective_query_keywords, 'not all keywords were scraped.');
|
||||||
|
|
||||||
|
for (let page_number in response.results[query]) {
|
||||||
|
|
||||||
|
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||||
|
|
||||||
|
let obj = response.results[query][page_number];
|
||||||
|
|
||||||
|
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results'], 'not all keys are in the object');
|
||||||
|
|
||||||
|
// effective query must be different to the original keyword
|
||||||
|
assert.isOk(obj.effective_query, 'effective query must be ok');
|
||||||
|
assert.isNotEmpty(obj.effective_query, 'effective query must be valid');
|
||||||
|
assert(obj.effective_query !== query, 'effective query must be different from keyword');
|
||||||
|
|
||||||
|
assert.isAtLeast(obj.results.length, 8, 'results must have at least 8 SERP objects');
|
||||||
|
assert.equal(obj.no_results, false, 'no results should be false');
|
||||||
|
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||||
|
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||||
|
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
(async () => {
|
||||||
|
await normal_search_test();
|
||||||
|
await no_results_test();
|
||||||
|
await effective_query_test();
|
||||||
|
})();
|
Loading…
x
Reference in New Issue
Block a user