From 62b3b688b4c9315e19afaa29cf6e854c8245778b Mon Sep 17 00:00:00 2001 From: Nikolai Tschacher Date: Thu, 7 Mar 2019 13:16:12 +0100 Subject: [PATCH] minor fixes --- index.js | 4 +- src/modules/google.js | 2 +- src/modules/se_scraper.js | 5 +- src/node_scraper.js | 11 +++-- test/test_queryargs_google.js | 91 +++++++++++++++++++++++++++++++++++ 5 files changed, 106 insertions(+), 7 deletions(-) create mode 100644 test/test_queryargs_google.js diff --git a/index.js b/index.js index 0ccdd05..3c5276c 100644 --- a/index.js +++ b/index.js @@ -25,7 +25,7 @@ exports.scrape = async function(user_config, callback) { compress: false, // compress debug: false, verbose: true, - keywords: ['scrapeulous.com'], + keywords: ['search engine scraping scrapeulous.com'], // whether to start the browser in headless mode headless: true, // the number of pages to scrape for each keyword @@ -53,7 +53,7 @@ exports.scrape = async function(user_config, callback) { timeout: 30 * 60 * 1000, // max timeout set to 30 minutes monitor: false, concurrency: Cluster.CONCURRENCY_BROWSER, - maxConcurrency: 2, + maxConcurrency: 1, } }; diff --git a/src/modules/google.js b/src/modules/google.js index bb93ed6..0bb7eaa 100644 --- a/src/modules/google.js +++ b/src/modules/google.js @@ -538,7 +538,7 @@ const GOOGLE_HL = { }; - class GoogleScraper extends Scraper { +class GoogleScraper extends Scraper { constructor(...args) { super(...args); diff --git a/src/modules/se_scraper.js b/src/modules/se_scraper.js index a917549..75ff7d0 100644 --- a/src/modules/se_scraper.js +++ b/src/modules/se_scraper.js @@ -39,7 +39,10 @@ module.exports = class Scraper { } async run({page, data}) { - this.page = page; + + if (page) { + this.page = page; + } let do_continue = await this.load_search_engine(); diff --git a/src/node_scraper.js b/src/node_scraper.js index 1f12780..0ef1abc 100644 --- a/src/node_scraper.js +++ b/src/node_scraper.js @@ -133,7 +133,7 @@ module.exports.handler = async function handler (event, context, callback) { pluggable: pluggable, page: page, }); - results = obj.run({page: page}); + results = obj.run({}); num_requests = obj.num_requests; metadata = obj.metadata; } @@ -259,8 +259,13 @@ module.exports.handler = async function handler (event, context, callback) { }); } - metadata.id = `${config.job_name} ${config.chunk_lines}`; - metadata.chunk_lines = config.chunk_lines; + if (config.chunk_lines) { + metadata.chunk_lines = config.chunk_lines; + if (config.job_name) { + metadata.id = `${config.job_name} ${config.chunk_lines}`; + } + } + metadata.elapsed_time = timeDelta.toString(); metadata.ms_per_keyword = ms_per_request.toString(); metadata.num_requests = num_requests; diff --git a/test/test_queryargs_google.js b/test/test_queryargs_google.js new file mode 100644 index 0000000..b9f569f --- /dev/null +++ b/test/test_queryargs_google.js @@ -0,0 +1,91 @@ +const se_scraper = require('./../index.js'); +var assert = require('chai').assert; + +const normal_search_keywords = ['apple juice']; + +async function queryargs_search_test() { + let config = { + search_engine: 'google', + compress: false, + debug: true, + verbose: true, + keywords: normal_search_keywords, + keyword_file: '', + num_pages: 2, + headless: true, + output_file: '', + block_assets: true, + // use specific search engine parameters for various search engines + google_settings: { + google_domain: 'google.com', + gl: 'fr', // The gl parameter determines the Google country to use for the query. + hl: 'fr', // The hl parameter determines the Google UI language to return results. + start: 30, // Determines the results offset to use, defaults to 0. + num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100. + }, + }; + + console.log('queryargs_search_test()'); + await se_scraper.scrape(config, queryargs_search_test_case); +} + +// we test with a callback function to our handler +function queryargs_search_test_case(err, response) { + + if (err) { + console.error(err); + } else { + assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json'); + assert.equal(response.statusCode, 200, 'status code must be 200'); + assert.equal(response.metadata.num_requests, 2); + + for (let query in response.results) { + let total_rank = 1; + + assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.'); + + for (let page_number in response.results[query]) { + + assert.isNumber(parseInt(page_number), 'page_number must be numeric'); + + let obj = response.results[query][page_number]; + + assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object'); + + assert.isAtLeast(obj.results.length, 90, 'results must have at least 80 SERP objects'); + assert.equal(obj.no_results, false, 'no results should be false'); + assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); + assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); + assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); + + for (let res of obj.results) { + + assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link'], 'not all keys are in the SERP object'); + + assert.isOk(res.link, 'link must be ok'); + assert.typeOf(res.link, 'string', 'link must be string'); + assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars'); + + assert.isOk(res.visible_link, 'visible_link must be ok'); + assert.typeOf(res.visible_link, 'string', 'visible_link must be string'); + assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars'); + + assert.isOk(res.title, 'title must be ok'); + assert.typeOf(res.title, 'string', 'title must be string'); + assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars'); + + assert.isOk(res.snippet, 'snippet must be ok'); + assert.typeOf(res.snippet, 'string', 'snippet must be string'); + assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); + + assert.isNumber(res.rank, 'rank must be integer'); + assert.equal(res.rank, total_rank++, 'rank ist wrong'); + } + } + } + } +} + +(async () => { + await queryargs_search_test(); +})();