mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2025-06-21 01:57:55 +02:00
minor fixes
This commit is contained in:
parent
7b52b4e62f
commit
62b3b688b4
4
index.js
4
index.js
@ -25,7 +25,7 @@ exports.scrape = async function(user_config, callback) {
|
|||||||
compress: false, // compress
|
compress: false, // compress
|
||||||
debug: false,
|
debug: false,
|
||||||
verbose: true,
|
verbose: true,
|
||||||
keywords: ['scrapeulous.com'],
|
keywords: ['search engine scraping scrapeulous.com'],
|
||||||
// whether to start the browser in headless mode
|
// whether to start the browser in headless mode
|
||||||
headless: true,
|
headless: true,
|
||||||
// the number of pages to scrape for each keyword
|
// the number of pages to scrape for each keyword
|
||||||
@ -53,7 +53,7 @@ exports.scrape = async function(user_config, callback) {
|
|||||||
timeout: 30 * 60 * 1000, // max timeout set to 30 minutes
|
timeout: 30 * 60 * 1000, // max timeout set to 30 minutes
|
||||||
monitor: false,
|
monitor: false,
|
||||||
concurrency: Cluster.CONCURRENCY_BROWSER,
|
concurrency: Cluster.CONCURRENCY_BROWSER,
|
||||||
maxConcurrency: 2,
|
maxConcurrency: 1,
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -538,7 +538,7 @@ const GOOGLE_HL = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
class GoogleScraper extends Scraper {
|
class GoogleScraper extends Scraper {
|
||||||
|
|
||||||
constructor(...args) {
|
constructor(...args) {
|
||||||
super(...args);
|
super(...args);
|
||||||
|
@ -39,7 +39,10 @@ module.exports = class Scraper {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async run({page, data}) {
|
async run({page, data}) {
|
||||||
this.page = page;
|
|
||||||
|
if (page) {
|
||||||
|
this.page = page;
|
||||||
|
}
|
||||||
|
|
||||||
let do_continue = await this.load_search_engine();
|
let do_continue = await this.load_search_engine();
|
||||||
|
|
||||||
|
@ -133,7 +133,7 @@ module.exports.handler = async function handler (event, context, callback) {
|
|||||||
pluggable: pluggable,
|
pluggable: pluggable,
|
||||||
page: page,
|
page: page,
|
||||||
});
|
});
|
||||||
results = obj.run({page: page});
|
results = obj.run({});
|
||||||
num_requests = obj.num_requests;
|
num_requests = obj.num_requests;
|
||||||
metadata = obj.metadata;
|
metadata = obj.metadata;
|
||||||
}
|
}
|
||||||
@ -259,8 +259,13 @@ module.exports.handler = async function handler (event, context, callback) {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
metadata.id = `${config.job_name} ${config.chunk_lines}`;
|
if (config.chunk_lines) {
|
||||||
metadata.chunk_lines = config.chunk_lines;
|
metadata.chunk_lines = config.chunk_lines;
|
||||||
|
if (config.job_name) {
|
||||||
|
metadata.id = `${config.job_name} ${config.chunk_lines}`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
metadata.elapsed_time = timeDelta.toString();
|
metadata.elapsed_time = timeDelta.toString();
|
||||||
metadata.ms_per_keyword = ms_per_request.toString();
|
metadata.ms_per_keyword = ms_per_request.toString();
|
||||||
metadata.num_requests = num_requests;
|
metadata.num_requests = num_requests;
|
||||||
|
91
test/test_queryargs_google.js
Normal file
91
test/test_queryargs_google.js
Normal file
@ -0,0 +1,91 @@
|
|||||||
|
const se_scraper = require('./../index.js');
|
||||||
|
var assert = require('chai').assert;
|
||||||
|
|
||||||
|
const normal_search_keywords = ['apple juice'];
|
||||||
|
|
||||||
|
async function queryargs_search_test() {
|
||||||
|
let config = {
|
||||||
|
search_engine: 'google',
|
||||||
|
compress: false,
|
||||||
|
debug: true,
|
||||||
|
verbose: true,
|
||||||
|
keywords: normal_search_keywords,
|
||||||
|
keyword_file: '',
|
||||||
|
num_pages: 2,
|
||||||
|
headless: true,
|
||||||
|
output_file: '',
|
||||||
|
block_assets: true,
|
||||||
|
// use specific search engine parameters for various search engines
|
||||||
|
google_settings: {
|
||||||
|
google_domain: 'google.com',
|
||||||
|
gl: 'fr', // The gl parameter determines the Google country to use for the query.
|
||||||
|
hl: 'fr', // The hl parameter determines the Google UI language to return results.
|
||||||
|
start: 30, // Determines the results offset to use, defaults to 0.
|
||||||
|
num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
console.log('queryargs_search_test()');
|
||||||
|
await se_scraper.scrape(config, queryargs_search_test_case);
|
||||||
|
}
|
||||||
|
|
||||||
|
// we test with a callback function to our handler
|
||||||
|
function queryargs_search_test_case(err, response) {
|
||||||
|
|
||||||
|
if (err) {
|
||||||
|
console.error(err);
|
||||||
|
} else {
|
||||||
|
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
|
||||||
|
assert.equal(response.statusCode, 200, 'status code must be 200');
|
||||||
|
assert.equal(response.metadata.num_requests, 2);
|
||||||
|
|
||||||
|
for (let query in response.results) {
|
||||||
|
let total_rank = 1;
|
||||||
|
|
||||||
|
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
|
||||||
|
|
||||||
|
for (let page_number in response.results[query]) {
|
||||||
|
|
||||||
|
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||||
|
|
||||||
|
let obj = response.results[query][page_number];
|
||||||
|
|
||||||
|
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
|
||||||
|
|
||||||
|
assert.isAtLeast(obj.results.length, 90, 'results must have at least 80 SERP objects');
|
||||||
|
assert.equal(obj.no_results, false, 'no results should be false');
|
||||||
|
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||||
|
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||||
|
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||||
|
|
||||||
|
for (let res of obj.results) {
|
||||||
|
|
||||||
|
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link'], 'not all keys are in the SERP object');
|
||||||
|
|
||||||
|
assert.isOk(res.link, 'link must be ok');
|
||||||
|
assert.typeOf(res.link, 'string', 'link must be string');
|
||||||
|
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||||
|
|
||||||
|
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||||
|
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||||
|
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||||
|
|
||||||
|
assert.isOk(res.title, 'title must be ok');
|
||||||
|
assert.typeOf(res.title, 'string', 'title must be string');
|
||||||
|
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||||
|
|
||||||
|
assert.isOk(res.snippet, 'snippet must be ok');
|
||||||
|
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||||
|
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||||
|
|
||||||
|
assert.isNumber(res.rank, 'rank must be integer');
|
||||||
|
assert.equal(res.rank, total_rank++, 'rank ist wrong');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
(async () => {
|
||||||
|
await queryargs_search_test();
|
||||||
|
})();
|
Loading…
x
Reference in New Issue
Block a user