can now parse args from string to json

This commit is contained in:
Nikolai Tschacher 2019-03-07 15:50:36 +01:00
parent 62b3b688b4
commit dd1f36076e
6 changed files with 677 additions and 1483 deletions

File diff suppressed because it is too large Load Diff

View File

@ -48,6 +48,7 @@ exports.scrape = async function(user_config, callback) {
// check if headless chrome escapes common detection techniques // check if headless chrome escapes common detection techniques
// this is a quick test and should be used for debugging // this is a quick test and should be used for debugging
test_evasion: false, test_evasion: false,
apply_evasion_techniques: true,
// settings for puppeteer-cluster // settings for puppeteer-cluster
puppeteer_cluster_config: { puppeteer_cluster_config: {
timeout: 30 * 60 * 1000, // max timeout set to 30 minutes timeout: 30 * 60 * 1000, // max timeout set to 30 minutes

View File

@ -1,6 +1,6 @@
{ {
"name": "se-scraper", "name": "se-scraper",
"version": "1.2.8", "version": "1.2.10",
"description": "A simple module using puppeteer to scrape several search engines such as Google, Duckduckgo and Bing.", "description": "A simple module using puppeteer to scrape several search engines such as Google, Duckduckgo and Bing.",
"homepage": "https://scrapeulous.com/", "homepage": "https://scrapeulous.com/",
"main": "index.js", "main": "index.js",

17
run.js
View File

@ -12,13 +12,15 @@ let config = {
search_engine: 'google', search_engine: 'google',
// use specific search engine parameters for various search engines // use specific search engine parameters for various search engines
google_settings: { // google_settings: {
google_domain: 'google.com', // google_domain: 'google.com',
gl: 'us', // The gl parameter determines the Google country to use for the query. // gl: 'us', // The gl parameter determines the Google country to use for the query.
hl: 'us', // The hl parameter determines the Google UI language to return results. // hl: 'us', // The hl parameter determines the Google UI language to return results.
start: 0, // Determines the results offset to use, defaults to 0. // start: 0, // Determines the results offset to use, defaults to 0.
num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100. // num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
}, // },
google_settings: '{"gl": "tr", "hl": "tr", "num": "50", "start": "0"}',
// https://docs.microsoft.com/en-us/rest/api/cognitiveservices/bing-web-api-v5-reference#query-parameters // https://docs.microsoft.com/en-us/rest/api/cognitiveservices/bing-web-api-v5-reference#query-parameters
bing_settings: { bing_settings: {
@ -64,6 +66,7 @@ let config = {
// check if headless chrome escapes common detection techniques // check if headless chrome escapes common detection techniques
// this is a quick test and should be used for debugging // this is a quick test and should be used for debugging
test_evasion: false, test_evasion: false,
apply_evasion_techniques: false,
// log ip address data // log ip address data
log_ip_address: false, log_ip_address: false,
// log http headers // log http headers

View File

@ -36,6 +36,19 @@ module.exports = class Scraper {
this.num_requests = 0; this.num_requests = 0;
// keep track of the keywords searched // keep track of the keywords searched
this.num_keywords = 0; this.num_keywords = 0;
let settings = this.config[`${this.config.search_engine}_settings`];
if (settings) {
if (typeof settings === 'string') {
try {
settings = JSON.parse(settings);
this.config[`${this.config.search_engine}_settings`] = settings;
} catch (e) {
console.error(e);
}
}
}
} }
async run({page, data}) { async run({page, data}) {
@ -64,8 +77,10 @@ module.exports = class Scraper {
*/ */
async load_search_engine() { async load_search_engine() {
// prevent detection by evading common detection techniques if (this.config.apply_evasion_techniques === true) {
await evadeChromeHeadlessDetection(this.page); // prevent detection by evading common detection techniques
await evadeChromeHeadlessDetection(this.page);
}
// block some assets to speed up scraping // block some assets to speed up scraping
if (this.config.block_assets === true) { if (this.config.block_assets === true) {
@ -223,7 +238,6 @@ module.exports = class Scraper {
let settings = this.config[`${this.config.search_engine}_settings`]; let settings = this.config[`${this.config.search_engine}_settings`];
if (settings) { if (settings) {
for (var key in settings) { for (var key in settings) {
baseUrl += `${key}=${settings[key]}&` baseUrl += `${key}=${settings[key]}&`
} }

View File

@ -133,7 +133,7 @@ module.exports.handler = async function handler (event, context, callback) {
pluggable: pluggable, pluggable: pluggable,
page: page, page: page,
}); });
results = obj.run({}); results = await obj.run({});
num_requests = obj.num_requests; num_requests = obj.num_requests;
metadata = obj.metadata; metadata = obj.metadata;
} }
@ -310,7 +310,7 @@ function parseEventData(config) {
} }
const booleans = ['debug', 'verbose', 'upload_to_s3', 'log_ip_address', 'log_http_headers', 'random_user_agent', const booleans = ['debug', 'verbose', 'upload_to_s3', 'log_ip_address', 'log_http_headers', 'random_user_agent',
'compress', 'is_local', 'max_results', 'set_manual_settings', 'block_assets', 'test_evasion', 'do_work']; 'compress', 'is_local', 'max_results', 'set_manual_settings', 'block_assets', 'test_evasion', 'do_work', 'apply_evasion_techniques'];
for (b of booleans) { for (b of booleans) {
config[b] = _bool(config[b]); config[b] = _bool(config[b]);