can now parse args from string to json

This commit is contained in:
Nikolai Tschacher 2019-03-07 15:50:36 +01:00
parent 62b3b688b4
commit dd1f36076e
6 changed files with 677 additions and 1483 deletions

File diff suppressed because it is too large Load Diff

View File

@ -48,6 +48,7 @@ exports.scrape = async function(user_config, callback) {
// check if headless chrome escapes common detection techniques
// this is a quick test and should be used for debugging
test_evasion: false,
apply_evasion_techniques: true,
// settings for puppeteer-cluster
puppeteer_cluster_config: {
timeout: 30 * 60 * 1000, // max timeout set to 30 minutes

View File

@ -1,6 +1,6 @@
{
"name": "se-scraper",
"version": "1.2.8",
"version": "1.2.10",
"description": "A simple module using puppeteer to scrape several search engines such as Google, Duckduckgo and Bing.",
"homepage": "https://scrapeulous.com/",
"main": "index.js",

17
run.js
View File

@ -12,13 +12,15 @@ let config = {
search_engine: 'google',
// use specific search engine parameters for various search engines
google_settings: {
google_domain: 'google.com',
gl: 'us', // The gl parameter determines the Google country to use for the query.
hl: 'us', // The hl parameter determines the Google UI language to return results.
start: 0, // Determines the results offset to use, defaults to 0.
num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
},
// google_settings: {
// google_domain: 'google.com',
// gl: 'us', // The gl parameter determines the Google country to use for the query.
// hl: 'us', // The hl parameter determines the Google UI language to return results.
// start: 0, // Determines the results offset to use, defaults to 0.
// num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
// },
google_settings: '{"gl": "tr", "hl": "tr", "num": "50", "start": "0"}',
// https://docs.microsoft.com/en-us/rest/api/cognitiveservices/bing-web-api-v5-reference#query-parameters
bing_settings: {
@ -64,6 +66,7 @@ let config = {
// check if headless chrome escapes common detection techniques
// this is a quick test and should be used for debugging
test_evasion: false,
apply_evasion_techniques: false,
// log ip address data
log_ip_address: false,
// log http headers

View File

@ -36,6 +36,19 @@ module.exports = class Scraper {
this.num_requests = 0;
// keep track of the keywords searched
this.num_keywords = 0;
let settings = this.config[`${this.config.search_engine}_settings`];
if (settings) {
if (typeof settings === 'string') {
try {
settings = JSON.parse(settings);
this.config[`${this.config.search_engine}_settings`] = settings;
} catch (e) {
console.error(e);
}
}
}
}
async run({page, data}) {
@ -64,8 +77,10 @@ module.exports = class Scraper {
*/
async load_search_engine() {
// prevent detection by evading common detection techniques
await evadeChromeHeadlessDetection(this.page);
if (this.config.apply_evasion_techniques === true) {
// prevent detection by evading common detection techniques
await evadeChromeHeadlessDetection(this.page);
}
// block some assets to speed up scraping
if (this.config.block_assets === true) {
@ -223,7 +238,6 @@ module.exports = class Scraper {
let settings = this.config[`${this.config.search_engine}_settings`];
if (settings) {
for (var key in settings) {
baseUrl += `${key}=${settings[key]}&`
}

View File

@ -133,7 +133,7 @@ module.exports.handler = async function handler (event, context, callback) {
pluggable: pluggable,
page: page,
});
results = obj.run({});
results = await obj.run({});
num_requests = obj.num_requests;
metadata = obj.metadata;
}
@ -310,7 +310,7 @@ function parseEventData(config) {
}
const booleans = ['debug', 'verbose', 'upload_to_s3', 'log_ip_address', 'log_http_headers', 'random_user_agent',
'compress', 'is_local', 'max_results', 'set_manual_settings', 'block_assets', 'test_evasion', 'do_work'];
'compress', 'is_local', 'max_results', 'set_manual_settings', 'block_assets', 'test_evasion', 'do_work', 'apply_evasion_techniques'];
for (b of booleans) {
config[b] = _bool(config[b]);