mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2025-02-02 17:49:15 +01:00
can now parse args from string to json
This commit is contained in:
parent
62b3b688b4
commit
dd1f36076e
File diff suppressed because it is too large
Load Diff
1
index.js
1
index.js
@ -48,6 +48,7 @@ exports.scrape = async function(user_config, callback) {
|
||||
// check if headless chrome escapes common detection techniques
|
||||
// this is a quick test and should be used for debugging
|
||||
test_evasion: false,
|
||||
apply_evasion_techniques: true,
|
||||
// settings for puppeteer-cluster
|
||||
puppeteer_cluster_config: {
|
||||
timeout: 30 * 60 * 1000, // max timeout set to 30 minutes
|
||||
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "se-scraper",
|
||||
"version": "1.2.8",
|
||||
"version": "1.2.10",
|
||||
"description": "A simple module using puppeteer to scrape several search engines such as Google, Duckduckgo and Bing.",
|
||||
"homepage": "https://scrapeulous.com/",
|
||||
"main": "index.js",
|
||||
|
17
run.js
17
run.js
@ -12,13 +12,15 @@ let config = {
|
||||
search_engine: 'google',
|
||||
|
||||
// use specific search engine parameters for various search engines
|
||||
google_settings: {
|
||||
google_domain: 'google.com',
|
||||
gl: 'us', // The gl parameter determines the Google country to use for the query.
|
||||
hl: 'us', // The hl parameter determines the Google UI language to return results.
|
||||
start: 0, // Determines the results offset to use, defaults to 0.
|
||||
num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
|
||||
},
|
||||
// google_settings: {
|
||||
// google_domain: 'google.com',
|
||||
// gl: 'us', // The gl parameter determines the Google country to use for the query.
|
||||
// hl: 'us', // The hl parameter determines the Google UI language to return results.
|
||||
// start: 0, // Determines the results offset to use, defaults to 0.
|
||||
// num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
|
||||
// },
|
||||
|
||||
google_settings: '{"gl": "tr", "hl": "tr", "num": "50", "start": "0"}',
|
||||
|
||||
// https://docs.microsoft.com/en-us/rest/api/cognitiveservices/bing-web-api-v5-reference#query-parameters
|
||||
bing_settings: {
|
||||
@ -64,6 +66,7 @@ let config = {
|
||||
// check if headless chrome escapes common detection techniques
|
||||
// this is a quick test and should be used for debugging
|
||||
test_evasion: false,
|
||||
apply_evasion_techniques: false,
|
||||
// log ip address data
|
||||
log_ip_address: false,
|
||||
// log http headers
|
||||
|
@ -36,6 +36,19 @@ module.exports = class Scraper {
|
||||
this.num_requests = 0;
|
||||
// keep track of the keywords searched
|
||||
this.num_keywords = 0;
|
||||
|
||||
let settings = this.config[`${this.config.search_engine}_settings`];
|
||||
if (settings) {
|
||||
if (typeof settings === 'string') {
|
||||
try {
|
||||
settings = JSON.parse(settings);
|
||||
this.config[`${this.config.search_engine}_settings`] = settings;
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
async run({page, data}) {
|
||||
@ -64,8 +77,10 @@ module.exports = class Scraper {
|
||||
*/
|
||||
async load_search_engine() {
|
||||
|
||||
// prevent detection by evading common detection techniques
|
||||
await evadeChromeHeadlessDetection(this.page);
|
||||
if (this.config.apply_evasion_techniques === true) {
|
||||
// prevent detection by evading common detection techniques
|
||||
await evadeChromeHeadlessDetection(this.page);
|
||||
}
|
||||
|
||||
// block some assets to speed up scraping
|
||||
if (this.config.block_assets === true) {
|
||||
@ -223,7 +238,6 @@ module.exports = class Scraper {
|
||||
let settings = this.config[`${this.config.search_engine}_settings`];
|
||||
|
||||
if (settings) {
|
||||
|
||||
for (var key in settings) {
|
||||
baseUrl += `${key}=${settings[key]}&`
|
||||
}
|
||||
|
@ -133,7 +133,7 @@ module.exports.handler = async function handler (event, context, callback) {
|
||||
pluggable: pluggable,
|
||||
page: page,
|
||||
});
|
||||
results = obj.run({});
|
||||
results = await obj.run({});
|
||||
num_requests = obj.num_requests;
|
||||
metadata = obj.metadata;
|
||||
}
|
||||
@ -310,7 +310,7 @@ function parseEventData(config) {
|
||||
}
|
||||
|
||||
const booleans = ['debug', 'verbose', 'upload_to_s3', 'log_ip_address', 'log_http_headers', 'random_user_agent',
|
||||
'compress', 'is_local', 'max_results', 'set_manual_settings', 'block_assets', 'test_evasion', 'do_work'];
|
||||
'compress', 'is_local', 'max_results', 'set_manual_settings', 'block_assets', 'test_evasion', 'do_work', 'apply_evasion_techniques'];
|
||||
|
||||
for (b of booleans) {
|
||||
config[b] = _bool(config[b]);
|
||||
|
Loading…
Reference in New Issue
Block a user