removed some superflous stuff

This commit is contained in:
Nikolai Tschacher 2019-07-02 18:04:01 +02:00
parent 5e8ff1cb34
commit 09c1255400
6 changed files with 26 additions and 54 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 307 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 92 KiB

2
package-lock.json generated
View File

@ -1,6 +1,6 @@
{ {
"name": "se-scraper", "name": "se-scraper",
"version": "1.3.8", "version": "1.3.11",
"lockfileVersion": 1, "lockfileVersion": 1,
"requires": true, "requires": true,
"dependencies": { "dependencies": {

View File

@ -1,6 +1,6 @@
{ {
"name": "se-scraper", "name": "se-scraper",
"version": "1.3.10", "version": "1.3.12",
"description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu", "description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu",
"homepage": "https://scrapeulous.com/", "homepage": "https://scrapeulous.com/",
"main": "index.js", "main": "index.js",

View File

@ -67,10 +67,9 @@ module.exports = class Scraper {
if (!do_continue) { if (!do_continue) {
console.error('Failed to load the search engine: load_search_engine()'); console.error('Failed to load the search engine: load_search_engine()');
return this.results; } else {
}
await this.scraping_loop(); await this.scraping_loop();
}
return { return {
'results': this.results, 'results': this.results,

View File

@ -65,11 +65,12 @@ function getScraper(search_engine, args) {
class ScrapeManager { class ScrapeManager {
constructor(config = {}) { constructor(config, context={}) {
this.cluster = null; this.cluster = null;
this.pluggable = null; this.pluggable = null;
this.scraper = null; this.scraper = null;
this.context = context;
this.config = { this.config = {
// the user agent to scrape with // the user agent to scrape with
@ -146,7 +147,14 @@ class ScrapeManager {
this.config[key] = config[key]; this.config[key] = config[key];
} }
this.config = parseEventData(this.config); if (config.sleep_range) {
// parse an array
config.sleep_range = eval(config.sleep_range);
if (config.sleep_range.length !== 2 && typeof i[0] !== 'number' && typeof i[1] !== 'number') {
throw "sleep_range is not a valid array of two integers.";
}
}
this.config.search_engine_name = typeof this.config.search_engine === 'function' ? this.config.search_engine.name : this.config.search_engine; this.config.search_engine_name = typeof this.config.search_engine === 'function' ? this.config.search_engine.name : this.config.search_engine;
@ -245,6 +253,7 @@ class ScrapeManager {
if (this.pluggable) { if (this.pluggable) {
launch_args.config = this.config; launch_args.config = this.config;
this.browser = await this.pluggable.start_browser(launch_args); this.browser = await this.pluggable.start_browser(launch_args);
this.page = await this.browser.newPage();
} else { } else {
// if no custom start_browser functionality was given // if no custom start_browser functionality was given
// use puppeteer-cluster for scraping // use puppeteer-cluster for scraping
@ -338,23 +347,18 @@ class ScrapeManager {
`[se-scraper] started at [${(new Date()).toUTCString()}] and scrapes ${this.config.search_engine_name} with ${this.config.keywords.length} keywords on ${this.config.num_pages} pages each.`) `[se-scraper] started at [${(new Date()).toUTCString()}] and scrapes ${this.config.search_engine_name} with ${this.config.keywords.length} keywords on ${this.config.num_pages} pages each.`)
} }
if (this.config.do_work && this.pluggable) { if (this.pluggable) {
let res = await this.pluggable.do_work(page); this.scraper = getScraper(this.config.search_engine, {
results = res.results; config: this.config,
num_requests = res.num_requests; context: this.context,
} else { pluggable: this.pluggable,
// const page = await this.browser.newPage(); page: this.page,
// this.scraper = getScraper(this.config.search_engine, { });
// config: this.config,
// context: {},
// pluggable: pluggable,
// page: page,
// });
// results = await this.scraper.run({});
// num_requests = this.scraper.num_requests;
// metadata = this.scraper.metadata;
// }
let res = await this.scraper.run(this.page);
results = res.results;
num_requests = this.scraper.num_requests;
} else {
// Each browser will get N/(K+1) keywords and will issue N/(K+1) * M total requests to the search engine. // Each browser will get N/(K+1) keywords and will issue N/(K+1) * M total requests to the search engine.
// https://github.com/GoogleChrome/puppeteer/issues/678 // https://github.com/GoogleChrome/puppeteer/issues/678
// The question is: Is it possible to set proxies per Page? Per Browser? // The question is: Is it possible to set proxies per Page? Per Browser?
@ -469,37 +473,6 @@ class ScrapeManager {
} }
} }
function parseEventData(config) {
function _bool(e) {
e = String(e);
if (typeof e.trim === "function") {
return e.trim().toLowerCase() === 'true';
} else {
return e.toLowerCase() === 'true';
}
}
const booleans = ['upload_to_s3', 'log_ip_address', 'log_http_headers', 'random_user_agent',
'compress', 'is_local', 'max_results', 'set_manual_settings', 'block_assets', 'test_evasion', 'do_work', 'apply_evasion_techniques'];
for (b of booleans) {
config[b] = _bool(config[b]);
}
if (config.sleep_range) {
// parse an array
config.sleep_range = eval(config.sleep_range);
if (config.sleep_range.length !== 2 && typeof i[0] !== 'number' && typeof i[1] !== 'number') {
throw "sleep_range is not a valid array of two integers.";
}
}
return config;
}
module.exports = { module.exports = {
ScrapeManager: ScrapeManager, ScrapeManager: ScrapeManager,
}; };