mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2025-06-20 17:47:49 +02:00
removed some superflous stuff
This commit is contained in:
parent
5e8ff1cb34
commit
09c1255400
BIN
debug_se_scraper_google_apple tree.png
Normal file
BIN
debug_se_scraper_google_apple tree.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 307 KiB |
Binary file not shown.
After Width: | Height: | Size: 92 KiB |
2
package-lock.json
generated
2
package-lock.json
generated
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "se-scraper",
|
"name": "se-scraper",
|
||||||
"version": "1.3.8",
|
"version": "1.3.11",
|
||||||
"lockfileVersion": 1,
|
"lockfileVersion": 1,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "se-scraper",
|
"name": "se-scraper",
|
||||||
"version": "1.3.10",
|
"version": "1.3.12",
|
||||||
"description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu",
|
"description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu",
|
||||||
"homepage": "https://scrapeulous.com/",
|
"homepage": "https://scrapeulous.com/",
|
||||||
"main": "index.js",
|
"main": "index.js",
|
||||||
|
@ -67,11 +67,10 @@ module.exports = class Scraper {
|
|||||||
|
|
||||||
if (!do_continue) {
|
if (!do_continue) {
|
||||||
console.error('Failed to load the search engine: load_search_engine()');
|
console.error('Failed to load the search engine: load_search_engine()');
|
||||||
return this.results;
|
} else {
|
||||||
|
await this.scraping_loop();
|
||||||
}
|
}
|
||||||
|
|
||||||
await this.scraping_loop();
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'results': this.results,
|
'results': this.results,
|
||||||
'html_output': this.html_output,
|
'html_output': this.html_output,
|
||||||
|
@ -65,11 +65,12 @@ function getScraper(search_engine, args) {
|
|||||||
|
|
||||||
class ScrapeManager {
|
class ScrapeManager {
|
||||||
|
|
||||||
constructor(config = {}) {
|
constructor(config, context={}) {
|
||||||
|
|
||||||
this.cluster = null;
|
this.cluster = null;
|
||||||
this.pluggable = null;
|
this.pluggable = null;
|
||||||
this.scraper = null;
|
this.scraper = null;
|
||||||
|
this.context = context;
|
||||||
|
|
||||||
this.config = {
|
this.config = {
|
||||||
// the user agent to scrape with
|
// the user agent to scrape with
|
||||||
@ -146,7 +147,14 @@ class ScrapeManager {
|
|||||||
this.config[key] = config[key];
|
this.config[key] = config[key];
|
||||||
}
|
}
|
||||||
|
|
||||||
this.config = parseEventData(this.config);
|
if (config.sleep_range) {
|
||||||
|
// parse an array
|
||||||
|
config.sleep_range = eval(config.sleep_range);
|
||||||
|
|
||||||
|
if (config.sleep_range.length !== 2 && typeof i[0] !== 'number' && typeof i[1] !== 'number') {
|
||||||
|
throw "sleep_range is not a valid array of two integers.";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
this.config.search_engine_name = typeof this.config.search_engine === 'function' ? this.config.search_engine.name : this.config.search_engine;
|
this.config.search_engine_name = typeof this.config.search_engine === 'function' ? this.config.search_engine.name : this.config.search_engine;
|
||||||
|
|
||||||
@ -245,6 +253,7 @@ class ScrapeManager {
|
|||||||
if (this.pluggable) {
|
if (this.pluggable) {
|
||||||
launch_args.config = this.config;
|
launch_args.config = this.config;
|
||||||
this.browser = await this.pluggable.start_browser(launch_args);
|
this.browser = await this.pluggable.start_browser(launch_args);
|
||||||
|
this.page = await this.browser.newPage();
|
||||||
} else {
|
} else {
|
||||||
// if no custom start_browser functionality was given
|
// if no custom start_browser functionality was given
|
||||||
// use puppeteer-cluster for scraping
|
// use puppeteer-cluster for scraping
|
||||||
@ -338,23 +347,18 @@ class ScrapeManager {
|
|||||||
`[se-scraper] started at [${(new Date()).toUTCString()}] and scrapes ${this.config.search_engine_name} with ${this.config.keywords.length} keywords on ${this.config.num_pages} pages each.`)
|
`[se-scraper] started at [${(new Date()).toUTCString()}] and scrapes ${this.config.search_engine_name} with ${this.config.keywords.length} keywords on ${this.config.num_pages} pages each.`)
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.config.do_work && this.pluggable) {
|
if (this.pluggable) {
|
||||||
let res = await this.pluggable.do_work(page);
|
this.scraper = getScraper(this.config.search_engine, {
|
||||||
results = res.results;
|
config: this.config,
|
||||||
num_requests = res.num_requests;
|
context: this.context,
|
||||||
} else {
|
pluggable: this.pluggable,
|
||||||
// const page = await this.browser.newPage();
|
page: this.page,
|
||||||
// this.scraper = getScraper(this.config.search_engine, {
|
});
|
||||||
// config: this.config,
|
|
||||||
// context: {},
|
|
||||||
// pluggable: pluggable,
|
|
||||||
// page: page,
|
|
||||||
// });
|
|
||||||
// results = await this.scraper.run({});
|
|
||||||
// num_requests = this.scraper.num_requests;
|
|
||||||
// metadata = this.scraper.metadata;
|
|
||||||
// }
|
|
||||||
|
|
||||||
|
let res = await this.scraper.run(this.page);
|
||||||
|
results = res.results;
|
||||||
|
num_requests = this.scraper.num_requests;
|
||||||
|
} else {
|
||||||
// Each browser will get N/(K+1) keywords and will issue N/(K+1) * M total requests to the search engine.
|
// Each browser will get N/(K+1) keywords and will issue N/(K+1) * M total requests to the search engine.
|
||||||
// https://github.com/GoogleChrome/puppeteer/issues/678
|
// https://github.com/GoogleChrome/puppeteer/issues/678
|
||||||
// The question is: Is it possible to set proxies per Page? Per Browser?
|
// The question is: Is it possible to set proxies per Page? Per Browser?
|
||||||
@ -469,37 +473,6 @@ class ScrapeManager {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function parseEventData(config) {
|
|
||||||
|
|
||||||
function _bool(e) {
|
|
||||||
e = String(e);
|
|
||||||
if (typeof e.trim === "function") {
|
|
||||||
return e.trim().toLowerCase() === 'true';
|
|
||||||
} else {
|
|
||||||
return e.toLowerCase() === 'true';
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const booleans = ['upload_to_s3', 'log_ip_address', 'log_http_headers', 'random_user_agent',
|
|
||||||
'compress', 'is_local', 'max_results', 'set_manual_settings', 'block_assets', 'test_evasion', 'do_work', 'apply_evasion_techniques'];
|
|
||||||
|
|
||||||
for (b of booleans) {
|
|
||||||
config[b] = _bool(config[b]);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (config.sleep_range) {
|
|
||||||
// parse an array
|
|
||||||
config.sleep_range = eval(config.sleep_range);
|
|
||||||
|
|
||||||
if (config.sleep_range.length !== 2 && typeof i[0] !== 'number' && typeof i[1] !== 'number') {
|
|
||||||
throw "sleep_range is not a valid array of two integers.";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return config;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
ScrapeManager: ScrapeManager,
|
ScrapeManager: ScrapeManager,
|
||||||
};
|
};
|
||||||
|
Loading…
x
Reference in New Issue
Block a user