mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2025-08-14 07:48:21 +02:00
fixed some errors and way better README
This commit is contained in:
34
index.js
34
index.js
@ -1,15 +1,16 @@
|
||||
const { Cluster } = require('./src/puppeteer-cluster/dist/index.js');
|
||||
const handler = require('./src/node_scraper.js');
|
||||
var fs = require('fs');
|
||||
var os = require("os");
|
||||
|
||||
exports.scrape = async function(config, callback) {
|
||||
exports.scrape = async function(user_config, callback) {
|
||||
|
||||
// options for scraping
|
||||
event = {
|
||||
let config = {
|
||||
// the user agent to scrape with
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
// if random_user_agent is set to True, a random user agent is chosen
|
||||
random_user_agent: true,
|
||||
random_user_agent: false,
|
||||
// whether to select manual settings in visible mode
|
||||
set_manual_settings: false,
|
||||
// log ip address data
|
||||
@ -18,7 +19,7 @@ exports.scrape = async function(config, callback) {
|
||||
log_http_headers: false,
|
||||
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
||||
// is drawn before every request. empty string for no sleeping.
|
||||
sleep_range: '[1,1]',
|
||||
sleep_range: '',
|
||||
// which search engine to scrape
|
||||
search_engine: 'google',
|
||||
compress: false, // compress
|
||||
@ -48,22 +49,27 @@ exports.scrape = async function(config, callback) {
|
||||
// this is a quick test and should be used for debugging
|
||||
test_evasion: false,
|
||||
// settings for puppeteer-cluster
|
||||
monitor: false,
|
||||
puppeteer_cluster_config: {
|
||||
timeout: 30 * 60 * 1000, // max timeout set to 30 minutes
|
||||
monitor: false,
|
||||
concurrency: Cluster.CONCURRENCY_BROWSER,
|
||||
maxConcurrency: 2,
|
||||
}
|
||||
};
|
||||
|
||||
// overwrite default config
|
||||
for (var key in config) {
|
||||
event[key] = config[key];
|
||||
for (var key in user_config) {
|
||||
config[key] = user_config[key];
|
||||
}
|
||||
|
||||
if (fs.existsSync(event.keyword_file)) {
|
||||
event.keywords = read_keywords_from_file(event.keyword_file);
|
||||
if (fs.existsSync(config.keyword_file)) {
|
||||
config.keywords = read_keywords_from_file(config.keyword_file);
|
||||
}
|
||||
|
||||
if (fs.existsSync(event.proxy_file)) {
|
||||
event.proxies = read_keywords_from_file(event.proxy_file);
|
||||
if (event.verbose) {
|
||||
console.log(`${event.proxies.length} proxies loaded.`);
|
||||
if (fs.existsSync(config.proxy_file)) {
|
||||
config.proxies = read_keywords_from_file(config.proxy_file);
|
||||
if (config.verbose) {
|
||||
console.log(`${config.proxies.length} proxies loaded.`);
|
||||
}
|
||||
}
|
||||
|
||||
@ -78,7 +84,7 @@ exports.scrape = async function(config, callback) {
|
||||
}
|
||||
}
|
||||
|
||||
await handler.handler(event, undefined, callback );
|
||||
await handler.handler(config, undefined, callback );
|
||||
};
|
||||
|
||||
function read_keywords_from_file(fname) {
|
||||
|
Reference in New Issue
Block a user