fixed some errors and way better README

This commit is contained in:
Nikolai Tschacher
2019-02-28 15:34:25 +01:00
parent 089e410ec6
commit 79d32a315a
20 changed files with 8817 additions and 366 deletions

View File

@ -1,15 +1,16 @@
const { Cluster } = require('./src/puppeteer-cluster/dist/index.js');
const handler = require('./src/node_scraper.js');
var fs = require('fs');
var os = require("os");
exports.scrape = async function(config, callback) {
exports.scrape = async function(user_config, callback) {
// options for scraping
event = {
let config = {
// the user agent to scrape with
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
// if random_user_agent is set to True, a random user agent is chosen
random_user_agent: true,
random_user_agent: false,
// whether to select manual settings in visible mode
set_manual_settings: false,
// log ip address data
@ -18,7 +19,7 @@ exports.scrape = async function(config, callback) {
log_http_headers: false,
// how long to sleep between requests. a random sleep interval within the range [a,b]
// is drawn before every request. empty string for no sleeping.
sleep_range: '[1,1]',
sleep_range: '',
// which search engine to scrape
search_engine: 'google',
compress: false, // compress
@ -48,22 +49,27 @@ exports.scrape = async function(config, callback) {
// this is a quick test and should be used for debugging
test_evasion: false,
// settings for puppeteer-cluster
monitor: false,
puppeteer_cluster_config: {
timeout: 30 * 60 * 1000, // max timeout set to 30 minutes
monitor: false,
concurrency: Cluster.CONCURRENCY_BROWSER,
maxConcurrency: 2,
}
};
// overwrite default config
for (var key in config) {
event[key] = config[key];
for (var key in user_config) {
config[key] = user_config[key];
}
if (fs.existsSync(event.keyword_file)) {
event.keywords = read_keywords_from_file(event.keyword_file);
if (fs.existsSync(config.keyword_file)) {
config.keywords = read_keywords_from_file(config.keyword_file);
}
if (fs.existsSync(event.proxy_file)) {
event.proxies = read_keywords_from_file(event.proxy_file);
if (event.verbose) {
console.log(`${event.proxies.length} proxies loaded.`);
if (fs.existsSync(config.proxy_file)) {
config.proxies = read_keywords_from_file(config.proxy_file);
if (config.verbose) {
console.log(`${config.proxies.length} proxies loaded.`);
}
}
@ -78,7 +84,7 @@ exports.scrape = async function(config, callback) {
}
}
await handler.handler(event, undefined, callback );
await handler.handler(config, undefined, callback );
};
function read_keywords_from_file(fname) {