se-scraper/examples/test_cluster.js

88 lines
2.4 KiB
JavaScript
Raw Permalink Normal View History

const { Cluster } = require('../../puppeteer-cluster/dist/index.js');
var fs = require('fs');
var os = require("os");
const PROXY_FILE = '/home/nikolai/.proxies';
function read_items_from_file(fname) {
let kws = fs.readFileSync(fname).toString().split(os.EOL);
// clean keywords
kws = kws.filter((kw) => {
return kw.trim().length > 0;
});
return kws;
}
(async () => {
let browserArgs = [
'--disable-infobars',
'--window-position=0,0',
'--ignore-certifcate-errors',
'--ignore-certifcate-errors-spki-list',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--disable-gpu',
'--window-size=1920x1080',
'--hide-scrollbars',
];
let proxies = read_items_from_file(PROXY_FILE);
console.dir(proxies);
// each new call to workerInstance() will
// left pop() one element from this list
// maxConcurrency should be equal to perBrowserOptions.length
// the first browser config with home IP
let perBrowserOptions = [{
headless: false,
ignoreHTTPSErrors: true,
args: browserArgs
}];
for (var proxy of proxies) {
perBrowserOptions.push({
headless: false,
ignoreHTTPSErrors: true,
args: browserArgs.concat(`--proxy-server=${proxy}`)
})
}
const cluster = await Cluster.launch({
monitor: true,
timeout: 12 * 60 * 60 * 1000, // 12 hours in ms
concurrency: Cluster.CONCURRENCY_BROWSER,
maxConcurrency: perBrowserOptions.length,
puppeteerOptions: {
headless: false,
args: browserArgs,
ignoreHTTPSErrors: true,
},
perBrowserOptions: perBrowserOptions
});
// Event handler to be called in case of problems
cluster.on('taskerror', (err, data) => {
console.log(`Error crawling ${data}: ${err.message}`);
});
await cluster.task(async ({ page, data: url }) => {
await page.goto(url, {waitUntil: 'domcontentloaded', timeout: 20000});
const pageTitle = await page.evaluate(() => document.title);
console.log(`Page title of ${url} is ${pageTitle}`);
console.log(await page.content());
});
for(var i = 0; i < perBrowserOptions.length; i++) {
await cluster.queue('http://ipinfo.io/json');
}
await cluster.idle();
await cluster.close();
})();