mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2025-01-05 20:58:48 +01:00
88 lines
2.4 KiB
JavaScript
88 lines
2.4 KiB
JavaScript
const { Cluster } = require('../../puppeteer-cluster/dist/index.js');
|
|
var fs = require('fs');
|
|
var os = require("os");
|
|
|
|
const PROXY_FILE = '/home/nikolai/.proxies';
|
|
|
|
function read_items_from_file(fname) {
|
|
let kws = fs.readFileSync(fname).toString().split(os.EOL);
|
|
// clean keywords
|
|
kws = kws.filter((kw) => {
|
|
return kw.trim().length > 0;
|
|
});
|
|
return kws;
|
|
}
|
|
|
|
(async () => {
|
|
|
|
let browserArgs = [
|
|
'--disable-infobars',
|
|
'--window-position=0,0',
|
|
'--ignore-certifcate-errors',
|
|
'--ignore-certifcate-errors-spki-list',
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-dev-shm-usage',
|
|
'--disable-accelerated-2d-canvas',
|
|
'--disable-gpu',
|
|
'--window-size=1920x1080',
|
|
'--hide-scrollbars',
|
|
];
|
|
|
|
let proxies = read_items_from_file(PROXY_FILE);
|
|
|
|
console.dir(proxies);
|
|
|
|
// each new call to workerInstance() will
|
|
// left pop() one element from this list
|
|
// maxConcurrency should be equal to perBrowserOptions.length
|
|
|
|
// the first browser config with home IP
|
|
let perBrowserOptions = [{
|
|
headless: false,
|
|
ignoreHTTPSErrors: true,
|
|
args: browserArgs
|
|
}];
|
|
|
|
for (var proxy of proxies) {
|
|
perBrowserOptions.push({
|
|
headless: false,
|
|
ignoreHTTPSErrors: true,
|
|
args: browserArgs.concat(`--proxy-server=${proxy}`)
|
|
})
|
|
}
|
|
|
|
const cluster = await Cluster.launch({
|
|
monitor: true,
|
|
timeout: 12 * 60 * 60 * 1000, // 12 hours in ms
|
|
concurrency: Cluster.CONCURRENCY_BROWSER,
|
|
maxConcurrency: perBrowserOptions.length,
|
|
puppeteerOptions: {
|
|
headless: false,
|
|
args: browserArgs,
|
|
ignoreHTTPSErrors: true,
|
|
},
|
|
perBrowserOptions: perBrowserOptions
|
|
});
|
|
|
|
// Event handler to be called in case of problems
|
|
cluster.on('taskerror', (err, data) => {
|
|
console.log(`Error crawling ${data}: ${err.message}`);
|
|
});
|
|
|
|
|
|
await cluster.task(async ({ page, data: url }) => {
|
|
await page.goto(url, {waitUntil: 'domcontentloaded', timeout: 20000});
|
|
const pageTitle = await page.evaluate(() => document.title);
|
|
console.log(`Page title of ${url} is ${pageTitle}`);
|
|
console.log(await page.content());
|
|
});
|
|
|
|
for(var i = 0; i < perBrowserOptions.length; i++) {
|
|
await cluster.queue('http://ipinfo.io/json');
|
|
}
|
|
|
|
await cluster.idle();
|
|
await cluster.close();
|
|
})();
|