refactor(cluster): use custom concurrency for puppeteer-cluster

This commit is contained in:
HugoPoi 2019-12-20 19:44:59 +01:00
parent 301695cd2b
commit 8f40057534
6 changed files with 69 additions and 9 deletions

3
.gitmodules vendored
View File

@ -1,3 +0,0 @@
[submodule "src/puppeteer-cluster"]
path = src/puppeteer-cluster
url = https://github.com/NikolaiT/puppeteer-cluster

8
package-lock.json generated
View File

@ -1350,6 +1350,14 @@
"ws": "^6.1.0"
}
},
"puppeteer-cluster": {
"version": "0.18.0",
"resolved": "https://registry.npmjs.org/puppeteer-cluster/-/puppeteer-cluster-0.18.0.tgz",
"integrity": "sha512-sMhK7foa+gq8khtgvKar6xwIjmPFS92ZrDsnpMHcQMX2Q8hWmVSViHG7RhuAOmbfIbS7Ya+lViXlx1xtgUjarQ==",
"requires": {
"debug": "^4.1.1"
}
},
"puppeteer-extra": {
"version": "2.1.3",
"resolved": "https://registry.npmjs.org/puppeteer-extra/-/puppeteer-extra-2.1.3.tgz",

View File

@ -5,7 +5,6 @@
"homepage": "https://scrapeulous.com/",
"main": "index.js",
"scripts": {
"postinstall": "cd src/puppeteer-cluster && npm install && npm run build",
"test": "mocha test/static_tests/"
},
"keywords": [
@ -27,6 +26,7 @@
"got": "^9.6.0",
"lodash": "^4.17.14",
"puppeteer": "^2.0.0",
"puppeteer-cluster": "^0.18.0",
"puppeteer-extra": "^2.1.3",
"puppeteer-extra-plugin-stealth": "^2.2.2",
"user-agents": "^1.0.378",

View File

@ -0,0 +1,54 @@
const { Browser } = require('puppeteer-cluster/dist/concurrency/builtInConcurrency');
const debug = require('debug')('se-scraper:CustomConcurrency');
const { timeoutExecute } = require('puppeteer-cluster/dist/util');
const BROWSER_TIMEOUT = 5000;
class CustomConcurrency extends Browser {
async init() {}
async close() {}
async workerInstance() {
const options = this.options.perBrowserOptions.shift();
let chrome = await this.puppeteer.launch(options);
let page;
let context;
return {
jobInstance: async () => {
await timeoutExecute(BROWSER_TIMEOUT, (async () => {
context = await chrome.createIncognitoBrowserContext();
page = await context.newPage();
})());
return {
resources: {
page,
},
close: async () => {
await timeoutExecute(BROWSER_TIMEOUT, context.close());
},
};
},
close: async () => {
await chrome.close();
},
repair: async () => {
debug('Starting repair');
try {
// will probably fail, but just in case the repair was not necessary
await chrome.close();
} catch (e) {}
// just relaunch as there is only one page per browser
chrome = await this.puppeteer.launch(options);
},
};
}
};
module.exports = CustomConcurrency;

View File

@ -6,6 +6,7 @@ const _ = require('lodash');
const { createLogger, format, transports } = require('winston');
const { combine, timestamp, printf } = format;
const debug = require('debug')('se-scraper:ScrapeManager');
const { Cluster } = require('puppeteer-cluster');
const UserAgent = require('user-agents');
const google = require('./modules/google.js');
@ -13,7 +14,7 @@ const bing = require('./modules/bing.js');
const yandex = require('./modules/yandex.js');
const infospace = require('./modules/infospace.js');
const duckduckgo = require('./modules/duckduckgo.js');
const { Cluster } = require('./puppeteer-cluster/dist/index.js');
const CustomConcurrencyImpl = require('./concurrency-implementation');
const MAX_ALLOWED_BROWSERS = 6;
@ -292,10 +293,11 @@ class ScrapeManager {
this.cluster = await Cluster.launch({
monitor: this.config.puppeteer_cluster_config.monitor,
timeout: this.config.puppeteer_cluster_config.timeout, // max timeout set to 30 minutes
concurrency: this.config.puppeteer_cluster_config.concurrency,
concurrency: CustomConcurrencyImpl,
maxConcurrency: this.config.puppeteer_cluster_config.maxConcurrency,
puppeteerOptions: launch_args,
perBrowserOptions: perBrowserOptions,
puppeteerOptions: {
perBrowserOptions: perBrowserOptions
}
});
this.cluster.on('taskerror', (err, data) => {

@ -1 +0,0 @@
Subproject commit f333cd0ffc7d5dbbc1b7df255986c13f353672d8