forked from extern/se-scraper
refactor(cluster): use custom concurrency for puppeteer-cluster
This commit is contained in:
parent
301695cd2b
commit
8f40057534
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -1,3 +0,0 @@
|
|||||||
[submodule "src/puppeteer-cluster"]
|
|
||||||
path = src/puppeteer-cluster
|
|
||||||
url = https://github.com/NikolaiT/puppeteer-cluster
|
|
8
package-lock.json
generated
8
package-lock.json
generated
@ -1350,6 +1350,14 @@
|
|||||||
"ws": "^6.1.0"
|
"ws": "^6.1.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"puppeteer-cluster": {
|
||||||
|
"version": "0.18.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/puppeteer-cluster/-/puppeteer-cluster-0.18.0.tgz",
|
||||||
|
"integrity": "sha512-sMhK7foa+gq8khtgvKar6xwIjmPFS92ZrDsnpMHcQMX2Q8hWmVSViHG7RhuAOmbfIbS7Ya+lViXlx1xtgUjarQ==",
|
||||||
|
"requires": {
|
||||||
|
"debug": "^4.1.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
"puppeteer-extra": {
|
"puppeteer-extra": {
|
||||||
"version": "2.1.3",
|
"version": "2.1.3",
|
||||||
"resolved": "https://registry.npmjs.org/puppeteer-extra/-/puppeteer-extra-2.1.3.tgz",
|
"resolved": "https://registry.npmjs.org/puppeteer-extra/-/puppeteer-extra-2.1.3.tgz",
|
||||||
|
@ -5,7 +5,6 @@
|
|||||||
"homepage": "https://scrapeulous.com/",
|
"homepage": "https://scrapeulous.com/",
|
||||||
"main": "index.js",
|
"main": "index.js",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"postinstall": "cd src/puppeteer-cluster && npm install && npm run build",
|
|
||||||
"test": "mocha test/static_tests/"
|
"test": "mocha test/static_tests/"
|
||||||
},
|
},
|
||||||
"keywords": [
|
"keywords": [
|
||||||
@ -27,6 +26,7 @@
|
|||||||
"got": "^9.6.0",
|
"got": "^9.6.0",
|
||||||
"lodash": "^4.17.14",
|
"lodash": "^4.17.14",
|
||||||
"puppeteer": "^2.0.0",
|
"puppeteer": "^2.0.0",
|
||||||
|
"puppeteer-cluster": "^0.18.0",
|
||||||
"puppeteer-extra": "^2.1.3",
|
"puppeteer-extra": "^2.1.3",
|
||||||
"puppeteer-extra-plugin-stealth": "^2.2.2",
|
"puppeteer-extra-plugin-stealth": "^2.2.2",
|
||||||
"user-agents": "^1.0.378",
|
"user-agents": "^1.0.378",
|
||||||
|
54
src/concurrency-implementation.js
Normal file
54
src/concurrency-implementation.js
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
const { Browser } = require('puppeteer-cluster/dist/concurrency/builtInConcurrency');
|
||||||
|
const debug = require('debug')('se-scraper:CustomConcurrency');
|
||||||
|
const { timeoutExecute } = require('puppeteer-cluster/dist/util');
|
||||||
|
|
||||||
|
const BROWSER_TIMEOUT = 5000;
|
||||||
|
|
||||||
|
class CustomConcurrency extends Browser {
|
||||||
|
|
||||||
|
async init() {}
|
||||||
|
async close() {}
|
||||||
|
|
||||||
|
async workerInstance() {
|
||||||
|
const options = this.options.perBrowserOptions.shift();
|
||||||
|
let chrome = await this.puppeteer.launch(options);
|
||||||
|
let page;
|
||||||
|
let context;
|
||||||
|
|
||||||
|
return {
|
||||||
|
jobInstance: async () => {
|
||||||
|
await timeoutExecute(BROWSER_TIMEOUT, (async () => {
|
||||||
|
context = await chrome.createIncognitoBrowserContext();
|
||||||
|
page = await context.newPage();
|
||||||
|
})());
|
||||||
|
|
||||||
|
return {
|
||||||
|
resources: {
|
||||||
|
page,
|
||||||
|
},
|
||||||
|
|
||||||
|
close: async () => {
|
||||||
|
await timeoutExecute(BROWSER_TIMEOUT, context.close());
|
||||||
|
},
|
||||||
|
};
|
||||||
|
},
|
||||||
|
|
||||||
|
close: async () => {
|
||||||
|
await chrome.close();
|
||||||
|
},
|
||||||
|
|
||||||
|
repair: async () => {
|
||||||
|
debug('Starting repair');
|
||||||
|
try {
|
||||||
|
// will probably fail, but just in case the repair was not necessary
|
||||||
|
await chrome.close();
|
||||||
|
} catch (e) {}
|
||||||
|
|
||||||
|
// just relaunch as there is only one page per browser
|
||||||
|
chrome = await this.puppeteer.launch(options);
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
module.exports = CustomConcurrency;
|
@ -6,6 +6,7 @@ const _ = require('lodash');
|
|||||||
const { createLogger, format, transports } = require('winston');
|
const { createLogger, format, transports } = require('winston');
|
||||||
const { combine, timestamp, printf } = format;
|
const { combine, timestamp, printf } = format;
|
||||||
const debug = require('debug')('se-scraper:ScrapeManager');
|
const debug = require('debug')('se-scraper:ScrapeManager');
|
||||||
|
const { Cluster } = require('puppeteer-cluster');
|
||||||
|
|
||||||
const UserAgent = require('user-agents');
|
const UserAgent = require('user-agents');
|
||||||
const google = require('./modules/google.js');
|
const google = require('./modules/google.js');
|
||||||
@ -13,7 +14,7 @@ const bing = require('./modules/bing.js');
|
|||||||
const yandex = require('./modules/yandex.js');
|
const yandex = require('./modules/yandex.js');
|
||||||
const infospace = require('./modules/infospace.js');
|
const infospace = require('./modules/infospace.js');
|
||||||
const duckduckgo = require('./modules/duckduckgo.js');
|
const duckduckgo = require('./modules/duckduckgo.js');
|
||||||
const { Cluster } = require('./puppeteer-cluster/dist/index.js');
|
const CustomConcurrencyImpl = require('./concurrency-implementation');
|
||||||
|
|
||||||
const MAX_ALLOWED_BROWSERS = 6;
|
const MAX_ALLOWED_BROWSERS = 6;
|
||||||
|
|
||||||
@ -292,10 +293,11 @@ class ScrapeManager {
|
|||||||
this.cluster = await Cluster.launch({
|
this.cluster = await Cluster.launch({
|
||||||
monitor: this.config.puppeteer_cluster_config.monitor,
|
monitor: this.config.puppeteer_cluster_config.monitor,
|
||||||
timeout: this.config.puppeteer_cluster_config.timeout, // max timeout set to 30 minutes
|
timeout: this.config.puppeteer_cluster_config.timeout, // max timeout set to 30 minutes
|
||||||
concurrency: this.config.puppeteer_cluster_config.concurrency,
|
concurrency: CustomConcurrencyImpl,
|
||||||
maxConcurrency: this.config.puppeteer_cluster_config.maxConcurrency,
|
maxConcurrency: this.config.puppeteer_cluster_config.maxConcurrency,
|
||||||
puppeteerOptions: launch_args,
|
puppeteerOptions: {
|
||||||
perBrowserOptions: perBrowserOptions,
|
perBrowserOptions: perBrowserOptions
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
this.cluster.on('taskerror', (err, data) => {
|
this.cluster.on('taskerror', (err, data) => {
|
||||||
|
@ -1 +0,0 @@
|
|||||||
Subproject commit f333cd0ffc7d5dbbc1b7df255986c13f353672d8
|
|
Loading…
Reference in New Issue
Block a user