mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2024-11-21 15:13:13 +01:00
refactor(cluster): use custom concurrency for puppeteer-cluster
This commit is contained in:
parent
301695cd2b
commit
8f40057534
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -1,3 +0,0 @@
|
||||
[submodule "src/puppeteer-cluster"]
|
||||
path = src/puppeteer-cluster
|
||||
url = https://github.com/NikolaiT/puppeteer-cluster
|
8
package-lock.json
generated
8
package-lock.json
generated
@ -1350,6 +1350,14 @@
|
||||
"ws": "^6.1.0"
|
||||
}
|
||||
},
|
||||
"puppeteer-cluster": {
|
||||
"version": "0.18.0",
|
||||
"resolved": "https://registry.npmjs.org/puppeteer-cluster/-/puppeteer-cluster-0.18.0.tgz",
|
||||
"integrity": "sha512-sMhK7foa+gq8khtgvKar6xwIjmPFS92ZrDsnpMHcQMX2Q8hWmVSViHG7RhuAOmbfIbS7Ya+lViXlx1xtgUjarQ==",
|
||||
"requires": {
|
||||
"debug": "^4.1.1"
|
||||
}
|
||||
},
|
||||
"puppeteer-extra": {
|
||||
"version": "2.1.3",
|
||||
"resolved": "https://registry.npmjs.org/puppeteer-extra/-/puppeteer-extra-2.1.3.tgz",
|
||||
|
@ -5,7 +5,6 @@
|
||||
"homepage": "https://scrapeulous.com/",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"postinstall": "cd src/puppeteer-cluster && npm install && npm run build",
|
||||
"test": "mocha test/static_tests/"
|
||||
},
|
||||
"keywords": [
|
||||
@ -27,6 +26,7 @@
|
||||
"got": "^9.6.0",
|
||||
"lodash": "^4.17.14",
|
||||
"puppeteer": "^2.0.0",
|
||||
"puppeteer-cluster": "^0.18.0",
|
||||
"puppeteer-extra": "^2.1.3",
|
||||
"puppeteer-extra-plugin-stealth": "^2.2.2",
|
||||
"user-agents": "^1.0.378",
|
||||
|
54
src/concurrency-implementation.js
Normal file
54
src/concurrency-implementation.js
Normal file
@ -0,0 +1,54 @@
|
||||
const { Browser } = require('puppeteer-cluster/dist/concurrency/builtInConcurrency');
|
||||
const debug = require('debug')('se-scraper:CustomConcurrency');
|
||||
const { timeoutExecute } = require('puppeteer-cluster/dist/util');
|
||||
|
||||
const BROWSER_TIMEOUT = 5000;
|
||||
|
||||
class CustomConcurrency extends Browser {
|
||||
|
||||
async init() {}
|
||||
async close() {}
|
||||
|
||||
async workerInstance() {
|
||||
const options = this.options.perBrowserOptions.shift();
|
||||
let chrome = await this.puppeteer.launch(options);
|
||||
let page;
|
||||
let context;
|
||||
|
||||
return {
|
||||
jobInstance: async () => {
|
||||
await timeoutExecute(BROWSER_TIMEOUT, (async () => {
|
||||
context = await chrome.createIncognitoBrowserContext();
|
||||
page = await context.newPage();
|
||||
})());
|
||||
|
||||
return {
|
||||
resources: {
|
||||
page,
|
||||
},
|
||||
|
||||
close: async () => {
|
||||
await timeoutExecute(BROWSER_TIMEOUT, context.close());
|
||||
},
|
||||
};
|
||||
},
|
||||
|
||||
close: async () => {
|
||||
await chrome.close();
|
||||
},
|
||||
|
||||
repair: async () => {
|
||||
debug('Starting repair');
|
||||
try {
|
||||
// will probably fail, but just in case the repair was not necessary
|
||||
await chrome.close();
|
||||
} catch (e) {}
|
||||
|
||||
// just relaunch as there is only one page per browser
|
||||
chrome = await this.puppeteer.launch(options);
|
||||
},
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
module.exports = CustomConcurrency;
|
@ -6,6 +6,7 @@ const _ = require('lodash');
|
||||
const { createLogger, format, transports } = require('winston');
|
||||
const { combine, timestamp, printf } = format;
|
||||
const debug = require('debug')('se-scraper:ScrapeManager');
|
||||
const { Cluster } = require('puppeteer-cluster');
|
||||
|
||||
const UserAgent = require('user-agents');
|
||||
const google = require('./modules/google.js');
|
||||
@ -13,7 +14,7 @@ const bing = require('./modules/bing.js');
|
||||
const yandex = require('./modules/yandex.js');
|
||||
const infospace = require('./modules/infospace.js');
|
||||
const duckduckgo = require('./modules/duckduckgo.js');
|
||||
const { Cluster } = require('./puppeteer-cluster/dist/index.js');
|
||||
const CustomConcurrencyImpl = require('./concurrency-implementation');
|
||||
|
||||
const MAX_ALLOWED_BROWSERS = 6;
|
||||
|
||||
@ -292,10 +293,11 @@ class ScrapeManager {
|
||||
this.cluster = await Cluster.launch({
|
||||
monitor: this.config.puppeteer_cluster_config.monitor,
|
||||
timeout: this.config.puppeteer_cluster_config.timeout, // max timeout set to 30 minutes
|
||||
concurrency: this.config.puppeteer_cluster_config.concurrency,
|
||||
concurrency: CustomConcurrencyImpl,
|
||||
maxConcurrency: this.config.puppeteer_cluster_config.maxConcurrency,
|
||||
puppeteerOptions: launch_args,
|
||||
perBrowserOptions: perBrowserOptions,
|
||||
puppeteerOptions: {
|
||||
perBrowserOptions: perBrowserOptions
|
||||
}
|
||||
});
|
||||
|
||||
this.cluster.on('taskerror', (err, data) => {
|
||||
|
@ -1 +0,0 @@
|
||||
Subproject commit f333cd0ffc7d5dbbc1b7df255986c13f353672d8
|
Loading…
Reference in New Issue
Block a user