diff --git a/.gitmodules b/.gitmodules index caa6108..e69de29 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +0,0 @@ -[submodule "src/puppeteer-cluster"] - path = src/puppeteer-cluster - url = https://github.com/NikolaiT/puppeteer-cluster diff --git a/package-lock.json b/package-lock.json index 671d279..e6e4d33 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1350,6 +1350,14 @@ "ws": "^6.1.0" } }, + "puppeteer-cluster": { + "version": "0.18.0", + "resolved": "https://registry.npmjs.org/puppeteer-cluster/-/puppeteer-cluster-0.18.0.tgz", + "integrity": "sha512-sMhK7foa+gq8khtgvKar6xwIjmPFS92ZrDsnpMHcQMX2Q8hWmVSViHG7RhuAOmbfIbS7Ya+lViXlx1xtgUjarQ==", + "requires": { + "debug": "^4.1.1" + } + }, "puppeteer-extra": { "version": "2.1.3", "resolved": "https://registry.npmjs.org/puppeteer-extra/-/puppeteer-extra-2.1.3.tgz", diff --git a/package.json b/package.json index e4c4bad..662d902 100644 --- a/package.json +++ b/package.json @@ -5,7 +5,6 @@ "homepage": "https://scrapeulous.com/", "main": "index.js", "scripts": { - "postinstall": "cd src/puppeteer-cluster && npm install && npm run build", "test": "mocha test/static_tests/" }, "keywords": [ @@ -27,6 +26,7 @@ "got": "^9.6.0", "lodash": "^4.17.14", "puppeteer": "^2.0.0", + "puppeteer-cluster": "^0.18.0", "puppeteer-extra": "^2.1.3", "puppeteer-extra-plugin-stealth": "^2.2.2", "user-agents": "^1.0.378", diff --git a/src/concurrency-implementation.js b/src/concurrency-implementation.js new file mode 100644 index 0000000..d06650e --- /dev/null +++ b/src/concurrency-implementation.js @@ -0,0 +1,54 @@ +const { Browser } = require('puppeteer-cluster/dist/concurrency/builtInConcurrency'); +const debug = require('debug')('se-scraper:CustomConcurrency'); +const { timeoutExecute } = require('puppeteer-cluster/dist/util'); + +const BROWSER_TIMEOUT = 5000; + +class CustomConcurrency extends Browser { + + async init() {} + async close() {} + + async workerInstance() { + const options = this.options.perBrowserOptions.shift(); + let chrome = await this.puppeteer.launch(options); + let page; + let context; + + return { + jobInstance: async () => { + await timeoutExecute(BROWSER_TIMEOUT, (async () => { + context = await chrome.createIncognitoBrowserContext(); + page = await context.newPage(); + })()); + + return { + resources: { + page, + }, + + close: async () => { + await timeoutExecute(BROWSER_TIMEOUT, context.close()); + }, + }; + }, + + close: async () => { + await chrome.close(); + }, + + repair: async () => { + debug('Starting repair'); + try { + // will probably fail, but just in case the repair was not necessary + await chrome.close(); + } catch (e) {} + + // just relaunch as there is only one page per browser + chrome = await this.puppeteer.launch(options); + }, + }; + } +}; + +module.exports = CustomConcurrency; \ No newline at end of file diff --git a/src/node_scraper.js b/src/node_scraper.js index 2f51169..c42e95e 100644 --- a/src/node_scraper.js +++ b/src/node_scraper.js @@ -6,6 +6,7 @@ const _ = require('lodash'); const { createLogger, format, transports } = require('winston'); const { combine, timestamp, printf } = format; const debug = require('debug')('se-scraper:ScrapeManager'); +const { Cluster } = require('puppeteer-cluster'); const UserAgent = require('user-agents'); const google = require('./modules/google.js'); @@ -13,7 +14,7 @@ const bing = require('./modules/bing.js'); const yandex = require('./modules/yandex.js'); const infospace = require('./modules/infospace.js'); const duckduckgo = require('./modules/duckduckgo.js'); -const { Cluster } = require('./puppeteer-cluster/dist/index.js'); +const CustomConcurrencyImpl = require('./concurrency-implementation'); const MAX_ALLOWED_BROWSERS = 6; @@ -292,10 +293,11 @@ class ScrapeManager { this.cluster = await Cluster.launch({ monitor: this.config.puppeteer_cluster_config.monitor, timeout: this.config.puppeteer_cluster_config.timeout, // max timeout set to 30 minutes - concurrency: this.config.puppeteer_cluster_config.concurrency, + concurrency: CustomConcurrencyImpl, maxConcurrency: this.config.puppeteer_cluster_config.maxConcurrency, - puppeteerOptions: launch_args, - perBrowserOptions: perBrowserOptions, + puppeteerOptions: { + perBrowserOptions: perBrowserOptions + } }); this.cluster.on('taskerror', (err, data) => { diff --git a/src/puppeteer-cluster b/src/puppeteer-cluster deleted file mode 160000 index f333cd0..0000000 --- a/src/puppeteer-cluster +++ /dev/null @@ -1 +0,0 @@ -Subproject commit f333cd0ffc7d5dbbc1b7df255986c13f353672d8