From 1694ee92d0c7d4e821bfd8bfa78cb9f8c17474c0 Mon Sep 17 00:00:00 2001 From: Nikolai Tschacher Date: Fri, 8 Nov 2019 16:21:16 +0100 Subject: [PATCH] updated to puppeteeer 2.0 --- package-lock.json | 38 +++++++++++++++---------------------- package.json | 4 ++-- src/modules/yandex.js | 19 ++++++++++++------- test/static_tests/yandex.js | 38 +++++++++++++++++++++++++++++++++++++ 4 files changed, 67 insertions(+), 32 deletions(-) diff --git a/package-lock.json b/package-lock.json index 7004e98..1648a56 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,6 +1,6 @@ { "name": "se-scraper", - "version": "1.5.0", + "version": "1.5.7", "lockfileVersion": 1, "requires": true, "dependencies": { @@ -297,7 +297,7 @@ }, "concat-stream": { "version": "1.6.2", - "resolved": "http://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz", + "resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz", "integrity": "sha512-27HBghJxjiZtIk3Ycvn/4kbJk/1uZuJFfuPEns6LaEvpvG1f0hTea8lilrouyo9mVc2GWdcEZ8OLoGmSADlrCw==", "requires": { "buffer-from": "^1.0.0", @@ -308,7 +308,7 @@ "dependencies": { "readable-stream": { "version": "2.3.6", - "resolved": "http://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz", "integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==", "requires": { "core-util-is": "~1.0.0", @@ -322,7 +322,7 @@ }, "string_decoder": { "version": "1.1.1", - "resolved": "http://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", "integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==", "requires": { "safe-buffer": "~5.1.0" @@ -564,7 +564,7 @@ }, "es6-promisify": { "version": "5.0.0", - "resolved": "http://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz", + "resolved": "https://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz", "integrity": "sha1-UQnWLz5W6pZ8S2NQWu8IKRyKUgM=", "requires": { "es6-promise": "^4.0.3" @@ -795,9 +795,9 @@ } }, "glob": { - "version": "7.1.4", - "resolved": "https://registry.npmjs.org/glob/-/glob-7.1.4.tgz", - "integrity": "sha512-hkLPepehmnKk41pUGm3sYxoFs/umurYfYJCerbXEyFIWcAzvpipAgVkBqqT9RBKMGjnq6kMuyYwha6csxbiM1A==", + "version": "7.1.6", + "resolved": "https://registry.npmjs.org/glob/-/glob-7.1.6.tgz", + "integrity": "sha512-LwaxwyZ72Lk7vZINtNNrywX0ZuLyStrdDtabefZKAY5ZGJhVtgdznluResxNmPitE0SAO+O26sWTHeKSI2wMBA==", "requires": { "fs.realpath": "^1.0.0", "inflight": "^1.0.4", @@ -889,9 +889,9 @@ } }, "https-proxy-agent": { - "version": "2.2.2", - "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-2.2.2.tgz", - "integrity": "sha512-c8Ndjc9Bkpfx/vCJueCPy0jlP4ccCCSNDp8xwCZzPjKJUm+B+u9WX2x98Qx4n1PiMNTWo3D7KK5ifNV/yJyRzg==", + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-3.0.1.tgz", + "integrity": "sha512-+ML2Rbh6DAuee7d07tYGEKOEi2voWPUGan+ExdPbPW6Z3svq+JCqr0v8WmKPOkz1vOVykPCBSuobe7G8GJUtVg==", "requires": { "agent-base": "^4.3.0", "debug": "^3.1.0" @@ -1511,13 +1511,13 @@ } }, "puppeteer": { - "version": "1.20.0", - "resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-1.20.0.tgz", - "integrity": "sha512-bt48RDBy2eIwZPrkgbcwHtb51mj2nKvHOPMaSH2IsWiv7lOG9k9zhaRzpDZafrk05ajMc3cu+lSQYYOfH2DkVQ==", + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-2.0.0.tgz", + "integrity": "sha512-t3MmTWzQxPRP71teU6l0jX47PHXlc4Z52sQv4LJQSZLq1ttkKS2yGM3gaI57uQwZkNaoGd0+HPPMELZkcyhlqA==", "requires": { "debug": "^4.1.0", "extract-zip": "^1.6.6", - "https-proxy-agent": "^2.2.1", + "https-proxy-agent": "^3.0.0", "mime": "^2.0.3", "progress": "^2.0.1", "proxy-from-env": "^1.0.0", @@ -1525,14 +1525,6 @@ "ws": "^6.1.0" } }, - "puppeteer-cluster": { - "version": "0.13.0", - "resolved": "https://registry.npmjs.org/puppeteer-cluster/-/puppeteer-cluster-0.13.0.tgz", - "integrity": "sha512-en9F6cHkj1tLucFz9q3BtrvVKxGxIR1cWZgcpKyjXJUElBbNahaUErrz7jGa6edVQJfqTrdF40mkDqIOZNJUhg==", - "requires": { - "debug": "^4.1.1" - } - }, "puppeteer-extra": { "version": "2.1.3", "resolved": "https://registry.npmjs.org/puppeteer-extra/-/puppeteer-extra-2.1.3.tgz", diff --git a/package.json b/package.json index 9504f6a..e476299 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "se-scraper", - "version": "1.5.6", + "version": "1.5.7", "description": "A module using puppeteer to scrape several search engines such as Google, Bing and Duckduckgo", "homepage": "https://scrapeulous.com/", "main": "index.js", @@ -26,7 +26,7 @@ "express": "^4.17.1", "got": "^9.6.0", "lodash": "^4.17.14", - "puppeteer": "^1.20.0", + "puppeteer": "^2.0.0", "puppeteer-extra": "^2.1.3", "puppeteer-extra-plugin-stealth": "^2.2.2", "user-agents": "^1.0.378" diff --git a/src/modules/yandex.js b/src/modules/yandex.js index c74b77b..890bbfa 100644 --- a/src/modules/yandex.js +++ b/src/modules/yandex.js @@ -22,19 +22,24 @@ class YandexScraper extends Scraper { try { if (item) { - let linkElement = item.querySelector('a.link'); + let linkElement = item.querySelector('h2 a.link'); if (linkElement) { obj.link = linkElement.getAttribute('href'); obj.title = linkElement.innerText; + } - let label = linkElement.querySelector('.organic__subtitle .label'); - if (label) { - let labelText = label.innerText; - if (labelText.trim() === 'ad') { - obj.is_ad = true; - } + let label = item.querySelector('.organic__subtitle .label'); + + if (label) { + let labelText = label.innerText; + + if (labelText) { + labelText = labelText.trim().toLowerCase(); + console.log(labelText); + let ad_labels = ['ad', 'werbung', 'реклама', 'anuncio']; + obj.is_ad = ad_labels.includes(labelText); } } diff --git a/test/static_tests/yandex.js b/test/static_tests/yandex.js index fce5816..9cb806c 100644 --- a/test/static_tests/yandex.js +++ b/test/static_tests/yandex.js @@ -29,6 +29,11 @@ async function yandex_ads() { yandex_search_with_ads2( await scraper.scrape(scrape_config) ); + scrape_config.keywords = ['купить деревянные окна']; + scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/yandex3.html'); + + yandex_search_with_ads3( await scraper.scrape(scrape_config) ); + await scraper.quit(); } @@ -83,6 +88,39 @@ function yandex_search_with_ads2(response) { } +function yandex_search_with_ads3(response) { + assert.equal(response.metadata.num_requests, 1); + + for (let query in response.results) { + + for (let page_number in response.results[query]) { + + assert.isNumber(parseInt(page_number), 'page_number must be numeric'); + + let obj = response.results[query][page_number]; + + // console.dir(obj.results, {depth: null, colors: true}); + + assert.containsAllKeys(obj, ['results', 'time', 'num_results',], 'not all keys are in the object'); + assert.isAtLeast(obj.results.length, 14, 'results must have at least 14 SERP objects'); + assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); + + // at least 4 ads + let cnt = 0; + obj.results.forEach((res) => { + if (res.is_ad) { + cnt++; + } + }); + + assert.isAtLeast(cnt, 4, 'there should be at least 4 ads in the results'); + + confirm_results_ok(obj); + } + } +} + + function confirm_results_ok(obj) { for (let res of obj.results) {