updated to puppeteeer 2.0

This commit is contained in:
Nikolai Tschacher 2019-11-08 16:21:16 +01:00
parent da69913272
commit 1694ee92d0
4 changed files with 67 additions and 32 deletions

38
package-lock.json generated
View File

@ -1,6 +1,6 @@
{
"name": "se-scraper",
"version": "1.5.0",
"version": "1.5.7",
"lockfileVersion": 1,
"requires": true,
"dependencies": {
@ -297,7 +297,7 @@
},
"concat-stream": {
"version": "1.6.2",
"resolved": "http://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz",
"resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz",
"integrity": "sha512-27HBghJxjiZtIk3Ycvn/4kbJk/1uZuJFfuPEns6LaEvpvG1f0hTea8lilrouyo9mVc2GWdcEZ8OLoGmSADlrCw==",
"requires": {
"buffer-from": "^1.0.0",
@ -308,7 +308,7 @@
"dependencies": {
"readable-stream": {
"version": "2.3.6",
"resolved": "http://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz",
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz",
"integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==",
"requires": {
"core-util-is": "~1.0.0",
@ -322,7 +322,7 @@
},
"string_decoder": {
"version": "1.1.1",
"resolved": "http://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz",
"resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz",
"integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==",
"requires": {
"safe-buffer": "~5.1.0"
@ -564,7 +564,7 @@
},
"es6-promisify": {
"version": "5.0.0",
"resolved": "http://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz",
"resolved": "https://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz",
"integrity": "sha1-UQnWLz5W6pZ8S2NQWu8IKRyKUgM=",
"requires": {
"es6-promise": "^4.0.3"
@ -795,9 +795,9 @@
}
},
"glob": {
"version": "7.1.4",
"resolved": "https://registry.npmjs.org/glob/-/glob-7.1.4.tgz",
"integrity": "sha512-hkLPepehmnKk41pUGm3sYxoFs/umurYfYJCerbXEyFIWcAzvpipAgVkBqqT9RBKMGjnq6kMuyYwha6csxbiM1A==",
"version": "7.1.6",
"resolved": "https://registry.npmjs.org/glob/-/glob-7.1.6.tgz",
"integrity": "sha512-LwaxwyZ72Lk7vZINtNNrywX0ZuLyStrdDtabefZKAY5ZGJhVtgdznluResxNmPitE0SAO+O26sWTHeKSI2wMBA==",
"requires": {
"fs.realpath": "^1.0.0",
"inflight": "^1.0.4",
@ -889,9 +889,9 @@
}
},
"https-proxy-agent": {
"version": "2.2.2",
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-2.2.2.tgz",
"integrity": "sha512-c8Ndjc9Bkpfx/vCJueCPy0jlP4ccCCSNDp8xwCZzPjKJUm+B+u9WX2x98Qx4n1PiMNTWo3D7KK5ifNV/yJyRzg==",
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-3.0.1.tgz",
"integrity": "sha512-+ML2Rbh6DAuee7d07tYGEKOEi2voWPUGan+ExdPbPW6Z3svq+JCqr0v8WmKPOkz1vOVykPCBSuobe7G8GJUtVg==",
"requires": {
"agent-base": "^4.3.0",
"debug": "^3.1.0"
@ -1511,13 +1511,13 @@
}
},
"puppeteer": {
"version": "1.20.0",
"resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-1.20.0.tgz",
"integrity": "sha512-bt48RDBy2eIwZPrkgbcwHtb51mj2nKvHOPMaSH2IsWiv7lOG9k9zhaRzpDZafrk05ajMc3cu+lSQYYOfH2DkVQ==",
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-2.0.0.tgz",
"integrity": "sha512-t3MmTWzQxPRP71teU6l0jX47PHXlc4Z52sQv4LJQSZLq1ttkKS2yGM3gaI57uQwZkNaoGd0+HPPMELZkcyhlqA==",
"requires": {
"debug": "^4.1.0",
"extract-zip": "^1.6.6",
"https-proxy-agent": "^2.2.1",
"https-proxy-agent": "^3.0.0",
"mime": "^2.0.3",
"progress": "^2.0.1",
"proxy-from-env": "^1.0.0",
@ -1525,14 +1525,6 @@
"ws": "^6.1.0"
}
},
"puppeteer-cluster": {
"version": "0.13.0",
"resolved": "https://registry.npmjs.org/puppeteer-cluster/-/puppeteer-cluster-0.13.0.tgz",
"integrity": "sha512-en9F6cHkj1tLucFz9q3BtrvVKxGxIR1cWZgcpKyjXJUElBbNahaUErrz7jGa6edVQJfqTrdF40mkDqIOZNJUhg==",
"requires": {
"debug": "^4.1.1"
}
},
"puppeteer-extra": {
"version": "2.1.3",
"resolved": "https://registry.npmjs.org/puppeteer-extra/-/puppeteer-extra-2.1.3.tgz",

View File

@ -1,6 +1,6 @@
{
"name": "se-scraper",
"version": "1.5.6",
"version": "1.5.7",
"description": "A module using puppeteer to scrape several search engines such as Google, Bing and Duckduckgo",
"homepage": "https://scrapeulous.com/",
"main": "index.js",
@ -26,7 +26,7 @@
"express": "^4.17.1",
"got": "^9.6.0",
"lodash": "^4.17.14",
"puppeteer": "^1.20.0",
"puppeteer": "^2.0.0",
"puppeteer-extra": "^2.1.3",
"puppeteer-extra-plugin-stealth": "^2.2.2",
"user-agents": "^1.0.378"

View File

@ -22,19 +22,24 @@ class YandexScraper extends Scraper {
try {
if (item) {
let linkElement = item.querySelector('a.link');
let linkElement = item.querySelector('h2 a.link');
if (linkElement) {
obj.link = linkElement.getAttribute('href');
obj.title = linkElement.innerText;
}
let label = linkElement.querySelector('.organic__subtitle .label');
if (label) {
let labelText = label.innerText;
if (labelText.trim() === 'ad') {
obj.is_ad = true;
}
let label = item.querySelector('.organic__subtitle .label');
if (label) {
let labelText = label.innerText;
if (labelText) {
labelText = labelText.trim().toLowerCase();
console.log(labelText);
let ad_labels = ['ad', 'werbung', 'реклама', 'anuncio'];
obj.is_ad = ad_labels.includes(labelText);
}
}

View File

@ -29,6 +29,11 @@ async function yandex_ads() {
yandex_search_with_ads2( await scraper.scrape(scrape_config) );
scrape_config.keywords = ['купить деревянные окна'];
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/yandex3.html');
yandex_search_with_ads3( await scraper.scrape(scrape_config) );
await scraper.quit();
}
@ -83,6 +88,39 @@ function yandex_search_with_ads2(response) {
}
function yandex_search_with_ads3(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
// console.dir(obj.results, {depth: null, colors: true});
assert.containsAllKeys(obj, ['results', 'time', 'num_results',], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 14, 'results must have at least 14 SERP objects');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
// at least 4 ads
let cnt = 0;
obj.results.forEach((res) => {
if (res.is_ad) {
cnt++;
}
});
assert.isAtLeast(cnt, 4, 'there should be at least 4 ads in the results');
confirm_results_ok(obj);
}
}
}
function confirm_results_ok(obj) {
for (let res of obj.results) {