mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2024-11-07 16:23:58 +01:00
updated to puppeteeer 2.0
This commit is contained in:
parent
da69913272
commit
1694ee92d0
38
package-lock.json
generated
38
package-lock.json
generated
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "se-scraper",
|
||||
"version": "1.5.0",
|
||||
"version": "1.5.7",
|
||||
"lockfileVersion": 1,
|
||||
"requires": true,
|
||||
"dependencies": {
|
||||
@ -297,7 +297,7 @@
|
||||
},
|
||||
"concat-stream": {
|
||||
"version": "1.6.2",
|
||||
"resolved": "http://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz",
|
||||
"resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz",
|
||||
"integrity": "sha512-27HBghJxjiZtIk3Ycvn/4kbJk/1uZuJFfuPEns6LaEvpvG1f0hTea8lilrouyo9mVc2GWdcEZ8OLoGmSADlrCw==",
|
||||
"requires": {
|
||||
"buffer-from": "^1.0.0",
|
||||
@ -308,7 +308,7 @@
|
||||
"dependencies": {
|
||||
"readable-stream": {
|
||||
"version": "2.3.6",
|
||||
"resolved": "http://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz",
|
||||
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz",
|
||||
"integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==",
|
||||
"requires": {
|
||||
"core-util-is": "~1.0.0",
|
||||
@ -322,7 +322,7 @@
|
||||
},
|
||||
"string_decoder": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "http://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz",
|
||||
"resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz",
|
||||
"integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==",
|
||||
"requires": {
|
||||
"safe-buffer": "~5.1.0"
|
||||
@ -564,7 +564,7 @@
|
||||
},
|
||||
"es6-promisify": {
|
||||
"version": "5.0.0",
|
||||
"resolved": "http://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz",
|
||||
"resolved": "https://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz",
|
||||
"integrity": "sha1-UQnWLz5W6pZ8S2NQWu8IKRyKUgM=",
|
||||
"requires": {
|
||||
"es6-promise": "^4.0.3"
|
||||
@ -795,9 +795,9 @@
|
||||
}
|
||||
},
|
||||
"glob": {
|
||||
"version": "7.1.4",
|
||||
"resolved": "https://registry.npmjs.org/glob/-/glob-7.1.4.tgz",
|
||||
"integrity": "sha512-hkLPepehmnKk41pUGm3sYxoFs/umurYfYJCerbXEyFIWcAzvpipAgVkBqqT9RBKMGjnq6kMuyYwha6csxbiM1A==",
|
||||
"version": "7.1.6",
|
||||
"resolved": "https://registry.npmjs.org/glob/-/glob-7.1.6.tgz",
|
||||
"integrity": "sha512-LwaxwyZ72Lk7vZINtNNrywX0ZuLyStrdDtabefZKAY5ZGJhVtgdznluResxNmPitE0SAO+O26sWTHeKSI2wMBA==",
|
||||
"requires": {
|
||||
"fs.realpath": "^1.0.0",
|
||||
"inflight": "^1.0.4",
|
||||
@ -889,9 +889,9 @@
|
||||
}
|
||||
},
|
||||
"https-proxy-agent": {
|
||||
"version": "2.2.2",
|
||||
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-2.2.2.tgz",
|
||||
"integrity": "sha512-c8Ndjc9Bkpfx/vCJueCPy0jlP4ccCCSNDp8xwCZzPjKJUm+B+u9WX2x98Qx4n1PiMNTWo3D7KK5ifNV/yJyRzg==",
|
||||
"version": "3.0.1",
|
||||
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-3.0.1.tgz",
|
||||
"integrity": "sha512-+ML2Rbh6DAuee7d07tYGEKOEi2voWPUGan+ExdPbPW6Z3svq+JCqr0v8WmKPOkz1vOVykPCBSuobe7G8GJUtVg==",
|
||||
"requires": {
|
||||
"agent-base": "^4.3.0",
|
||||
"debug": "^3.1.0"
|
||||
@ -1511,13 +1511,13 @@
|
||||
}
|
||||
},
|
||||
"puppeteer": {
|
||||
"version": "1.20.0",
|
||||
"resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-1.20.0.tgz",
|
||||
"integrity": "sha512-bt48RDBy2eIwZPrkgbcwHtb51mj2nKvHOPMaSH2IsWiv7lOG9k9zhaRzpDZafrk05ajMc3cu+lSQYYOfH2DkVQ==",
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-2.0.0.tgz",
|
||||
"integrity": "sha512-t3MmTWzQxPRP71teU6l0jX47PHXlc4Z52sQv4LJQSZLq1ttkKS2yGM3gaI57uQwZkNaoGd0+HPPMELZkcyhlqA==",
|
||||
"requires": {
|
||||
"debug": "^4.1.0",
|
||||
"extract-zip": "^1.6.6",
|
||||
"https-proxy-agent": "^2.2.1",
|
||||
"https-proxy-agent": "^3.0.0",
|
||||
"mime": "^2.0.3",
|
||||
"progress": "^2.0.1",
|
||||
"proxy-from-env": "^1.0.0",
|
||||
@ -1525,14 +1525,6 @@
|
||||
"ws": "^6.1.0"
|
||||
}
|
||||
},
|
||||
"puppeteer-cluster": {
|
||||
"version": "0.13.0",
|
||||
"resolved": "https://registry.npmjs.org/puppeteer-cluster/-/puppeteer-cluster-0.13.0.tgz",
|
||||
"integrity": "sha512-en9F6cHkj1tLucFz9q3BtrvVKxGxIR1cWZgcpKyjXJUElBbNahaUErrz7jGa6edVQJfqTrdF40mkDqIOZNJUhg==",
|
||||
"requires": {
|
||||
"debug": "^4.1.1"
|
||||
}
|
||||
},
|
||||
"puppeteer-extra": {
|
||||
"version": "2.1.3",
|
||||
"resolved": "https://registry.npmjs.org/puppeteer-extra/-/puppeteer-extra-2.1.3.tgz",
|
||||
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "se-scraper",
|
||||
"version": "1.5.6",
|
||||
"version": "1.5.7",
|
||||
"description": "A module using puppeteer to scrape several search engines such as Google, Bing and Duckduckgo",
|
||||
"homepage": "https://scrapeulous.com/",
|
||||
"main": "index.js",
|
||||
@ -26,7 +26,7 @@
|
||||
"express": "^4.17.1",
|
||||
"got": "^9.6.0",
|
||||
"lodash": "^4.17.14",
|
||||
"puppeteer": "^1.20.0",
|
||||
"puppeteer": "^2.0.0",
|
||||
"puppeteer-extra": "^2.1.3",
|
||||
"puppeteer-extra-plugin-stealth": "^2.2.2",
|
||||
"user-agents": "^1.0.378"
|
||||
|
@ -22,19 +22,24 @@ class YandexScraper extends Scraper {
|
||||
try {
|
||||
if (item) {
|
||||
|
||||
let linkElement = item.querySelector('a.link');
|
||||
let linkElement = item.querySelector('h2 a.link');
|
||||
|
||||
if (linkElement) {
|
||||
obj.link = linkElement.getAttribute('href');
|
||||
obj.title = linkElement.innerText;
|
||||
}
|
||||
|
||||
let label = linkElement.querySelector('.organic__subtitle .label');
|
||||
|
||||
if (label) {
|
||||
let labelText = label.innerText;
|
||||
if (labelText.trim() === 'ad') {
|
||||
obj.is_ad = true;
|
||||
}
|
||||
let label = item.querySelector('.organic__subtitle .label');
|
||||
|
||||
if (label) {
|
||||
let labelText = label.innerText;
|
||||
|
||||
if (labelText) {
|
||||
labelText = labelText.trim().toLowerCase();
|
||||
console.log(labelText);
|
||||
let ad_labels = ['ad', 'werbung', 'реклама', 'anuncio'];
|
||||
obj.is_ad = ad_labels.includes(labelText);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -29,6 +29,11 @@ async function yandex_ads() {
|
||||
|
||||
yandex_search_with_ads2( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.keywords = ['купить деревянные окна'];
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/yandex3.html');
|
||||
|
||||
yandex_search_with_ads3( await scraper.scrape(scrape_config) );
|
||||
|
||||
await scraper.quit();
|
||||
}
|
||||
|
||||
@ -83,6 +88,39 @@ function yandex_search_with_ads2(response) {
|
||||
}
|
||||
|
||||
|
||||
function yandex_search_with_ads3(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
// console.dir(obj.results, {depth: null, colors: true});
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'num_results',], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 14, 'results must have at least 14 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
// at least 4 ads
|
||||
let cnt = 0;
|
||||
obj.results.forEach((res) => {
|
||||
if (res.is_ad) {
|
||||
cnt++;
|
||||
}
|
||||
});
|
||||
|
||||
assert.isAtLeast(cnt, 4, 'there should be at least 4 ads in the results');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function confirm_results_ok(obj) {
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
Loading…
Reference in New Issue
Block a user