parsing ads works for duckduckgo, google, bing. tested.

This commit is contained in:
Nikolai Tschacher 2019-07-07 19:38:28 +02:00
parent bbebe3ce60
commit a413cb54ef
12 changed files with 211 additions and 178 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 182 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 343 KiB

After

Width:  |  Height:  |  Size: 331 KiB

View File

@ -2,25 +2,19 @@ const se_scraper = require('./../src/node_scraper.js');
(async () => { (async () => {
let browser_config = { let browser_config = {
debug_level: 2, debug_level: 1,
output_file: 'examples/results/data.json',
test_evasion: false, test_evasion: false,
headless: false, headless: true,
block_assets: false, block_assets: false,
random_user_agent: true, random_user_agent: false,
log_http_headers: false,
html_output: false,
}; };
let scrape_job = { let scrape_job = {
search_engine: 'google', search_engine: 'bing',
keywords: ['cloud service'], keywords: ['auto verkaufen'],
num_pages: 1, num_pages: 1,
// add some cool google search settings
google_settings: {
gl: 'us', // The gl parameter determines the Google country to use for the query.
hl: 'en', // The hl parameter determines the Google UI language to return results.
start: 0, // Determines the results offset to use, defaults to 0.
num: 10, // Determines the number of results to show, defaults to 10. Maximum is 100.
},
}; };
var scraper = new se_scraper.ScrapeManager(browser_config); var scraper = new se_scraper.ScrapeManager(browser_config);

View File

@ -1,124 +1,104 @@
{ {
"cloud service": { "buy used car": {
"1": { "1": {
"time": "Sat, 06 Jul 2019 19:33:03 GMT", "time": "Sun, 07 Jul 2019 16:04:09 GMT",
"num_results": "About 2,720,000,000 results (0.53 seconds) ", "num_results": "About 5,330,000,000 results (0.65 seconds) ",
"no_results": false, "no_results": false,
"effective_query": "", "effective_query": "",
"top_ads": [ "top_ads": [],
"bottom_ads": [],
"places": [
{ {
"ad_visible_url": "www.ibm.com/de-de/cloud", "heading": "Approved Automotive",
"ads_link": "/aclk?sa=l&ai=DChcSEwjJ8uy7hKHjAhUJhtUKHSqmA_MYABAAGgJ3cw&sig=AOD64_0cI3jZ1rhFR8yEf5YtReD8f2PBlQ&rct=j&q=&ved=2ahUKEwig4ea7hKHjAhVgTRUIHfv-CQUQ0Qx6BAgXEAE&adurl=", "rating": "2.7 (3) · Used car dealerClosed ⋅ Opens 8AM Mon",
"ads_link_target": "https://www.ibm.com/de-de/cloud/solutions", "contact": "Independence, KS · (620) 331-6223",
"title": "Die IBM Cloud | Mehr Sicherheit für Ihre Daten", "hours": "Closed ⋅ Opens 8AM Mon"
"snippet": "Die IBM Cloud ist die Cloud für smarte Unternehmen. Warum erfahren Sie hier! Mit der IBM Cloud erschließen Sie Ihrem Unternehmen neue Umsatzströme aus Ihren Daten. Cloud-Migration."
}
],
"bottom_ads": [
{
"ad_visible_url": "www.hpe.com/Cloud/Service",
"ads_link": "/aclk?sa=L&ai=DChcSEwjJ8uy7hKHjAhUJhtUKHSqmA_MYABADGgJ3cw&sig=AOD64_2VKnwqa309cs9KfVrY2KSK-J3T9w&rct=j&q=&ved=2ahUKEwig4ea7hKHjAhVgTRUIHfv-CQUQ0Qx6BAgYEAE&adurl=",
"ads_link_target": "https://www.hpe.com/de/de/services.html",
"title": "HPE Cloud Service | HPE besuchen und mehr erfahren",
"snippet": "Hilfe beim Erstellen einer Roadmap abgestimmt auf Ihre Herausforderungen! Vereinfachter IT-Zyklus. Schnellere Innovationen. Optimierte Infrastruktur. Bestes Partner-Ökosystem. Dienstleistungen: Advisory Services, Professional Services, Operational Sevices, Cloud Services, Applications Services."
}, },
{ {
"ad_visible_url": "w3.usa.siemens.com/", "heading": "Romans Motor Company",
"ads_link": "/aclk?sa=l&ai=DChcSEwjJ8uy7hKHjAhUJhtUKHSqmA_MYABAGGgJ3cw&sig=AOD64_3tMHCSFikvffpErcgjGyDahhVwWQ&rct=j&q=&ved=2ahUKEwig4ea7hKHjAhVgTRUIHfv-CQUQ0Qx6BAgZEAE&adurl=", "rating": "4.4 (38) · Chevrolet dealerClosed ⋅ Opens 8AM Mon\"Great place to get your car worked on.\" \"Great place to get your car worked on.\" ",
"ads_link_target": "https://w3.usa.siemens.com/buildingtechnologies/us/en/Smart_Buildings/digital-services/Pages/analytic-services.aspx?stc=ussi100083&sp_source=ussi100083", "contact": "Independence, KS · (620) 331-4700\"Great place to get your car worked on.\" ",
"title": "Analytic Services from Siemens | Making Your Building Smarter", "hours": "Closed ⋅ Opens 8AM Mon"
"snippet": "Siemens Uses the Latest Analytical Tools Along with Cloud-Based Services to Identify. and Solve Potential Problems Before They Affect Your Entire Organization." },
{
"heading": "Perl on Eleventh",
"rating": "No reviews · Used car dealerClosed ⋅ Opens 9AM Mon",
"contact": "Coffeyville, KS · (620) 251-4050",
"hours": "Closed ⋅ Opens 9AM Mon"
} }
], ],
"places": [],
"results": [ "results": [
{ {
"link": "https://www.webopedia.com/TERM/C/cloud_services.html", "link": "https://www.cars.com/shopping/",
"title": "What is Cloud Service? Webopedia Definitionhttps://www.webopedia.com/TERM/C/cloud_services.html", "title": "Used Cars for Sale Online Near Me | Cars.comhttps://www.cars.com/shopping/Cached",
"snippet": "", "snippet": "Car-Buying Advice. First-timers and veterans shopping new or used cars: know what to remind yourself to do, ask and learn, from verifying the condition to ...",
"visible_link": "https://www.webopedia.com/TERM/C/cloud_services.html", "visible_link": "https://www.cars.com/shopping/",
"date": "", "date": "",
"rank": 1 "rank": 1
}, },
{ {
"link": "https://www.webopedia.com/TERM/C/cloud_services.html", "link": "https://www.carfax.com/cars-for-sale",
"title": "What is Cloud Service? Webopedia Definitionhttps://www.webopedia.com/TERM/C/cloud_services.html", "title": "Used Cars for Sale | with Free CARFAXhttps://www.carfax.com/cars-for-saleCachedSimilar",
"snippet": "", "snippet": "A FREE CARFAX report comes with every used car and truck for sale on Carfax.com. ... Find out how much a car is really worth before you buy it with the all-new ...",
"visible_link": "https://www.webopedia.com/TERM/C/cloud_services.html", "visible_link": "https://www.carfax.com/cars-for-sale",
"date": "", "date": "",
"rank": 2 "rank": 2
}, },
{ {
"link": "https://www.webopedia.com/TERM/C/cloud_services.html", "link": "https://www.autotrader.com/",
"title": "What is Cloud Service? Webopedia Definitionhttps://www.webopedia.com/TERM/C/cloud_services.htmlCached", "title": "New Cars, Used Cars - Find Cars for Sale and Reviews at Autotraderhttps://www.autotrader.com/Cached",
"snippet": "cloud service. A cloud service is any service made available to users on demand via the Internet from a cloud computing provider's servers as opposed to being provided from a company's own on-premises servers.", "snippet": "With millions of cars, finding your next new car or used car and the car ... Buying a car is a big deal -- and saving money is a crucial aspect of any new - or used ...",
"visible_link": "https://www.webopedia.com/TERM/C/cloud_services.html", "visible_link": "https://www.autotrader.com/",
"date": "", "date": "",
"rank": 3 "rank": 3
}, },
{ {
"link": "https://www.skyhighnetworks.com/cloud-security-blog/what-is-a-cloud-service/", "link": "https://www.carmax.com/cars",
"title": "What is a Cloud Service? - Skyhigh Networkshttps://www.skyhighnetworks.com/cloud-security-blog/what-is-a-cloud-service/CachedSimilar", "title": "Used Cars for Sale - CarMaxhttps://www.carmax.com/carsCachedSimilar",
"snippet": "The cloud has been around for many years, yet there is still confusion as to what exactly is considered a cloud service. Read on for the definitive answer.", "snippet": "Search for new and used cars at carmax.com. Use our car ... 130 Best Used Cars for 2019: Ranked by Price and Type ... What is the best car to buy in 2019?",
"visible_link": "https://www.skyhighnetworks.com/cloud-security-blog/what-is-a-cloud-service/", "visible_link": "https://www.carmax.com/cars",
"date": "", "date": "",
"rank": 4 "rank": 4
}, },
{ {
"link": "https://searchitchannel.techtarget.com/definition/cloud-services", "link": "https://www.enterprisecarsales.com/usedcars-buy",
"title": "What is cloud services? - Definition from WhatIs.com - SearchITChannelhttps://searchitchannel.techtarget.com/definition/cloud-servicesCached", "title": "Buy Used Cars - Enterprise Car Saleshttps://www.enterprisecarsales.com/usedcars-buyCached",
"snippet": "Dec 6, 2016 - Cloud services is an umbrella term that may refer to a variety of resources provided over the internet, or to professional services that support the ...", "snippet": "Looking to buy a used car? You know Enterprise for our exceptional customer service on rental cars, and you'll experience that same level of service when you ...",
"visible_link": "https://searchitchannel.techtarget.com/definition/cloud-services", "visible_link": "https://www.enterprisecarsales.com/usedcars-buy",
"date": "Dec 6, 2016 - ", "date": "",
"rank": 5 "rank": 5
}, },
{ {
"link": "https://azure.microsoft.com/en-us/overview/what-is-cloud-computing/", "link": "https://www.truecar.com/used-cars-for-sale/",
"title": "What Is Cloud Computing? A Beginner's Guide | Microsoft Azurehttps://azure.microsoft.com/en-us/overview/what-is-cloud-computing/CachedSimilar", "title": "Used Cars For Sale: 1,006,922 Used & Pre-Owned Cars | TrueCarhttps://www.truecar.com/used-cars-for-sale/CachedSimilar",
"snippet": "Simply put, cloud computing is the delivery of computing services—including servers, storage, databases, networking, software, analytics, and intelligence—over ...", "snippet": "Buy With Confidence. Get the best used car buying experience when you purchase from a TrueCar Certified Dealer who is dedicated to great service, and ...",
"visible_link": "https://azure.microsoft.com/en-us/overview/what-is-cloud-computing/", "visible_link": "https://www.truecar.com/used-cars-for-sale/",
"date": "", "date": "",
"rank": 6 "rank": 6
}, },
{ {
"link": "https://www.pcmag.com/article/345308/20-of-the-best-cloud-services-for-smbs", "link": "https://www.truecar.com/",
"title": "The 50 Best Cloud Services for SMBs | PCMag.comhttps://www.pcmag.com/article/345308/20-of-the-best-cloud-services-for-smbsCached", "title": "TrueCar: Car Prices, Owner Reviews & Inventory | New & Used Carshttps://www.truecar.com/CachedSimilar",
"snippet": "Oct 23, 2017 - To help you make better decisions about what to buy, we're listing the best cloud services for SMBs, covering topics from project management ...", "snippet": "Shop for new and used cars and trucks. ... Sam's Club, American Express and Chase, giving members who use TrueCar a superior car-buying experience.",
"visible_link": "https://www.pcmag.com/article/345308/20-of-the-best-cloud-services-for-smbs", "visible_link": "https://www.truecar.com/",
"date": "Oct 23, 2017 - ", "date": "",
"rank": 7 "rank": 7
}, },
{ {
"link": "https://en.wikipedia.org/wiki/Cloud_computing", "link": "https://www.enterprisecarsales.com/list/buy-a-car-1",
"title": "Cloud computing - Wikipediahttps://en.wikipedia.org/wiki/Cloud_computingCachedSimilar", "title": "Buy Used Cars, Find Used Vehicles for Sale - Enterprise Car Saleshttps://www.enterprisecarsales.com/list/buy-a-car-1CachedSimilar",
"snippet": "Jump to Software as a service (SaaS) - In the software as a service (SaaS) model, users gain access to application software and databases. Cloud ...", "snippet": "Browse our vehicle inventory to find reliable used cars for sale right now at Enterprise Car Sales.",
"visible_link": "https://en.wikipedia.org/wiki/Cloud_computing", "visible_link": "https://www.enterprisecarsales.com/list/buy-a-car-1",
"date": "Jump to Software as a service (SaaS) - ", "date": "",
"rank": 8 "rank": 8
}, },
{ {
"link": "https://www.techopedia.com/definition/29017/cloud-services", "link": "https://www.edmunds.com/used-cars-for-sale/",
"title": "What are Cloud Services? - Definition from Techopediahttps://www.techopedia.com/definition/29017/cloud-servicesCachedSimilar", "title": "Get the Best Deals on Used Cars For Sale Near You - Shop Used ...https://www.edmunds.com/used-cars-for-sale/CachedSimilar",
"snippet": "Cloud services refer to any IT services that are provisioned and accessed from a cloud computing provider. This is a broad term that incorporates all delivery and ...", "snippet": "Get the best prices on great used cars, trucks and SUVs for sale near you with Edmunds. We have over 5 million cheap ... Buy used with confidence on Edmunds ...",
"visible_link": "https://www.techopedia.com/definition/29017/cloud-services", "visible_link": "https://www.edmunds.com/used-cars-for-sale/",
"date": "", "date": "",
"rank": 9 "rank": 9
},
{
"link": "https://www.techradar.com/news/best-cloud-computing-service",
"title": "Best cloud computing services of 2019 | TechRadarhttps://www.techradar.com/news/best-cloud-computing-serviceCached",
"snippet": "4 days ago - Additionally, cloud services aren't simply about services or resources, but about providing fully fledged IT systems you can use as if you were ...",
"visible_link": "https://www.techradar.com/news/best-cloud-computing-service",
"date": "4 days ago - ",
"rank": 10
},
{
"link": "https://aws.amazon.com/what-is-cloud-computing/",
"title": "What is Cloud Computing - Amazon Web Serviceshttps://aws.amazon.com/what-is-cloud-computing/CachedSimilar",
"snippet": "Whether you are using it to run applications that share photos to millions of mobile users or to support business critical operations, a cloud services platform ...",
"visible_link": "https://aws.amazon.com/what-is-cloud-computing/",
"date": "",
"rank": 11
} }
] ]
} }

View File

@ -1,6 +1,6 @@
{ {
"name": "se-scraper", "name": "se-scraper",
"version": "1.3.13", "version": "1.3.14",
"description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu", "description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu",
"homepage": "https://scrapeulous.com/", "homepage": "https://scrapeulous.com/",
"main": "index.js", "main": "index.js",

View File

@ -22,9 +22,9 @@ class BingScraper extends Scraper {
const ads = []; const ads = [];
$('.b_ad .sb_add').each((i, element) => { $('.b_ad .sb_add').each((i, element) => {
ads.push({ ads.push({
ad_visible_url: $(element).find('.b_adurl cite').text(), visible_link: $(element).find('.b_adurl cite').text(),
ads_link: $(element).find('h2 a').attr('href'), tracking_link: $(element).find('h2 a').attr('href'),
ads_link_target: $(element).find('h2 link').attr('href'), link: $(element).find('link').attr('href'),
title: $(element).find('h2 a').text(), title: $(element).find('h2 a').text(),
snippet: $(element).find('.b_caption').text(), snippet: $(element).find('.b_caption').text(),
}) })

View File

@ -9,7 +9,7 @@ class DuckduckgoScraper extends Scraper {
// perform queries // perform queries
const results = []; const results = [];
$('.result__body').each((i, link) => { $('#links .result__body').each((i, link) => {
results.push({ results.push({
link: $(link).find('.result__title .result__a').attr('href'), link: $(link).find('.result__title .result__a').attr('href'),
title: $(link).find('.result__title .result__a').text(), title: $(link).find('.result__title .result__a').text(),
@ -22,8 +22,8 @@ class DuckduckgoScraper extends Scraper {
const ads = []; const ads = [];
$('.results--ads.has-ad').each((i, element) => { $('.results--ads.has-ad').each((i, element) => {
ads.push({ ads.push({
ad_visible_url: $(element).find('.result__url').text(), visible_link: $(element).find('.result__url').text(),
ads_link: $(element).find('.result__title .result__a').attr('href'), tracking_link: $(element).find('.result__title .result__a').attr('href'),
title: $(element).find('.result__title .result__a').text(), title: $(element).find('.result__title .result__a').text(),
snippet: $(element).find('.result__snippet').text(), snippet: $(element).find('.result__snippet').text(),
}) })

View File

@ -24,29 +24,33 @@ class GoogleScraper extends Scraper {
}) })
}); });
// parse top ads // parse ads
const top_ads = []; let parseAds = (storage, selector) => {
$('#tads .ads-ad').each((i, element) => { $(selector).each((i, element) => {
top_ads.push({ let obj = {
ad_visible_url: $(element).find('.ads-visurl cite').text(), visible_link: $(element).find('.ads-visurl cite').text(),
ads_link: $(element).find('a:first-child').attr('href'), tracking_link: $(element).find('a:first-child').attr('href'),
ads_link_target: $(element).find('a:nth-child(2)').attr('href'), link: $(element).find('a:nth-child(2)').attr('href'),
title: $(element).find('a h3').text(), title: $(element).find('a h3').text(),
snippet: $(element).find('.ads-creative').text(), snippet: $(element).find('.ads-creative').text(),
links: [],
};
$(element).find('ul li a').each((i, el) => {
obj.links.push({
tracking_link: $(el).attr('data-arwt'),
link: $(el).attr('href'),
title: $(el).text(),
}) })
}); });
storage.push(obj);
});
};
// parse bottom ads const top_ads = [];
const bottomads = []; const bottomads = [];
$('#tadsb .ads-ad').each((i, element) => {
bottomads.push({ parseAds(top_ads, '#tads .ads-ad');
ad_visible_url: $(element).find('.ads-visurl cite').text(), parseAds(bottomads, '#tadsb .ads-ad');
ads_link: $(element).find('a:first-child').attr('href'),
ads_link_target: $(element).find('a:nth-child(2)').attr('href'),
title: $(element).find('a h3').text(),
snippet: $(element).find('.ads-creative').text(),
})
});
// parse google places // parse google places
const places = []; const places = [];
@ -143,7 +147,7 @@ class GoogleScraper extends Scraper {
} }
async wait_for_results() { async wait_for_results() {
await this.page.waitForSelector('#fbarcnt', { timeout: this.STANDARD_TIMEOUT }); await this.page.waitForSelector('#fbar', { timeout: this.STANDARD_TIMEOUT });
} }
async detected() { async detected() {

View File

@ -116,6 +116,7 @@ module.exports = class Scraper {
if (this.config.log_http_headers === true) { if (this.config.log_http_headers === true) {
this.metadata.http_headers = await meta.get_http_headers(this.page); this.metadata.http_headers = await meta.get_http_headers(this.page);
log(this.config, 1, this.metadata.http_headers);
} }
if (this.config.log_ip_address === true) { if (this.config.log_ip_address === true) {

View File

@ -233,21 +233,21 @@ function test_case_ads_test(response) {
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
assert.isAtLeast(obj.ads.length, 2, 'ads must have at least 2 SERP object'); assert.isAtLeast(obj.ads.length, 2, 'ads must have at least 2 SERP objects');
for (let res of obj.ads) { for (let res of obj.ads) {
assert.isOk(res.ads_link, 'link must be ok'); assert.isOk(res.tracking_link, 'link must be ok');
assert.typeOf(res.ads_link, 'string', 'link must be string'); assert.typeOf(res.tracking_link, 'string', 'link must be string');
assert.isAtLeast(res.ads_link.length, 5, 'link must have at least 5 chars'); assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.ads_link_target, 'link must be ok'); // assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.ads_link_target, 'string', 'link must be string'); // assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.ads_link_target.length, 5, 'link must have at least 5 chars'); // assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.ad_visible_url, 'visible_link must be ok'); assert.isOk(res.visible_link, 'visible_link must be ok');
assert.typeOf(res.ad_visible_url, 'string', 'visible_link must be string'); assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
assert.isAtLeast(res.ad_visible_url.length, 5, 'visible_link must have at least 5 chars'); assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok'); assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string'); assert.typeOf(res.title, 'string', 'title must be string');
@ -257,14 +257,10 @@ function test_case_ads_test(response) {
assert.typeOf(res.snippet, 'string', 'snippet must be string'); assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
} }
} }
} }
} }
describe('Bing', function(){ describe('Bing', function(){
this.timeout(30000); this.timeout(30000);
it('normal search', normal_search_test); it('normal search', normal_search_test);

View File

@ -1,10 +1,7 @@
const se_scraper = require('./../index.js'); const se_scraper = require('./../index.js');
var assert = require('chai').assert; const chai = require('chai');
chai.use(require('chai-string'));
/* const assert = chai.assert;
* Use chai and mocha for tests.
* https://mochajs.org/#installation
*/
const normal_search_keywords = ['apple tree', 'weather tomorrow']; const normal_search_keywords = ['apple tree', 'weather tomorrow'];
@ -12,12 +9,9 @@ async function normal_search_test() {
let config = { let config = {
compress: false, compress: false,
debug_level: 1, debug_level: 1,
keyword_file: '',
headless: true, headless: true,
output_file: '', block_assets: false,
block_assets: true, random_user_agent: true,
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
random_user_agent: false,
}; };
let scrape_config = { let scrape_config = {
@ -83,12 +77,9 @@ async function effective_query_test() {
let config = { let config = {
compress: false, compress: false,
debug_level: 1, debug_level: 1,
keyword_file: '',
headless: true, headless: true,
output_file: '',
block_assets: true, block_assets: true,
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', random_user_agent: true,
random_user_agent: false,
}; };
let scrape_config = { let scrape_config = {
@ -129,7 +120,72 @@ function test_case_effective_query(response) {
} }
} }
(async () => { const ads_keywords = ['cloud services', 'buy shoes'];
await normal_search_test();
await effective_query_test(); async function ads_test() {
})(); let config = {
compress: false,
debug_level: 1,
headless: true,
block_assets: false,
random_user_agent: false,
};
let scrape_config = {
search_engine: 'duckduckgo',
keywords: ads_keywords,
num_pages: 1,
};
console.log('ads_test()');
test_case_ads_test( await se_scraper.scrape(config, scrape_config) );
}
function test_case_ads_test(response) {
assert.equal(response.metadata.num_requests, 2);
for (let query in response.results) {
assert.containsAllKeys(response.results, ads_keywords, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time', 'effective_query', 'ads'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 6, 'results must have at least 6 SERP objects');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
assert.isAtLeast(obj.ads.length, 2, 'ads must have at least 2 SERP object');
for (let res of obj.ads) {
assert.isOk(res.tracking_link, 'link must be ok');
assert.typeOf(res.tracking_link, 'string', 'link must be string');
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'visible_link must be ok');
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
}
}
}
}
describe('Duckduckgo', function(){
this.timeout(30000);
it('normal search', normal_search_test);
it('effective query', effective_query_test);
it('finds ads', ads_test);
});

View File

@ -221,7 +221,7 @@ function check_html_output_test_case( response ) {
} }
} }
const ads_keywords = ['cloud services', 'buy shoes']; const ads_keywords = ['cloud services', 'auto kaufen'];
async function ads_test() { async function ads_test() {
let config = { let config = {
@ -229,7 +229,7 @@ async function ads_test() {
debug_level: 1, debug_level: 1,
headless: true, headless: true,
block_assets: false, block_assets: false,
random_user_agent: true, random_user_agent: false, // dont try to trick google with ads
}; };
let scrape_config = { let scrape_config = {
@ -263,22 +263,21 @@ function test_case_ads_test(response) {
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
assert.isAtLeast(obj.top_ads.length, 1, 'top_ads must have at least 1 SERP object'); assert(obj.top_ads.length >= 1 || obj.bottom_ads.length >= 1, 'top_ads or bottom_ads must have at least 1 SERP object');
assert.isAtLeast(obj.bottom_ads.length, 1, 'bottom_ads must have at least 1 SERP object');
for (let res of obj.top_ads) { for (let res of obj.top_ads) {
assert.isOk(res.ads_link, 'link must be ok'); assert.isOk(res.tracking_link, 'link must be ok');
assert.typeOf(res.ads_link, 'string', 'link must be string'); assert.typeOf(res.tracking_link, 'string', 'link must be string');
assert.isAtLeast(res.ads_link.length, 5, 'link must have at least 5 chars'); assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.ads_link_target, 'link must be ok'); assert.isOk(res.visible_link, 'link must be ok');
assert.typeOf(res.ads_link_target, 'string', 'link must be string'); assert.typeOf(res.visible_link, 'string', 'link must be string');
assert.isAtLeast(res.ads_link_target.length, 5, 'link must have at least 5 chars'); assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.ad_visible_url, 'visible_link must be ok'); assert.isOk(res.visible_link, 'visible_link must be ok');
assert.typeOf(res.ad_visible_url, 'string', 'visible_link must be string'); assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
assert.isAtLeast(res.ad_visible_url.length, 5, 'visible_link must have at least 5 chars'); assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok'); assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string'); assert.typeOf(res.title, 'string', 'title must be string');
@ -287,21 +286,22 @@ function test_case_ads_test(response) {
assert.isOk(res.snippet, 'snippet must be ok'); assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string'); assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
assert.typeOf(res.links, 'array', 'snippet must be array');
} }
for (let res of obj.bottom_ads) { for (let res of obj.bottom_ads) {
assert.isOk(res.tracking_link, 'link must be ok');
assert.typeOf(res.tracking_link, 'string', 'link must be string');
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.ads_link, 'link must be ok'); assert.isOk(res.visible_link, 'link must be ok');
assert.typeOf(res.ads_link, 'string', 'link must be string'); assert.typeOf(res.visible_link, 'string', 'link must be string');
assert.isAtLeast(res.ads_link.length, 5, 'link must have at least 5 chars'); assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.ads_link_target, 'link must be ok'); assert.isOk(res.visible_link, 'visible_link must be ok');
assert.typeOf(res.ads_link_target, 'string', 'link must be string'); assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
assert.isAtLeast(res.ads_link_target.length, 5, 'link must have at least 5 chars'); assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.ad_visible_url, 'visible_link must be ok');
assert.typeOf(res.ad_visible_url, 'string', 'visible_link must be string');
assert.isAtLeast(res.ad_visible_url.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok'); assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string'); assert.typeOf(res.title, 'string', 'title must be string');
@ -310,6 +310,8 @@ function test_case_ads_test(response) {
assert.isOk(res.snippet, 'snippet must be ok'); assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string'); assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
assert.typeOf(res.links, 'array', 'snippet must be array');
} }
} }
@ -322,5 +324,5 @@ describe('Google', function(){
it('no results', no_results_test); it('no results', no_results_test);
it('effective query', effective_query_test); it('effective query', effective_query_test);
it('html output query', html_output_query_test); it('html output query', html_output_query_test);
it('finds ads', ads_test); it('ads', ads_test);
}); });