mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2025-01-12 23:58:11 +01:00
parsing ads works for duckduckgo, google, bing. tested.
This commit is contained in:
parent
bbebe3ce60
commit
a413cb54ef
BIN
debug_se_scraper_google_buy used car.png
Normal file
BIN
debug_se_scraper_google_buy used car.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 182 KiB |
Binary file not shown.
Before Width: | Height: | Size: 343 KiB After Width: | Height: | Size: 331 KiB |
@ -2,25 +2,19 @@ const se_scraper = require('./../src/node_scraper.js');
|
||||
|
||||
(async () => {
|
||||
let browser_config = {
|
||||
debug_level: 2,
|
||||
output_file: 'examples/results/data.json',
|
||||
debug_level: 1,
|
||||
test_evasion: false,
|
||||
headless: false,
|
||||
headless: true,
|
||||
block_assets: false,
|
||||
random_user_agent: true,
|
||||
random_user_agent: false,
|
||||
log_http_headers: false,
|
||||
html_output: false,
|
||||
};
|
||||
|
||||
let scrape_job = {
|
||||
search_engine: 'google',
|
||||
keywords: ['cloud service'],
|
||||
search_engine: 'bing',
|
||||
keywords: ['auto verkaufen'],
|
||||
num_pages: 1,
|
||||
// add some cool google search settings
|
||||
google_settings: {
|
||||
gl: 'us', // The gl parameter determines the Google country to use for the query.
|
||||
hl: 'en', // The hl parameter determines the Google UI language to return results.
|
||||
start: 0, // Determines the results offset to use, defaults to 0.
|
||||
num: 10, // Determines the number of results to show, defaults to 10. Maximum is 100.
|
||||
},
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(browser_config);
|
||||
|
@ -1,124 +1,104 @@
|
||||
{
|
||||
"cloud service": {
|
||||
"buy used car": {
|
||||
"1": {
|
||||
"time": "Sat, 06 Jul 2019 19:33:03 GMT",
|
||||
"num_results": "About 2,720,000,000 results (0.53 seconds) ",
|
||||
"time": "Sun, 07 Jul 2019 16:04:09 GMT",
|
||||
"num_results": "About 5,330,000,000 results (0.65 seconds) ",
|
||||
"no_results": false,
|
||||
"effective_query": "",
|
||||
"top_ads": [
|
||||
"top_ads": [],
|
||||
"bottom_ads": [],
|
||||
"places": [
|
||||
{
|
||||
"ad_visible_url": "www.ibm.com/de-de/cloud",
|
||||
"ads_link": "/aclk?sa=l&ai=DChcSEwjJ8uy7hKHjAhUJhtUKHSqmA_MYABAAGgJ3cw&sig=AOD64_0cI3jZ1rhFR8yEf5YtReD8f2PBlQ&rct=j&q=&ved=2ahUKEwig4ea7hKHjAhVgTRUIHfv-CQUQ0Qx6BAgXEAE&adurl=",
|
||||
"ads_link_target": "https://www.ibm.com/de-de/cloud/solutions",
|
||||
"title": "Die IBM Cloud | Mehr Sicherheit für Ihre Daten",
|
||||
"snippet": "Die IBM Cloud ist die Cloud für smarte Unternehmen. Warum erfahren Sie hier! Mit der IBM Cloud erschließen Sie Ihrem Unternehmen neue Umsatzströme aus Ihren Daten. Cloud-Migration."
|
||||
}
|
||||
],
|
||||
"bottom_ads": [
|
||||
{
|
||||
"ad_visible_url": "www.hpe.com/Cloud/Service",
|
||||
"ads_link": "/aclk?sa=L&ai=DChcSEwjJ8uy7hKHjAhUJhtUKHSqmA_MYABADGgJ3cw&sig=AOD64_2VKnwqa309cs9KfVrY2KSK-J3T9w&rct=j&q=&ved=2ahUKEwig4ea7hKHjAhVgTRUIHfv-CQUQ0Qx6BAgYEAE&adurl=",
|
||||
"ads_link_target": "https://www.hpe.com/de/de/services.html",
|
||||
"title": "HPE Cloud Service | HPE besuchen und mehr erfahren",
|
||||
"snippet": "Hilfe beim Erstellen einer Roadmap abgestimmt auf Ihre Herausforderungen! Vereinfachter IT-Zyklus. Schnellere Innovationen. Optimierte Infrastruktur. Bestes Partner-Ökosystem. Dienstleistungen: Advisory Services, Professional Services, Operational Sevices, Cloud Services, Applications Services."
|
||||
"heading": "Approved Automotive",
|
||||
"rating": "2.7 (3) · Used car dealerClosed ⋅ Opens 8AM Mon",
|
||||
"contact": "Independence, KS · (620) 331-6223",
|
||||
"hours": "Closed ⋅ Opens 8AM Mon"
|
||||
},
|
||||
{
|
||||
"ad_visible_url": "w3.usa.siemens.com/",
|
||||
"ads_link": "/aclk?sa=l&ai=DChcSEwjJ8uy7hKHjAhUJhtUKHSqmA_MYABAGGgJ3cw&sig=AOD64_3tMHCSFikvffpErcgjGyDahhVwWQ&rct=j&q=&ved=2ahUKEwig4ea7hKHjAhVgTRUIHfv-CQUQ0Qx6BAgZEAE&adurl=",
|
||||
"ads_link_target": "https://w3.usa.siemens.com/buildingtechnologies/us/en/Smart_Buildings/digital-services/Pages/analytic-services.aspx?stc=ussi100083&sp_source=ussi100083",
|
||||
"title": "Analytic Services from Siemens | Making Your Building Smarter",
|
||||
"snippet": "Siemens Uses the Latest Analytical Tools Along with Cloud-Based Services to Identify. and Solve Potential Problems Before They Affect Your Entire Organization."
|
||||
"heading": "Romans Motor Company",
|
||||
"rating": "4.4 (38) · Chevrolet dealerClosed ⋅ Opens 8AM Mon\"Great place to get your car worked on.\" \"Great place to get your car worked on.\" ",
|
||||
"contact": "Independence, KS · (620) 331-4700\"Great place to get your car worked on.\" ",
|
||||
"hours": "Closed ⋅ Opens 8AM Mon"
|
||||
},
|
||||
{
|
||||
"heading": "Perl on Eleventh",
|
||||
"rating": "No reviews · Used car dealerClosed ⋅ Opens 9AM Mon",
|
||||
"contact": "Coffeyville, KS · (620) 251-4050",
|
||||
"hours": "Closed ⋅ Opens 9AM Mon"
|
||||
}
|
||||
],
|
||||
"places": [],
|
||||
"results": [
|
||||
{
|
||||
"link": "https://www.webopedia.com/TERM/C/cloud_services.html",
|
||||
"title": "What is Cloud Service? Webopedia Definitionhttps://www.webopedia.com/TERM/C/cloud_services.html",
|
||||
"snippet": "",
|
||||
"visible_link": "https://www.webopedia.com/TERM/C/cloud_services.html",
|
||||
"link": "https://www.cars.com/shopping/",
|
||||
"title": "Used Cars for Sale Online Near Me | Cars.comhttps://www.cars.com/shopping/Cached",
|
||||
"snippet": "Car-Buying Advice. First-timers and veterans shopping new or used cars: know what to remind yourself to do, ask and learn, from verifying the condition to ...",
|
||||
"visible_link": "https://www.cars.com/shopping/",
|
||||
"date": "",
|
||||
"rank": 1
|
||||
},
|
||||
{
|
||||
"link": "https://www.webopedia.com/TERM/C/cloud_services.html",
|
||||
"title": "What is Cloud Service? Webopedia Definitionhttps://www.webopedia.com/TERM/C/cloud_services.html",
|
||||
"snippet": "",
|
||||
"visible_link": "https://www.webopedia.com/TERM/C/cloud_services.html",
|
||||
"link": "https://www.carfax.com/cars-for-sale",
|
||||
"title": "Used Cars for Sale | with Free CARFAXhttps://www.carfax.com/cars-for-saleCachedSimilar",
|
||||
"snippet": "A FREE CARFAX report comes with every used car and truck for sale on Carfax.com. ... Find out how much a car is really worth before you buy it with the all-new ...",
|
||||
"visible_link": "https://www.carfax.com/cars-for-sale",
|
||||
"date": "",
|
||||
"rank": 2
|
||||
},
|
||||
{
|
||||
"link": "https://www.webopedia.com/TERM/C/cloud_services.html",
|
||||
"title": "What is Cloud Service? Webopedia Definitionhttps://www.webopedia.com/TERM/C/cloud_services.htmlCached",
|
||||
"snippet": "cloud service. A cloud service is any service made available to users on demand via the Internet from a cloud computing provider's servers as opposed to being provided from a company's own on-premises servers.",
|
||||
"visible_link": "https://www.webopedia.com/TERM/C/cloud_services.html",
|
||||
"link": "https://www.autotrader.com/",
|
||||
"title": "New Cars, Used Cars - Find Cars for Sale and Reviews at Autotraderhttps://www.autotrader.com/Cached",
|
||||
"snippet": "With millions of cars, finding your next new car or used car and the car ... Buying a car is a big deal -- and saving money is a crucial aspect of any new - or used ...",
|
||||
"visible_link": "https://www.autotrader.com/",
|
||||
"date": "",
|
||||
"rank": 3
|
||||
},
|
||||
{
|
||||
"link": "https://www.skyhighnetworks.com/cloud-security-blog/what-is-a-cloud-service/",
|
||||
"title": "What is a Cloud Service? - Skyhigh Networkshttps://www.skyhighnetworks.com/cloud-security-blog/what-is-a-cloud-service/CachedSimilar",
|
||||
"snippet": "The cloud has been around for many years, yet there is still confusion as to what exactly is considered a cloud service. Read on for the definitive answer.",
|
||||
"visible_link": "https://www.skyhighnetworks.com/cloud-security-blog/what-is-a-cloud-service/",
|
||||
"link": "https://www.carmax.com/cars",
|
||||
"title": "Used Cars for Sale - CarMaxhttps://www.carmax.com/carsCachedSimilar",
|
||||
"snippet": "Search for new and used cars at carmax.com. Use our car ... 130 Best Used Cars for 2019: Ranked by Price and Type ... What is the best car to buy in 2019?",
|
||||
"visible_link": "https://www.carmax.com/cars",
|
||||
"date": "",
|
||||
"rank": 4
|
||||
},
|
||||
{
|
||||
"link": "https://searchitchannel.techtarget.com/definition/cloud-services",
|
||||
"title": "What is cloud services? - Definition from WhatIs.com - SearchITChannelhttps://searchitchannel.techtarget.com/definition/cloud-servicesCached",
|
||||
"snippet": "Dec 6, 2016 - Cloud services is an umbrella term that may refer to a variety of resources provided over the internet, or to professional services that support the ...",
|
||||
"visible_link": "https://searchitchannel.techtarget.com/definition/cloud-services",
|
||||
"date": "Dec 6, 2016 - ",
|
||||
"link": "https://www.enterprisecarsales.com/usedcars-buy",
|
||||
"title": "Buy Used Cars - Enterprise Car Saleshttps://www.enterprisecarsales.com/usedcars-buyCached",
|
||||
"snippet": "Looking to buy a used car? You know Enterprise for our exceptional customer service on rental cars, and you'll experience that same level of service when you ...",
|
||||
"visible_link": "https://www.enterprisecarsales.com/usedcars-buy",
|
||||
"date": "",
|
||||
"rank": 5
|
||||
},
|
||||
{
|
||||
"link": "https://azure.microsoft.com/en-us/overview/what-is-cloud-computing/",
|
||||
"title": "What Is Cloud Computing? A Beginner's Guide | Microsoft Azurehttps://azure.microsoft.com/en-us/overview/what-is-cloud-computing/CachedSimilar",
|
||||
"snippet": "Simply put, cloud computing is the delivery of computing services—including servers, storage, databases, networking, software, analytics, and intelligence—over ...",
|
||||
"visible_link": "https://azure.microsoft.com/en-us/overview/what-is-cloud-computing/",
|
||||
"link": "https://www.truecar.com/used-cars-for-sale/",
|
||||
"title": "Used Cars For Sale: 1,006,922 Used & Pre-Owned Cars | TrueCarhttps://www.truecar.com/used-cars-for-sale/CachedSimilar",
|
||||
"snippet": "Buy With Confidence. Get the best used car buying experience when you purchase from a TrueCar Certified Dealer who is dedicated to great service, and ...",
|
||||
"visible_link": "https://www.truecar.com/used-cars-for-sale/",
|
||||
"date": "",
|
||||
"rank": 6
|
||||
},
|
||||
{
|
||||
"link": "https://www.pcmag.com/article/345308/20-of-the-best-cloud-services-for-smbs",
|
||||
"title": "The 50 Best Cloud Services for SMBs | PCMag.comhttps://www.pcmag.com/article/345308/20-of-the-best-cloud-services-for-smbsCached",
|
||||
"snippet": "Oct 23, 2017 - To help you make better decisions about what to buy, we're listing the best cloud services for SMBs, covering topics from project management ...",
|
||||
"visible_link": "https://www.pcmag.com/article/345308/20-of-the-best-cloud-services-for-smbs",
|
||||
"date": "Oct 23, 2017 - ",
|
||||
"link": "https://www.truecar.com/",
|
||||
"title": "TrueCar: Car Prices, Owner Reviews & Inventory | New & Used Carshttps://www.truecar.com/CachedSimilar",
|
||||
"snippet": "Shop for new and used cars and trucks. ... Sam's Club, American Express and Chase, giving members who use TrueCar a superior car-buying experience.",
|
||||
"visible_link": "https://www.truecar.com/",
|
||||
"date": "",
|
||||
"rank": 7
|
||||
},
|
||||
{
|
||||
"link": "https://en.wikipedia.org/wiki/Cloud_computing",
|
||||
"title": "Cloud computing - Wikipediahttps://en.wikipedia.org/wiki/Cloud_computingCachedSimilar",
|
||||
"snippet": "Jump to Software as a service (SaaS) - In the software as a service (SaaS) model, users gain access to application software and databases. Cloud ...",
|
||||
"visible_link": "https://en.wikipedia.org/wiki/Cloud_computing",
|
||||
"date": "Jump to Software as a service (SaaS) - ",
|
||||
"link": "https://www.enterprisecarsales.com/list/buy-a-car-1",
|
||||
"title": "Buy Used Cars, Find Used Vehicles for Sale - Enterprise Car Saleshttps://www.enterprisecarsales.com/list/buy-a-car-1CachedSimilar",
|
||||
"snippet": "Browse our vehicle inventory to find reliable used cars for sale right now at Enterprise Car Sales.",
|
||||
"visible_link": "https://www.enterprisecarsales.com/list/buy-a-car-1",
|
||||
"date": "",
|
||||
"rank": 8
|
||||
},
|
||||
{
|
||||
"link": "https://www.techopedia.com/definition/29017/cloud-services",
|
||||
"title": "What are Cloud Services? - Definition from Techopediahttps://www.techopedia.com/definition/29017/cloud-servicesCachedSimilar",
|
||||
"snippet": "Cloud services refer to any IT services that are provisioned and accessed from a cloud computing provider. This is a broad term that incorporates all delivery and ...",
|
||||
"visible_link": "https://www.techopedia.com/definition/29017/cloud-services",
|
||||
"link": "https://www.edmunds.com/used-cars-for-sale/",
|
||||
"title": "Get the Best Deals on Used Cars For Sale Near You - Shop Used ...https://www.edmunds.com/used-cars-for-sale/CachedSimilar",
|
||||
"snippet": "Get the best prices on great used cars, trucks and SUVs for sale near you with Edmunds. We have over 5 million cheap ... Buy used with confidence on Edmunds ...",
|
||||
"visible_link": "https://www.edmunds.com/used-cars-for-sale/",
|
||||
"date": "",
|
||||
"rank": 9
|
||||
},
|
||||
{
|
||||
"link": "https://www.techradar.com/news/best-cloud-computing-service",
|
||||
"title": "Best cloud computing services of 2019 | TechRadarhttps://www.techradar.com/news/best-cloud-computing-serviceCached",
|
||||
"snippet": "4 days ago - Additionally, cloud services aren't simply about services or resources, but about providing fully fledged IT systems you can use as if you were ...",
|
||||
"visible_link": "https://www.techradar.com/news/best-cloud-computing-service",
|
||||
"date": "4 days ago - ",
|
||||
"rank": 10
|
||||
},
|
||||
{
|
||||
"link": "https://aws.amazon.com/what-is-cloud-computing/",
|
||||
"title": "What is Cloud Computing - Amazon Web Serviceshttps://aws.amazon.com/what-is-cloud-computing/CachedSimilar",
|
||||
"snippet": "Whether you are using it to run applications that share photos to millions of mobile users or to support business critical operations, a cloud services platform ...",
|
||||
"visible_link": "https://aws.amazon.com/what-is-cloud-computing/",
|
||||
"date": "",
|
||||
"rank": 11
|
||||
}
|
||||
]
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "se-scraper",
|
||||
"version": "1.3.13",
|
||||
"version": "1.3.14",
|
||||
"description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu",
|
||||
"homepage": "https://scrapeulous.com/",
|
||||
"main": "index.js",
|
||||
|
@ -22,9 +22,9 @@ class BingScraper extends Scraper {
|
||||
const ads = [];
|
||||
$('.b_ad .sb_add').each((i, element) => {
|
||||
ads.push({
|
||||
ad_visible_url: $(element).find('.b_adurl cite').text(),
|
||||
ads_link: $(element).find('h2 a').attr('href'),
|
||||
ads_link_target: $(element).find('h2 link').attr('href'),
|
||||
visible_link: $(element).find('.b_adurl cite').text(),
|
||||
tracking_link: $(element).find('h2 a').attr('href'),
|
||||
link: $(element).find('link').attr('href'),
|
||||
title: $(element).find('h2 a').text(),
|
||||
snippet: $(element).find('.b_caption').text(),
|
||||
})
|
||||
|
@ -9,7 +9,7 @@ class DuckduckgoScraper extends Scraper {
|
||||
|
||||
// perform queries
|
||||
const results = [];
|
||||
$('.result__body').each((i, link) => {
|
||||
$('#links .result__body').each((i, link) => {
|
||||
results.push({
|
||||
link: $(link).find('.result__title .result__a').attr('href'),
|
||||
title: $(link).find('.result__title .result__a').text(),
|
||||
@ -22,8 +22,8 @@ class DuckduckgoScraper extends Scraper {
|
||||
const ads = [];
|
||||
$('.results--ads.has-ad').each((i, element) => {
|
||||
ads.push({
|
||||
ad_visible_url: $(element).find('.result__url').text(),
|
||||
ads_link: $(element).find('.result__title .result__a').attr('href'),
|
||||
visible_link: $(element).find('.result__url').text(),
|
||||
tracking_link: $(element).find('.result__title .result__a').attr('href'),
|
||||
title: $(element).find('.result__title .result__a').text(),
|
||||
snippet: $(element).find('.result__snippet').text(),
|
||||
})
|
||||
|
@ -24,29 +24,33 @@ class GoogleScraper extends Scraper {
|
||||
})
|
||||
});
|
||||
|
||||
// parse top ads
|
||||
const top_ads = [];
|
||||
$('#tads .ads-ad').each((i, element) => {
|
||||
top_ads.push({
|
||||
ad_visible_url: $(element).find('.ads-visurl cite').text(),
|
||||
ads_link: $(element).find('a:first-child').attr('href'),
|
||||
ads_link_target: $(element).find('a:nth-child(2)').attr('href'),
|
||||
// parse ads
|
||||
let parseAds = (storage, selector) => {
|
||||
$(selector).each((i, element) => {
|
||||
let obj = {
|
||||
visible_link: $(element).find('.ads-visurl cite').text(),
|
||||
tracking_link: $(element).find('a:first-child').attr('href'),
|
||||
link: $(element).find('a:nth-child(2)').attr('href'),
|
||||
title: $(element).find('a h3').text(),
|
||||
snippet: $(element).find('.ads-creative').text(),
|
||||
links: [],
|
||||
};
|
||||
$(element).find('ul li a').each((i, el) => {
|
||||
obj.links.push({
|
||||
tracking_link: $(el).attr('data-arwt'),
|
||||
link: $(el).attr('href'),
|
||||
title: $(el).text(),
|
||||
})
|
||||
});
|
||||
storage.push(obj);
|
||||
});
|
||||
};
|
||||
|
||||
// parse bottom ads
|
||||
const top_ads = [];
|
||||
const bottomads = [];
|
||||
$('#tadsb .ads-ad').each((i, element) => {
|
||||
bottomads.push({
|
||||
ad_visible_url: $(element).find('.ads-visurl cite').text(),
|
||||
ads_link: $(element).find('a:first-child').attr('href'),
|
||||
ads_link_target: $(element).find('a:nth-child(2)').attr('href'),
|
||||
title: $(element).find('a h3').text(),
|
||||
snippet: $(element).find('.ads-creative').text(),
|
||||
})
|
||||
});
|
||||
|
||||
parseAds(top_ads, '#tads .ads-ad');
|
||||
parseAds(bottomads, '#tadsb .ads-ad');
|
||||
|
||||
// parse google places
|
||||
const places = [];
|
||||
@ -143,7 +147,7 @@ class GoogleScraper extends Scraper {
|
||||
}
|
||||
|
||||
async wait_for_results() {
|
||||
await this.page.waitForSelector('#fbarcnt', { timeout: this.STANDARD_TIMEOUT });
|
||||
await this.page.waitForSelector('#fbar', { timeout: this.STANDARD_TIMEOUT });
|
||||
}
|
||||
|
||||
async detected() {
|
||||
|
@ -116,6 +116,7 @@ module.exports = class Scraper {
|
||||
|
||||
if (this.config.log_http_headers === true) {
|
||||
this.metadata.http_headers = await meta.get_http_headers(this.page);
|
||||
log(this.config, 1, this.metadata.http_headers);
|
||||
}
|
||||
|
||||
if (this.config.log_ip_address === true) {
|
||||
|
@ -233,21 +233,21 @@ function test_case_ads_test(response) {
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
assert.isAtLeast(obj.ads.length, 2, 'ads must have at least 2 SERP object');
|
||||
assert.isAtLeast(obj.ads.length, 2, 'ads must have at least 2 SERP objects');
|
||||
|
||||
for (let res of obj.ads) {
|
||||
|
||||
assert.isOk(res.ads_link, 'link must be ok');
|
||||
assert.typeOf(res.ads_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.ads_link.length, 5, 'link must have at least 5 chars');
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.ads_link_target, 'link must be ok');
|
||||
assert.typeOf(res.ads_link_target, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.ads_link_target.length, 5, 'link must have at least 5 chars');
|
||||
// assert.isOk(res.link, 'link must be ok');
|
||||
// assert.typeOf(res.link, 'string', 'link must be string');
|
||||
// assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.ad_visible_url, 'visible_link must be ok');
|
||||
assert.typeOf(res.ad_visible_url, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.ad_visible_url.length, 5, 'visible_link must have at least 5 chars');
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
@ -257,14 +257,10 @@ function test_case_ads_test(response) {
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
describe('Bing', function(){
|
||||
this.timeout(30000);
|
||||
it('normal search', normal_search_test);
|
||||
|
@ -1,10 +1,7 @@
|
||||
const se_scraper = require('./../index.js');
|
||||
var assert = require('chai').assert;
|
||||
|
||||
/*
|
||||
* Use chai and mocha for tests.
|
||||
* https://mochajs.org/#installation
|
||||
*/
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
|
||||
const normal_search_keywords = ['apple tree', 'weather tomorrow'];
|
||||
|
||||
@ -12,12 +9,9 @@ async function normal_search_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
keyword_file: '',
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
block_assets: false,
|
||||
random_user_agent: true,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
@ -83,12 +77,9 @@ async function effective_query_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
keyword_file: '',
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
random_user_agent: true,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
@ -129,7 +120,72 @@ function test_case_effective_query(response) {
|
||||
}
|
||||
}
|
||||
|
||||
(async () => {
|
||||
await normal_search_test();
|
||||
await effective_query_test();
|
||||
})();
|
||||
const ads_keywords = ['cloud services', 'buy shoes'];
|
||||
|
||||
async function ads_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
block_assets: false,
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'duckduckgo',
|
||||
keywords: ads_keywords,
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
console.log('ads_test()');
|
||||
test_case_ads_test( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
function test_case_ads_test(response) {
|
||||
assert.equal(response.metadata.num_requests, 2);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, ads_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'effective_query', 'ads'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 6, 'results must have at least 6 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
assert.isAtLeast(obj.ads.length, 2, 'ads must have at least 2 SERP object');
|
||||
|
||||
for (let res of obj.ads) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
describe('Duckduckgo', function(){
|
||||
this.timeout(30000);
|
||||
it('normal search', normal_search_test);
|
||||
it('effective query', effective_query_test);
|
||||
it('finds ads', ads_test);
|
||||
});
|
@ -221,7 +221,7 @@ function check_html_output_test_case( response ) {
|
||||
}
|
||||
}
|
||||
|
||||
const ads_keywords = ['cloud services', 'buy shoes'];
|
||||
const ads_keywords = ['cloud services', 'auto kaufen'];
|
||||
|
||||
async function ads_test() {
|
||||
let config = {
|
||||
@ -229,7 +229,7 @@ async function ads_test() {
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
block_assets: false,
|
||||
random_user_agent: true,
|
||||
random_user_agent: false, // dont try to trick google with ads
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
@ -263,22 +263,21 @@ function test_case_ads_test(response) {
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
assert.isAtLeast(obj.top_ads.length, 1, 'top_ads must have at least 1 SERP object');
|
||||
assert.isAtLeast(obj.bottom_ads.length, 1, 'bottom_ads must have at least 1 SERP object');
|
||||
assert(obj.top_ads.length >= 1 || obj.bottom_ads.length >= 1, 'top_ads or bottom_ads must have at least 1 SERP object');
|
||||
|
||||
for (let res of obj.top_ads) {
|
||||
|
||||
assert.isOk(res.ads_link, 'link must be ok');
|
||||
assert.typeOf(res.ads_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.ads_link.length, 5, 'link must have at least 5 chars');
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.ads_link_target, 'link must be ok');
|
||||
assert.typeOf(res.ads_link_target, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.ads_link_target.length, 5, 'link must have at least 5 chars');
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.ad_visible_url, 'visible_link must be ok');
|
||||
assert.typeOf(res.ad_visible_url, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.ad_visible_url.length, 5, 'visible_link must have at least 5 chars');
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
@ -287,21 +286,22 @@ function test_case_ads_test(response) {
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.typeOf(res.links, 'array', 'snippet must be array');
|
||||
}
|
||||
|
||||
for (let res of obj.bottom_ads) {
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.ads_link, 'link must be ok');
|
||||
assert.typeOf(res.ads_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.ads_link.length, 5, 'link must have at least 5 chars');
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.ads_link_target, 'link must be ok');
|
||||
assert.typeOf(res.ads_link_target, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.ads_link_target.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.ad_visible_url, 'visible_link must be ok');
|
||||
assert.typeOf(res.ad_visible_url, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.ad_visible_url.length, 5, 'visible_link must have at least 5 chars');
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
@ -310,6 +310,8 @@ function test_case_ads_test(response) {
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.typeOf(res.links, 'array', 'snippet must be array');
|
||||
}
|
||||
|
||||
}
|
||||
@ -322,5 +324,5 @@ describe('Google', function(){
|
||||
it('no results', no_results_test);
|
||||
it('effective query', effective_query_test);
|
||||
it('html output query', html_output_query_test);
|
||||
it('finds ads', ads_test);
|
||||
it('ads', ads_test);
|
||||
});
|
||||
|
Loading…
Reference in New Issue
Block a user