parsing ads is supported now for google, bing and duckduckgo

This commit is contained in:
Nikolai Tschacher 2019-07-06 21:42:13 +02:00
parent 09c1255400
commit bbebe3ce60
14 changed files with 339 additions and 1568 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 307 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 343 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 92 KiB

After

Width:  |  Height:  |  Size: 91 KiB

View File

@ -4,19 +4,22 @@ const se_scraper = require('./../src/node_scraper.js');
let browser_config = {
debug_level: 2,
output_file: 'examples/results/data.json',
test_evasion: true,
test_evasion: false,
headless: false,
block_assets: false,
random_user_agent: true,
};
let scrape_job = {
search_engine: 'google',
keywords: ['news', 'se-scraper'],
keywords: ['cloud service'],
num_pages: 1,
// add some cool google search settings
google_settings: {
gl: 'us', // The gl parameter determines the Google country to use for the query.
hl: 'en', // The hl parameter determines the Google UI language to return results.
start: 0, // Determines the results offset to use, defaults to 0.
num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
num: 10, // Determines the number of results to show, defaults to 10. Maximum is 100.
},
};

File diff suppressed because it is too large Load Diff

BIN
headless-test-result.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 50 KiB

View File

@ -1,6 +1,6 @@
{
"name": "se-scraper",
"version": "1.3.12",
"version": "1.3.13",
"description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu",
"homepage": "https://scrapeulous.com/",
"main": "index.js",

View File

@ -18,6 +18,18 @@ class BingScraper extends Scraper {
})
});
// parse bing ads
const ads = [];
$('.b_ad .sb_add').each((i, element) => {
ads.push({
ad_visible_url: $(element).find('.b_adurl cite').text(),
ads_link: $(element).find('h2 a').attr('href'),
ads_link_target: $(element).find('h2 link').attr('href'),
title: $(element).find('h2 a').text(),
snippet: $(element).find('.b_caption').text(),
})
});
// 'Including results for', 'Einschließlich Ergebnisse'
let no_results = this.no_results(
['There are no results', 'Es gibt keine Ergebnisse'],
@ -41,6 +53,7 @@ class BingScraper extends Scraper {
effective_query: effective_query,
num_results: $('#b_content .sb_count').text(),
results: cleaned,
ads: ads,
}
}

View File

@ -19,6 +19,16 @@ class DuckduckgoScraper extends Scraper {
});
});
const ads = [];
$('.results--ads.has-ad').each((i, element) => {
ads.push({
ad_visible_url: $(element).find('.result__url').text(),
ads_link: $(element).find('.result__title .result__a').attr('href'),
title: $(element).find('.result__title .result__a').text(),
snippet: $(element).find('.result__snippet').text(),
})
});
let effective_query = $('a.js-spelling-suggestion-link').attr('data-query') || '';
const cleaned = [];
@ -33,7 +43,8 @@ class DuckduckgoScraper extends Scraper {
return {
time: (new Date()).toUTCString(),
effective_query: effective_query,
results: cleaned
results: cleaned,
ads: ads,
}
}

View File

@ -13,7 +13,6 @@ class GoogleScraper extends Scraper {
// load the page source into cheerio
const $ = cheerio.load(html);
// perform queries
const results = [];
$('#center_col .g').each((i, link) => {
results.push({
@ -25,6 +24,41 @@ class GoogleScraper extends Scraper {
})
});
// parse top ads
const top_ads = [];
$('#tads .ads-ad').each((i, element) => {
top_ads.push({
ad_visible_url: $(element).find('.ads-visurl cite').text(),
ads_link: $(element).find('a:first-child').attr('href'),
ads_link_target: $(element).find('a:nth-child(2)').attr('href'),
title: $(element).find('a h3').text(),
snippet: $(element).find('.ads-creative').text(),
})
});
// parse bottom ads
const bottomads = [];
$('#tadsb .ads-ad').each((i, element) => {
bottomads.push({
ad_visible_url: $(element).find('.ads-visurl cite').text(),
ads_link: $(element).find('a:first-child').attr('href'),
ads_link_target: $(element).find('a:nth-child(2)').attr('href'),
title: $(element).find('a h3').text(),
snippet: $(element).find('.ads-creative').text(),
})
});
// parse google places
const places = [];
$('.rllt__link').each((i, element) => {
places.push({
heading: $(element).find('[role="heading"] span').text(),
rating: $(element).find('.rllt__details div:first-child').text(),
contact: $(element).find('.rllt__details div:nth-child(2)').text(),
hours: $(element).find('.rllt__details div:nth-child(3)').text(),
})
});
// 'Ergebnisse für', 'Showing results for'
let no_results = this.no_results(
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
@ -51,8 +85,12 @@ class GoogleScraper extends Scraper {
num_results: $('#resultStats').text(),
no_results: no_results,
effective_query: effective_query,
results: cleaned
top_ads: top_ads,
bottom_ads: bottomads,
places: places,
results: cleaned,
}
}
async load_start_page() {
@ -105,7 +143,7 @@ class GoogleScraper extends Scraper {
}
async wait_for_results() {
await this.page.waitForSelector('#center_col .g', { timeout: this.STANDARD_TIMEOUT });
await this.page.waitForSelector('#fbarcnt', { timeout: this.STANDARD_TIMEOUT });
}
async detected() {

View File

@ -197,7 +197,7 @@ module.exports = class Scraper {
let html = await this.page.content();
if (this.config.html_output) {
this.html_output[keyword][page_num] = html;
this.html_output[keyword][this.page_num] = html;
}
let parsed = this.parse(html);

View File

@ -104,6 +104,8 @@ class ScrapeManager {
num_pages: 1,
// path to output file, data will be stored in JSON
output_file: '',
// whether to also passthru all the html output of the serp pages
html_output: false,
// whether to prevent images, css, fonts and media from being loaded
// will speed up scraping a great deal
block_assets: true,
@ -357,7 +359,10 @@ class ScrapeManager {
let res = await this.scraper.run(this.page);
results = res.results;
metadata = this.scraper.metadata;
num_requests = this.scraper.num_requests;
html_output = this.scraper.html_output;
} else {
// Each browser will get N/(K+1) keywords and will issue N/(K+1) * M total requests to the search engine.
// https://github.com/GoogleChrome/puppeteer/issues/678

View File

@ -1,6 +1,7 @@
const se_scraper = require('./../index.js');
var assert = require('chai').assert;
const chai = require('chai');
chai.use(require('chai-string'));
const assert = chai.assert;
/*
* Use chai and mocha for tests.
* https://mochajs.org/#installation
@ -189,8 +190,85 @@ function test_case_effective_query(response) {
}
}
(async () => {
await normal_search_test();
await no_results_test();
await effective_query_test();
})();
const ads_keywords = ['cloud services', 'buy shoes'];
async function ads_test() {
let config = {
compress: false,
debug_level: 1,
headless: true,
block_assets: false,
random_user_agent: true,
};
let scrape_config = {
search_engine: 'bing',
keywords: ads_keywords,
num_pages: 1,
};
console.log('ads_test()');
test_case_ads_test( await se_scraper.scrape(config, scrape_config) );
}
function test_case_ads_test(response) {
assert.equal(response.metadata.num_requests, 2);
for (let query in response.results) {
assert.containsAllKeys(response.results, ads_keywords, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 5, 'results must have at least 5 SERP objects');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
assert.isAtLeast(obj.ads.length, 2, 'ads must have at least 2 SERP object');
for (let res of obj.ads) {
assert.isOk(res.ads_link, 'link must be ok');
assert.typeOf(res.ads_link, 'string', 'link must be string');
assert.isAtLeast(res.ads_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.ads_link_target, 'link must be ok');
assert.typeOf(res.ads_link_target, 'string', 'link must be string');
assert.isAtLeast(res.ads_link_target.length, 5, 'link must have at least 5 chars');
assert.isOk(res.ad_visible_url, 'visible_link must be ok');
assert.typeOf(res.ad_visible_url, 'string', 'visible_link must be string');
assert.isAtLeast(res.ad_visible_url.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
}
}
}
}
describe('Bing', function(){
this.timeout(30000);
it('normal search', normal_search_test);
it('no results', no_results_test);
it('effective query', effective_query_test);
it('finds ads', ads_test);
});

View File

@ -82,7 +82,7 @@ function normal_search_test_case(response) {
}
}
const keywords_no_results = ['fgskl34440abJAksafkl34a44dsflkjaQQuBBdfk',];
const keywords_no_results = ['fgskl34440abJAksfs4353534a3l34AVGFDFflkjaQQuBBdfk',];
async function no_results_test() {
let config = {
@ -159,7 +159,6 @@ async function effective_query_test() {
function test_case_effective_query(response) {
assert.equal(response.metadata.num_requests, 1);
results = response.results;
for (let query in response.results) {
assert.containsAllKeys(response.results, effective_query_keywords, 'not all keywords were scraped.');
@ -222,10 +221,106 @@ function check_html_output_test_case( response ) {
}
}
const ads_keywords = ['cloud services', 'buy shoes'];
async function ads_test() {
let config = {
compress: false,
debug_level: 1,
headless: true,
block_assets: false,
random_user_agent: true,
};
let scrape_config = {
search_engine: 'google',
keywords: ads_keywords,
num_pages: 1,
};
console.log('ads_test()');
test_case_ads_test( await se_scraper.scrape(config, scrape_config) );
}
function test_case_ads_test(response) {
assert.equal(response.metadata.num_requests, 2);
for (let query in response.results) {
assert.containsAllKeys(response.results, ads_keywords, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
assert.isAtLeast(obj.top_ads.length, 1, 'top_ads must have at least 1 SERP object');
assert.isAtLeast(obj.bottom_ads.length, 1, 'bottom_ads must have at least 1 SERP object');
for (let res of obj.top_ads) {
assert.isOk(res.ads_link, 'link must be ok');
assert.typeOf(res.ads_link, 'string', 'link must be string');
assert.isAtLeast(res.ads_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.ads_link_target, 'link must be ok');
assert.typeOf(res.ads_link_target, 'string', 'link must be string');
assert.isAtLeast(res.ads_link_target.length, 5, 'link must have at least 5 chars');
assert.isOk(res.ad_visible_url, 'visible_link must be ok');
assert.typeOf(res.ad_visible_url, 'string', 'visible_link must be string');
assert.isAtLeast(res.ad_visible_url.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
}
for (let res of obj.bottom_ads) {
assert.isOk(res.ads_link, 'link must be ok');
assert.typeOf(res.ads_link, 'string', 'link must be string');
assert.isAtLeast(res.ads_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.ads_link_target, 'link must be ok');
assert.typeOf(res.ads_link_target, 'string', 'link must be string');
assert.isAtLeast(res.ads_link_target.length, 5, 'link must have at least 5 chars');
assert.isOk(res.ad_visible_url, 'visible_link must be ok');
assert.typeOf(res.ad_visible_url, 'string', 'visible_link must be string');
assert.isAtLeast(res.ad_visible_url.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
}
}
}
}
describe('Google', function(){
this.timeout(30000);
it('normal search', normal_search_test);
it('no results', no_results_test);
it('effective query', effective_query_test);
it('html output query', html_output_query_test);
it('finds ads', ads_test);
});