mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2025-01-26 06:08:34 +01:00
parsing ads is supported now for google, bing and duckduckgo
This commit is contained in:
parent
09c1255400
commit
bbebe3ce60
Binary file not shown.
Before Width: | Height: | Size: 307 KiB |
BIN
debug_se_scraper_google_cloud service.png
Normal file
BIN
debug_se_scraper_google_cloud service.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 343 KiB |
Binary file not shown.
Before Width: | Height: | Size: 92 KiB After Width: | Height: | Size: 91 KiB |
@ -4,19 +4,22 @@ const se_scraper = require('./../src/node_scraper.js');
|
||||
let browser_config = {
|
||||
debug_level: 2,
|
||||
output_file: 'examples/results/data.json',
|
||||
test_evasion: true,
|
||||
test_evasion: false,
|
||||
headless: false,
|
||||
block_assets: false,
|
||||
random_user_agent: true,
|
||||
};
|
||||
|
||||
let scrape_job = {
|
||||
search_engine: 'google',
|
||||
keywords: ['news', 'se-scraper'],
|
||||
keywords: ['cloud service'],
|
||||
num_pages: 1,
|
||||
// add some cool google search settings
|
||||
google_settings: {
|
||||
gl: 'us', // The gl parameter determines the Google country to use for the query.
|
||||
hl: 'en', // The hl parameter determines the Google UI language to return results.
|
||||
start: 0, // Determines the results offset to use, defaults to 0.
|
||||
num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
|
||||
num: 10, // Determines the number of results to show, defaults to 10. Maximum is 100.
|
||||
},
|
||||
};
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
BIN
headless-test-result.png
Normal file
BIN
headless-test-result.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 50 KiB |
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "se-scraper",
|
||||
"version": "1.3.12",
|
||||
"version": "1.3.13",
|
||||
"description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu",
|
||||
"homepage": "https://scrapeulous.com/",
|
||||
"main": "index.js",
|
||||
|
@ -18,6 +18,18 @@ class BingScraper extends Scraper {
|
||||
})
|
||||
});
|
||||
|
||||
// parse bing ads
|
||||
const ads = [];
|
||||
$('.b_ad .sb_add').each((i, element) => {
|
||||
ads.push({
|
||||
ad_visible_url: $(element).find('.b_adurl cite').text(),
|
||||
ads_link: $(element).find('h2 a').attr('href'),
|
||||
ads_link_target: $(element).find('h2 link').attr('href'),
|
||||
title: $(element).find('h2 a').text(),
|
||||
snippet: $(element).find('.b_caption').text(),
|
||||
})
|
||||
});
|
||||
|
||||
// 'Including results for', 'Einschließlich Ergebnisse'
|
||||
let no_results = this.no_results(
|
||||
['There are no results', 'Es gibt keine Ergebnisse'],
|
||||
@ -41,6 +53,7 @@ class BingScraper extends Scraper {
|
||||
effective_query: effective_query,
|
||||
num_results: $('#b_content .sb_count').text(),
|
||||
results: cleaned,
|
||||
ads: ads,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -19,6 +19,16 @@ class DuckduckgoScraper extends Scraper {
|
||||
});
|
||||
});
|
||||
|
||||
const ads = [];
|
||||
$('.results--ads.has-ad').each((i, element) => {
|
||||
ads.push({
|
||||
ad_visible_url: $(element).find('.result__url').text(),
|
||||
ads_link: $(element).find('.result__title .result__a').attr('href'),
|
||||
title: $(element).find('.result__title .result__a').text(),
|
||||
snippet: $(element).find('.result__snippet').text(),
|
||||
})
|
||||
});
|
||||
|
||||
let effective_query = $('a.js-spelling-suggestion-link').attr('data-query') || '';
|
||||
|
||||
const cleaned = [];
|
||||
@ -33,7 +43,8 @@ class DuckduckgoScraper extends Scraper {
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
effective_query: effective_query,
|
||||
results: cleaned
|
||||
results: cleaned,
|
||||
ads: ads,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -13,7 +13,6 @@ class GoogleScraper extends Scraper {
|
||||
// load the page source into cheerio
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
// perform queries
|
||||
const results = [];
|
||||
$('#center_col .g').each((i, link) => {
|
||||
results.push({
|
||||
@ -25,6 +24,41 @@ class GoogleScraper extends Scraper {
|
||||
})
|
||||
});
|
||||
|
||||
// parse top ads
|
||||
const top_ads = [];
|
||||
$('#tads .ads-ad').each((i, element) => {
|
||||
top_ads.push({
|
||||
ad_visible_url: $(element).find('.ads-visurl cite').text(),
|
||||
ads_link: $(element).find('a:first-child').attr('href'),
|
||||
ads_link_target: $(element).find('a:nth-child(2)').attr('href'),
|
||||
title: $(element).find('a h3').text(),
|
||||
snippet: $(element).find('.ads-creative').text(),
|
||||
})
|
||||
});
|
||||
|
||||
// parse bottom ads
|
||||
const bottomads = [];
|
||||
$('#tadsb .ads-ad').each((i, element) => {
|
||||
bottomads.push({
|
||||
ad_visible_url: $(element).find('.ads-visurl cite').text(),
|
||||
ads_link: $(element).find('a:first-child').attr('href'),
|
||||
ads_link_target: $(element).find('a:nth-child(2)').attr('href'),
|
||||
title: $(element).find('a h3').text(),
|
||||
snippet: $(element).find('.ads-creative').text(),
|
||||
})
|
||||
});
|
||||
|
||||
// parse google places
|
||||
const places = [];
|
||||
$('.rllt__link').each((i, element) => {
|
||||
places.push({
|
||||
heading: $(element).find('[role="heading"] span').text(),
|
||||
rating: $(element).find('.rllt__details div:first-child').text(),
|
||||
contact: $(element).find('.rllt__details div:nth-child(2)').text(),
|
||||
hours: $(element).find('.rllt__details div:nth-child(3)').text(),
|
||||
})
|
||||
});
|
||||
|
||||
// 'Ergebnisse für', 'Showing results for'
|
||||
let no_results = this.no_results(
|
||||
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
|
||||
@ -51,8 +85,12 @@ class GoogleScraper extends Scraper {
|
||||
num_results: $('#resultStats').text(),
|
||||
no_results: no_results,
|
||||
effective_query: effective_query,
|
||||
results: cleaned
|
||||
top_ads: top_ads,
|
||||
bottom_ads: bottomads,
|
||||
places: places,
|
||||
results: cleaned,
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
@ -105,7 +143,7 @@ class GoogleScraper extends Scraper {
|
||||
}
|
||||
|
||||
async wait_for_results() {
|
||||
await this.page.waitForSelector('#center_col .g', { timeout: this.STANDARD_TIMEOUT });
|
||||
await this.page.waitForSelector('#fbarcnt', { timeout: this.STANDARD_TIMEOUT });
|
||||
}
|
||||
|
||||
async detected() {
|
||||
|
@ -197,7 +197,7 @@ module.exports = class Scraper {
|
||||
let html = await this.page.content();
|
||||
|
||||
if (this.config.html_output) {
|
||||
this.html_output[keyword][page_num] = html;
|
||||
this.html_output[keyword][this.page_num] = html;
|
||||
}
|
||||
|
||||
let parsed = this.parse(html);
|
||||
|
@ -104,6 +104,8 @@ class ScrapeManager {
|
||||
num_pages: 1,
|
||||
// path to output file, data will be stored in JSON
|
||||
output_file: '',
|
||||
// whether to also passthru all the html output of the serp pages
|
||||
html_output: false,
|
||||
// whether to prevent images, css, fonts and media from being loaded
|
||||
// will speed up scraping a great deal
|
||||
block_assets: true,
|
||||
@ -357,7 +359,10 @@ class ScrapeManager {
|
||||
|
||||
let res = await this.scraper.run(this.page);
|
||||
results = res.results;
|
||||
metadata = this.scraper.metadata;
|
||||
num_requests = this.scraper.num_requests;
|
||||
html_output = this.scraper.html_output;
|
||||
|
||||
} else {
|
||||
// Each browser will get N/(K+1) keywords and will issue N/(K+1) * M total requests to the search engine.
|
||||
// https://github.com/GoogleChrome/puppeteer/issues/678
|
||||
|
@ -1,6 +1,7 @@
|
||||
const se_scraper = require('./../index.js');
|
||||
var assert = require('chai').assert;
|
||||
|
||||
const chai = require('chai');
|
||||
chai.use(require('chai-string'));
|
||||
const assert = chai.assert;
|
||||
/*
|
||||
* Use chai and mocha for tests.
|
||||
* https://mochajs.org/#installation
|
||||
@ -189,8 +190,85 @@ function test_case_effective_query(response) {
|
||||
}
|
||||
}
|
||||
|
||||
(async () => {
|
||||
await normal_search_test();
|
||||
await no_results_test();
|
||||
await effective_query_test();
|
||||
})();
|
||||
|
||||
const ads_keywords = ['cloud services', 'buy shoes'];
|
||||
|
||||
async function ads_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
block_assets: false,
|
||||
random_user_agent: true,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'bing',
|
||||
keywords: ads_keywords,
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
console.log('ads_test()');
|
||||
test_case_ads_test( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
function test_case_ads_test(response) {
|
||||
assert.equal(response.metadata.num_requests, 2);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, ads_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 5, 'results must have at least 5 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
assert.isAtLeast(obj.ads.length, 2, 'ads must have at least 2 SERP object');
|
||||
|
||||
for (let res of obj.ads) {
|
||||
|
||||
assert.isOk(res.ads_link, 'link must be ok');
|
||||
assert.typeOf(res.ads_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.ads_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.ads_link_target, 'link must be ok');
|
||||
assert.typeOf(res.ads_link_target, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.ads_link_target.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.ad_visible_url, 'visible_link must be ok');
|
||||
assert.typeOf(res.ad_visible_url, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.ad_visible_url.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
describe('Bing', function(){
|
||||
this.timeout(30000);
|
||||
it('normal search', normal_search_test);
|
||||
it('no results', no_results_test);
|
||||
it('effective query', effective_query_test);
|
||||
it('finds ads', ads_test);
|
||||
});
|
||||
|
@ -82,7 +82,7 @@ function normal_search_test_case(response) {
|
||||
}
|
||||
}
|
||||
|
||||
const keywords_no_results = ['fgskl34440abJAksafkl34a44dsflkjaQQuBBdfk',];
|
||||
const keywords_no_results = ['fgskl34440abJAksfs4353534a3l34AVGFDFflkjaQQuBBdfk',];
|
||||
|
||||
async function no_results_test() {
|
||||
let config = {
|
||||
@ -159,7 +159,6 @@ async function effective_query_test() {
|
||||
function test_case_effective_query(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
results = response.results;
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, effective_query_keywords, 'not all keywords were scraped.');
|
||||
@ -222,10 +221,106 @@ function check_html_output_test_case( response ) {
|
||||
}
|
||||
}
|
||||
|
||||
const ads_keywords = ['cloud services', 'buy shoes'];
|
||||
|
||||
async function ads_test() {
|
||||
let config = {
|
||||
compress: false,
|
||||
debug_level: 1,
|
||||
headless: true,
|
||||
block_assets: false,
|
||||
random_user_agent: true,
|
||||
};
|
||||
|
||||
let scrape_config = {
|
||||
search_engine: 'google',
|
||||
keywords: ads_keywords,
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
console.log('ads_test()');
|
||||
test_case_ads_test( await se_scraper.scrape(config, scrape_config) );
|
||||
}
|
||||
|
||||
function test_case_ads_test(response) {
|
||||
assert.equal(response.metadata.num_requests, 2);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, ads_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
assert.isAtLeast(obj.top_ads.length, 1, 'top_ads must have at least 1 SERP object');
|
||||
assert.isAtLeast(obj.bottom_ads.length, 1, 'bottom_ads must have at least 1 SERP object');
|
||||
|
||||
for (let res of obj.top_ads) {
|
||||
|
||||
assert.isOk(res.ads_link, 'link must be ok');
|
||||
assert.typeOf(res.ads_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.ads_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.ads_link_target, 'link must be ok');
|
||||
assert.typeOf(res.ads_link_target, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.ads_link_target.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.ad_visible_url, 'visible_link must be ok');
|
||||
assert.typeOf(res.ad_visible_url, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.ad_visible_url.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
}
|
||||
|
||||
for (let res of obj.bottom_ads) {
|
||||
|
||||
assert.isOk(res.ads_link, 'link must be ok');
|
||||
assert.typeOf(res.ads_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.ads_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.ads_link_target, 'link must be ok');
|
||||
assert.typeOf(res.ads_link_target, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.ads_link_target.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.ad_visible_url, 'visible_link must be ok');
|
||||
assert.typeOf(res.ad_visible_url, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.ad_visible_url.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
describe('Google', function(){
|
||||
this.timeout(30000);
|
||||
it('normal search', normal_search_test);
|
||||
it('no results', no_results_test);
|
||||
it('effective query', effective_query_test);
|
||||
it('html output query', html_output_query_test);
|
||||
it('finds ads', ads_test);
|
||||
});
|
||||
|
Loading…
Reference in New Issue
Block a user