From 5e47c27c70e83f895ca21d54576bd9a3b9ad11ff Mon Sep 17 00:00:00 2001 From: Nikolai Tschacher Date: Mon, 23 Sep 2019 23:38:38 +0200 Subject: [PATCH] too late to find a proper commit description --- examples/quickstart.js | 7 +- se-scraper.iml | 4 +- src/modules/bing.js | 140 ++++++++++++++++++++++++++------------ src/node_scraper.js | 5 +- test/static_tests/bing.js | 60 +++++++++++++++- test/test_bing.js | 8 ++- 6 files changed, 167 insertions(+), 57 deletions(-) diff --git a/examples/quickstart.js b/examples/quickstart.js index a8178bb..2545d80 100644 --- a/examples/quickstart.js +++ b/examples/quickstart.js @@ -4,14 +4,13 @@ const se_scraper = require('./../src/node_scraper.js'); let browser_config = { debug_level: 1, test_evasion: false, - log_http_headers: true, - log_ip_address: true, + log_http_headers: false, + log_ip_address: false, random_user_agent: false, apply_evasion_techniques: true, screen_output: false, - html_output: true, + html_output: false, clean_html_output: true, - compress: true, }; let scrape_job = { diff --git a/se-scraper.iml b/se-scraper.iml index 8021953..656287b 100644 --- a/se-scraper.iml +++ b/se-scraper.iml @@ -2,7 +2,9 @@ - + + + diff --git a/src/modules/bing.js b/src/modules/bing.js index e9a2452..0cda19a 100644 --- a/src/modules/bing.js +++ b/src/modules/bing.js @@ -3,52 +3,106 @@ const Scraper = require('./se_scraper'); class BingScraper extends Scraper { - parse(html) { - // load the page source into cheerio - const $ = cheerio.load(html); + async parse_async(html) { - // perform queries - const results = []; - $('#b_content #b_results .b_algo').each((i, link) => { - results.push({ - link: $(link).find('h2 a').attr('href'), - title: $(link).find('h2').text(), - snippet: $(link).find('.b_caption p').text(), - visible_link: $(link).find('cite').text(), - }) + let results = await this.page.evaluate(() => { + + let _text = (el, s) => { + let n = el.querySelector(s); + + if (n) { + return n.innerText; + } else { + return ''; + } + }; + + let _attr = (el, s, attr) => { + let n = el.querySelector(s); + + if (n) { + return n.getAttribute(attr); + } else { + return null; + } + }; + + let results = { + num_results: '', + no_results: false, + effective_query: '', + results: [], + ads: [], + right_side_ads: [], + }; + + let num_results_el = document.querySelector('#b_content .sb_count'); + + if (num_results_el) { + results.num_results = num_results_el.innerText; + } + + let organic_results = document.querySelectorAll('#b_content #b_results .b_algo'); + + organic_results.forEach((el) => { + + let serp_obj = { + link: _attr(el, 'h2 a', 'href'), + title: _text(el, 'h2'), + snippet: _text(el, '.b_caption p'), + visible_link: _text(el, 'cite'), + }; + + results.results.push(serp_obj); + }); + + // check if no results + results.no_results = (results.results.length === 0); + + // parse bing ads + let ads = document.querySelectorAll('#b_results .b_ad .sb_add'); + + ads.forEach((el) => { + + let ad_obj = { + title: _text(el, 'h2 a'), + snippet: _text(el, '.b_caption p'), + visible_link: _text(el, '.b_adurl cite'), + tracking_link: _attr(el, 'h2 a', 'href'), + }; + + results.ads.push(ad_obj); + }); + + // right side ads + let right_side_ads = document.querySelectorAll('#b_context .b_ad .sb_add'); + + right_side_ads.forEach((el) => { + + let ad_obj = { + title: _text(el, 'h2 a'), + snippet: _text(el, '.b_caption p'), + visible_link: _text(el, '.b_adurl cite'), + tracking_link: _attr(el, 'h2 a', 'href'), + }; + + results.right_side_ads.push(ad_obj); + }); + + + let effective_query_el = document.querySelector('#sp_requery a'); + + if (effective_query_el) { + results.effective_query = effective_query_el.innerText; + } + + return results; }); - // parse bing ads - const ads = []; - $('.b_ad .sb_add').each((i, element) => { - ads.push({ - visible_link: $(element).find('.b_adurl cite').text(), - tracking_link: $(element).find('h2 a').attr('href'), - //link: $(element).find('link').attr('href'), - title: $(element).find('h2 a').text(), - snippet: $(element).find('.b_caption').text(), - }) - }); - - // 'Including results for', 'Einschließlich Ergebnisse' - let no_results = this.no_results( - ['There are no results', 'Es gibt keine Ergebnisse'], - $('#b_results').text() - ); - - let effective_query = $('#sp_requery a').first().text() || ''; - - const cleaned = this.clean_results(results, ['title', 'link']); - const ads_cleaned = this.clean_results(ads, ['title', 'visible_link', 'tracking_link']); - - return { - time: (new Date()).toUTCString(), - no_results: no_results, - effective_query: effective_query, - num_results: $('#b_content .sb_count').text(), - results: cleaned, - ads: ads_cleaned, - } + results.results = this.clean_results(results.results, ['title', 'link']); + results.ads = this.clean_results(results.ads, ['title', 'visible_link', 'tracking_link']); + results.time = (new Date()).toUTCString(); + return results; } async load_start_page() { diff --git a/src/node_scraper.js b/src/node_scraper.js index faf973f..7238f5f 100644 --- a/src/node_scraper.js +++ b/src/node_scraper.js @@ -76,11 +76,10 @@ class ScrapeManager { log_http_headers: false, // how long to sleep between requests. a random sleep interval within the range [a,b] // is drawn before every request. empty string for no sleeping. - sleep_range: undefined, + sleep_range: null, // which search engine to scrape search_engine: 'google', search_engine_name: 'google', - compress: false, // compress // whether debug information should be printed // level 0: print nothing // level 1: print most important info @@ -114,7 +113,7 @@ class ScrapeManager { // this module should export the functions: // get_browser, handle_metadata, close_browser //custom_func: resolve('examples/pluggable.js'), - custom_func: undefined, + custom_func: null, throw_on_detection: false, // use a proxy for all connections // example: 'socks5://78.94.172.42:1080' diff --git a/test/static_tests/bing.js b/test/static_tests/bing.js index bed1e16..ae0b127 100644 --- a/test/static_tests/bing.js +++ b/test/static_tests/bing.js @@ -34,6 +34,11 @@ async function bing_ads() { bing_search_with_ads3( await scraper.scrape(scrape_config) ); + scrape_config.keywords = ['service auto garage']; + scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/bing4.html'); + + bing_search_with_ads4( await scraper.scrape(scrape_config) ); + await scraper.quit(); } @@ -52,7 +57,9 @@ function bing_search_with_ads(response) { assert.include(obj.num_results, '1’100’000', 'num results not included'); assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object'); assert.isAtLeast(obj.results.length, 6, 'results must have at least 6 SERP objects'); - assert.isAtLeast(obj.ads.length, 12, 'there are 12 ads'); + assert.isAtLeast(obj.ads.length, 7, 'there are 7 ads'); + + assert.isAtLeast(obj.right_side_ads.length, 5, 'there are 5 ads'); assert.equal(obj.no_results, false, 'no results should be false'); assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); @@ -79,7 +86,9 @@ function bing_search_with_ads2(response) { assert.include(obj.num_results, '44’300’000', 'num results not included'); assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object'); assert.isAtLeast(obj.results.length, 6, 'results must have at least 6 SERP objects'); - assert.isAtLeast(obj.ads.length, 12, 'there are 12 ads'); + assert.isAtLeast(obj.ads.length, 7, 'there are 7 ads'); + + assert.isAtLeast(obj.right_side_ads.length, 5, 'there are 5 ads'); assert.equal(obj.no_results, false, 'no results should be false'); assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); @@ -102,7 +111,7 @@ function bing_search_with_ads3(response) { let obj = response.results[query][page_number]; - assert.include(obj.num_results, '65.500.000 results', 'num results not included'); + assert.include(obj.num_results, '65.500.000 Results', 'num results not included'); assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object'); assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects'); assert.isAtLeast(obj.ads.length, 3, 'there are 3 ads'); @@ -117,6 +126,32 @@ function bing_search_with_ads3(response) { } } +function bing_search_with_ads4(response) { + assert.equal(response.metadata.num_requests, 1); + + for (let query in response.results) { + + for (let page_number in response.results[query]) { + + assert.isNumber(parseInt(page_number), 'page_number must be numeric'); + + let obj = response.results[query][page_number]; + + assert.include(obj.num_results, '4.200.000 Ergebnisse', 'num results not included'); + assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object'); + assert.isAtLeast(obj.results.length, 9, 'results must have at least 9 SERP objects'); + assert.isAtLeast(obj.ads.length, 3, 'there are 3 ads'); + + assert.equal(obj.no_results, false, 'no results should be false'); + assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); + assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); + assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); + + confirm_results_ok(obj); + } + } +} + function confirm_results_ok(obj) { @@ -160,6 +195,25 @@ function confirm_results_ok(obj) { assert.typeOf(res.snippet, 'string', 'snippet must be string'); assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); } + + for (let res of obj.right_side_ads) { + + assert.isOk(res.tracking_link, 'link must be ok'); + assert.typeOf(res.tracking_link, 'string', 'link must be string'); + assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars'); + + assert.isOk(res.visible_link, 'link must be ok'); + assert.typeOf(res.visible_link, 'string', 'link must be string'); + assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars'); + + assert.isOk(res.title, 'title must be ok'); + assert.typeOf(res.title, 'string', 'title must be string'); + assert.isAtLeast(res.title.length, 8, 'title must have at least 8 chars'); + + assert.isOk(res.snippet, 'snippet must be ok'); + assert.typeOf(res.snippet, 'string', 'snippet must be string'); + assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); + } } describe('Bing', function(){ diff --git a/test/test_bing.js b/test/test_bing.js index b5faff1..f62c56a 100644 --- a/test/test_bing.js +++ b/test/test_bing.js @@ -73,9 +73,11 @@ function normal_search_test_case(response) { assert.typeOf(res.title, 'string', 'title must be string'); assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars'); - assert.isOk(res.snippet, 'snippet must be ok'); - assert.typeOf(res.snippet, 'string', 'snippet must be string'); - assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); + if (res.snippet) { + assert.isOk(res.snippet, 'snippet must be ok'); + assert.typeOf(res.snippet, 'string', 'snippet must be string'); + assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); + } assert.isNumber(res.rank, 'rank must be integer'); assert.equal(res.rank, total_rank++, 'rank ist wrong');