clean test case for google is passing

2025-08-09 21:57:49 +02:00 · 2019-01-31 14:57:34 +01:00
parent 987e3d7342
commit c60d0f3528
6 changed files with 225 additions and 30 deletions
--- a/TODO.txt
+++ b/TODO.txt
@ -44,24 +44,7 @@ TODO:


 TODO:
-    okay its fucking time to make a generic scraping class like in GoogleScraper
+    okay its fucking time to make a generic scraping class like in GoogleScraper [done]
    i feel like history repeats

-    class Scraper
-
-        constructor(options = {}) {
-
-        }
-
-        async load_search_engine() {}
-
-        async search_keyword() {}
-
-        async new_page() {}
-
-        async detected() {}
-
-
-    then each search engine derives from this generic class
-
-    some search engines do not seed such a abstract class, because they are too complex
+    write good test case for google
--- a/package.json
+++ b/package.json
@ -1,6 +1,6 @@
 {
  "name": "se-scraper",
-  "version": "1.1.8",
+  "version": "1.1.9",
  "description": "A simple module which uses puppeteer to scrape several search engines.",
  "homepage": "https://scrapeulous.com/",
  "main": "index.js",
--- a/run.js
+++ b/run.js
@ -20,13 +20,13 @@ let config = {
    // this output is informational
    verbose: true,
    // an array of keywords to scrape
-    keywords: ['news'],
+    keywords: ['apple tree'],
    // alternatively you can specify a keyword_file. this overwrites the keywords array
    keyword_file: '',
    // the number of pages to scrape for each keyword
-    num_pages: 2,
+    num_pages: 1,
    // whether to start the browser in headless mode
-    headless: true,
+    headless: false,
    // path to output file, data will be stored in JSON
    output_file: '',
    // whether to prevent images, css, fonts from being loaded
--- a/src/modules/google.js
+++ b/src/modules/google.js
@ -19,9 +19,10 @@ class GoogleScraper extends Scraper {
 			})
 		});

+		// 'Ergebnisse für', 'Showing results for'
 		let no_results = this.no_results(
 			['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
-				'No results found for', 'Ergebnisse für', 'Showing results for'],
+				'No results found for'],
 			$('#main').text()
 		);

@ -109,7 +110,7 @@ class GoogleNewsOldScraper extends Scraper {

 		let no_results = this.no_results(
 			['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
-				'No results found for', 'Ergebnisse für', 'Showing results for', 'did not match any news results'],
+				'No results found for', 'did not match any news results'],
 			$('#main').text()
 		);

@ -190,8 +191,7 @@ class GoogleImageScraper extends Scraper {
 		});

 		let no_results = this.no_results(
-			['stimmt mit keinem Bildergebnis', 'Keine Ergebnisse für', 'not match any image results', 'No results found for',
-				'Showing results for', 'Ergebnisse für'],
+			['stimmt mit keinem Bildergebnis', 'Keine Ergebnisse für', 'not match any image results', 'No results found for',],
 			$('#main').text()
 		);

@ -297,7 +297,7 @@ class GoogleNewsScraper extends Scraper {

 		let no_results = this.no_results(
 			['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
-				'No results found for', 'Ergebnisse für', 'Showing results for', 'did not match any news results'],
+				'No results found for', 'did not match any news results'],
 			$('body').text()
 		);

--- a/src/modules/se_scraper.js
+++ b/src/modules/se_scraper.js
@ -176,8 +176,15 @@ module.exports = class Scraper {
    }

    no_results(needles, html) {
-        return !needles.map((needle) => { return html.indexOf(needle)})
-            .every((res) => { return res == -1});
+        for (let needle of needles) {
+            if (html.includes(needle)) {
+                if (this.config.debug) {
+                    console.log(`HTML contains needle ${needle}. no_results=true`);
+                }
+                return true;
+            }
+        }
+        return false;
    }

    parse(html) {
--- a/test/test_google.js
+++ b/test/test_google.js
@ -0,0 +1,205 @@
+const se_scraper =  require('./../index.js');
+var assert = require('chai').assert;
+
+/*
+ * Use chai and mocha for tests.
+ * https://mochajs.org/#installation
+ */
+
+function sleep(ms) {
+    return new Promise(resolve => {
+        setTimeout(resolve, ms)
+    })
+}
+
+const normal_search_keywords = ['apple tree', 'weather tomorrow'];
+
+async function normal_search_test() {
+    let config = {
+        search_engine: 'google',
+        compress: false,
+        debug: false,
+        verbose: false,
+        keywords: normal_search_keywords,
+        keyword_file: '',
+        num_pages: 3,
+        headless: true,
+        output_file: '',
+        block_assets: true,
+        user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
+        random_user_agent: false,
+    };
+
+    console.log('normal_search_test()');
+    await se_scraper.scrape(config, normal_search_test_case);
+}
+
+// we test with a callback function to our handler
+function normal_search_test_case(err, response) {
+
+    if (err) {
+        console.error(err);
+    } else {
+        assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
+        assert.equal(response.statusCode, 200, 'status code must be 200');
+
+        let total_rank = 1;
+
+        for (let query in response.results) {
+
+            assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
+
+            for (let page_number in response.results[query]) {
+
+                assert.isNumber(parseInt(page_number), 'page_number must be numeric');
+
+                let obj = response.results[query][page_number];
+
+                assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results'], 'not all keys are in the object');
+
+                assert.isAtLeast(obj.results.length, 8, 'results must have at least 8 SERP objects');
+                assert.equal(obj.no_results, false, 'no results should be false');
+                assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
+                assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
+                assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
+
+                for (let res of obj.results) {
+
+                    assert.containsAllKeys(res, ['link', 'title', 'rank'], 'not all keys are in the SERP object');
+
+                    assert.isOk(res.link, 'link must be ok');
+                    assert.typeOf(res.link, 'string', 'link must be string');
+                    assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
+
+                    assert.isOk(res.title, 'title must be ok');
+                    assert.typeOf(res.title, 'string', 'title must be string');
+                    assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
+
+                    assert.isOk(res.snippet, 'snippet must be ok');
+                    assert.typeOf(res.snippet, 'string', 'snippet must be string');
+                    assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
+
+                    assert.isNumber(res.rank, 'rank must be integer');
+                    assert.equal(res.rank, total_rank++, 'rank ist wrong');
+                }
+            }
+        }
+    }
+}
+
+
+const keywords_no_results = ['fgskl34440abJAksafkl34a44dsflkjaQQuBBdfk',];
+
+async function no_results_test() {
+    let config = {
+        search_engine: 'google',
+        compress: false,
+        debug: false,
+        verbose: false,
+        keywords: keywords_no_results,
+        keyword_file: '',
+        num_pages: 1,
+        headless: true,
+        output_file: '',
+        block_assets: true,
+        user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
+        random_user_agent: false,
+    };
+    console.log('no_results_test()');
+    await se_scraper.scrape(config, test_case_no_results);
+}
+
+// we test with a callback function to our handler
+function test_case_no_results(err, response) {
+    if (err) {
+        console.error(err);
+    } else {
+        assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
+        assert.equal(response.statusCode, 200, 'status code must be 200');
+        results = response.results;
+        for (let query in response.results) {
+
+            assert.containsAllKeys(response.results, keywords_no_results, 'not all keywords were scraped.');
+
+            for (let page_number in response.results[query]) {
+
+                assert.isNumber(parseInt(page_number), 'page_number must be numeric');
+
+                let obj = response.results[query][page_number];
+
+                assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results'], 'not all keys are in the object');
+
+                assert(obj.results.length === 0, 'results must have 0 SERP objects');
+                assert.equal(obj.no_results, true, 'no results should be true');
+                assert.isEmpty(obj.num_results, 'no results should be a empty string');
+                assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
+            }
+        }
+    }
+}
+
+const effective_query_keywords = ['mount evverrest'];
+
+async function effective_query_test() {
+    let config = {
+        search_engine: 'google',
+        compress: false,
+        debug: false,
+        verbose: false,
+        keywords: effective_query_keywords,
+        keyword_file: '',
+        num_pages: 1,
+        headless: true,
+        output_file: '',
+        block_assets: true,
+        user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
+        random_user_agent: false,
+    };
+    console.log('effective_query_test()');
+    await se_scraper.scrape(config, test_case_effective_query);
+}
+
+// we test with a callback function to our handler
+function test_case_effective_query(err, response) {
+
+    if (err) {
+        console.error(err);
+    } else {
+
+        assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
+        assert.equal(response.statusCode, 200, 'status code must be 200');
+
+        results = response.results;
+        for (let query in response.results) {
+
+            assert.containsAllKeys(response.results, effective_query_keywords, 'not all keywords were scraped.');
+
+            for (let page_number in response.results[query]) {
+
+                assert.isNumber(parseInt(page_number), 'page_number must be numeric');
+
+                let obj = response.results[query][page_number];
+
+                assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results'], 'not all keys are in the object');
+
+                // effective query must be different to the original keyword
+                assert.isOk(obj.effective_query, 'effective query must be ok');
+                assert.isNotEmpty(obj.effective_query, 'effective query must be valid');
+                assert(obj.effective_query !== query, 'effective query must be different from keyword');
+
+                assert.isAtLeast(obj.results.length, 8, 'results must have at least 8 SERP objects');
+                assert.equal(obj.no_results, false, 'no results should be false');
+                assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
+                assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
+                assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
+            }
+        }
+    }
+}
+
+
+(async () => {
+    await normal_search_test();
+    await no_results_test();
+    await effective_query_test();
+})();