From 394b567db64779b29d5de5358cc1752e42cbf641 Mon Sep 17 00:00:00 2001 From: HugoPoi Date: Fri, 10 Jan 2020 09:35:24 +0100 Subject: [PATCH] test: add user_agent tests, add html_output tests --- package-lock.json | 6 ++ package.json | 3 +- test/html_output.js | 96 +++++++++++++++++++++++++++++ test/user_agent.js | 144 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 248 insertions(+), 1 deletion(-) create mode 100644 test/html_output.js create mode 100644 test/user_agent.js diff --git a/package-lock.json b/package-lock.json index d4ea117..c5e9ae8 100644 --- a/package-lock.json +++ b/package-lock.json @@ -2197,6 +2197,12 @@ "resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz", "integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c=" }, + "ua-parser-js": { + "version": "0.7.21", + "resolved": "https://registry.npmjs.org/ua-parser-js/-/ua-parser-js-0.7.21.tgz", + "integrity": "sha512-+O8/qh/Qj8CgC6eYBVBykMrNtp5Gebn4dlGD/kKXVkJNDwyrAwSIqwz8CDf+tsAIWVycKcku6gIXJ0qwx/ZXaQ==", + "dev": true + }, "ultron": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/ultron/-/ultron-1.1.1.tgz", diff --git a/package.json b/package.json index d110373..8533f13 100644 --- a/package.json +++ b/package.json @@ -39,6 +39,7 @@ "express": "^4.17.1", "http-mitm-proxy": "^0.8.2", "key-cert": "^1.0.1", - "mocha": "^6.1.4" + "mocha": "^6.1.4", + "ua-parser-js": "^0.7.21" } } diff --git a/test/html_output.js b/test/html_output.js new file mode 100644 index 0000000..4829382 --- /dev/null +++ b/test/html_output.js @@ -0,0 +1,96 @@ +'use strict'; +const express = require('express'); +const { createLogger, transports } = require('winston'); +const http = require('http'); +const https = require('https'); +const assert = require('assert'); +const path = require('path'); +const keyCert = require('key-cert'); +const Promise = require('bluebird'); +const Proxy = require('http-mitm-proxy'); + +const debug = require('debug')('se-scraper:test'); +const se_scraper = require('../'); + +const httpPort = 3012; +const httpsPort = httpPort + 1; +const proxyPort = httpPort + 2; + +const fakeSearchEngine = express(); +fakeSearchEngine.get('/search', (req, res) => { + debug('q=%s', req.query.q); + const pageNumber = ((req.query.start/10) || 0) + 1; + res.sendFile(path.join(__dirname, 'mocks/google/' + req.query.q + '_page' + pageNumber + '.html')); +}); +fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']})); + +describe('Config', function(){ + + let httpServer, httpsServer, proxy; + before(async function(){ + // Here mount our fake engine in both http and https listen server + httpServer = http.createServer(fakeSearchEngine); + httpsServer = https.createServer(await keyCert(), fakeSearchEngine); + + proxy = Proxy(); + proxy.onRequest((ctx, callback) => { + ctx.proxyToServerRequestOptions.host = 'localhost'; + ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort; + ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine'; + debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host); + return callback(); + }); + + await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort}); + await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort); + await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort); + debug('Fake http search engine servers started'); + }); + + after(function(){ + httpsServer.close(); + httpServer.close(); + proxy.close(); + }); + + describe('html_output', function(){ + + const testLogger = createLogger({ + transports: [ + new transports.Console({ + level: 'error' + }) + ] + }); + + /** + * Test html_output option + */ + it('html_output single page single keyword', async function () { + + const scrape_job = { + search_engine: 'google', + google_settings: { + start_url: 'http://localhost:' + httpPort + }, + keywords: ['test keyword'], + }; + + var scraper = new se_scraper.ScrapeManager({ + throw_on_detection: true, + logger: testLogger, + html_output: true, + //clean_html_output: false, + //clean_data_images: false, + }); + await scraper.start(); + const { results } = await scraper.scrape(scrape_job); + await scraper.quit(); + + assert(results['test keyword']['1'].html.length > 1000, 'Html of google page 1 should be provided'); + + }); + + }); + +}); \ No newline at end of file diff --git a/test/user_agent.js b/test/user_agent.js new file mode 100644 index 0000000..8d7a3f3 --- /dev/null +++ b/test/user_agent.js @@ -0,0 +1,144 @@ +'use strict'; +const express = require('express'); +const { createLogger, transports } = require('winston'); +const http = require('http'); +const https = require('https'); +const assert = require('assert'); +const keyCert = require('key-cert'); +const Promise = require('bluebird'); +const Proxy = require('http-mitm-proxy'); +const UAParser = require('ua-parser-js'); +const _ = require('lodash'); + +const debug = require('debug')('se-scraper:test'); +const se_scraper = require('../'); +const Scraper = require('../src/modules/se_scraper'); + +const httpPort = 3012; +const httpsPort = httpPort + 1; +const proxyPort = httpPort + 2; + +const fakeSearchEngine = express(); +fakeSearchEngine.set('trust proxy', 'loopback'); +fakeSearchEngine.get('/test-user_agent', (req, res) => { + debug('fake-search-engine req.headers.user-agent=%s', req.headers['user-agent']); + res.send(req.headers['user-agent']); +}); + +describe('Config', function(){ + + let httpServer, httpsServer, proxy; + before(async function(){ + // Here mount our fake engine in both http and https listen server + httpServer = http.createServer(fakeSearchEngine); + httpsServer = https.createServer(await keyCert(), fakeSearchEngine); + + proxy = Proxy(); + proxy.onRequest((ctx, callback) => { + ctx.proxyToServerRequestOptions.host = 'localhost'; + ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort; + ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine'; + debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host); + return callback(); + }); + + await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort}); + await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort); + await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort); + debug('Fake http search engine servers started'); + }); + + after(function(){ + httpsServer.close(); + httpServer.close(); + proxy.close(); + }); + + describe('user_agent', function(){ + + class MockScraperTestUserAgent extends Scraper { + + async load_start_page(){ + return true; + } + + async search_keyword(){ + await this.page.goto('http://localhost:' + httpPort + '/test-user_agent'); + } + + async parse_async(){ + const bodyHandle = await this.page.$('body'); + return await this.page.evaluate(body => body.innerHTML, bodyHandle); + } + } + + const testLogger = createLogger({ + transports: [ + new transports.Console({ + level: 'error' + }) + ] + }); + + /** + * Test user_agent option + */ + it('fixed user_agent', async function () { + + const scrape_job = { + search_engine: MockScraperTestUserAgent, + keywords: ['javascript is hard'], + }; + + var scraper = new se_scraper.ScrapeManager({ + throw_on_detection: true, + logger: testLogger, + user_agent: 'THIS IS A USERAGENT 42.0' + }); + await scraper.start(); + + const { results } = await scraper.scrape(scrape_job); + assert.strictEqual(results['javascript is hard']['1'], 'THIS IS A USERAGENT 42.0'); + + await scraper.quit(); + }); + + /** + * Test random_user_agent option + * TODO generated user_agent should be different for each keyword + * TODO this test will sometimes fail because user_agent not very random :-( + */ + it('random_user_agent', async function () { + + const scrape_job = { + search_engine: MockScraperTestUserAgent, + keywords: ['news'], + }; + + const NUMBER_OF_EXEC = 10; + + const uaList = await Promise.map(_.range(NUMBER_OF_EXEC), async (i) => { + const scraper = new se_scraper.ScrapeManager({ + throw_on_detection: true, + logger: testLogger, + random_user_agent: true, + }); + await scraper.start(); + const { results: { news } } = await scraper.scrape(scrape_job); + await scraper.quit(); + return news['1']; + }); + + uaList.forEach((userAgent) => { + const uaParsed = UAParser(userAgent); + assert(uaParsed.browser.name, 'UserAgent should have a browser name detected'); + assert(uaParsed.os.name, 'UserAgent should have a os name detected'); + }); + + assert( _.chain(uaList).countBy().toPairs().sortBy(e => e[1]).last().value()[1] < (NUMBER_OF_EXEC * 0.3), 'Each user agent should appear less than 30% of the time' ); + + }); + + }); + +}); \ No newline at end of file