se-scraper/test/modules/google.js

125 lines
5.3 KiB
JavaScript
Raw Normal View History

2020-01-07 09:48:46 +01:00
'use strict';
const express = require('express');
const puppeteer = require('puppeteer');
2020-01-07 09:48:46 +01:00
// TODO add a test logger in place of default winston logger
const logger = require('winston');
const net = require('net');
const http = require('http');
const https = require('https');
const url = require('url');
2020-01-07 09:48:46 +01:00
const assert = require('assert');
const path = require('path');
const keyCert = require('key-cert');
const Promise = require('bluebird');
const debug = require('debug')('se-scraper:test');
2020-01-07 09:48:46 +01:00
const { GoogleScraper } = require('../../src/modules/google');
const httpPort = 3012;
const httpsPort = httpPort + 1;
const fakeSearchEngine = express();
2020-01-07 09:48:46 +01:00
fakeSearchEngine.get('/search', (req, res, next) => {
debug('q=%s', req.query.q);
const pageNumber = ((req.query.start/10) || 0) + 1;
res.sendFile(path.join(__dirname, '../mocks/google/' + req.query.q + '_page' + pageNumber + '.html'));
});
fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']}));
describe('Module Google', function(){
let httpServerAndProxy, httpsServer;
before(async function(){
// Here mount our fake engine in both http and https listen server
httpServerAndProxy = http.createServer(fakeSearchEngine);
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
/**
* express doesn't handle HTTP CONNECT method, this implement a basic MITM http proxy
* here we use our http server to also act as a http proxy and rewrite all http/s request to our fake engine
*/
httpServerAndProxy.on('connect', (req, clientSocket, head) => {
const parsedUrl = url.parse('http://' + req.url);
const destPort = (parseInt(parsedUrl.port) === 443) ? httpsPort : httpPort;
const serverSocket = net.connect(destPort, 'localhost', () => {
debug('connection proxied askedHost=%s toPort=%s', parsedUrl.host, destPort);
clientSocket.write('HTTP/1.1 200 Connection Established\r\n' +
'Proxy-agent: Node.js-Proxy\r\n' +
'\r\n');
serverSocket.write(head);
serverSocket.pipe(clientSocket);
clientSocket.pipe(serverSocket);
serverSocket.on('error', (err)=>{
console.error(err);
});
});
});
await Promise.promisify(httpServerAndProxy.listen, {context: httpServerAndProxy})(httpPort);
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
debug('Fake http search engine servers started');
});
after(function(){
httpsServer.close();
httpServerAndProxy.close();
});
let browser;
let page;
beforeEach(async function(){
debug('Start a new browser');
browser = await puppeteer.launch({
//dumpio: true,
ignoreHTTPSErrors: true,
args: [ '--proxy-server=http://localhost:' + httpPort ]
});
debug('Open a fresh page');
page = await browser.newPage();
});
afterEach(async function(){
await browser.close();
});
2020-01-07 09:48:46 +01:00
it('one keyword one page', function(){
const googleScraper = new GoogleScraper({
config: {
search_engine_name: 'google',
throw_on_detection: true,
keywords: ['test keyword'],
logger,
scrape_from_file: '',
}
});
googleScraper.STANDARD_TIMEOUT = 500;
2020-01-07 09:48:46 +01:00
return googleScraper.run({page}).then(({results, metadata, num_requests}) => {
assert.strictEqual(num_requests, 1, 'Must do one request');
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed');
});
});
2020-01-07 09:48:46 +01:00
it('one keyword 3 pages', function () {
const googleScraper = new GoogleScraper({
config: {
search_engine_name: 'google',
throw_on_detection: true,
keywords: ['test keyword'],
logger,
scrape_from_file: '',
num_pages: 3,
}
});
googleScraper.STANDARD_TIMEOUT = 500;
return googleScraper.run({page}).then(({results, metadata, num_requests}) => {
assert.strictEqual(num_requests, 3, 'Must three requests');
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed on page 1');
assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tool (FREE) ᐈ #1 Google Keyword Planner Alternative', 'Title not matching on first organic result page 1');
assert.strictEqual(results['test keyword']['2'].results.length, 10, 'Must have 10 organic results parsed on page 2');
assert.strictEqual(results['test keyword']['2'].results[0].title, 'Keyword Research | The Beginner\'s Guide to SEO - Moz', 'Title not matching on first organic result page 1');
assert.strictEqual(results['test keyword']['3'].results.length, 10, 'Must have 10 organic results parsed on page 3');
assert.strictEqual(results['test keyword']['3'].results[0].title, 'The ACT Keyword Study Plan — NerdCoach', 'Title not matching on first organic result page 1');
});
});
});