mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2025-06-20 17:47:49 +02:00
test: add bing module test
This commit is contained in:
parent
392c43390e
commit
3ab8e46126
@ -123,12 +123,9 @@ class BingScraper extends Scraper {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
await this.page.goto(startUrl);
|
||||||
await this.page.goto(startUrl);
|
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
||||||
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
|
||||||
} catch (e) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
206
test/mocks/google/test keyword_page2.html
Normal file
206
test/mocks/google/test keyword_page2.html
Normal file
File diff suppressed because one or more lines are too long
191
test/mocks/google/test keyword_page3.html
Normal file
191
test/mocks/google/test keyword_page3.html
Normal file
File diff suppressed because one or more lines are too long
125
test/modules/bing.js
Normal file
125
test/modules/bing.js
Normal file
@ -0,0 +1,125 @@
|
|||||||
|
'use strict';
|
||||||
|
const express = require('express');
|
||||||
|
const puppeteer = require('puppeteer');
|
||||||
|
// TODO add a test logger in place of default winston logger
|
||||||
|
const logger = require('winston');
|
||||||
|
const net = require('net');
|
||||||
|
const http = require('http');
|
||||||
|
const https = require('https');
|
||||||
|
const url = require('url');
|
||||||
|
const assert = require('assert');
|
||||||
|
const path = require('path');
|
||||||
|
const keyCert = require('key-cert');
|
||||||
|
const Promise = require('bluebird');
|
||||||
|
|
||||||
|
const debug = require('debug')('se-scraper:test');
|
||||||
|
const { BingScraper } = require('../../src/modules/bing');
|
||||||
|
|
||||||
|
const httpPort = 3012;
|
||||||
|
const httpsPort = httpPort + 1;
|
||||||
|
|
||||||
|
const fakeSearchEngine = express();
|
||||||
|
fakeSearchEngine.get('/search', (req, res, next) => {
|
||||||
|
debug('q=%s', req.query.q);
|
||||||
|
const pageNumber = ((req.query.start/10) || 0) + 1;
|
||||||
|
res.sendFile(path.join(__dirname, '../mocks/bing/' + req.query.q + '_page' + pageNumber + '.html'));
|
||||||
|
});
|
||||||
|
fakeSearchEngine.use(express.static('test/mocks/bing', {extensions: ['html']}));
|
||||||
|
|
||||||
|
describe('Module Bing', function(){
|
||||||
|
|
||||||
|
let httpServerAndProxy, httpsServer;
|
||||||
|
before(async function(){
|
||||||
|
// Here mount our fake engine in both http and https listen server
|
||||||
|
httpServerAndProxy = http.createServer(fakeSearchEngine);
|
||||||
|
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* express doesn't handle HTTP CONNECT method, this implement a basic MITM http proxy
|
||||||
|
* here we use our http server to also act as a http proxy and rewrite all http/s request to our fake engine
|
||||||
|
*/
|
||||||
|
httpServerAndProxy.on('connect', (req, clientSocket, head) => {
|
||||||
|
const parsedUrl = url.parse('http://' + req.url);
|
||||||
|
const destPort = (parseInt(parsedUrl.port) === 443) ? httpsPort : httpPort;
|
||||||
|
const serverSocket = net.connect(destPort, 'localhost', () => {
|
||||||
|
debug('connection proxied askedHost=%s toPort=%s', parsedUrl.host, destPort);
|
||||||
|
clientSocket.write('HTTP/1.1 200 Connection Established\r\n' +
|
||||||
|
'Proxy-agent: Node.js-Proxy\r\n' +
|
||||||
|
'\r\n');
|
||||||
|
serverSocket.write(head);
|
||||||
|
serverSocket.pipe(clientSocket);
|
||||||
|
clientSocket.pipe(serverSocket);
|
||||||
|
serverSocket.on('error', (err)=>{
|
||||||
|
console.error(err);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
await Promise.promisify(httpServerAndProxy.listen, {context: httpServerAndProxy})(httpPort);
|
||||||
|
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
||||||
|
debug('Fake http search engine servers started');
|
||||||
|
});
|
||||||
|
|
||||||
|
after(function(){
|
||||||
|
httpsServer.close();
|
||||||
|
httpServerAndProxy.close();
|
||||||
|
});
|
||||||
|
|
||||||
|
let browser;
|
||||||
|
let page;
|
||||||
|
beforeEach(async function(){
|
||||||
|
debug('Start a new browser');
|
||||||
|
browser = await puppeteer.launch({
|
||||||
|
//dumpio: true,
|
||||||
|
ignoreHTTPSErrors: true,
|
||||||
|
args: [ '--proxy-server=http://localhost:' + httpPort ]
|
||||||
|
});
|
||||||
|
debug('Open a fresh page');
|
||||||
|
page = await browser.newPage();
|
||||||
|
});
|
||||||
|
|
||||||
|
afterEach(async function(){
|
||||||
|
await browser.close();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('one keyword one page', function(){
|
||||||
|
const bingScraper = new BingScraper({
|
||||||
|
config: {
|
||||||
|
search_engine_name: 'bing',
|
||||||
|
throw_on_detection: true,
|
||||||
|
keywords: ['test keyword'],
|
||||||
|
logger,
|
||||||
|
scrape_from_file: '',
|
||||||
|
}
|
||||||
|
});
|
||||||
|
bingScraper.STANDARD_TIMEOUT = 500;
|
||||||
|
return bingScraper.run({page}).then(({results, metadata, num_requests}) => {
|
||||||
|
assert.strictEqual(num_requests, 1, 'Must do one request');
|
||||||
|
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it('one keyword 3 pages', function () {
|
||||||
|
const bingScraper = new BingScraper({
|
||||||
|
config: {
|
||||||
|
search_engine_name: 'bing',
|
||||||
|
throw_on_detection: true,
|
||||||
|
keywords: ['test keyword'],
|
||||||
|
logger,
|
||||||
|
scrape_from_file: '',
|
||||||
|
num_pages: 3,
|
||||||
|
}
|
||||||
|
});
|
||||||
|
bingScraper.STANDARD_TIMEOUT = 500;
|
||||||
|
return bingScraper.run({page}).then(({results, metadata, num_requests}) => {
|
||||||
|
assert.strictEqual(num_requests, 3, 'Must three requests');
|
||||||
|
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed on page 1');
|
||||||
|
assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tool (FREE) ᐈ #1 Google Keyword Planner Alternative', 'Title not matching on first organic result page 1');
|
||||||
|
assert.strictEqual(results['test keyword']['2'].results.length, 10, 'Must have 10 organic results parsed on page 2');
|
||||||
|
assert.strictEqual(results['test keyword']['2'].results[0].title, 'Keyword Research | The Beginner\'s Guide to SEO - Moz', 'Title not matching on first organic result page 1');
|
||||||
|
assert.strictEqual(results['test keyword']['3'].results.length, 10, 'Must have 10 organic results parsed on page 3');
|
||||||
|
assert.strictEqual(results['test keyword']['3'].results[0].title, 'The ACT Keyword Study Plan — NerdCoach', 'Title not matching on first organic result page 1');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
});
|
@ -1,22 +1,28 @@
|
|||||||
|
'use strict';
|
||||||
const express = require('express');
|
const express = require('express');
|
||||||
const puppeteer = require('puppeteer');
|
const puppeteer = require('puppeteer');
|
||||||
|
// TODO add a test logger in place of default winston logger
|
||||||
const logger = require('winston');
|
const logger = require('winston');
|
||||||
const net = require('net');
|
const net = require('net');
|
||||||
const http = require('http');
|
const http = require('http');
|
||||||
const https = require('https');
|
const https = require('https');
|
||||||
const url = require('url');
|
const url = require('url');
|
||||||
|
const assert = require('assert');
|
||||||
|
const path = require('path');
|
||||||
const keyCert = require('key-cert');
|
const keyCert = require('key-cert');
|
||||||
const Promise = require('bluebird');
|
const Promise = require('bluebird');
|
||||||
|
|
||||||
const debug = require('debug')('se-scraper:test');
|
const debug = require('debug')('se-scraper:test');
|
||||||
const { GoogleScraper } = require('../src/modules/google');
|
const { GoogleScraper } = require('../../src/modules/google');
|
||||||
|
|
||||||
const httpPort = 3012;
|
const httpPort = 3012;
|
||||||
const httpsPort = httpPort + 1;
|
const httpsPort = httpPort + 1;
|
||||||
|
|
||||||
const fakeSearchEngine = express();
|
const fakeSearchEngine = express();
|
||||||
fakeSearchEngine.get("/about", (req, res) => {
|
fakeSearchEngine.get('/search', (req, res, next) => {
|
||||||
res.status(500).send("This is the About page");
|
debug('q=%s', req.query.q);
|
||||||
|
const pageNumber = ((req.query.start/10) || 0) + 1;
|
||||||
|
res.sendFile(path.join(__dirname, '../mocks/google/' + req.query.q + '_page' + pageNumber + '.html'));
|
||||||
});
|
});
|
||||||
fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']}));
|
fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']}));
|
||||||
|
|
||||||
@ -76,7 +82,7 @@ describe('Module Google', function(){
|
|||||||
await browser.close();
|
await browser.close();
|
||||||
});
|
});
|
||||||
|
|
||||||
it('one keyword', function(){
|
it('one keyword one page', function(){
|
||||||
const googleScraper = new GoogleScraper({
|
const googleScraper = new GoogleScraper({
|
||||||
config: {
|
config: {
|
||||||
search_engine_name: 'google',
|
search_engine_name: 'google',
|
||||||
@ -84,14 +90,36 @@ describe('Module Google', function(){
|
|||||||
keywords: ['test keyword'],
|
keywords: ['test keyword'],
|
||||||
logger,
|
logger,
|
||||||
scrape_from_file: '',
|
scrape_from_file: '',
|
||||||
google_settings: {
|
|
||||||
//start_url: 'http://www.google.com/'
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
googleScraper.STANDARD_TIMEOUT = 500;
|
googleScraper.STANDARD_TIMEOUT = 500;
|
||||||
return googleScraper.run({page});
|
return googleScraper.run({page}).then(({results, metadata, num_requests}) => {
|
||||||
|
assert.strictEqual(num_requests, 1, 'Must do one request');
|
||||||
|
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed');
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('one keyword 3 pages', function () {
|
||||||
|
const googleScraper = new GoogleScraper({
|
||||||
|
config: {
|
||||||
|
search_engine_name: 'google',
|
||||||
|
throw_on_detection: true,
|
||||||
|
keywords: ['test keyword'],
|
||||||
|
logger,
|
||||||
|
scrape_from_file: '',
|
||||||
|
num_pages: 3,
|
||||||
|
}
|
||||||
|
});
|
||||||
|
googleScraper.STANDARD_TIMEOUT = 500;
|
||||||
|
return googleScraper.run({page}).then(({results, metadata, num_requests}) => {
|
||||||
|
assert.strictEqual(num_requests, 3, 'Must three requests');
|
||||||
|
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed on page 1');
|
||||||
|
assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tool (FREE) ᐈ #1 Google Keyword Planner Alternative', 'Title not matching on first organic result page 1');
|
||||||
|
assert.strictEqual(results['test keyword']['2'].results.length, 10, 'Must have 10 organic results parsed on page 2');
|
||||||
|
assert.strictEqual(results['test keyword']['2'].results[0].title, 'Keyword Research | The Beginner\'s Guide to SEO - Moz', 'Title not matching on first organic result page 1');
|
||||||
|
assert.strictEqual(results['test keyword']['3'].results.length, 10, 'Must have 10 organic results parsed on page 3');
|
||||||
|
assert.strictEqual(results['test keyword']['3'].results[0].title, 'The ACT Keyword Study Plan — NerdCoach', 'Title not matching on first organic result page 1');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
});
|
});
|
Loading…
x
Reference in New Issue
Block a user