mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2024-11-25 00:53:46 +01:00
test(duckduckgo): implement tests for duckduckgo module
This commit is contained in:
parent
b685fb4def
commit
28332528ea
148
test/mocks/duckduckgo/index.html
Normal file
148
test/mocks/duckduckgo/index.html
Normal file
@ -0,0 +1,148 @@
|
||||
<!DOCTYPE html>
|
||||
<!--[if IEMobile 7 ]> <html lang="en_US" class="no-js iem7"> <![endif]-->
|
||||
<!--[if lt IE 7]> <html class="ie6 lt-ie10 lt-ie9 lt-ie8 lt-ie7 no-js" lang="en_US"> <![endif]-->
|
||||
<!--[if IE 7]> <html class="ie7 lt-ie10 lt-ie9 lt-ie8 no-js" lang="en_US"> <![endif]-->
|
||||
<!--[if IE 8]> <html class="ie8 lt-ie10 lt-ie9 no-js" lang="en_US"> <![endif]-->
|
||||
<!--[if IE 9]> <html class="ie9 lt-ie10 no-js" lang="en_US"> <![endif]-->
|
||||
<!--[if (gte IE 9)|(gt IEMobile 7)|!(IEMobile)|!(IE)]><!--><html class="no-js" lang="en_US"><!--<![endif]-->
|
||||
|
||||
<head>
|
||||
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
|
||||
<meta http-equiv="content-type" content="text/html; charset=UTF-8;charset=utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=1" />
|
||||
<meta name="HandheldFriendly" content="true"/>
|
||||
|
||||
<link rel="canonical" href="https://duckduckgo.com/">
|
||||
|
||||
<link rel="stylesheet" href="/s1847.css" type="text/css">
|
||||
|
||||
<link rel="stylesheet" href="/o1847.css" type="text/css">
|
||||
|
||||
|
||||
|
||||
<link rel="shortcut icon" href="/favicon.ico" type="image/x-icon"/>
|
||||
<link rel="apple-touch-icon" href="/assets/icons/meta/DDG-iOS-icon_60x60.png"/>
|
||||
<link rel="apple-touch-icon" sizes="76x76" href="/assets/icons/meta/DDG-iOS-icon_76x76.png"/>
|
||||
<link rel="apple-touch-icon" sizes="120x120" href="/assets/icons/meta/DDG-iOS-icon_120x120.png"/>
|
||||
<link rel="apple-touch-icon" sizes="152x152" href="/assets/icons/meta/DDG-iOS-icon_152x152.png"/>
|
||||
<link rel="image_src" href="/assets/icons/meta/DDG-icon_256x256.png"/>
|
||||
<link rel="manifest" href="/manifest.json"/>
|
||||
|
||||
<meta name="twitter:card" content="summary">
|
||||
<meta name="twitter:site" value="@duckduckgo">
|
||||
|
||||
<meta property="og:url" content="https://duckduckgo.com/" />
|
||||
<meta property="og:site_name" content="DuckDuckGo" />
|
||||
<meta property="og:image" content="https://duckduckgo.com/assets/logo_social-media.png">
|
||||
|
||||
|
||||
<title>DuckDuckGo — Privacy, simplified.</title>
|
||||
<meta property="og:title" content="DuckDuckGo — Privacy, simplified." />
|
||||
|
||||
|
||||
<meta property="og:description" content="The Internet privacy company that empowers you to seamlessly take control of your personal information online, without any tradeoffs.">
|
||||
<meta name="description" content="The Internet privacy company that empowers you to seamlessly take control of your personal information online, without any tradeoffs.">
|
||||
|
||||
|
||||
</head>
|
||||
<body id="pg-index" class="page-index body--home">
|
||||
<script type="text/javascript">
|
||||
var settings_js_version = "/s2475.js",
|
||||
locale = "en_US";
|
||||
</script>
|
||||
<script type="text/javascript" src="/lib/l113.js"></script>
|
||||
<script type="text/javascript" src="/locale/en_US/duckduckgo14.js"></script>
|
||||
<script type="text/javascript" src="/util/u418.js"></script>
|
||||
<script type="text/javascript" src="/d2727.js"></script>
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript">
|
||||
DDG.page = new DDG.Pages.Home();
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
<div class="site-wrapper site-wrapper--home js-site-wrapper">
|
||||
|
||||
|
||||
<div class="header-wrap--home js-header-wrap">
|
||||
<div class="header--aside js-header-aside"></div>
|
||||
<div class="js-header-home-search header-wrap--home__search">
|
||||
<div class="logo-wrap--home">
|
||||
<a id="logo_homepage_link" class="logo_homepage" href="/about">
|
||||
About DuckDuckGo
|
||||
<span class="logo_homepage__tt">Duck it!</span>
|
||||
</a>
|
||||
</div>
|
||||
|
||||
<form id="search_form_homepage_top" class="search search--home js-search-form-top" name="x" method="POST" action="/html">
|
||||
<input class="search__input js-search-input" type="text" autocomplete="off" name="q" tabindex="1" value="">
|
||||
<input class="search__button js-search-button" type="submit" tabindex="2" value="S" />
|
||||
<input class="search__clear empty js-search-clear" type="button" tabindex="3" value="X" />
|
||||
<div class="search__hidden js-search-hidden"></div>
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
<div id="" class="content-wrap--home">
|
||||
<div id="content_homepage" class="content--home">
|
||||
<div class="cw--c">
|
||||
<div class="logo-wrap--home">
|
||||
<a id="logo_homepage_link" class="logo_homepage" href="/about">
|
||||
About DuckDuckGo
|
||||
<span class="logo_homepage__tt">Duck it!</span>
|
||||
</a>
|
||||
</div>
|
||||
|
||||
<div class="search-wrap--home">
|
||||
<form id="search_form_homepage" class="search search--home js-search-form" name="x" method="POST" action="/html">
|
||||
<input id="search_form_input_homepage" class="search__input js-search-input" type="text" autocomplete="off" name="q" tabindex="1" value="">
|
||||
<input id="search_button_homepage" class="search__button js-search-button" type="submit" tabindex="2" value="S" />
|
||||
<input id="search_form_input_clear" class="search__clear empty js-search-clear" type="button" tabindex="3" value="X" />
|
||||
<div id="search_elements_hidden" class="search__hidden js-search-hidden"></div>
|
||||
</form>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<!-- en_US All Settings -->
|
||||
<noscript>
|
||||
<div class="tag-home">
|
||||
<div class="tag-home__wrapper">
|
||||
<div class="tag-home__item">
|
||||
The search engine that doesn't track you.
|
||||
<span class="hide--screen-xs"><a href="/about" class="tag-home__link">Learn More</a>.</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</noscript>
|
||||
<div class="tag-home tag-home--slide no-js__hide js-tag-home"></div>
|
||||
<div id="error_homepage"></div>
|
||||
|
||||
|
||||
|
||||
|
||||
</div> <!-- cw -->
|
||||
</div> <!-- content_homepage //-->
|
||||
</div> <!-- content_wrapper_homepage //-->
|
||||
<div id="footer_homepage" class="foot-home js-foot-home"></div>
|
||||
|
||||
<script type="text/javascript">
|
||||
{function seterr(str) {
|
||||
var error=document.getElementById('error_homepage');
|
||||
error.innerHTML=str;
|
||||
$(error).css('display','block');
|
||||
}
|
||||
var err=new RegExp('[\?\&]e=([^\&]+)');var errm=new Array();errm['2']='no search';errm['3']='search too long';errm['4']='not UTF\u002d8 encoding';errm['6']='too many search terms';if (err.test(window.location.href)) seterr('Oops, '+(errm[RegExp.$1]?errm[RegExp.$1]:'there was an error.')+' Please try again');};
|
||||
|
||||
if (kurl) {
|
||||
document.getElementById("logo_homepage_link").href += (document.getElementById("logo_homepage_link").href.indexOf('?')==-1 ? '?t=i' : '') + kurl;
|
||||
}
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
</div> <!-- site-wrapper -->
|
||||
</body>
|
||||
</html>
|
3
test/mocks/duckduckgo/test keyword_page1.html
Normal file
3
test/mocks/duckduckgo/test keyword_page1.html
Normal file
File diff suppressed because one or more lines are too long
3
test/mocks/duckduckgo/test keyword_page2.html
Normal file
3
test/mocks/duckduckgo/test keyword_page2.html
Normal file
File diff suppressed because one or more lines are too long
3
test/mocks/duckduckgo/test keyword_page3.html
Normal file
3
test/mocks/duckduckgo/test keyword_page3.html
Normal file
File diff suppressed because one or more lines are too long
140
test/modules/duckduckgo.js
Normal file
140
test/modules/duckduckgo.js
Normal file
@ -0,0 +1,140 @@
|
||||
'use strict';
|
||||
const express = require('express');
|
||||
const puppeteer = require('puppeteer');
|
||||
const { createLogger, transports } = require('winston');
|
||||
const http = require('http');
|
||||
const https = require('https');
|
||||
const assert = require('assert');
|
||||
const path = require('path');
|
||||
const keyCert = require('key-cert');
|
||||
const Promise = require('bluebird');
|
||||
const Proxy = require('http-mitm-proxy');
|
||||
|
||||
const debug = require('debug')('se-scraper:test');
|
||||
const { DuckduckgoScraper } = require('../../src/modules/duckduckgo');
|
||||
|
||||
const httpPort = 3012;
|
||||
const httpsPort = httpPort + 1;
|
||||
const proxyPort = httpPort + 2;
|
||||
|
||||
const fakeSearchEngine = express();
|
||||
fakeSearchEngine.use(express.urlencoded({ extended: true }))
|
||||
fakeSearchEngine.get('/', (req, res, next) => {
|
||||
if(!req.query.q){
|
||||
return next();
|
||||
}
|
||||
debug('q=%s page=%d', req.query.q, req.query.page);
|
||||
const pageNumber = req.query.page;
|
||||
res.sendFile(path.join(__dirname, '../mocks/duckduckgo/' + req.query.q + '_page' + pageNumber + '.html'));
|
||||
});
|
||||
fakeSearchEngine.post('/html', (req, res) => {
|
||||
debug('body=%o', req.body);
|
||||
const pageNumber = 1;
|
||||
res.sendFile(path.join(__dirname, '../mocks/duckduckgo/' + req.body.q + '_page' + pageNumber + '.html'));
|
||||
});
|
||||
fakeSearchEngine.use(express.static('test/mocks/duckduckgo', {extensions: ['html']}));
|
||||
|
||||
describe('Module DuckDuckGo', function(){
|
||||
|
||||
let httpServer, httpsServer, proxy;
|
||||
before(async function(){
|
||||
// Here mount our fake engine in both http and https listen server
|
||||
httpServer = http.createServer(fakeSearchEngine);
|
||||
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
||||
|
||||
proxy = Proxy();
|
||||
proxy.onRequest((ctx, callback) => {
|
||||
ctx.proxyToServerRequestOptions.host = 'localhost';
|
||||
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
|
||||
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
|
||||
debug('proxy askedHost=%s method=%s url=%s toPort=%s',
|
||||
ctx.clientToProxyRequest.headers.host,
|
||||
ctx.clientToProxyRequest.method,
|
||||
ctx.clientToProxyRequest.url,
|
||||
ctx.proxyToServerRequestOptions.port
|
||||
);
|
||||
return callback();
|
||||
});
|
||||
|
||||
await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort });
|
||||
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
|
||||
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
||||
debug('Fake http search engine servers started');
|
||||
});
|
||||
|
||||
after(function(){
|
||||
proxy.close();
|
||||
httpsServer.close();
|
||||
httpServer.close();
|
||||
});
|
||||
|
||||
let browser;
|
||||
let page;
|
||||
beforeEach(async function(){
|
||||
debug('Start a new browser');
|
||||
browser = await puppeteer.launch({
|
||||
//dumpio: true,
|
||||
//headless: false,
|
||||
ignoreHTTPSErrors: true,
|
||||
args: [ '--proxy-server=http://localhost:' + proxyPort ]
|
||||
});
|
||||
debug('Open a fresh page');
|
||||
page = await browser.newPage();
|
||||
});
|
||||
|
||||
afterEach(async function(){
|
||||
await browser.close();
|
||||
});
|
||||
|
||||
const testLogger = createLogger({
|
||||
transports: [
|
||||
new transports.Console({
|
||||
level: 'error'
|
||||
})
|
||||
]
|
||||
});
|
||||
|
||||
it('one keyword one page', function(){
|
||||
const duckduckgoScraper = new DuckduckgoScraper({
|
||||
config: {
|
||||
search_engine_name: 'duckduckgo',
|
||||
throw_on_detection: true,
|
||||
keywords: ['test keyword'],
|
||||
logger: testLogger,
|
||||
scrape_from_file: '',
|
||||
}
|
||||
});
|
||||
duckduckgoScraper.STANDARD_TIMEOUT = 1000;
|
||||
return duckduckgoScraper.run({page}).then(({results, metadata, num_requests}) => {
|
||||
assert.strictEqual(num_requests, 1, 'Must do one request');
|
||||
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed');
|
||||
});
|
||||
});
|
||||
|
||||
it('one keyword 3 pages', function () {
|
||||
this.timeout(4000);
|
||||
const duckduckgoScraper = new DuckduckgoScraper({
|
||||
config: {
|
||||
search_engine_name: 'google',
|
||||
throw_on_detection: true,
|
||||
keywords: ['test keyword'],
|
||||
logger: testLogger,
|
||||
scrape_from_file: '',
|
||||
num_pages: 3,
|
||||
}
|
||||
});
|
||||
duckduckgoScraper.STANDARD_TIMEOUT = 1000;
|
||||
return duckduckgoScraper.run({page}).then(({results, metadata, num_requests}) => {
|
||||
assert.strictEqual(num_requests, 3, 'Must three requests');
|
||||
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed on page 1');
|
||||
assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tests | TestComplete Documentation', 'Title not matching on first organic result page 1');
|
||||
debug('results page 1 %O',results['test keyword']['1'].results);
|
||||
debug('results page 2 %O', results['test keyword']['2'].results);
|
||||
assert.strictEqual(results['test keyword']['2'].results.length, 19, 'Must have 19 organic results parsed on page 2');
|
||||
assert.strictEqual(results['test keyword']['2'].results[0].title, 'Quest Diagnostics: Test Directory', 'Title not matching on first organic result page 1');
|
||||
assert.strictEqual(results['test keyword']['3'].results.length, 48, 'Must have 48 organic results parsed on page 3');
|
||||
assert.strictEqual(results['test keyword']['3'].results[0].title, 'Java Keywords Quiz - Sporcle', 'Title not matching on first organic result page 1');
|
||||
});
|
||||
});
|
||||
|
||||
});
|
Loading…
Reference in New Issue
Block a user