forked from extern/se-scraper
test: add user_agent tests, add html_output tests
This commit is contained in:
parent
cac6b87e92
commit
394b567db6
6
package-lock.json
generated
6
package-lock.json
generated
@ -2197,6 +2197,12 @@
|
|||||||
"resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz",
|
"resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz",
|
||||||
"integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c="
|
"integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c="
|
||||||
},
|
},
|
||||||
|
"ua-parser-js": {
|
||||||
|
"version": "0.7.21",
|
||||||
|
"resolved": "https://registry.npmjs.org/ua-parser-js/-/ua-parser-js-0.7.21.tgz",
|
||||||
|
"integrity": "sha512-+O8/qh/Qj8CgC6eYBVBykMrNtp5Gebn4dlGD/kKXVkJNDwyrAwSIqwz8CDf+tsAIWVycKcku6gIXJ0qwx/ZXaQ==",
|
||||||
|
"dev": true
|
||||||
|
},
|
||||||
"ultron": {
|
"ultron": {
|
||||||
"version": "1.1.1",
|
"version": "1.1.1",
|
||||||
"resolved": "https://registry.npmjs.org/ultron/-/ultron-1.1.1.tgz",
|
"resolved": "https://registry.npmjs.org/ultron/-/ultron-1.1.1.tgz",
|
||||||
|
@ -39,6 +39,7 @@
|
|||||||
"express": "^4.17.1",
|
"express": "^4.17.1",
|
||||||
"http-mitm-proxy": "^0.8.2",
|
"http-mitm-proxy": "^0.8.2",
|
||||||
"key-cert": "^1.0.1",
|
"key-cert": "^1.0.1",
|
||||||
"mocha": "^6.1.4"
|
"mocha": "^6.1.4",
|
||||||
|
"ua-parser-js": "^0.7.21"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
96
test/html_output.js
Normal file
96
test/html_output.js
Normal file
@ -0,0 +1,96 @@
|
|||||||
|
'use strict';
|
||||||
|
const express = require('express');
|
||||||
|
const { createLogger, transports } = require('winston');
|
||||||
|
const http = require('http');
|
||||||
|
const https = require('https');
|
||||||
|
const assert = require('assert');
|
||||||
|
const path = require('path');
|
||||||
|
const keyCert = require('key-cert');
|
||||||
|
const Promise = require('bluebird');
|
||||||
|
const Proxy = require('http-mitm-proxy');
|
||||||
|
|
||||||
|
const debug = require('debug')('se-scraper:test');
|
||||||
|
const se_scraper = require('../');
|
||||||
|
|
||||||
|
const httpPort = 3012;
|
||||||
|
const httpsPort = httpPort + 1;
|
||||||
|
const proxyPort = httpPort + 2;
|
||||||
|
|
||||||
|
const fakeSearchEngine = express();
|
||||||
|
fakeSearchEngine.get('/search', (req, res) => {
|
||||||
|
debug('q=%s', req.query.q);
|
||||||
|
const pageNumber = ((req.query.start/10) || 0) + 1;
|
||||||
|
res.sendFile(path.join(__dirname, 'mocks/google/' + req.query.q + '_page' + pageNumber + '.html'));
|
||||||
|
});
|
||||||
|
fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']}));
|
||||||
|
|
||||||
|
describe('Config', function(){
|
||||||
|
|
||||||
|
let httpServer, httpsServer, proxy;
|
||||||
|
before(async function(){
|
||||||
|
// Here mount our fake engine in both http and https listen server
|
||||||
|
httpServer = http.createServer(fakeSearchEngine);
|
||||||
|
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
||||||
|
|
||||||
|
proxy = Proxy();
|
||||||
|
proxy.onRequest((ctx, callback) => {
|
||||||
|
ctx.proxyToServerRequestOptions.host = 'localhost';
|
||||||
|
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
|
||||||
|
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
|
||||||
|
debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
|
||||||
|
return callback();
|
||||||
|
});
|
||||||
|
|
||||||
|
await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
|
||||||
|
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
|
||||||
|
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
||||||
|
debug('Fake http search engine servers started');
|
||||||
|
});
|
||||||
|
|
||||||
|
after(function(){
|
||||||
|
httpsServer.close();
|
||||||
|
httpServer.close();
|
||||||
|
proxy.close();
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('html_output', function(){
|
||||||
|
|
||||||
|
const testLogger = createLogger({
|
||||||
|
transports: [
|
||||||
|
new transports.Console({
|
||||||
|
level: 'error'
|
||||||
|
})
|
||||||
|
]
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test html_output option
|
||||||
|
*/
|
||||||
|
it('html_output single page single keyword', async function () {
|
||||||
|
|
||||||
|
const scrape_job = {
|
||||||
|
search_engine: 'google',
|
||||||
|
google_settings: {
|
||||||
|
start_url: 'http://localhost:' + httpPort
|
||||||
|
},
|
||||||
|
keywords: ['test keyword'],
|
||||||
|
};
|
||||||
|
|
||||||
|
var scraper = new se_scraper.ScrapeManager({
|
||||||
|
throw_on_detection: true,
|
||||||
|
logger: testLogger,
|
||||||
|
html_output: true,
|
||||||
|
//clean_html_output: false,
|
||||||
|
//clean_data_images: false,
|
||||||
|
});
|
||||||
|
await scraper.start();
|
||||||
|
const { results } = await scraper.scrape(scrape_job);
|
||||||
|
await scraper.quit();
|
||||||
|
|
||||||
|
assert(results['test keyword']['1'].html.length > 1000, 'Html of google page 1 should be provided');
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
});
|
144
test/user_agent.js
Normal file
144
test/user_agent.js
Normal file
@ -0,0 +1,144 @@
|
|||||||
|
'use strict';
|
||||||
|
const express = require('express');
|
||||||
|
const { createLogger, transports } = require('winston');
|
||||||
|
const http = require('http');
|
||||||
|
const https = require('https');
|
||||||
|
const assert = require('assert');
|
||||||
|
const keyCert = require('key-cert');
|
||||||
|
const Promise = require('bluebird');
|
||||||
|
const Proxy = require('http-mitm-proxy');
|
||||||
|
const UAParser = require('ua-parser-js');
|
||||||
|
const _ = require('lodash');
|
||||||
|
|
||||||
|
const debug = require('debug')('se-scraper:test');
|
||||||
|
const se_scraper = require('../');
|
||||||
|
const Scraper = require('../src/modules/se_scraper');
|
||||||
|
|
||||||
|
const httpPort = 3012;
|
||||||
|
const httpsPort = httpPort + 1;
|
||||||
|
const proxyPort = httpPort + 2;
|
||||||
|
|
||||||
|
const fakeSearchEngine = express();
|
||||||
|
fakeSearchEngine.set('trust proxy', 'loopback');
|
||||||
|
fakeSearchEngine.get('/test-user_agent', (req, res) => {
|
||||||
|
debug('fake-search-engine req.headers.user-agent=%s', req.headers['user-agent']);
|
||||||
|
res.send(req.headers['user-agent']);
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Config', function(){
|
||||||
|
|
||||||
|
let httpServer, httpsServer, proxy;
|
||||||
|
before(async function(){
|
||||||
|
// Here mount our fake engine in both http and https listen server
|
||||||
|
httpServer = http.createServer(fakeSearchEngine);
|
||||||
|
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
||||||
|
|
||||||
|
proxy = Proxy();
|
||||||
|
proxy.onRequest((ctx, callback) => {
|
||||||
|
ctx.proxyToServerRequestOptions.host = 'localhost';
|
||||||
|
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
|
||||||
|
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
|
||||||
|
debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
|
||||||
|
return callback();
|
||||||
|
});
|
||||||
|
|
||||||
|
await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
|
||||||
|
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
|
||||||
|
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
||||||
|
debug('Fake http search engine servers started');
|
||||||
|
});
|
||||||
|
|
||||||
|
after(function(){
|
||||||
|
httpsServer.close();
|
||||||
|
httpServer.close();
|
||||||
|
proxy.close();
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('user_agent', function(){
|
||||||
|
|
||||||
|
class MockScraperTestUserAgent extends Scraper {
|
||||||
|
|
||||||
|
async load_start_page(){
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
async search_keyword(){
|
||||||
|
await this.page.goto('http://localhost:' + httpPort + '/test-user_agent');
|
||||||
|
}
|
||||||
|
|
||||||
|
async parse_async(){
|
||||||
|
const bodyHandle = await this.page.$('body');
|
||||||
|
return await this.page.evaluate(body => body.innerHTML, bodyHandle);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const testLogger = createLogger({
|
||||||
|
transports: [
|
||||||
|
new transports.Console({
|
||||||
|
level: 'error'
|
||||||
|
})
|
||||||
|
]
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test user_agent option
|
||||||
|
*/
|
||||||
|
it('fixed user_agent', async function () {
|
||||||
|
|
||||||
|
const scrape_job = {
|
||||||
|
search_engine: MockScraperTestUserAgent,
|
||||||
|
keywords: ['javascript is hard'],
|
||||||
|
};
|
||||||
|
|
||||||
|
var scraper = new se_scraper.ScrapeManager({
|
||||||
|
throw_on_detection: true,
|
||||||
|
logger: testLogger,
|
||||||
|
user_agent: 'THIS IS A USERAGENT 42.0'
|
||||||
|
});
|
||||||
|
await scraper.start();
|
||||||
|
|
||||||
|
const { results } = await scraper.scrape(scrape_job);
|
||||||
|
assert.strictEqual(results['javascript is hard']['1'], 'THIS IS A USERAGENT 42.0');
|
||||||
|
|
||||||
|
await scraper.quit();
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test random_user_agent option
|
||||||
|
* TODO generated user_agent should be different for each keyword
|
||||||
|
* TODO this test will sometimes fail because user_agent not very random :-(
|
||||||
|
*/
|
||||||
|
it('random_user_agent', async function () {
|
||||||
|
|
||||||
|
const scrape_job = {
|
||||||
|
search_engine: MockScraperTestUserAgent,
|
||||||
|
keywords: ['news'],
|
||||||
|
};
|
||||||
|
|
||||||
|
const NUMBER_OF_EXEC = 10;
|
||||||
|
|
||||||
|
const uaList = await Promise.map(_.range(NUMBER_OF_EXEC), async (i) => {
|
||||||
|
const scraper = new se_scraper.ScrapeManager({
|
||||||
|
throw_on_detection: true,
|
||||||
|
logger: testLogger,
|
||||||
|
random_user_agent: true,
|
||||||
|
});
|
||||||
|
await scraper.start();
|
||||||
|
const { results: { news } } = await scraper.scrape(scrape_job);
|
||||||
|
await scraper.quit();
|
||||||
|
return news['1'];
|
||||||
|
});
|
||||||
|
|
||||||
|
uaList.forEach((userAgent) => {
|
||||||
|
const uaParsed = UAParser(userAgent);
|
||||||
|
assert(uaParsed.browser.name, 'UserAgent should have a browser name detected');
|
||||||
|
assert(uaParsed.os.name, 'UserAgent should have a os name detected');
|
||||||
|
});
|
||||||
|
|
||||||
|
assert( _.chain(uaList).countBy().toPairs().sortBy(e => e[1]).last().value()[1] < (NUMBER_OF_EXEC * 0.3), 'Each user agent should appear less than 30% of the time' );
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
});
|
Loading…
Reference in New Issue
Block a user