2019-08-13 17:35:06 +02:00
|
|
|
/**
|
|
|
|
|
|
|
|
Test server with:
|
|
|
|
|
|
|
|
curl -XPOST http://0.0.0.0:3000 -H 'Content-Type: application/json' \
|
|
|
|
-d '{
|
|
|
|
"browser_config": {
|
|
|
|
"random_user_agent": true
|
|
|
|
},
|
|
|
|
"scrape_config": {
|
|
|
|
"search_engine": "google",
|
|
|
|
"keywords": ["test"],
|
|
|
|
"num_pages": 1
|
|
|
|
}
|
|
|
|
}'
|
|
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
const se_scraper = require('../index.js');
|
|
|
|
'use strict';
|
|
|
|
const express = require('express');
|
|
|
|
|
|
|
|
// Constants
|
|
|
|
const PORT = process.env.PORT || 3000;
|
|
|
|
const HOST = process.env.HOST || '0.0.0.0';
|
|
|
|
|
|
|
|
// App
|
|
|
|
const app = express();
|
|
|
|
app.use(express.json());
|
|
|
|
|
|
|
|
let browser_config = {
|
|
|
|
random_user_agent: true,
|
|
|
|
headless : true,
|
|
|
|
debug_level: 1,
|
|
|
|
sleep_range: '',
|
|
|
|
puppeteer_cluster_config: {
|
|
|
|
timeout: 30 * 60 * 1000, // max timeout set to 30 minutes
|
|
|
|
monitor: false,
|
|
|
|
concurrency: 1, // one scraper per tab
|
|
|
|
maxConcurrency: 1, // scrape with 5 tabs
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
app.post('/', async (req, res) => {
|
|
|
|
if (!req.body.browser_config || !req.body.scrape_config) {
|
|
|
|
res.json({
|
|
|
|
'status': 400,
|
|
|
|
'msg': 'please specify browser_config and scrape_config'
|
|
|
|
});
|
|
|
|
} else {
|
|
|
|
// overwrite standard browser config
|
|
|
|
Object.assign(browser_config, req.body.browser_config);
|
|
|
|
|
|
|
|
var scraper = new se_scraper.ScrapeManager(browser_config);
|
|
|
|
await scraper.start();
|
|
|
|
var results = await scraper.scrape(req.body.scrape_config);
|
|
|
|
// console.dir(results, {depth: null, colors: true});
|
2019-08-16 20:10:40 +02:00
|
|
|
await scraper.quit();
|
2019-08-13 17:35:06 +02:00
|
|
|
|
|
|
|
res.send(results);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
app.listen(PORT, HOST);
|
|
|
|
|
|
|
|
console.log(`Running on http://${HOST}:${PORT}`);
|