se-scraper/server/server.js
2019-08-16 20:10:40 +02:00

67 lines
1.6 KiB
JavaScript

/**
Test server with:
curl -XPOST http://0.0.0.0:3000 -H 'Content-Type: application/json' \
-d '{
"browser_config": {
"random_user_agent": true
},
"scrape_config": {
"search_engine": "google",
"keywords": ["test"],
"num_pages": 1
}
}'
*/
const se_scraper = require('../index.js');
'use strict';
const express = require('express');
// Constants
const PORT = process.env.PORT || 3000;
const HOST = process.env.HOST || '0.0.0.0';
// App
const app = express();
app.use(express.json());
let browser_config = {
random_user_agent: true,
headless : true,
debug_level: 1,
sleep_range: '',
puppeteer_cluster_config: {
timeout: 30 * 60 * 1000, // max timeout set to 30 minutes
monitor: false,
concurrency: 1, // one scraper per tab
maxConcurrency: 1, // scrape with 5 tabs
}
};
app.post('/', async (req, res) => {
if (!req.body.browser_config || !req.body.scrape_config) {
res.json({
'status': 400,
'msg': 'please specify browser_config and scrape_config'
});
} else {
// overwrite standard browser config
Object.assign(browser_config, req.body.browser_config);
var scraper = new se_scraper.ScrapeManager(browser_config);
await scraper.start();
var results = await scraper.scrape(req.body.scrape_config);
// console.dir(results, {depth: null, colors: true});
await scraper.quit();
res.send(results);
}
});
app.listen(PORT, HOST);
console.log(`Running on http://${HOST}:${PORT}`);