se-scraper/examples/pluggable.js

73 lines
2.0 KiB
JavaScript
Raw Normal View History

2019-01-27 19:08:07 +01:00
module.exports = class Pluggable {
constructor(options = {}) {
const {
chromeFlags = [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--disable-gpu',
'--window-size=1920x1080',
'--hide-scrollbars',
'--user-agent=Chrome',
],
userAgent = 'Chrome',
headless = true,
} = options;
this.chromeFlags = chromeFlags;
this.userAgent = userAgent;
this.headless = headless;
this.chromeFlags.push(this.userAgent);
}
async close_browser() {
await this.browser.close();
}
// Callback invoked after metadata has been gathered
2019-01-27 19:08:07 +01:00
async handle_metadata(args) {
2019-01-27 20:08:09 +01:00
// store scraping metadata somewhere
}
// Callback invoked after all keywords have been scraped
2019-01-27 20:08:09 +01:00
async handle_results(args) {
// store the results somewhere
2019-01-27 19:08:07 +01:00
}
// Callback invoked before a keyword is scraped.
async before_keyword_scraped(args) {
console.log('before keyword scraped.');
}
// Callback invoked after a keyword has been scraped.
// TODO: implement this
async after_keyword_scraped(args) {
console.log('after keyword scraped.')
}
2019-01-27 19:08:07 +01:00
async start_browser(args={}) {
const puppeteer = require('puppeteer');
let launch_args = {
args: args.chromeFlags || this.chromeFlags,
2019-01-27 20:08:09 +01:00
headless: args.headless,
2019-01-27 19:08:07 +01:00
};
2019-01-27 20:08:09 +01:00
if (launch_args.headless === undefined) {
launch_args.headless = this.headless;
}
2019-01-27 19:08:07 +01:00
this.browser = await puppeteer.launch(launch_args);
console.log('Loaded custom function get_browser()');
2019-01-27 20:08:09 +01:00
console.log(launch_args);
2019-01-27 19:08:07 +01:00
return this.browser;
}
2019-03-02 22:32:26 +01:00
async do_work(page) {
// do some scraping work and return results and num_requests
}
2019-01-27 19:08:07 +01:00
};