mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2025-06-20 09:38:06 +02:00
added pluggable functionality
This commit is contained in:
parent
e78d7145b5
commit
8e695626b6
43
README.md
43
README.md
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
This node module supports scraping several search engines.
|
This node module supports scraping several search engines.
|
||||||
|
|
||||||
Right now scraping for
|
Right now scraping the search engines
|
||||||
|
|
||||||
* Google
|
* Google
|
||||||
* Google News
|
* Google News
|
||||||
@ -17,6 +17,13 @@ Right now scraping for
|
|||||||
|
|
||||||
is supported.
|
is supported.
|
||||||
|
|
||||||
|
Additionally **se-scraper** supports investment ticker search from the following sites:
|
||||||
|
|
||||||
|
* Bloomberg
|
||||||
|
* Reuters
|
||||||
|
* cnbc
|
||||||
|
* Marketwatch
|
||||||
|
|
||||||
This module uses puppeteer. It was created by the Developer of https://github.com/NikolaiT/GoogleScraper, a module with 1800 Stars on Github.
|
This module uses puppeteer. It was created by the Developer of https://github.com/NikolaiT/GoogleScraper, a module with 1800 Stars on Github.
|
||||||
|
|
||||||
### Technical Notes
|
### Technical Notes
|
||||||
@ -38,8 +45,9 @@ npm install se-scraper
|
|||||||
|
|
||||||
Use se-scraper by calling it with a script such as the one below.
|
Use se-scraper by calling it with a script such as the one below.
|
||||||
|
|
||||||
```javascript
|
```js
|
||||||
const se_scraper = require('se-scraper');
|
const se_scraper = require('se-scraper');
|
||||||
|
const resolve = require('path').resolve;
|
||||||
|
|
||||||
let config = {
|
let config = {
|
||||||
// the user agent to scrape with
|
// the user agent to scrape with
|
||||||
@ -47,27 +55,35 @@ let config = {
|
|||||||
// if random_user_agent is set to True, a random user agent is chosen
|
// if random_user_agent is set to True, a random user agent is chosen
|
||||||
random_user_agent: false,
|
random_user_agent: false,
|
||||||
// get meta data of scraping in return object
|
// get meta data of scraping in return object
|
||||||
write_meta_data: true,
|
write_meta_data: false,
|
||||||
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
||||||
// is drawn before every request. empty string for no sleeping.
|
// is drawn before every request. empty string for no sleeping.
|
||||||
sleep_range: '[1,1]',
|
sleep_range: '',
|
||||||
// which search engine to scrape
|
// which search engine to scrape
|
||||||
search_engine: 'yahoo_news',
|
search_engine: 'google',
|
||||||
// whether debug information should be printed
|
// whether debug information should be printed
|
||||||
debug: true,
|
// debug info is useful for developers when debugging
|
||||||
|
debug: false,
|
||||||
// whether verbose program output should be printed
|
// whether verbose program output should be printed
|
||||||
|
// this output is informational
|
||||||
verbose: false,
|
verbose: false,
|
||||||
// an array of keywords to scrape
|
// an array of keywords to scrape
|
||||||
keywords: ['GOOGL', ],
|
keywords: ['scrapeulous.com', ],
|
||||||
// alternatively you can specify a keyword_file. this overwrites the keywords array
|
// alternatively you can specify a keyword_file. this overwrites the keywords array
|
||||||
keyword_file: './keywords.txt',
|
keyword_file: '',
|
||||||
// whether to start the browser in headless mode
|
// whether to start the browser in headless mode
|
||||||
headless: false,
|
headless: true,
|
||||||
// path to output file, data will be stored in JSON
|
// path to output file, data will be stored in JSON
|
||||||
output_file: 'results.json',
|
output_file: 'data.json',
|
||||||
// whether to prevent images, css, fonts from being loaded
|
// whether to prevent images, css, fonts from being loaded
|
||||||
// will speed up scraping a great deal
|
// will speed up scraping a great deal
|
||||||
block_assets: true
|
block_assets: true,
|
||||||
|
// path to js module that extends functionality
|
||||||
|
// this module should export the functions:
|
||||||
|
// get_browser, handle_metadata, close_browser
|
||||||
|
// must be an absolute path to the module
|
||||||
|
//custom_func: resolve('examples/pluggable.js'),
|
||||||
|
custom_func: '',
|
||||||
};
|
};
|
||||||
|
|
||||||
se_scraper.scrape(config, (err, response) => {
|
se_scraper.scrape(config, (err, response) => {
|
||||||
@ -100,6 +116,11 @@ Supported options for the `search_engine` config key:
|
|||||||
'duckduckgo_news'
|
'duckduckgo_news'
|
||||||
'google_dr'
|
'google_dr'
|
||||||
'yahoo_news'
|
'yahoo_news'
|
||||||
|
// ticker search
|
||||||
|
'bloomberg'
|
||||||
|
'reuters'
|
||||||
|
'cnbc'
|
||||||
|
'marketwatch'
|
||||||
```
|
```
|
||||||
|
|
||||||
Output for the above script on my laptop:
|
Output for the above script on my laptop:
|
||||||
|
1
TODO.txt
1
TODO.txt
@ -12,6 +12,7 @@
|
|||||||
- Add functionality to block images and CSS from loading as described here:
|
- Add functionality to block images and CSS from loading as described here:
|
||||||
|
|
||||||
https://www.scrapehero.com/how-to-increase-web-scraping-speed-using-puppeteer/
|
https://www.scrapehero.com/how-to-increase-web-scraping-speed-using-puppeteer/
|
||||||
|
https://www.scrapehero.com/how-to-build-a-web-scraper-using-puppeteer-and-node-js/
|
||||||
|
|
||||||
TODO:
|
TODO:
|
||||||
- add proxy support
|
- add proxy support
|
||||||
|
1
data.json
Normal file
1
data.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
{"scrapeulous.com":{"time":"Sun, 27 Jan 2019 14:51:54 GMT","num_results":"Ungefähr 169 Ergebnisse (0,23 Sekunden) ","no_results":false,"effective_query":"","results":[{"link":"https://scrapeulous.com/","title":"Scrapeulous","snippet":"Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. Whether you need to analyze your competitors market ...","visible_link":"https://scrapeulous.com/","date":"","rank":1},{"link":"https://scrapeulous.com/about/","title":"About - Scrapeulous","snippet":"Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. The business requirement to scrape information from ...","visible_link":"https://scrapeulous.com/about/","date":"","rank":2},{"link":"https://scrapeulous.com/contact/","title":"Contact - Scrapeulous","snippet":"Contact scrapeulous.com. Your email address. Valid email address where we are going to contact you. We will not send spam mail. Your inquiry.","visible_link":"https://scrapeulous.com/contact/","date":"","rank":3},{"link":"https://scrapeulous.com/howto/","title":"Howto - Scrapeulous","snippet":"We offer scraping large amounts of keywords for the Google Search Engine. Large means any number of keywords between 40 and 50000. Additionally, we ...","visible_link":"https://scrapeulous.com/howto/","date":"","rank":4},{"link":"https://incolumitas.com/","title":"Coding, Learning and Business Ideas","snippet":"About · Contact · GoogleScraper · Lichess Autoplay-Bot · Projects · Scrapeulous.com · Site Notice · SVGCaptcha · Home Archives Categories Tags Atom ...","visible_link":"https://incolumitas.com/","date":"","rank":5},{"link":"https://incolumitas.com/pages/scrapeulous/","title":"Coding, Learning and Business Ideas – Scrapeulous.com - Incolumitas","snippet":"In autumn 2018, I created a scraping service called scrapeulous.com. There you can purchase scrape jobs that allow you to upload a keyword file which in turn ...","visible_link":"https://incolumitas.com/pages/scrapeulous/","date":"","rank":6},{"link":"https://www.youtube.com/watch?v=a6xn6rc9GbI","title":"scrapeulous intro - YouTube","snippet":"Introduction for https://scrapeulous.com.","visible_link":"https://www.youtube.com/watch?v=a6xn6rc9GbI","date":"","rank":7},{"link":"https://www.youtube.com/channel/UCJs1Xei5LRefg9GwFYdYhOw","title":"Scrapeulous Scrapeulous - YouTube","snippet":"How to use scrapeulous.com - Duration: 3 minutes, 42 seconds. 32 minutes ago; 4 views. Introduction for https://scrapeulous.com. Show more. This item has ...","visible_link":"https://www.youtube.com/.../UCJs1Xei5LRefg9GwFYdYhOw","date":"","rank":8},{"link":"https://readthedocs.org/projects/googlescraper/downloads/pdf/latest/","title":"GoogleScraper Documentation - Read the Docs","snippet":"23.12.2018 - 1.1 Scrapeulous.com - Scraping Service. GoogleScraper is a open source tool and will remain a open source tool in the future. Some people ...","visible_link":"https://readthedocs.org/projects/googlescraper/downloads/.../latest...","date":"23.12.2018 - ","rank":9}]}}
|
39
examples/pluggable.js
Normal file
39
examples/pluggable.js
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
module.exports = {
|
||||||
|
get_browser: get_browser,
|
||||||
|
handle_metadata: handle_metadata,
|
||||||
|
close_browser: close_browser
|
||||||
|
};
|
||||||
|
|
||||||
|
async function close_browser(browser) {
|
||||||
|
await browser.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
async function handle_metadata() {
|
||||||
|
// silence
|
||||||
|
}
|
||||||
|
|
||||||
|
async function get_browser(launch_args) {
|
||||||
|
const puppeteer = require('puppeteer');
|
||||||
|
|
||||||
|
const ADDITIONAL_CHROME_FLAGS = [
|
||||||
|
'--no-sandbox',
|
||||||
|
'--disable-setuid-sandbox',
|
||||||
|
'--disable-dev-shm-usage',
|
||||||
|
'--disable-accelerated-2d-canvas',
|
||||||
|
'--disable-gpu',
|
||||||
|
'--window-size=1920x1080',
|
||||||
|
'--hide-scrollbars',
|
||||||
|
'--user-agent=Chrome',
|
||||||
|
];
|
||||||
|
|
||||||
|
let custom_args = {
|
||||||
|
args: ADDITIONAL_CHROME_FLAGS,
|
||||||
|
headless: true,
|
||||||
|
};
|
||||||
|
|
||||||
|
browser = await puppeteer.launch(launch_args);
|
||||||
|
|
||||||
|
console.log('Loaded custom function get_browser()');
|
||||||
|
|
||||||
|
return browser;
|
||||||
|
}
|
6
index.js
6
index.js
@ -29,7 +29,11 @@ exports.scrape = async function(config, callback) {
|
|||||||
output_file: '',
|
output_file: '',
|
||||||
// whether to prevent images, css, fonts from being loaded
|
// whether to prevent images, css, fonts from being loaded
|
||||||
// will speed up scraping a great deal
|
// will speed up scraping a great deal
|
||||||
block_assets: true
|
block_assets: true,
|
||||||
|
// path to js module that extends functionality
|
||||||
|
// this module should export the functions:
|
||||||
|
// get_browser, handle_metadata, close_browser
|
||||||
|
custom_func: 'examples/pluggable.js',
|
||||||
};
|
};
|
||||||
|
|
||||||
for (var key in config) {
|
for (var key in config) {
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
{
|
{
|
||||||
"name": "se-scraper",
|
"name": "se-scraper",
|
||||||
"version": "1.1.0",
|
"version": "1.1.1",
|
||||||
"description": "A simple module which uses puppeteer to scrape several search engines.",
|
"description": "A simple module which uses puppeteer to scrape several search engines.",
|
||||||
|
"homepage": "https://scrapeulous.com/",
|
||||||
"main": "index.js",
|
"main": "index.js",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"test": "mocha"
|
"test": "mocha"
|
||||||
|
21
run.js
21
run.js
@ -1,4 +1,5 @@
|
|||||||
const se_scraper = require('./index.js');
|
const se_scraper = require('./index.js');
|
||||||
|
const resolve = require('path').resolve;
|
||||||
|
|
||||||
let config = {
|
let config = {
|
||||||
// the user agent to scrape with
|
// the user agent to scrape with
|
||||||
@ -9,24 +10,32 @@ let config = {
|
|||||||
write_meta_data: false,
|
write_meta_data: false,
|
||||||
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
||||||
// is drawn before every request. empty string for no sleeping.
|
// is drawn before every request. empty string for no sleeping.
|
||||||
sleep_range: '[1,1]',
|
sleep_range: '',
|
||||||
// which search engine to scrape
|
// which search engine to scrape
|
||||||
search_engine: 'bing',
|
search_engine: 'google',
|
||||||
// whether debug information should be printed
|
// whether debug information should be printed
|
||||||
debug: true,
|
// debug info is useful for developers when debugging
|
||||||
|
debug: false,
|
||||||
// whether verbose program output should be printed
|
// whether verbose program output should be printed
|
||||||
|
// this output is informational
|
||||||
verbose: false,
|
verbose: false,
|
||||||
// an array of keywords to scrape
|
// an array of keywords to scrape
|
||||||
keywords: ['MSFT', ],
|
keywords: ['scrapeulous.com', ],
|
||||||
// alternatively you can specify a keyword_file. this overwrites the keywords array
|
// alternatively you can specify a keyword_file. this overwrites the keywords array
|
||||||
keyword_file: '',
|
keyword_file: '',
|
||||||
// whether to start the browser in headless mode
|
// whether to start the browser in headless mode
|
||||||
headless: false,
|
headless: true,
|
||||||
// path to output file, data will be stored in JSON
|
// path to output file, data will be stored in JSON
|
||||||
output_file: 'data.json',
|
output_file: 'data.json',
|
||||||
// whether to prevent images, css, fonts from being loaded
|
// whether to prevent images, css, fonts from being loaded
|
||||||
// will speed up scraping a great deal
|
// will speed up scraping a great deal
|
||||||
block_assets: true
|
block_assets: true,
|
||||||
|
// path to js module that extends functionality
|
||||||
|
// this module should export the functions:
|
||||||
|
// get_browser, handle_metadata, close_browser
|
||||||
|
// must be an absolute path to the module
|
||||||
|
//custom_func: resolve('examples/pluggable.js'),
|
||||||
|
custom_func: '',
|
||||||
};
|
};
|
||||||
|
|
||||||
se_scraper.scrape(config, (err, response) => {
|
se_scraper.scrape(config, (err, response) => {
|
||||||
|
@ -4,6 +4,7 @@ module.exports = {
|
|||||||
sleep: sleep,
|
sleep: sleep,
|
||||||
random_sleep: random_sleep,
|
random_sleep: random_sleep,
|
||||||
set_input_value: set_input_value,
|
set_input_value: set_input_value,
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
async function set_input_value(page, selector, value) {
|
async function set_input_value(page, selector, value) {
|
||||||
|
@ -1,13 +1,6 @@
|
|||||||
const cheerio = require('cheerio');
|
const cheerio = require('cheerio');
|
||||||
const sfunctions = require('./functions.js');
|
const sfunctions = require('./functions.js');
|
||||||
|
|
||||||
/*
|
|
||||||
Scrape for dateranges:
|
|
||||||
|
|
||||||
https://www.google.com/search?lr=&hl=en&tbs=cdr:1,cd_min:1/1/2007,cd_max:1/1/2009&q=%22video+game%22+%22Catan%22&oq=%22video+game%22+%22Catan%22
|
|
||||||
|
|
||||||
*/
|
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
scrape_google_news_old_pup: scrape_google_news_old_pup,
|
scrape_google_news_old_pup: scrape_google_news_old_pup,
|
||||||
scrape_google_pup: scrape_google_pup,
|
scrape_google_pup: scrape_google_pup,
|
||||||
@ -19,13 +12,6 @@ module.exports = {
|
|||||||
const STANDARD_TIMEOUT = 8000;
|
const STANDARD_TIMEOUT = 8000;
|
||||||
const SOLVE_CAPTCHA_TIME = 45000;
|
const SOLVE_CAPTCHA_TIME = 45000;
|
||||||
|
|
||||||
const setTextInputValue = async (page, selector, value) => {
|
|
||||||
await page.waitFor(selector);
|
|
||||||
await page.evaluate((value, selector) => {
|
|
||||||
return document.querySelector(selector).value = value;
|
|
||||||
}, value, selector);
|
|
||||||
};
|
|
||||||
|
|
||||||
async function scrape_google_pup(page, event, context) {
|
async function scrape_google_pup(page, event, context) {
|
||||||
await page.goto('https://www.google.com/');
|
await page.goto('https://www.google.com/');
|
||||||
|
|
||||||
@ -51,7 +37,7 @@ async function scrape_google_pup(page, event, context) {
|
|||||||
// await input.click({ clickCount: 3 });
|
// await input.click({ clickCount: 3 });
|
||||||
// await sfunctions.sleep(50);
|
// await sfunctions.sleep(50);
|
||||||
//await input.type(keyword);
|
//await input.type(keyword);
|
||||||
await setTextInputValue(page, `input[name="q"]`, keyword);
|
await sfunctions.set_input_value(page, `input[name="q"]`, keyword);
|
||||||
await sfunctions.sleep(50);
|
await sfunctions.sleep(50);
|
||||||
await input.focus();
|
await input.focus();
|
||||||
await page.keyboard.press("Enter");
|
await page.keyboard.press("Enter");
|
||||||
@ -130,7 +116,7 @@ async function scrape_google_pup_dr(page, event, context) {
|
|||||||
// await input.click({ clickCount: 3 });
|
// await input.click({ clickCount: 3 });
|
||||||
// await sfunctions.sleep(50);
|
// await sfunctions.sleep(50);
|
||||||
// await input.type(keyword);
|
// await input.type(keyword);
|
||||||
await setTextInputValue(page, `input[name="q"]`, keyword);
|
await sfunctions.set_input_value(page, `input[name="q"]`, keyword);
|
||||||
await sfunctions.sleep(50);
|
await sfunctions.sleep(50);
|
||||||
|
|
||||||
await input.focus();
|
await input.focus();
|
||||||
@ -252,7 +238,7 @@ async function scrape_google_news_old_pup(page, event, context) {
|
|||||||
// overwrites last text in input
|
// overwrites last text in input
|
||||||
// await input.click({ clickCount: 3 });
|
// await input.click({ clickCount: 3 });
|
||||||
// await input.type(keyword);
|
// await input.type(keyword);
|
||||||
await setTextInputValue(page, `input[name="q"]`, keyword);
|
await sfunctions.set_input_value(page, `input[name="q"]`, keyword);
|
||||||
await sfunctions.sleep(50);
|
await sfunctions.sleep(50);
|
||||||
await input.focus();
|
await input.focus();
|
||||||
await page.keyboard.press("Enter");
|
await page.keyboard.press("Enter");
|
||||||
@ -367,7 +353,7 @@ async function scrape_google_image_pup(page, event, context) {
|
|||||||
// overwrites last text in input
|
// overwrites last text in input
|
||||||
// await input.click({ clickCount: 3 });
|
// await input.click({ clickCount: 3 });
|
||||||
// await input.type(keyword);
|
// await input.type(keyword);
|
||||||
await setTextInputValue(page, `input[name="q"]`, keyword);
|
await sfunctions.set_input_value(page, `input[name="q"]`, keyword);
|
||||||
await sfunctions.sleep(50);
|
await sfunctions.sleep(50);
|
||||||
|
|
||||||
await input.focus();
|
await input.focus();
|
||||||
|
@ -13,7 +13,6 @@ const meta = require('./modules/metadata.js');
|
|||||||
const duckduckgo = require('./modules/duckduckgo.js');
|
const duckduckgo = require('./modules/duckduckgo.js');
|
||||||
const tickersearch = require('./modules/ticker_search.js');
|
const tickersearch = require('./modules/ticker_search.js');
|
||||||
|
|
||||||
|
|
||||||
function write_results(fname, data) {
|
function write_results(fname, data) {
|
||||||
fs.writeFileSync(fname, data, (err) => {
|
fs.writeFileSync(fname, data, (err) => {
|
||||||
if (err) throw err;
|
if (err) throw err;
|
||||||
@ -21,14 +20,23 @@ function write_results(fname, data) {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports.handler = async function handler (event, context, callback) {
|
module.exports.handler = async function handler (config, context, callback) {
|
||||||
|
|
||||||
|
custom_func = null;
|
||||||
|
if (config.custom_func && fs.existsSync(config.custom_func)) {
|
||||||
|
try {
|
||||||
|
custom_func = require(config.custom_func);
|
||||||
|
} catch (exception) {
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const startTime = Date.now();
|
const startTime = Date.now();
|
||||||
|
|
||||||
event = parseEventData(event);
|
config = parseEventData(config);
|
||||||
if (event.debug === true) {
|
if (config.debug === true) {
|
||||||
console.log(event);
|
console.log(config);
|
||||||
}
|
}
|
||||||
|
|
||||||
const ADDITIONAL_CHROME_FLAGS = [
|
const ADDITIONAL_CHROME_FLAGS = [
|
||||||
@ -44,12 +52,12 @@ module.exports.handler = async function handler (event, context, callback) {
|
|||||||
|
|
||||||
let USER_AGENT = '';
|
let USER_AGENT = '';
|
||||||
|
|
||||||
if (event.random_user_agent) {
|
if (config.random_user_agent) {
|
||||||
USER_AGENT = ua.random_user_agent();
|
USER_AGENT = ua.random_user_agent();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (event.user_agent) {
|
if (config.user_agent) {
|
||||||
USER_AGENT = event.user_agent;
|
USER_AGENT = config.user_agent;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (USER_AGENT) {
|
if (USER_AGENT) {
|
||||||
@ -58,23 +66,29 @@ module.exports.handler = async function handler (event, context, callback) {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
if (event.debug === true) {
|
let launch_args = {
|
||||||
console.log("Chrome Flags: ", ADDITIONAL_CHROME_FLAGS);
|
|
||||||
}
|
|
||||||
|
|
||||||
browser = await puppeteer.launch({
|
|
||||||
args: ADDITIONAL_CHROME_FLAGS,
|
args: ADDITIONAL_CHROME_FLAGS,
|
||||||
headless: event.headless !== false,
|
headless: config.headless !== false,
|
||||||
});
|
};
|
||||||
|
|
||||||
if (event.log_http_headers === true) {
|
if (config.debug === true) {
|
||||||
|
console.log("Chrome Args: ", launch_args);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (custom_func) {
|
||||||
|
browser = await custom_func.get_browser(launch_args);
|
||||||
|
} else {
|
||||||
|
browser = await puppeteer.launch(launch_args);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (config.log_http_headers === true) {
|
||||||
headers = await meta.get_http_headers(browser);
|
headers = await meta.get_http_headers(browser);
|
||||||
console.dir(headers);
|
console.dir(headers);
|
||||||
}
|
}
|
||||||
|
|
||||||
const page = await browser.newPage();
|
const page = await browser.newPage();
|
||||||
|
|
||||||
if (event.block_assets === true) {
|
if (config.block_assets === true) {
|
||||||
await page.setRequestInterception(true);
|
await page.setRequestInterception(true);
|
||||||
|
|
||||||
page.on('request', (req) => {
|
page.on('request', (req) => {
|
||||||
@ -104,45 +118,53 @@ module.exports.handler = async function handler (event, context, callback) {
|
|||||||
reuters: tickersearch.scrape_reuters_finance_pup,
|
reuters: tickersearch.scrape_reuters_finance_pup,
|
||||||
cnbc: tickersearch.scrape_cnbc_finance_pup,
|
cnbc: tickersearch.scrape_cnbc_finance_pup,
|
||||||
marketwatch: tickersearch.scrape_marketwatch_finance_pup,
|
marketwatch: tickersearch.scrape_marketwatch_finance_pup,
|
||||||
}[event.search_engine](page, event, context);
|
}[config.search_engine](page, config, context);
|
||||||
|
|
||||||
let metadata = {};
|
let metadata = {};
|
||||||
|
|
||||||
if (event.write_meta_data === true) {
|
if (config.write_meta_data === true) {
|
||||||
metadata = await meta.get_metadata(browser);
|
metadata = await meta.get_metadata(browser);
|
||||||
}
|
}
|
||||||
|
|
||||||
await browser.close();
|
if (custom_func) {
|
||||||
|
await custom_func.close_browser(browser);
|
||||||
|
} else {
|
||||||
|
await browser.close();
|
||||||
|
}
|
||||||
|
|
||||||
let num_keywords = event.keywords.length || 0;
|
let num_keywords = config.keywords.length || 0;
|
||||||
let timeDelta = Date.now() - startTime;
|
let timeDelta = Date.now() - startTime;
|
||||||
let ms_per_keyword = timeDelta/num_keywords;
|
let ms_per_keyword = timeDelta/num_keywords;
|
||||||
console.log(`Scraper took ${timeDelta}ms to scrape ${num_keywords} keywords.`);
|
|
||||||
console.log(`On average ms/keyword: ${ms_per_keyword}ms/keyword`);
|
|
||||||
|
|
||||||
if (event.verbose === true) {
|
if (config.verbose === true) {
|
||||||
|
console.log(`Scraper took ${timeDelta}ms to scrape ${num_keywords} keywords.`);
|
||||||
|
console.log(`On average ms/keyword: ${ms_per_keyword}ms/keyword`);
|
||||||
console.dir(results, {depth: null, colors: true});
|
console.dir(results, {depth: null, colors: true});
|
||||||
}
|
}
|
||||||
|
|
||||||
if (event.compress === true) {
|
if (config.compress === true) {
|
||||||
results = JSON.stringify(results);
|
results = JSON.stringify(results);
|
||||||
// https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Encoding
|
// https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Encoding
|
||||||
results = zlib.deflateSync(results).toString('base64');
|
results = zlib.deflateSync(results).toString('base64');
|
||||||
}
|
}
|
||||||
|
|
||||||
if (event.write_meta_data === true) {
|
if (config.write_meta_data === true) {
|
||||||
metadata.id = `${event.job_name} ${event.chunk_lines}`;
|
metadata.id = `${config.job_name} ${config.chunk_lines}`;
|
||||||
metadata.chunk_lines = event.chunk_lines;
|
metadata.chunk_lines = config.chunk_lines;
|
||||||
metadata.elapsed_time = timeDelta.toString();
|
metadata.elapsed_time = timeDelta.toString();
|
||||||
metadata.ms_per_keyword = ms_per_keyword.toString();
|
metadata.ms_per_keyword = ms_per_keyword.toString();
|
||||||
|
|
||||||
if (event.verbose === true) {
|
if (config.verbose === true) {
|
||||||
console.log(metadata);
|
console.log(metadata);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (custom_func) {
|
||||||
|
await custom_func.handle_metadata(metadata);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (event.output_file) {
|
if (config.output_file) {
|
||||||
write_results(event.output_file, JSON.stringify(results));
|
write_results(config.output_file, JSON.stringify(results));
|
||||||
}
|
}
|
||||||
|
|
||||||
let response = {
|
let response = {
|
||||||
@ -161,7 +183,7 @@ module.exports.handler = async function handler (event, context, callback) {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
function parseEventData(event) {
|
function parseEventData(config) {
|
||||||
|
|
||||||
function _bool(e) {
|
function _bool(e) {
|
||||||
e = String(e);
|
e = String(e);
|
||||||
@ -172,54 +194,54 @@ function parseEventData(event) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (event.debug) {
|
if (config.debug) {
|
||||||
event.debug = _bool(event.debug);
|
config.debug = _bool(config.debug);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (event.verbose) {
|
if (config.verbose) {
|
||||||
event.verbose = _bool(event.verbose);
|
config.verbose = _bool(config.verbose);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (event.upload_to_s3) {
|
if (config.upload_to_s3) {
|
||||||
event.upload_to_s3 = _bool(event.upload_to_s3);
|
config.upload_to_s3 = _bool(config.upload_to_s3);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (event.write_meta_data) {
|
if (config.write_meta_data) {
|
||||||
event.write_meta_data = _bool(event.write_meta_data);
|
config.write_meta_data = _bool(config.write_meta_data);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (event.log_http_headers) {
|
if (config.log_http_headers) {
|
||||||
event.log_http_headers = _bool(event.log_http_headers);
|
config.log_http_headers = _bool(config.log_http_headers);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (event.compress) {
|
if (config.compress) {
|
||||||
event.compress = _bool(event.compress);
|
config.compress = _bool(config.compress);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (event.is_local) {
|
if (config.is_local) {
|
||||||
event.is_local = _bool(event.is_local);
|
config.is_local = _bool(config.is_local);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (event.max_results) {
|
if (config.max_results) {
|
||||||
event.max_results = parseInt(event.max_results);
|
config.max_results = parseInt(config.max_results);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (event.set_manual_settings) {
|
if (config.set_manual_settings) {
|
||||||
event.set_manual_settings = _bool(event.set_manual_settings);
|
config.set_manual_settings = _bool(config.set_manual_settings);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (event.block_assets) {
|
if (config.block_assets) {
|
||||||
event.block_assets = _bool(event.block_assets);
|
config.block_assets = _bool(config.block_assets);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (event.sleep_range) {
|
if (config.sleep_range) {
|
||||||
// parse an array
|
// parse an array
|
||||||
event.sleep_range = eval(event.sleep_range);
|
config.sleep_range = eval(config.sleep_range);
|
||||||
|
|
||||||
if (event.sleep_range.length !== 2 && typeof i[0] !== 'number' && typeof i[1] !== 'number') {
|
if (config.sleep_range.length !== 2 && typeof i[0] !== 'number' && typeof i[1] !== 'number') {
|
||||||
throw "sleep_range is not a valid array of two integers.";
|
throw "sleep_range is not a valid array of two integers.";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return event;
|
return config;
|
||||||
}
|
}
|
@ -32,7 +32,6 @@ async function tests() {
|
|||||||
console.log(`Testing ${se}...`);
|
console.log(`Testing ${se}...`);
|
||||||
event.search_engine = se;
|
event.search_engine = se;
|
||||||
await handler.handler(event, undefined, test_case);
|
await handler.handler(event, undefined, test_case);
|
||||||
await sleep(3000);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -88,26 +87,19 @@ function test_case(err, response) {
|
|||||||
if (err) {
|
if (err) {
|
||||||
console.error(err);
|
console.error(err);
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
|
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
|
||||||
assert.equal(response.statusCode, 200, 'status code must be 200');
|
assert.equal(response.statusCode, 200, 'status code must be 200');
|
||||||
|
|
||||||
results = response.results;
|
for (key in response.results) {
|
||||||
|
kw = response.results[key];
|
||||||
for (kw in results) {
|
|
||||||
// at least 6 results
|
// at least 6 results
|
||||||
assert.isAtLeast(results[kw].results.length, 6, 'results must have at least 6 links');
|
assert.isAtLeast(kw.results.length, 6, 'results must have at least 6 links');
|
||||||
|
assert.equal(kw.no_results, false, 'no results should be false');
|
||||||
|
assert.typeOf(kw.num_results, 'string', 'num_results must be a string');
|
||||||
|
assert.isAtLeast(kw.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||||
|
assert.typeOf(Date.parse(kw.time), 'number', 'time should be a valid date');
|
||||||
|
|
||||||
assert.equal(results[kw].no_results, false, 'no results should be false');
|
for (let res of kw.results) {
|
||||||
|
|
||||||
assert.typeOf(results[kw].num_results, 'string', 'num_results must be a string');
|
|
||||||
assert.isAtLeast(results[kw].num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
|
||||||
|
|
||||||
assert.typeOf(Date.parse(results[kw].time), 'number', 'time should be a valid date');
|
|
||||||
|
|
||||||
|
|
||||||
for (k = 0; k < results[kw].results.length; k++) {
|
|
||||||
res = results[kw].results[k];
|
|
||||||
assert.isOk(res.link, 'link must be ok');
|
assert.isOk(res.link, 'link must be ok');
|
||||||
assert.typeOf(res.link, 'string', 'link must be string');
|
assert.typeOf(res.link, 'string', 'link must be string');
|
||||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||||
|
Loading…
x
Reference in New Issue
Block a user