From e78d7145b585d5bdd7211b2f3ccd2478ec1898c9 Mon Sep 17 00:00:00 2001 From: Nikolai Tschacher Date: Sun, 27 Jan 2019 01:27:52 +0100 Subject: [PATCH] faster scraping, added ticker search engines --- README.md | 3 + TODO.txt | 6 ++ index.js | 3 + package.json | 2 +- run.js | 7 +- src/modules/baidu.js | 3 +- src/modules/bing.js | 6 +- src/modules/duckduckgo.js | 3 +- src/modules/google.js | 21 +--- src/modules/infospace.js | 6 +- src/modules/ticker_search.js | 181 ++++++++++++++++++++++++++++++++++- src/modules/youtube.js | 3 +- src/node_scraper.js | 63 ++++++------ 13 files changed, 244 insertions(+), 63 deletions(-) diff --git a/README.md b/README.md index 6fd2047..10fde80 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,9 @@ let config = { headless: false, // path to output file, data will be stored in JSON output_file: 'results.json', + // whether to prevent images, css, fonts from being loaded + // will speed up scraping a great deal + block_assets: true }; se_scraper.scrape(config, (err, response) => { diff --git a/TODO.txt b/TODO.txt index 5e0302b..55661b0 100644 --- a/TODO.txt +++ b/TODO.txt @@ -7,6 +7,12 @@ - fix issue #3: add functionality to add keyword file +27.1.2019 + + - Add functionality to block images and CSS from loading as described here: + + https://www.scrapehero.com/how-to-increase-web-scraping-speed-using-puppeteer/ + TODO: - add proxy support - add captcha service solving support diff --git a/index.js b/index.js index 920c5e9..c881bc3 100644 --- a/index.js +++ b/index.js @@ -27,6 +27,9 @@ exports.scrape = async function(config, callback) { headless: true, // path to output file, data will be stored in JSON output_file: '', + // whether to prevent images, css, fonts from being loaded + // will speed up scraping a great deal + block_assets: true }; for (var key in config) { diff --git a/package.json b/package.json index 44342c7..45671ee 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "se-scraper", - "version": "1.0.6", + "version": "1.1.0", "description": "A simple module which uses puppeteer to scrape several search engines.", "main": "index.js", "scripts": { diff --git a/run.js b/run.js index 5a63796..140fca4 100644 --- a/run.js +++ b/run.js @@ -11,19 +11,22 @@ let config = { // is drawn before every request. empty string for no sleeping. sleep_range: '[1,1]', // which search engine to scrape - search_engine: 'google', + search_engine: 'bing', // whether debug information should be printed debug: true, // whether verbose program output should be printed verbose: false, // an array of keywords to scrape - keywords: ['scrapeulous.com', ], + keywords: ['MSFT', ], // alternatively you can specify a keyword_file. this overwrites the keywords array keyword_file: '', // whether to start the browser in headless mode headless: false, // path to output file, data will be stored in JSON output_file: 'data.json', + // whether to prevent images, css, fonts from being loaded + // will speed up scraping a great deal + block_assets: true }; se_scraper.scrape(config, (err, response) => { diff --git a/src/modules/baidu.js b/src/modules/baidu.js index 4d6a389..9c2fcfa 100644 --- a/src/modules/baidu.js +++ b/src/modules/baidu.js @@ -5,8 +5,7 @@ module.exports = { scrape_baidu_pup: scrape_baidu_pup, }; -async function scrape_baidu_pup(browser, event, context) { - const page = await browser.newPage(); +async function scrape_baidu_pup(page, event, context) { await page.goto('https://www.baidu.com/'); try { diff --git a/src/modules/bing.js b/src/modules/bing.js index dc867c3..c250d71 100644 --- a/src/modules/bing.js +++ b/src/modules/bing.js @@ -6,8 +6,7 @@ module.exports = { scrape_bing_news_pup: scrape_bing_news_pup, }; -async function scrape_bing_pup(browser, event, context) { - const page = await browser.newPage(); +async function scrape_bing_pup(page, event, context) { await page.goto('https://www.bing.com/'); try { @@ -91,8 +90,7 @@ function parse(html) { } } -async function scrape_bing_news_pup(browser, event, context) { - const page = await browser.newPage(); +async function scrape_bing_news_pup(page, event, context) { await page.goto('https://www.bing.com/news/search?'); if (event.set_manual_settings === true) { diff --git a/src/modules/duckduckgo.js b/src/modules/duckduckgo.js index be067a7..1713b56 100644 --- a/src/modules/duckduckgo.js +++ b/src/modules/duckduckgo.js @@ -5,8 +5,7 @@ module.exports = { scrape_duckduckgo_news_pup: scrape_duckduckgo_news_pup, }; -async function scrape_duckduckgo_news_pup(browser, event, context) { - const page = await browser.newPage(); +async function scrape_duckduckgo_news_pup(page, event, context) { await page.goto('https://duckduckgo.com/?q=42&t=h_&iar=news&ia=news'); try { diff --git a/src/modules/google.js b/src/modules/google.js index d770368..99b97a7 100644 --- a/src/modules/google.js +++ b/src/modules/google.js @@ -26,9 +26,7 @@ const setTextInputValue = async (page, selector, value) => { }, value, selector); }; - -async function scrape_google_pup(browser, event, context) { - const page = await browser.newPage(); +async function scrape_google_pup(page, event, context) { await page.goto('https://www.google.com/'); try { @@ -92,14 +90,12 @@ async function scrape_google_pup(browser, event, context) { let html = await page.content(); results[keyword] = parse_google_results(html); - } return results; } -async function scrape_google_pup_dr(browser, event, context) { - const page = await browser.newPage(); +async function scrape_google_pup_dr(page, event, context) { let keywords = event.keywords; first = keywords[0]; var year = first.slice(-5); @@ -235,10 +231,7 @@ async function scraping_detected(page) { return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1; } - -async function scrape_google_news_old_pup(browser, event, context) { - const page = await browser.newPage(); - +async function scrape_google_news_old_pup(page, event, context) { let keywords = event.keywords; var results = {}; @@ -347,9 +340,7 @@ function parse_google_news_results_se_format(html) { } } -async function scrape_google_image_pup(browser, event, context) { - const page = await browser.newPage(); - +async function scrape_google_image_pup(page, event, context) { let keywords = event.keywords; var results = {}; @@ -477,9 +468,7 @@ function clean_image_url(url) { const all_results = new Set(); -async function scrape_google_news_pup(browser, event, context) { - const page = await browser.newPage(); - +async function scrape_google_news_pup(page, event, context) { let keywords = event.keywords; var results = {}; diff --git a/src/modules/infospace.js b/src/modules/infospace.js index b5bc21d..e1ddc2d 100644 --- a/src/modules/infospace.js +++ b/src/modules/infospace.js @@ -6,8 +6,7 @@ module.exports = { scrape_webcrawler_news_pup: scrape_webcrawler_news_pup, }; -async function scrape_infospace_pup(browser, event, context) { - const page = await browser.newPage(); +async function scrape_infospace_pup(page, event, context) { await page.goto('http://infospace.com/index.html'); try { @@ -89,8 +88,7 @@ function parse(html) { } } -async function scrape_webcrawler_news_pup(browser, event, context) { - const page = await browser.newPage(); +async function scrape_webcrawler_news_pup(page, event, context) { await page.goto('https://www.webcrawler.com/?qc=news'); try { diff --git a/src/modules/ticker_search.js b/src/modules/ticker_search.js index 17f74d7..dae6d57 100644 --- a/src/modules/ticker_search.js +++ b/src/modules/ticker_search.js @@ -3,11 +3,16 @@ const sfunctions = require('./functions.js'); module.exports = { scrape_yahoo_finance_pup: scrape_yahoo_finance_pup, + scrape_bloomberg_finance_pup: scrape_bloomberg_finance_pup, + scrape_reuters_finance_pup: scrape_reuters_finance_pup, + scrape_cnbc_finance_pup: scrape_cnbc_finance_pup, + scrape_marketwatch_finance_pup: scrape_marketwatch_finance_pup, }; -async function scrape_yahoo_finance_pup(browser, event, context) { +// https://www.google.com/search?q=MSFT&tbm=fin + +async function scrape_yahoo_finance_pup(page, event, context) { var results = {}; - const page = await browser.newPage(); await page.goto('https://finance.yahoo.com/'); for (var i = 0; i < 3; i++) { @@ -34,7 +39,6 @@ async function scrape_yahoo_finance_pup(browser, event, context) { console.error(`Problem with scraping ${keyword}: ${e}`); } } - return results; } @@ -55,4 +59,175 @@ function parse(html) { time: (new Date()).toUTCString(), results: results, } +} + +async function scrape_marketwatch_finance_pup(page, event, context) { + var results = {}; + for (let keyword of event.keywords) { + try { + await page.goto(`https://www.marketwatch.com/investing/stock/${keyword}`); + await page.waitForSelector('.intraday__data', { timeout: 8000 }); + + if (event.debug === true && event.is_local === true) { + await page.screenshot({path: `debug/${keyword}.png`}); + } + + await sfunctions.sleep(500); + + let newsData = await page.evaluate(() => { + let results = []; + // get the hotel elements + let items = document.querySelectorAll('.article__content'); + // get the hotel data + items.forEach((newsitem) => { + let data = {}; + try { + data.link = newsitem.querySelector('.article__headline a').getAttribute('href'); + data.title = newsitem.querySelector('.article__headline a').innerText; + data.date = newsitem.querySelector('.article__timestamp').innerText; + data.author = newsitem.querySelector('.article__author').innerText; + } + catch (exception) { + console.error('Error parsing marketwatch data: ', exception); + } + results.push(data); + }); + return results; + }); + + results[keyword] = { + time: (new Date()).toUTCString(), + results: newsData, + } + + } catch (e) { + console.error(`Problem with scraping ${keyword}: ${e}`); + } + } + return results; +} + + +async function scrape_bloomberg_finance_pup(page, event, context) { + /* + Bloomberg blocks after one request. what a shit hole. + */ + var results = {}; + for (let keyword of event.keywords) { + try { + await page.goto(`https://www.bloomberg.com/quote/${keyword}:US`); + await page.waitForSelector('.pseudoMainContent', { timeout: 8000 }); + + if (event.debug === true && event.is_local === true) { + await page.screenshot({path: `debug/${keyword}.png`}); + } + + await sfunctions.sleep(1000); + + let news_items = await page.$x('//*[starts-with(@class,"newsItem")]'); + for (let item of news_items) { + let url = item.$$('a').then((link) => { + link.getProperty('href').then((anchor) => { + return anchor; + }) + }); + } + + } catch (e) { + console.error(`Problem with scraping ${keyword}: ${e}`); + } + } + return results; +} + +async function scrape_reuters_finance_pup(page, event, context) { + var results = {}; + for (let keyword of event.keywords) { + try { + await page.goto(`https://www.reuters.com/finance/stocks/overview/${keyword}`); + await page.waitForSelector('#sectionHeader', { timeout: 8000 }); + + if (event.debug === true && event.is_local === true) { + await page.screenshot({path: `debug/${keyword}.png`}); + } + + await sfunctions.sleep(500); + + let newsData = await page.evaluate(() => { + let results = []; + // get the hotel elements + let items = document.querySelectorAll('div.feature'); + // get the hotel data + items.forEach((newsitem) => { + let data = {}; + try { + data.link = newsitem.querySelector('h2 a').getAttribute('href'); + data.link = 'https://www.reuters.com' + data.link; + data.title = newsitem.querySelector('h2 a').innerText; + data.text = newsitem.querySelector('p').innerText; + data.date = newsitem.querySelector('.timestamp').innerText; + } + catch (exception) { + console.error('Error parsing reuters data: ', exception); + } + results.push(data); + }); + return results; + }); + + results[keyword] = { + time: (new Date()).toUTCString(), + results: newsData, + } + + } catch (e) { + console.error(`Problem with scraping ${keyword}: ${e}`); + } + } + return results; +} + +async function scrape_cnbc_finance_pup(page, event, context) { + var results = {}; + for (let keyword of event.keywords) { + try { + await page.goto(`https://www.cnbc.com/quotes/?symbol=${keyword}&tab=news`); + await page.waitForSelector('#quote_title_and_chart', { timeout: 8000 }); + + if (event.debug === true && event.is_local === true) { + await page.screenshot({path: `debug/${keyword}.png`}); + } + + await sfunctions.sleep(500); + + let newsData = await page.evaluate(() => { + let results = []; + // get the hotel elements + let items = document.querySelectorAll('div.headline'); + // get the hotel data + items.forEach((newsitem) => { + let data = {}; + try { + data.link = newsitem.querySelector('a').getAttribute('href'); + data.title = newsitem.querySelector('[ng-bind="asset.headline"]').innerText; + data.date = newsitem.querySelector('span.note').innerText; + } + catch (exception) { + console.error('Error parsing cnbc data: ', exception); + } + results.push(data); + }); + return results; + }); + + results[keyword] = { + time: (new Date()).toUTCString(), + results: newsData, + } + + } catch (e) { + console.error(`Problem with scraping ${keyword}: ${e}`); + } + } + return results; } \ No newline at end of file diff --git a/src/modules/youtube.js b/src/modules/youtube.js index 2900cde..8426c50 100644 --- a/src/modules/youtube.js +++ b/src/modules/youtube.js @@ -7,8 +7,7 @@ module.exports = { const all_videos = new Set(); -async function scrape_youtube_pup(browser, event, context) { - const page = await browser.newPage(); +async function scrape_youtube_pup(page, event, context) { await page.goto('https://www.youtube.com'); try { diff --git a/src/node_scraper.js b/src/node_scraper.js index 5fc4534..d7b4b22 100644 --- a/src/node_scraper.js +++ b/src/node_scraper.js @@ -72,35 +72,40 @@ module.exports.handler = async function handler (event, context, callback) { console.dir(headers); } - // TODO: this is ugly but I don't want to use to much objects and classes right now. - if (event.search_engine == 'google') { - results = await google.scrape_google_pup(browser, event, context); - } else if (event.search_engine == 'google_news_old') { - results = await google.scrape_google_news_old_pup(browser, event, context); - } else if (event.search_engine == 'google_news') { - results = await google.scrape_google_news_pup(browser, event, context); - } else if (event.search_engine == 'google_image') { - results = await google.scrape_google_image_pup(browser, event, context); - } else if (event.search_engine == 'bing') { - results = await bing.scrape_bing_pup(browser, event, context); - } else if (event.search_engine == 'bing_news') { - results = await bing.scrape_bing_news_pup(browser, event, context); - } else if (event.search_engine == 'infospace') { - results = await infospace.scrape_infospace_pup(browser, event, context); - } else if (event.search_engine == 'webcrawler') { - results = await infospace.scrape_webcrawler_news_pup(browser, event, context); - } else if (event.search_engine == 'baidu') { - results = await baidu.scrape_baidu_pup(browser, event, context); - } else if (event.search_engine == 'youtube') { - results = await youtube.scrape_youtube_pup(browser, event, context); - } else if (event.search_engine == 'duckduckgo_news') { - results = await duckduckgo.scrape_duckduckgo_news_pup(browser, event, context); - } else if (event.search_engine == 'google_dr') { - results = await google.scrape_google_pup_dr(browser, event, context); - } else if (event.search_engine == 'yahoo_news') { - results = await tickersearch.scrape_yahoo_finance_pup(browser, event, context); + const page = await browser.newPage(); + + if (event.block_assets === true) { + await page.setRequestInterception(true); + + page.on('request', (req) => { + if (req.resourceType() == 'stylesheet' || req.resourceType() == 'font' || req.resourceType() == 'image') { + req.abort(); + } else { + req.continue(); + } + }); } + results = await { + google: google.scrape_google_pup, + google_news_old: google.scrape_google_news_old_pup, + google_news: google.scrape_google_news_pup, + google_image: google.scrape_google_image_pup, + bing: bing.scrape_bing_pup, + bing_news: bing.scrape_bing_news_pup, + infospace: infospace.scrape_infospace_pup, + webcrawler: infospace.scrape_webcrawler_news_pup, + baidu: baidu.scrape_baidu_pup, + youtube: youtube.scrape_youtube_pup, + duckduckgo_news: duckduckgo.scrape_duckduckgo_news_pup, + google_dr: google.scrape_google_pup_dr, + yahoo_news: tickersearch.scrape_yahoo_finance_pup, + bloomberg: tickersearch.scrape_bloomberg_finance_pup, + reuters: tickersearch.scrape_reuters_finance_pup, + cnbc: tickersearch.scrape_cnbc_finance_pup, + marketwatch: tickersearch.scrape_marketwatch_finance_pup, + }[event.search_engine](page, event, context); + let metadata = {}; if (event.write_meta_data === true) { @@ -203,6 +208,10 @@ function parseEventData(event) { event.set_manual_settings = _bool(event.set_manual_settings); } + if (event.block_assets) { + event.block_assets = _bool(event.block_assets); + } + if (event.sleep_range) { // parse an array event.sleep_range = eval(event.sleep_range);