From bab902e80ab4d897b6464a1b31d0911b82c13722 Mon Sep 17 00:00:00 2001 From: Nikolai Tschacher Date: Thu, 24 Jan 2019 15:50:03 +0100 Subject: [PATCH] supporting yahoo ticker search for news --- README.md | 9 ++++-- TODO.txt | 5 ++++ index.js | 7 +++-- keywords.txt | 5 ++-- package-lock.json | 2 +- package.json | 2 +- run.js | 8 +++-- se-scraper.iml | 9 ++++++ src/modules/functions.js | 8 +++++ src/modules/ticker_search.js | 58 ++++++++++++++++++++++++++++++++++++ src/node_scraper.js | 7 +++-- 11 files changed, 106 insertions(+), 14 deletions(-) create mode 100644 se-scraper.iml create mode 100644 src/modules/ticker_search.js diff --git a/README.md b/README.md index 470fd2b..062877f 100644 --- a/README.md +++ b/README.md @@ -52,13 +52,17 @@ let config = { // is drawn before every request. empty string for no sleeping. sleep_range: '[1,1]', // which search engine to scrape - search_engine: 'google', + search_engine: 'yahoo_news', // whether debug information should be printed debug: 'true', // whether verbose program output should be printed verbose: 'false', // an array of keywords to scrape - keywords: ['incolumitas.com scraping', 'best scraping framework'], + keywords: ['GOOGL', ], + // alternatively you can specify a keyword_file. this overwrites the keywords array + keyword_file: './keywords.txt', + // whether to start the browser in headless mode + headless: false, }; se_scraper.scrape(config, (err, response) => { @@ -90,6 +94,7 @@ Supported options for the `search_engine` config key: 'youtube' 'duckduckgo_news' 'google_dr' +'yahoo_news' ``` Output for the above script on my laptop: diff --git a/TODO.txt b/TODO.txt index 010cd46..5e0302b 100644 --- a/TODO.txt +++ b/TODO.txt @@ -2,6 +2,11 @@ - fix interface to scrape() [DONE] - add to Github + +24.1.2018 + + - fix issue #3: add functionality to add keyword file + TODO: - add proxy support - add captcha service solving support diff --git a/index.js b/index.js index e66d5de..bb3b587 100644 --- a/index.js +++ b/index.js @@ -1,5 +1,6 @@ const handler = require('./src/node_scraper.js'); var fs = require('fs'); +var os = require("os"); exports.scrape = function(config, callback) { // options for scraping @@ -21,14 +22,14 @@ exports.scrape = function(config, callback) { compress: 'false', // compress debug: 'false', verbose: 'false', - keywords: [], + keywords: ['test'], }; for (var key in config) { event[key] = config[key]; } - if (fs.existsSync( event.keyword_file )) { + if (fs.existsSync(event.keyword_file)) { event.keywords = read_keywords_from_file(event.keyword_file); } @@ -47,7 +48,7 @@ exports.scrape = function(config, callback) { }; function read_keywords_from_file(fname) { - let kws = fs.readFileSync(fname).toString().split("\n"); + let kws = fs.readFileSync(fname).toString().split(os.EOL); // clean keywords kws = kws.filter((kw) => { return kw.trim().length > 0; diff --git a/keywords.txt b/keywords.txt index e7c5d5f..6dc8051 100644 --- a/keywords.txt +++ b/keywords.txt @@ -1,3 +1,2 @@ -google scraper nikolait -mount everest -incolumitas.com \ No newline at end of file +GOOGL +AAPL \ No newline at end of file diff --git a/package-lock.json b/package-lock.json index 1de3cf4..043e898 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,6 +1,6 @@ { "name": "se-scraper", - "version": "1.0.0", + "version": "1.0.5", "lockfileVersion": 1, "requires": true, "dependencies": { diff --git a/package.json b/package.json index 10bb40e..221d869 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "se-scraper", - "version": "1.0.4", + "version": "1.0.5", "description": "A simple module which uses puppeteer to scrape several search engines.", "main": "index.js", "scripts": { diff --git a/run.js b/run.js index 2df9341..786ef5c 100644 --- a/run.js +++ b/run.js @@ -11,13 +11,17 @@ let config = { // is drawn before every request. empty string for no sleeping. sleep_range: '[1,1]', // which search engine to scrape - search_engine: 'google', + search_engine: 'yahoo_news', // whether debug information should be printed debug: 'true', // whether verbose program output should be printed verbose: 'false', // an array of keywords to scrape - keywords: ['incolumitas.com scraping', 'best scraping framework'], + keywords: ['GOOGL', ], + // alternatively you can specify a keyword_file. this overwrites the keywords array + keyword_file: './keywords.txt', + // whether to start the browser in headless mode + headless: false, }; se_scraper.scrape(config, (err, response) => { diff --git a/se-scraper.iml b/se-scraper.iml new file mode 100644 index 0000000..8021953 --- /dev/null +++ b/se-scraper.iml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/src/modules/functions.js b/src/modules/functions.js index 70415bf..35ef8db 100644 --- a/src/modules/functions.js +++ b/src/modules/functions.js @@ -3,8 +3,16 @@ module.exports = { effective_query: effective_query, sleep: sleep, random_sleep: random_sleep, + set_input_value: set_input_value, }; +async function set_input_value(page, selector, value) { + await page.waitFor(selector); + await page.evaluate((value, selector) => { + return document.querySelector(selector).value = value; + }, value, selector); +} + function no_results(needles, html) { return !needles.map((needle) => { return html.indexOf(needle)}) .every((res) => { return res == -1}); diff --git a/src/modules/ticker_search.js b/src/modules/ticker_search.js new file mode 100644 index 0000000..17f74d7 --- /dev/null +++ b/src/modules/ticker_search.js @@ -0,0 +1,58 @@ +const cheerio = require('cheerio'); +const sfunctions = require('./functions.js'); + +module.exports = { + scrape_yahoo_finance_pup: scrape_yahoo_finance_pup, +}; + +async function scrape_yahoo_finance_pup(browser, event, context) { + var results = {}; + const page = await browser.newPage(); + await page.goto('https://finance.yahoo.com/'); + + for (var i = 0; i < 3; i++) { + consent = await page.waitForSelector('[type="submit"]'); + await consent.click(); + } + + for (let keyword of event.keywords) { + try { + await page.goto(`https://finance.yahoo.com/quote/${keyword}/news?p=${keyword}`); + + await page.waitForSelector('#quote-header-info', { timeout: 8000 }); + + if (event.debug === true && event.is_local === true) { + await page.screenshot({path: `debug/${keyword}.png`}); + } + + await sfunctions.sleep(1000); + + let html = await page.content(); + results[keyword] = parse(html); + + } catch (e) { + console.error(`Problem with scraping ${keyword}: ${e}`); + } + } + + return results; +} + +function parse(html) { + // load the page source into cheerio + const $ = cheerio.load(html); + + const results = []; + $('.js-stream-content .Cf').each((i, link) => { + results.push({ + link: $(link).find('h3 a').attr('href'), + title: $(link).find('h3').text(), + snippet: $(link).find('p').text(), + }) + }); + + return { + time: (new Date()).toUTCString(), + results: results, + } +} \ No newline at end of file diff --git a/src/node_scraper.js b/src/node_scraper.js index 31b1b4a..ddcc84a 100644 --- a/src/node_scraper.js +++ b/src/node_scraper.js @@ -10,6 +10,7 @@ const youtube = require('./modules/youtube.js'); const ua = require('./modules/user_agents.js'); const meta = require('./modules/metadata.js'); const duckduckgo = require('./modules/duckduckgo.js'); +const tickersearch = require('./modules/ticker_search.js'); module.exports.handler = async function handler (event, context, callback) { @@ -54,7 +55,7 @@ module.exports.handler = async function handler (event, context, callback) { browser = await puppeteer.launch({ args: ADDITIONAL_CHROME_FLAGS, - headless: true, + headless: event.headless !== false, }); if (event.log_http_headers === true) { @@ -87,7 +88,9 @@ module.exports.handler = async function handler (event, context, callback) { results = await duckduckgo.scrape_duckduckgo_news_pup(browser, event, context); } else if (event.search_engine == 'google_dr') { results = await google.scrape_google_pup_dr(browser, event, context); - } + } else if (event.search_engine == 'yahoo_news') { + results = await tickersearch.scrape_yahoo_finance_pup(browser, event, context); + } let metadata = {};