supporting yahoo ticker search for news

2025-06-20 01:27:56 +02:00 · 2019-01-24 15:50:03 +01:00 · 2019-01-24 15:50:03 +01:00 · bab902e80a
commit bab902e80a
parent 9cfa502851
11 changed files with 106 additions and 14 deletions
--- a/README.md
+++ b/README.md
@ -52,13 +52,17 @@ let config = {
    // is drawn before every request. empty string for no sleeping.
    sleep_range: '[1,1]',
    // which search engine to scrape
-    search_engine: 'google',
+    search_engine: 'yahoo_news',
    // whether debug information should be printed
    debug: 'true',
    // whether verbose program output should be printed
    verbose: 'false',
    // an array of keywords to scrape
-    keywords: ['incolumitas.com scraping', 'best scraping framework'],
+    keywords: ['GOOGL', ],
+    // alternatively you can specify a keyword_file. this overwrites the keywords array
+    keyword_file: './keywords.txt',
+    // whether to start the browser in headless mode
+    headless: false,
 };

 se_scraper.scrape(config, (err, response) => {
@ -90,6 +94,7 @@ Supported options for the `search_engine` config key:
 'youtube'
 'duckduckgo_news'
 'google_dr'
+'yahoo_news'
 ```

 Output for the above script on my laptop:
--- a/TODO.txt
+++ b/TODO.txt
@ -2,6 +2,11 @@
    - fix interface to scrape() [DONE]
    - add to Github

+
+24.1.2018
+
+    - fix issue #3: add functionality to add keyword file
+
 TODO:
    - add proxy support
    - add captcha service solving support
--- a/index.js
+++ b/index.js
@ -1,5 +1,6 @@
 const handler = require('./src/node_scraper.js');
 var fs = require('fs');
+var os = require("os");

 exports.scrape = function(config, callback) {
 	// options for scraping
@ -21,14 +22,14 @@ exports.scrape = function(config, callback) {
 		compress: 'false', // compress
 		debug: 'false',
 		verbose: 'false',
-		keywords: [],
+		keywords: ['test'],
 	};

 	for (var key in config) {
 		event[key] = config[key];
 	}

-	if (fs.existsSync( event.keyword_file )) {
+	if (fs.existsSync(event.keyword_file)) {
 		event.keywords = read_keywords_from_file(event.keyword_file);
 	}

@ -47,7 +48,7 @@ exports.scrape = function(config, callback) {
 };

 function read_keywords_from_file(fname) {
-	let kws =  fs.readFileSync(fname).toString().split("\n");
+	let kws =  fs.readFileSync(fname).toString().split(os.EOL);
 	// clean keywords
 	kws = kws.filter((kw) => {
 		return kw.trim().length > 0;
--- a/keywords.txt
+++ b/keywords.txt
@ -1,3 +1,2 @@
-google scraper nikolait
-mount everest
-incolumitas.com
+GOOGL
+AAPL
--- a/package-lock.json
+++ b/package-lock.json
@ -1,6 +1,6 @@
 {
  "name": "se-scraper",
-  "version": "1.0.0",
+  "version": "1.0.5",
  "lockfileVersion": 1,
  "requires": true,
  "dependencies": {
--- a/package.json
+++ b/package.json
@ -1,6 +1,6 @@
 {
  "name": "se-scraper",
-  "version": "1.0.4",
+  "version": "1.0.5",
  "description": "A simple module which uses puppeteer to scrape several search engines.",
  "main": "index.js",
  "scripts": {
--- a/run.js
+++ b/run.js
@ -11,13 +11,17 @@ let config = {
    // is drawn before every request. empty string for no sleeping.
    sleep_range: '[1,1]',
    // which search engine to scrape
-    search_engine: 'google',
+    search_engine: 'yahoo_news',
    // whether debug information should be printed
    debug: 'true',
    // whether verbose program output should be printed
    verbose: 'false',
    // an array of keywords to scrape
-    keywords: ['incolumitas.com scraping', 'best scraping framework'],
+    keywords: ['GOOGL', ],
+    // alternatively you can specify a keyword_file. this overwrites the keywords array
+    keyword_file: './keywords.txt',
+    // whether to start the browser in headless mode
+    headless: false,
 };

 se_scraper.scrape(config, (err, response) => {
--- a/se-scraper.iml
+++ b/se-scraper.iml
@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="WEB_MODULE" version="4">
+  <component name="NewModuleRootManager" inherit-compiler-output="true">
+    <exclude-output />
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
--- a/src/modules/functions.js
+++ b/src/modules/functions.js
@ -3,8 +3,16 @@ module.exports = {
 	effective_query: effective_query,
    sleep: sleep,
    random_sleep: random_sleep,
+    set_input_value: set_input_value,
 };

+async function set_input_value(page, selector, value) {
+    await page.waitFor(selector);
+    await page.evaluate((value, selector) => {
+        return document.querySelector(selector).value = value;
+    }, value, selector);
+}
+
 function no_results(needles, html) {
 	return !needles.map((needle) => { return html.indexOf(needle)})
 		.every((res) => { return res == -1});
--- a/src/modules/ticker_search.js
+++ b/src/modules/ticker_search.js
@ -0,0 +1,58 @@
+const cheerio = require('cheerio');
+const sfunctions = require('./functions.js');
+
+module.exports = {
+    scrape_yahoo_finance_pup: scrape_yahoo_finance_pup,
+};
+
+async function scrape_yahoo_finance_pup(browser, event, context) {
+    var results = {};
+    const page = await browser.newPage();
+    await page.goto('https://finance.yahoo.com/');
+
+    for (var i = 0; i < 3; i++) {
+        consent = await page.waitForSelector('[type="submit"]');
+        await consent.click();
+    }
+
+    for (let keyword of event.keywords) {
+        try {
+            await page.goto(`https://finance.yahoo.com/quote/${keyword}/news?p=${keyword}`);
+
+            await page.waitForSelector('#quote-header-info', { timeout: 8000 });
+
+            if (event.debug === true && event.is_local === true) {
+                await page.screenshot({path: `debug/${keyword}.png`});
+            }
+
+            await sfunctions.sleep(1000);
+
+            let html = await page.content();
+            results[keyword] = parse(html);
+
+        } catch (e) {
+            console.error(`Problem with scraping ${keyword}: ${e}`);
+        }
+    }
+
+    return results;
+}
+
+function parse(html) {
+    // load the page source into cheerio
+    const $ = cheerio.load(html);
+
+    const results = [];
+    $('.js-stream-content .Cf').each((i, link) => {
+        results.push({
+            link: $(link).find('h3 a').attr('href'),
+            title: $(link).find('h3').text(),
+            snippet: $(link).find('p').text(),
+        })
+    });
+
+    return {
+        time: (new Date()).toUTCString(),
+        results: results,
+    }
+}
--- a/src/node_scraper.js
+++ b/src/node_scraper.js
@ -10,6 +10,7 @@ const youtube = require('./modules/youtube.js');
 const ua = require('./modules/user_agents.js');
 const meta = require('./modules/metadata.js');
 const duckduckgo = require('./modules/duckduckgo.js');
+const tickersearch = require('./modules/ticker_search.js');

 module.exports.handler = async function handler (event, context, callback) {

@ -54,7 +55,7 @@ module.exports.handler = async function handler (event, context, callback) {

 		browser = await puppeteer.launch({
 			args: ADDITIONAL_CHROME_FLAGS,
-			headless: true,
+			headless: event.headless !== false,
 		});

 		if (event.log_http_headers === true) {
@ -87,7 +88,9 @@ module.exports.handler = async function handler (event, context, callback) {
            results = await duckduckgo.scrape_duckduckgo_news_pup(browser, event, context);
 		} else if (event.search_engine == 'google_dr') {
            results = await google.scrape_google_pup_dr(browser, event, context);
-        }
+        } else if (event.search_engine == 'yahoo_news') {
+			results = await tickersearch.scrape_yahoo_finance_pup(browser, event, context);
+		}

        let metadata = {};