From bab902e80ab4d897b6464a1b31d0911b82c13722 Mon Sep 17 00:00:00 2001
From: Nikolai Tschacher <nikolai.tschacher@informatik.hu-berlin.de>
Date: Thu, 24 Jan 2019 15:50:03 +0100
Subject: [PATCH] supporting yahoo ticker search for news

---
 README.md                    |  9 ++++--
 TODO.txt                     |  5 ++++
 index.js                     |  7 +++--
 keywords.txt                 |  5 ++--
 package-lock.json            |  2 +-
 package.json                 |  2 +-
 run.js                       |  8 +++--
 se-scraper.iml               |  9 ++++++
 src/modules/functions.js     |  8 +++++
 src/modules/ticker_search.js | 58 ++++++++++++++++++++++++++++++++++++
 src/node_scraper.js          |  7 +++--
 11 files changed, 106 insertions(+), 14 deletions(-)
 create mode 100644 se-scraper.iml
 create mode 100644 src/modules/ticker_search.js

diff --git a/README.md b/README.md
index 470fd2b..062877f 100644
--- a/README.md
+++ b/README.md
@@ -52,13 +52,17 @@ let config = {
     // is drawn before every request. empty string for no sleeping.
     sleep_range: '[1,1]',
     // which search engine to scrape
-    search_engine: 'google',
+    search_engine: 'yahoo_news',
     // whether debug information should be printed
     debug: 'true',
     // whether verbose program output should be printed
     verbose: 'false',
     // an array of keywords to scrape
-    keywords: ['incolumitas.com scraping', 'best scraping framework'],
+    keywords: ['GOOGL', ],
+    // alternatively you can specify a keyword_file. this overwrites the keywords array
+    keyword_file: './keywords.txt',
+    // whether to start the browser in headless mode
+    headless: false,
 };
 
 se_scraper.scrape(config, (err, response) => {
@@ -90,6 +94,7 @@ Supported options for the `search_engine` config key:
 'youtube'
 'duckduckgo_news'
 'google_dr'
+'yahoo_news'
 ```
 
 Output for the above script on my laptop:
diff --git a/TODO.txt b/TODO.txt
index 010cd46..5e0302b 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -2,6 +2,11 @@
     - fix interface to scrape() [DONE]
     - add to Github
 
+
+24.1.2018
+
+    - fix issue #3: add functionality to add keyword file
+
 TODO:
     - add proxy support
     - add captcha service solving support
diff --git a/index.js b/index.js
index e66d5de..bb3b587 100644
--- a/index.js
+++ b/index.js
@@ -1,5 +1,6 @@
 const handler = require('./src/node_scraper.js');
 var fs = require('fs');
+var os = require("os");
 
 exports.scrape = function(config, callback) {
 	// options for scraping
@@ -21,14 +22,14 @@ exports.scrape = function(config, callback) {
 		compress: 'false', // compress
 		debug: 'false',
 		verbose: 'false',
-		keywords: [],
+		keywords: ['test'],
 	};
 
 	for (var key in config) {
 		event[key] = config[key];
 	}
 
-	if (fs.existsSync( event.keyword_file )) {
+	if (fs.existsSync(event.keyword_file)) {
 		event.keywords = read_keywords_from_file(event.keyword_file);
 	}
 
@@ -47,7 +48,7 @@ exports.scrape = function(config, callback) {
 };
 
 function read_keywords_from_file(fname) {
-	let kws =  fs.readFileSync(fname).toString().split("\n");
+	let kws =  fs.readFileSync(fname).toString().split(os.EOL);
 	// clean keywords
 	kws = kws.filter((kw) => {
 		return kw.trim().length > 0;
diff --git a/keywords.txt b/keywords.txt
index e7c5d5f..6dc8051 100644
--- a/keywords.txt
+++ b/keywords.txt
@@ -1,3 +1,2 @@
-google scraper nikolait
-mount everest
-incolumitas.com
\ No newline at end of file
+GOOGL
+AAPL
\ No newline at end of file
diff --git a/package-lock.json b/package-lock.json
index 1de3cf4..043e898 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,6 +1,6 @@
 {
   "name": "se-scraper",
-  "version": "1.0.0",
+  "version": "1.0.5",
   "lockfileVersion": 1,
   "requires": true,
   "dependencies": {
diff --git a/package.json b/package.json
index 10bb40e..221d869 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "se-scraper",
-  "version": "1.0.4",
+  "version": "1.0.5",
   "description": "A simple module which uses puppeteer to scrape several search engines.",
   "main": "index.js",
   "scripts": {
diff --git a/run.js b/run.js
index 2df9341..786ef5c 100644
--- a/run.js
+++ b/run.js
@@ -11,13 +11,17 @@ let config = {
     // is drawn before every request. empty string for no sleeping.
     sleep_range: '[1,1]',
     // which search engine to scrape
-    search_engine: 'google',
+    search_engine: 'yahoo_news',
     // whether debug information should be printed
     debug: 'true',
     // whether verbose program output should be printed
     verbose: 'false',
     // an array of keywords to scrape
-    keywords: ['incolumitas.com scraping', 'best scraping framework'],
+    keywords: ['GOOGL', ],
+    // alternatively you can specify a keyword_file. this overwrites the keywords array
+    keyword_file: './keywords.txt',
+    // whether to start the browser in headless mode
+    headless: false,
 };
 
 se_scraper.scrape(config, (err, response) => {
diff --git a/se-scraper.iml b/se-scraper.iml
new file mode 100644
index 0000000..8021953
--- /dev/null
+++ b/se-scraper.iml
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="WEB_MODULE" version="4">
+  <component name="NewModuleRootManager" inherit-compiler-output="true">
+    <exclude-output />
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
\ No newline at end of file
diff --git a/src/modules/functions.js b/src/modules/functions.js
index 70415bf..35ef8db 100644
--- a/src/modules/functions.js
+++ b/src/modules/functions.js
@@ -3,8 +3,16 @@ module.exports = {
 	effective_query: effective_query,
     sleep: sleep,
     random_sleep: random_sleep,
+    set_input_value: set_input_value,
 };
 
+async function set_input_value(page, selector, value) {
+    await page.waitFor(selector);
+    await page.evaluate((value, selector) => {
+        return document.querySelector(selector).value = value;
+    }, value, selector);
+}
+
 function no_results(needles, html) {
 	return !needles.map((needle) => { return html.indexOf(needle)})
 		.every((res) => { return res == -1});
diff --git a/src/modules/ticker_search.js b/src/modules/ticker_search.js
new file mode 100644
index 0000000..17f74d7
--- /dev/null
+++ b/src/modules/ticker_search.js
@@ -0,0 +1,58 @@
+const cheerio = require('cheerio');
+const sfunctions = require('./functions.js');
+
+module.exports = {
+    scrape_yahoo_finance_pup: scrape_yahoo_finance_pup,
+};
+
+async function scrape_yahoo_finance_pup(browser, event, context) {
+    var results = {};
+    const page = await browser.newPage();
+    await page.goto('https://finance.yahoo.com/');
+
+    for (var i = 0; i < 3; i++) {
+        consent = await page.waitForSelector('[type="submit"]');
+        await consent.click();
+    }
+
+    for (let keyword of event.keywords) {
+        try {
+            await page.goto(`https://finance.yahoo.com/quote/${keyword}/news?p=${keyword}`);
+
+            await page.waitForSelector('#quote-header-info', { timeout: 8000 });
+
+            if (event.debug === true && event.is_local === true) {
+                await page.screenshot({path: `debug/${keyword}.png`});
+            }
+
+            await sfunctions.sleep(1000);
+
+            let html = await page.content();
+            results[keyword] = parse(html);
+
+        } catch (e) {
+            console.error(`Problem with scraping ${keyword}: ${e}`);
+        }
+    }
+
+    return results;
+}
+
+function parse(html) {
+    // load the page source into cheerio
+    const $ = cheerio.load(html);
+
+    const results = [];
+    $('.js-stream-content .Cf').each((i, link) => {
+        results.push({
+            link: $(link).find('h3 a').attr('href'),
+            title: $(link).find('h3').text(),
+            snippet: $(link).find('p').text(),
+        })
+    });
+
+    return {
+        time: (new Date()).toUTCString(),
+        results: results,
+    }
+}
\ No newline at end of file
diff --git a/src/node_scraper.js b/src/node_scraper.js
index 31b1b4a..ddcc84a 100644
--- a/src/node_scraper.js
+++ b/src/node_scraper.js
@@ -10,6 +10,7 @@ const youtube = require('./modules/youtube.js');
 const ua = require('./modules/user_agents.js');
 const meta = require('./modules/metadata.js');
 const duckduckgo = require('./modules/duckduckgo.js');
+const tickersearch = require('./modules/ticker_search.js');
 
 module.exports.handler = async function handler (event, context, callback) {
 
@@ -54,7 +55,7 @@ module.exports.handler = async function handler (event, context, callback) {
 
 		browser = await puppeteer.launch({
 			args: ADDITIONAL_CHROME_FLAGS,
-			headless: true,
+			headless: event.headless !== false,
 		});
 
 		if (event.log_http_headers === true) {
@@ -87,7 +88,9 @@ module.exports.handler = async function handler (event, context, callback) {
             results = await duckduckgo.scrape_duckduckgo_news_pup(browser, event, context);
 		} else if (event.search_engine == 'google_dr') {
             results = await google.scrape_google_pup_dr(browser, event, context);
-        }
+        } else if (event.search_engine == 'yahoo_news') {
+			results = await tickersearch.scrape_yahoo_finance_pup(browser, event, context);
+		}
 
         let metadata = {};