supporting yahoo ticker search for news

2025-06-30 22:30:00 +02:00 · 2019-01-24 15:50:03 +01:00
parent 9cfa502851
commit bab902e80a
11 changed files with 106 additions and 14 deletions
--- a/README.md
+++ b/README.md
@ -52,13 +52,17 @@ let config = {
    // is drawn before every request. empty string for no sleeping.
    sleep_range: '[1,1]',
    // which search engine to scrape
-    search_engine: 'google',
+    search_engine: 'yahoo_news',
    // whether debug information should be printed
    debug: 'true',
    // whether verbose program output should be printed
    verbose: 'false',
    // an array of keywords to scrape
-    keywords: ['incolumitas.com scraping', 'best scraping framework'],
+    keywords: ['GOOGL', ],
    // alternatively you can specify a keyword_file. this overwrites the keywords array
    keyword_file: './keywords.txt',
    // whether to start the browser in headless mode
    headless: false,
 };
 se_scraper.scrape(config, (err, response) => {
@ -90,6 +94,7 @@ Supported options for the `search_engine` config key:
 'youtube'
 'duckduckgo_news'
 'google_dr'
 'yahoo_news'
 ```
 Output for the above script on my laptop:
--- a/TODO.txt
+++ b/TODO.txt
@ -2,6 +2,11 @@
    - fix interface to scrape() [DONE]
    - add to Github
 24.1.2018
    - fix issue #3: add functionality to add keyword file
 TODO:
    - add proxy support
    - add captcha service solving support
--- a/index.js
+++ b/index.js
@ -1,5 +1,6 @@
 const handler = require('./src/node_scraper.js');
 var fs = require('fs');
 var os = require("os");
 exports.scrape = function(config, callback) {
 	// options for scraping
@ -21,14 +22,14 @@ exports.scrape = function(config, callback) {
 		compress: 'false', // compress
 		debug: 'false',
 		verbose: 'false',
-		keywords: [],
+		keywords: ['test'],
 	};
 	for (var key in config) {
 		event[key] = config[key];
 	}
-	if (fs.existsSync( event.keyword_file )) {
+	if (fs.existsSync(event.keyword_file)) {
 		event.keywords = read_keywords_from_file(event.keyword_file);
 	}
@ -47,7 +48,7 @@ exports.scrape = function(config, callback) {
 };
 function read_keywords_from_file(fname) {
-	let kws =  fs.readFileSync(fname).toString().split("\n");
+	let kws =  fs.readFileSync(fname).toString().split(os.EOL);
 	// clean keywords
 	kws = kws.filter((kw) => {
 		return kw.trim().length > 0;
--- a/keywords.txt
+++ b/keywords.txt
@ -1,3 +1,2 @@
-google scraper nikolait
+GOOGL
-mount everest
+AAPL
 incolumitas.com
--- a/package-lock.json
+++ b/package-lock.json
@ -1,6 +1,6 @@
 {
  "name": "se-scraper",
-  "version": "1.0.0",
+  "version": "1.0.5",
  "lockfileVersion": 1,
  "requires": true,
  "dependencies": {
--- a/package.json
+++ b/package.json
@ -1,6 +1,6 @@
 {
  "name": "se-scraper",
-  "version": "1.0.4",
+  "version": "1.0.5",
  "description": "A simple module which uses puppeteer to scrape several search engines.",
  "main": "index.js",
  "scripts": {
--- a/run.js
+++ b/run.js
@ -11,13 +11,17 @@ let config = {
    // is drawn before every request. empty string for no sleeping.
    sleep_range: '[1,1]',
    // which search engine to scrape
-    search_engine: 'google',
+    search_engine: 'yahoo_news',
    // whether debug information should be printed
    debug: 'true',
    // whether verbose program output should be printed
    verbose: 'false',
    // an array of keywords to scrape
-    keywords: ['incolumitas.com scraping', 'best scraping framework'],
+    keywords: ['GOOGL', ],
    // alternatively you can specify a keyword_file. this overwrites the keywords array
    keyword_file: './keywords.txt',
    // whether to start the browser in headless mode
    headless: false,
 };
 se_scraper.scrape(config, (err, response) => {
--- a/se-scraper.iml
+++ b/se-scraper.iml
@ -0,0 +1,9 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <module type="WEB_MODULE" version="4">
  <component name="NewModuleRootManager" inherit-compiler-output="true">
    <exclude-output />
    <content url="file://$MODULE_DIR$" />
    <orderEntry type="inheritedJdk" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
 </module>
--- a/src/modules/functions.js
+++ b/src/modules/functions.js
@ -3,8 +3,16 @@ module.exports = {
 	effective_query: effective_query,
    sleep: sleep,
    random_sleep: random_sleep,
    set_input_value: set_input_value,
 };
 async function set_input_value(page, selector, value) {
    await page.waitFor(selector);
    await page.evaluate((value, selector) => {
        return document.querySelector(selector).value = value;
    }, value, selector);
 }
 function no_results(needles, html) {
 	return !needles.map((needle) => { return html.indexOf(needle)})
 		.every((res) => { return res == -1});
--- a/src/modules/ticker_search.js
+++ b/src/modules/ticker_search.js
@ -0,0 +1,58 @@
 const cheerio = require('cheerio');
 const sfunctions = require('./functions.js');
 module.exports = {
    scrape_yahoo_finance_pup: scrape_yahoo_finance_pup,
 };
 async function scrape_yahoo_finance_pup(browser, event, context) {
    var results = {};
    const page = await browser.newPage();
    await page.goto('https://finance.yahoo.com/');
    for (var i = 0; i < 3; i++) {
        consent = await page.waitForSelector('[type="submit"]');
        await consent.click();
    }
    for (let keyword of event.keywords) {
        try {
            await page.goto(`https://finance.yahoo.com/quote/${keyword}/news?p=${keyword}`);
            await page.waitForSelector('#quote-header-info', { timeout: 8000 });
            if (event.debug === true && event.is_local === true) {
                await page.screenshot({path: `debug/${keyword}.png`});
            }
            await sfunctions.sleep(1000);
            let html = await page.content();
            results[keyword] = parse(html);
        } catch (e) {
            console.error(`Problem with scraping ${keyword}: ${e}`);
        }
    }
    return results;
 }
 function parse(html) {
    // load the page source into cheerio
    const $ = cheerio.load(html);
    const results = [];
    $('.js-stream-content .Cf').each((i, link) => {
        results.push({
            link: $(link).find('h3 a').attr('href'),
            title: $(link).find('h3').text(),
            snippet: $(link).find('p').text(),
        })
    });
    return {
        time: (new Date()).toUTCString(),
        results: results,
    }
 }
--- a/src/node_scraper.js
+++ b/src/node_scraper.js
@ -10,6 +10,7 @@ const youtube = require('./modules/youtube.js');
 const ua = require('./modules/user_agents.js');
 const meta = require('./modules/metadata.js');
 const duckduckgo = require('./modules/duckduckgo.js');
 const tickersearch = require('./modules/ticker_search.js');
 module.exports.handler = async function handler (event, context, callback) {
@ -54,7 +55,7 @@ module.exports.handler = async function handler (event, context, callback) {
 		browser = await puppeteer.launch({
 			args: ADDITIONAL_CHROME_FLAGS,
-			headless: true,
+			headless: event.headless !== false,
 		});
 		if (event.log_http_headers === true) {
@ -87,6 +88,8 @@ module.exports.handler = async function handler (event, context, callback) {
            results = await duckduckgo.scrape_duckduckgo_news_pup(browser, event, context);
 		} else if (event.search_engine == 'google_dr') {
            results = await google.scrape_google_pup_dr(browser, event, context);
        } else if (event.search_engine == 'yahoo_news') {
 			results = await tickersearch.scrape_yahoo_finance_pup(browser, event, context);
 		}
        let metadata = {};