supporting yahoo ticker search for news

This commit is contained in:
Nikolai Tschacher 2019-01-24 15:50:03 +01:00
parent 9cfa502851
commit bab902e80a
11 changed files with 106 additions and 14 deletions

View File

@ -52,13 +52,17 @@ let config = {
// is drawn before every request. empty string for no sleeping. // is drawn before every request. empty string for no sleeping.
sleep_range: '[1,1]', sleep_range: '[1,1]',
// which search engine to scrape // which search engine to scrape
search_engine: 'google', search_engine: 'yahoo_news',
// whether debug information should be printed // whether debug information should be printed
debug: 'true', debug: 'true',
// whether verbose program output should be printed // whether verbose program output should be printed
verbose: 'false', verbose: 'false',
// an array of keywords to scrape // an array of keywords to scrape
keywords: ['incolumitas.com scraping', 'best scraping framework'], keywords: ['GOOGL', ],
// alternatively you can specify a keyword_file. this overwrites the keywords array
keyword_file: './keywords.txt',
// whether to start the browser in headless mode
headless: false,
}; };
se_scraper.scrape(config, (err, response) => { se_scraper.scrape(config, (err, response) => {
@ -90,6 +94,7 @@ Supported options for the `search_engine` config key:
'youtube' 'youtube'
'duckduckgo_news' 'duckduckgo_news'
'google_dr' 'google_dr'
'yahoo_news'
``` ```
Output for the above script on my laptop: Output for the above script on my laptop:

View File

@ -2,6 +2,11 @@
- fix interface to scrape() [DONE] - fix interface to scrape() [DONE]
- add to Github - add to Github
24.1.2018
- fix issue #3: add functionality to add keyword file
TODO: TODO:
- add proxy support - add proxy support
- add captcha service solving support - add captcha service solving support

View File

@ -1,5 +1,6 @@
const handler = require('./src/node_scraper.js'); const handler = require('./src/node_scraper.js');
var fs = require('fs'); var fs = require('fs');
var os = require("os");
exports.scrape = function(config, callback) { exports.scrape = function(config, callback) {
// options for scraping // options for scraping
@ -21,14 +22,14 @@ exports.scrape = function(config, callback) {
compress: 'false', // compress compress: 'false', // compress
debug: 'false', debug: 'false',
verbose: 'false', verbose: 'false',
keywords: [], keywords: ['test'],
}; };
for (var key in config) { for (var key in config) {
event[key] = config[key]; event[key] = config[key];
} }
if (fs.existsSync( event.keyword_file )) { if (fs.existsSync(event.keyword_file)) {
event.keywords = read_keywords_from_file(event.keyword_file); event.keywords = read_keywords_from_file(event.keyword_file);
} }
@ -47,7 +48,7 @@ exports.scrape = function(config, callback) {
}; };
function read_keywords_from_file(fname) { function read_keywords_from_file(fname) {
let kws = fs.readFileSync(fname).toString().split("\n"); let kws = fs.readFileSync(fname).toString().split(os.EOL);
// clean keywords // clean keywords
kws = kws.filter((kw) => { kws = kws.filter((kw) => {
return kw.trim().length > 0; return kw.trim().length > 0;

View File

@ -1,3 +1,2 @@
google scraper nikolait GOOGL
mount everest AAPL
incolumitas.com

2
package-lock.json generated
View File

@ -1,6 +1,6 @@
{ {
"name": "se-scraper", "name": "se-scraper",
"version": "1.0.0", "version": "1.0.5",
"lockfileVersion": 1, "lockfileVersion": 1,
"requires": true, "requires": true,
"dependencies": { "dependencies": {

View File

@ -1,6 +1,6 @@
{ {
"name": "se-scraper", "name": "se-scraper",
"version": "1.0.4", "version": "1.0.5",
"description": "A simple module which uses puppeteer to scrape several search engines.", "description": "A simple module which uses puppeteer to scrape several search engines.",
"main": "index.js", "main": "index.js",
"scripts": { "scripts": {

8
run.js
View File

@ -11,13 +11,17 @@ let config = {
// is drawn before every request. empty string for no sleeping. // is drawn before every request. empty string for no sleeping.
sleep_range: '[1,1]', sleep_range: '[1,1]',
// which search engine to scrape // which search engine to scrape
search_engine: 'google', search_engine: 'yahoo_news',
// whether debug information should be printed // whether debug information should be printed
debug: 'true', debug: 'true',
// whether verbose program output should be printed // whether verbose program output should be printed
verbose: 'false', verbose: 'false',
// an array of keywords to scrape // an array of keywords to scrape
keywords: ['incolumitas.com scraping', 'best scraping framework'], keywords: ['GOOGL', ],
// alternatively you can specify a keyword_file. this overwrites the keywords array
keyword_file: './keywords.txt',
// whether to start the browser in headless mode
headless: false,
}; };
se_scraper.scrape(config, (err, response) => { se_scraper.scrape(config, (err, response) => {

9
se-scraper.iml Normal file
View File

@ -0,0 +1,9 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="WEB_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="true">
<exclude-output />
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

View File

@ -3,8 +3,16 @@ module.exports = {
effective_query: effective_query, effective_query: effective_query,
sleep: sleep, sleep: sleep,
random_sleep: random_sleep, random_sleep: random_sleep,
set_input_value: set_input_value,
}; };
async function set_input_value(page, selector, value) {
await page.waitFor(selector);
await page.evaluate((value, selector) => {
return document.querySelector(selector).value = value;
}, value, selector);
}
function no_results(needles, html) { function no_results(needles, html) {
return !needles.map((needle) => { return html.indexOf(needle)}) return !needles.map((needle) => { return html.indexOf(needle)})
.every((res) => { return res == -1}); .every((res) => { return res == -1});

View File

@ -0,0 +1,58 @@
const cheerio = require('cheerio');
const sfunctions = require('./functions.js');
module.exports = {
scrape_yahoo_finance_pup: scrape_yahoo_finance_pup,
};
async function scrape_yahoo_finance_pup(browser, event, context) {
var results = {};
const page = await browser.newPage();
await page.goto('https://finance.yahoo.com/');
for (var i = 0; i < 3; i++) {
consent = await page.waitForSelector('[type="submit"]');
await consent.click();
}
for (let keyword of event.keywords) {
try {
await page.goto(`https://finance.yahoo.com/quote/${keyword}/news?p=${keyword}`);
await page.waitForSelector('#quote-header-info', { timeout: 8000 });
if (event.debug === true && event.is_local === true) {
await page.screenshot({path: `debug/${keyword}.png`});
}
await sfunctions.sleep(1000);
let html = await page.content();
results[keyword] = parse(html);
} catch (e) {
console.error(`Problem with scraping ${keyword}: ${e}`);
}
}
return results;
}
function parse(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
const results = [];
$('.js-stream-content .Cf').each((i, link) => {
results.push({
link: $(link).find('h3 a').attr('href'),
title: $(link).find('h3').text(),
snippet: $(link).find('p').text(),
})
});
return {
time: (new Date()).toUTCString(),
results: results,
}
}

View File

@ -10,6 +10,7 @@ const youtube = require('./modules/youtube.js');
const ua = require('./modules/user_agents.js'); const ua = require('./modules/user_agents.js');
const meta = require('./modules/metadata.js'); const meta = require('./modules/metadata.js');
const duckduckgo = require('./modules/duckduckgo.js'); const duckduckgo = require('./modules/duckduckgo.js');
const tickersearch = require('./modules/ticker_search.js');
module.exports.handler = async function handler (event, context, callback) { module.exports.handler = async function handler (event, context, callback) {
@ -54,7 +55,7 @@ module.exports.handler = async function handler (event, context, callback) {
browser = await puppeteer.launch({ browser = await puppeteer.launch({
args: ADDITIONAL_CHROME_FLAGS, args: ADDITIONAL_CHROME_FLAGS,
headless: true, headless: event.headless !== false,
}); });
if (event.log_http_headers === true) { if (event.log_http_headers === true) {
@ -87,6 +88,8 @@ module.exports.handler = async function handler (event, context, callback) {
results = await duckduckgo.scrape_duckduckgo_news_pup(browser, event, context); results = await duckduckgo.scrape_duckduckgo_news_pup(browser, event, context);
} else if (event.search_engine == 'google_dr') { } else if (event.search_engine == 'google_dr') {
results = await google.scrape_google_pup_dr(browser, event, context); results = await google.scrape_google_pup_dr(browser, event, context);
} else if (event.search_engine == 'yahoo_news') {
results = await tickersearch.scrape_yahoo_finance_pup(browser, event, context);
} }
let metadata = {}; let metadata = {};