forked from extern/se-scraper
ticker search OOP now and added tests
This commit is contained in:
parent
d35a602994
commit
d5b147296e
10
README.md
10
README.md
@ -19,7 +19,6 @@ is supported.
|
||||
|
||||
Additionally **se-scraper** supports investment ticker search from the following sites:
|
||||
|
||||
* Bloomberg
|
||||
* Reuters
|
||||
* cnbc
|
||||
* Marketwatch
|
||||
@ -28,7 +27,13 @@ This module uses puppeteer. It was created by the Developer of https://github.co
|
||||
|
||||
### Quickstart
|
||||
|
||||
Install with
|
||||
**Note**: If you don't want puppeteer to download a complete chromium browser, add this variable to your environments:
|
||||
|
||||
```bash
|
||||
export PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=1
|
||||
```
|
||||
|
||||
Then install with
|
||||
|
||||
```bash
|
||||
npm install se-scraper
|
||||
@ -187,7 +192,6 @@ Supported options for the `search_engine` config key:
|
||||
'youtube'
|
||||
'duckduckgo_news'
|
||||
'yahoo_news'
|
||||
'bloomberg'
|
||||
'reuters'
|
||||
'cnbc'
|
||||
'marketwatch'
|
||||
|
1
bing.json
Normal file
1
bing.json
Normal file
@ -0,0 +1 @@
|
||||
{"scrapeulous.com":{"1":{"time":"Thu, 31 Jan 2019 14:40:37 GMT","no_results":false,"effective_query":"scrupulous","num_results":"1.370.000 Ergebnisse","results":[{"link":"https://www.dict.cc/englisch-deutsch/scrupulous.html","title":"dict.cc Wörterbuch :: scrupulous :: Englisch-Deutsch ...","snippet":"Dieses Deutsch-Englisch-Wörterbuch basiert auf der Idee der freien Weitergabe von Wissen. Mehr Informationen! Enthält Übersetzungen von der TU Chemnitz sowie aus Mr Honey's Business Dictionary (Englisch/Deutsch).","visible_link":"www.dict.cc › … › Übersetzungen mit gleichem Wortanfang › SCR","rank":1},{"link":"https://scrapeulous.com/about/","title":"About scrapeulous.com","snippet":"About scrapeulous.com. Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. The business requirement to scrape information from search engines often occurs in marketing research or in scientific projects.","visible_link":"https://scrapeulous.com/about","rank":2},{"link":"https://www.dictionary.com/browse/scrupulous","title":"Scrupulous | Define Scrupulous at Dictionary.com","snippet":"Scrupulous definition, having scruples, or moral or ethical standards; having or showing a strict regard for what one considers right; principled: scrupulous about defending human rights. See more.","visible_link":"https://www.dictionary.com/browse/scrupulous","rank":3},{"link":"https://www.dict.cc/?s=scrupulous","title":"scrupulous | Übersetzung Englisch-Deutsch","snippet":"Kennst du Übersetzungen, die noch nicht in diesem Wörterbuch enthalten sind? Hier kannst du sie vorschlagen! Bitte immer nur genau eine Deutsch-Englisch-Übersetzung eintragen (Formatierung siehe Guidelines), möglichst mit einem guten Beleg im Kommentarfeld.","visible_link":"https://www.dict.cc/?s=scrupulous","rank":4},{"link":"https://dict.leo.org/englisch-deutsch/scrupulous","title":"scrupulous - Deutsch Wörterbuch - leo.org: Startseite","snippet":"Lernen Sie die Übersetzung für 'scrupulous' in LEOs Englisch ⇔ Deutsch Wörterbuch. Mit Flexionstabellen der verschiedenen Fälle und Zeiten Aussprache und relevante Diskussionen Kostenloser Vokabeltrainer","visible_link":"https://dict.leo.org/englisch-deutsch/scrupulous","rank":5},{"link":"https://www.merriam-webster.com/dictionary/scrupulous","title":"Scrupulous | Definition of Scrupulous by Merriam …","snippet":"Choose the Right Synonym for scrupulous. upright, honest, just, conscientious, scrupulous, honorable mean having or showing a strict regard for what is morally right.","visible_link":"https://www.merriam-webster.com/dictionary/scrupulous","rank":6},{"link":"https://dictionary.cambridge.org/de/worterbuch/englisch/scrupulous","title":"SCRUPULOUS | Bedeutung im Cambridge Englisch Wörterbuch","snippet":"These examples are from the Cambridge English Corpus and from sources on the web. Any opinions in the examples do not represent the opinion of the Cambridge Dictionary editors or of Cambridge University Press or its licensors.","visible_link":"https://dictionary.cambridge.org/de/worterbuch/englisch/scrupulous","rank":7},{"link":"https://en.oxforddictionaries.com/definition/scrupulous","title":"scrupulous | Definition of scrupulous in English by …","snippet":"Definition of scrupulous - (of a person or process) careful, thorough, and extremely attentive to details","visible_link":"https://en.oxforddictionaries.com/definition/scrupulous","rank":8},{"link":"https://www.dictionary.com/browse/scrupulously","title":"Scrupulously | Define Scrupulously at …","snippet":"Scrupulously definition, having scruples, or moral or ethical standards; having or showing a strict regard for what one considers right; principled: scrupulous about defending human rights. See more.","visible_link":"https://www.dictionary.com/browse/scrupulously","rank":9},{"link":"https://www.youtube.com/watch?v=a6xn6rc9GbI","title":"How to use scrapeulous.com - YouTube","snippet":"16.12.2018 · This video is unavailable. Watch Queue Queue. Watch Queue Queue","visible_link":"https://www.youtube.com/watch?v=a6xn6rc9GbI","rank":10}]}}}
|
1
google.json
Normal file
1
google.json
Normal file
@ -0,0 +1 @@
|
||||
{"scrapeulous.com":{"1":{"time":"Thu, 31 Jan 2019 14:40:33 GMT","num_results":"Ungefähr 163 Ergebnisse (0,25 Sekunden) ","no_results":false,"effective_query":"","results":[{"link":"https://scrapeulous.com/","title":"Scrapeuloushttps://scrapeulous.com/Im CacheDiese Seite übersetzen","snippet":"What We Do. Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. Whether you need to analyze your ...","visible_link":"https://scrapeulous.com/","date":"","rank":1},{"link":"https://scrapeulous.com/about/","title":"About - Scrapeuloushttps://scrapeulous.com/about/Im CacheDiese Seite übersetzen","snippet":"Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. The business requirement to scrape information from ...","visible_link":"https://scrapeulous.com/about/","date":"","rank":2},{"link":"https://scrapeulous.com/howto/","title":"Howto - Scrapeuloushttps://scrapeulous.com/howto/Im CacheDiese Seite übersetzen","snippet":"We offer scraping large amounts of keywords for the Google Search Engine. Large means any number of keywords between 40 and 50000. Additionally, we ...","visible_link":"https://scrapeulous.com/howto/","date":"","rank":3},{"link":"https://scrapeulous.com/contact/","title":"Contact - Scrapeuloushttps://scrapeulous.com/contact/Im CacheDiese Seite übersetzen","snippet":"Contact scrapeulous.com. Your email address. Valid email address where we are going to contact you. We will not send spam mail. Your inquiry.","visible_link":"https://scrapeulous.com/contact/","date":"","rank":4},{"link":"https://incolumitas.com/","title":"Coding, Learning and Business Ideashttps://incolumitas.com/Im CacheDiese Seite übersetzen","snippet":"About · Contact · GoogleScraper · Lichess Autoplay-Bot · Projects · Scrapeulous.com · Site Notice · SVGCaptcha · Home Archives Categories Tags Atom ...","visible_link":"https://incolumitas.com/","date":"","rank":5},{"link":"https://incolumitas.com/pages/scrapeulous/","title":"Coding, Learning and Business Ideas – Scrapeulous.com - Incolumitashttps://incolumitas.com/pages/scrapeulous/Im CacheDiese Seite übersetzen","snippet":"In autumn 2018, I created a scraping service called scrapeulous.com. There you can purchase scrape jobs that allow you to upload a keyword file which in turn ...","visible_link":"https://incolumitas.com/pages/scrapeulous/","date":"","rank":6},{"link":"https://www.youtube.com/watch?v=a6xn6rc9GbI","title":"scrapeulous intro - YouTubehttps://www.youtube.com/watch?v=a6xn6rc9GbIDiese Seite übersetzen","snippet":"Introduction for https://scrapeulous.com.","visible_link":"https://www.youtube.com/watch?v=a6xn6rc9GbI","date":"","rank":7},{"link":"https://www.youtube.com/channel/UCJs1Xei5LRefg9GwFYdYhOw","title":"Scrapeulous Scrapeulous - YouTubehttps://www.youtube.com/.../UCJs1Xei5LRefg9GwFYdYhOwIm CacheDiese Seite übersetzen","snippet":"How to use scrapeulous.com - Duration: 3 minutes, 42 seconds. 32 minutes ago; 4 views. Introduction for https://scrapeulous.com. Show more. This item has ...","visible_link":"https://www.youtube.com/.../UCJs1Xei5LRefg9GwFYdYhOw","date":"","rank":8},{"link":"https://readthedocs.org/projects/googlescraper/downloads/pdf/latest/","title":"GoogleScraper Documentation - ReadTheDocshttps://readthedocs.org/projects/googlescraper/downloads/.../latest...Im CacheDiese Seite übersetzen","snippet":"23.12.2018 - 1.1 Scrapeulous.com - Scraping Service. GoogleScraper is a open source tool and will remain a open source tool in the future. Some people ...","visible_link":"https://readthedocs.org/projects/googlescraper/downloads/.../latest...","date":"23.12.2018 - ","rank":9},{"link":"https://pypi.org/project/CountryGoogleScraper/","title":"CountryGoogleScraper · PyPIhttps://pypi.org/project/CountryGoogleScraper/Im CacheDiese Seite übersetzen","snippet":"Look [here to get an idea how to use asynchronous mode](http://scrapeulous.com/googlescraper-260-keywords-in-a-second.html). ### Table of Contents 1.","visible_link":"https://pypi.org/project/CountryGoogleScraper/","date":"","rank":10}]}}}
|
4
run.js
4
run.js
@ -12,7 +12,7 @@ let config = {
|
||||
// is drawn before every request. empty string for no sleeping.
|
||||
sleep_range: '[1,2]',
|
||||
// which search engine to scrape
|
||||
search_engine: 'google',
|
||||
search_engine: 'marketwatch',
|
||||
// whether debug information should be printed
|
||||
// debug info is useful for developers when debugging
|
||||
debug: false,
|
||||
@ -20,7 +20,7 @@ let config = {
|
||||
// this output is informational
|
||||
verbose: true,
|
||||
// an array of keywords to scrape
|
||||
keywords: ['apple tree'],
|
||||
keywords: ['MSFT', 'AAPL'],
|
||||
// alternatively you can specify a keyword_file. this overwrites the keywords array
|
||||
keyword_file: '',
|
||||
// the number of pages to scrape for each keyword
|
||||
|
@ -115,7 +115,8 @@ module.exports = class Scraper {
|
||||
}
|
||||
|
||||
let html = await this.page.content();
|
||||
this.results[keyword][page_num] = this.parse(html);
|
||||
let parsed = this.parse(html);
|
||||
this.results[keyword][page_num] = parsed ? parsed : await this.parse_async(html);
|
||||
|
||||
page_num += 1;
|
||||
|
||||
@ -191,6 +192,10 @@ module.exports = class Scraper {
|
||||
|
||||
}
|
||||
|
||||
async parse_async(html) {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns true if startpage was loaded correctly.
|
||||
|
@ -1,285 +1,215 @@
|
||||
const cheerio = require('cheerio');
|
||||
const Scraper = require('./se_scraper');
|
||||
|
||||
class YahooFinanceScraper extends Scraper {
|
||||
|
||||
parse(html) {
|
||||
// load the page source into cheerio
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const results = [];
|
||||
$('.js-stream-content .Cf').each((i, link) => {
|
||||
results.push({
|
||||
link: $(link).find('h3 a').attr('href'),
|
||||
title: $(link).find('h3').text(),
|
||||
snippet: $(link).find('p').text(),
|
||||
})
|
||||
});
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
results: results,
|
||||
}
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
try {
|
||||
await this.page.goto('https://finance.yahoo.com/');
|
||||
for (var i = 0; i < 3; i++) {
|
||||
let consent = await this.page.waitForSelector('[type="submit"]');
|
||||
await consent.click();
|
||||
}
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
async search_keyword(keyword) {
|
||||
await this.page.goto(`https://finance.yahoo.com/quote/${keyword}/news?p=${keyword}`);
|
||||
await this.page.waitForSelector('#quote-header-info', { timeout: 8000 });
|
||||
await this.sleep(1000);
|
||||
}
|
||||
|
||||
async next_page() {
|
||||
return false;
|
||||
}
|
||||
|
||||
async wait_for_results() {
|
||||
await this.page.waitForSelector('#b_content', { timeout: 5000 });
|
||||
await this.sleep(500);
|
||||
}
|
||||
|
||||
async detected() {
|
||||
}
|
||||
}
|
||||
|
||||
class MarketwatchFinanceScraper extends Scraper {
|
||||
|
||||
async parse_async(html) {
|
||||
let res = await this.page.evaluate(() => {
|
||||
let results = [];
|
||||
// get the hotel elements
|
||||
let items = document.querySelectorAll('.article__content');
|
||||
// get the hotel data
|
||||
items.forEach((newsitem) => {
|
||||
let data = {};
|
||||
try {
|
||||
data.link = newsitem.querySelector('.article__headline a').getAttribute('href');
|
||||
data.title = newsitem.querySelector('.article__headline a').innerText;
|
||||
data.date = newsitem.querySelector('.article__timestamp').innerText;
|
||||
data.author = newsitem.querySelector('.article__author').innerText;
|
||||
}
|
||||
catch (exception) {
|
||||
console.error('Error parsing marketwatch data: ', exception);
|
||||
}
|
||||
results.push(data);
|
||||
});
|
||||
return results;
|
||||
});
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
results: res,
|
||||
}
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
return true;
|
||||
}
|
||||
|
||||
async search_keyword(keyword) {
|
||||
await this.page.goto(`https://www.marketwatch.com/investing/stock/${keyword}`);
|
||||
}
|
||||
|
||||
async next_page() {
|
||||
return false;
|
||||
}
|
||||
|
||||
async wait_for_results() {
|
||||
await this.page.waitForSelector('.intraday__data', { timeout: 8000 });
|
||||
await this.sleep(500);
|
||||
}
|
||||
|
||||
async detected() {
|
||||
}
|
||||
}
|
||||
|
||||
class ReutersFinanceScraper extends Scraper {
|
||||
|
||||
async parse_async(html) {
|
||||
let newsData = await this.page.evaluate(() => {
|
||||
let results = [];
|
||||
// get the hotel elements
|
||||
let items = document.querySelectorAll('div.feature');
|
||||
// get the hotel data
|
||||
items.forEach((newsitem) => {
|
||||
let data = {};
|
||||
try {
|
||||
data.link = newsitem.querySelector('h2 a').getAttribute('href');
|
||||
data.link = 'https://www.reuters.com' + data.link;
|
||||
data.title = newsitem.querySelector('h2 a').innerText;
|
||||
data.snippet = newsitem.querySelector('p').innerText;
|
||||
data.date = newsitem.querySelector('.timestamp').innerText;
|
||||
}
|
||||
catch (exception) {
|
||||
console.error('Error parsing reuters data: ', exception);
|
||||
}
|
||||
results.push(data);
|
||||
});
|
||||
return results;
|
||||
});
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
results: newsData,
|
||||
}
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
return true;
|
||||
}
|
||||
|
||||
async search_keyword(keyword) {
|
||||
await this.page.goto(`https://www.reuters.com/finance/stocks/overview/${keyword}`);
|
||||
}
|
||||
|
||||
async next_page() {
|
||||
return false;
|
||||
}
|
||||
|
||||
async wait_for_results() {
|
||||
await this.page.waitForSelector('#sectionHeader', { timeout: 8000 });
|
||||
await this.sleep(500);
|
||||
}
|
||||
|
||||
async detected() {
|
||||
}
|
||||
}
|
||||
|
||||
class CnbcFinanceScraper extends Scraper {
|
||||
|
||||
async parse_async(html) {
|
||||
let newsData = await this.page.evaluate(() => {
|
||||
let results = [];
|
||||
// get the hotel elements
|
||||
let items = document.querySelectorAll('div.headline');
|
||||
// get the hotel data
|
||||
items.forEach((newsitem) => {
|
||||
let data = {};
|
||||
try {
|
||||
data.link = newsitem.querySelector('a').getAttribute('href');
|
||||
data.title = newsitem.querySelector('[ng-bind="asset.headline"]').innerText;
|
||||
data.date = newsitem.querySelector('span.note').innerText;
|
||||
}
|
||||
catch (exception) {
|
||||
console.error('Error parsing cnbc data: ', exception);
|
||||
}
|
||||
results.push(data);
|
||||
});
|
||||
return results;
|
||||
});
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
results: newsData,
|
||||
}
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
return true;
|
||||
}
|
||||
|
||||
async search_keyword(keyword) {
|
||||
await this.page.goto(`https://www.cnbc.com/quotes/?symbol=${keyword}&tab=news`);
|
||||
}
|
||||
|
||||
async next_page() {
|
||||
return false;
|
||||
}
|
||||
|
||||
async wait_for_results() {
|
||||
await this.page.waitForSelector('#quote_title_and_chart', { timeout: 8000 });
|
||||
await this.sleep(500);
|
||||
}
|
||||
|
||||
async detected() {
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
scrape_yahoo_finance_pup: scrape_yahoo_finance_pup,
|
||||
scrape_bloomberg_finance_pup: scrape_bloomberg_finance_pup,
|
||||
scrape_reuters_finance_pup: scrape_reuters_finance_pup,
|
||||
scrape_cnbc_finance_pup: scrape_cnbc_finance_pup,
|
||||
scrape_marketwatch_finance_pup: scrape_marketwatch_finance_pup,
|
||||
not_implemented: undefined,
|
||||
};
|
||||
|
||||
function sleep(ms) {
|
||||
return new Promise(resolve => {
|
||||
setTimeout(resolve, ms)
|
||||
})
|
||||
}
|
||||
|
||||
async function scrape_yahoo_finance_pup(page, event, context, pluggable) {
|
||||
var results = {};
|
||||
await page.goto('https://finance.yahoo.com/');
|
||||
|
||||
for (var i = 0; i < 3; i++) {
|
||||
consent = await page.waitForSelector('[type="submit"]');
|
||||
await consent.click();
|
||||
}
|
||||
|
||||
for (let keyword of event.keywords) {
|
||||
|
||||
if (pluggable.before_keyword_scraped) {
|
||||
await pluggable.before_keyword_scraped({
|
||||
keyword: keyword,
|
||||
page: page,
|
||||
event: event,
|
||||
context: context,
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
await page.goto(`https://finance.yahoo.com/quote/${keyword}/news?p=${keyword}`);
|
||||
|
||||
await page.waitForSelector('#quote-header-info', { timeout: 8000 });
|
||||
|
||||
if (event.debug === true && event.is_local === true) {
|
||||
await page.screenshot({path: `debug/${keyword}.png`});
|
||||
}
|
||||
|
||||
await sleep(1000);
|
||||
|
||||
let html = await page.content();
|
||||
results[keyword] = parse(html);
|
||||
|
||||
} catch (e) {
|
||||
console.error(`Problem with scraping ${keyword}: ${e}`);
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
function parse(html) {
|
||||
// load the page source into cheerio
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const results = [];
|
||||
$('.js-stream-content .Cf').each((i, link) => {
|
||||
results.push({
|
||||
link: $(link).find('h3 a').attr('href'),
|
||||
title: $(link).find('h3').text(),
|
||||
snippet: $(link).find('p').text(),
|
||||
})
|
||||
});
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
results: results,
|
||||
}
|
||||
}
|
||||
|
||||
async function scrape_marketwatch_finance_pup(page, event, context, pluggable) {
|
||||
var results = {};
|
||||
for (let keyword of event.keywords) {
|
||||
if (pluggable.before_keyword_scraped) {
|
||||
await pluggable.before_keyword_scraped({
|
||||
keyword: keyword,
|
||||
page: page,
|
||||
event: event,
|
||||
context: context,
|
||||
});
|
||||
}
|
||||
try {
|
||||
await page.goto(`https://www.marketwatch.com/investing/stock/${keyword}`);
|
||||
await page.waitForSelector('.intraday__data', { timeout: 8000 });
|
||||
|
||||
if (event.debug === true && event.is_local === true) {
|
||||
await page.screenshot({path: `debug/${keyword}.png`});
|
||||
}
|
||||
|
||||
await sleep(500);
|
||||
|
||||
let newsData = await page.evaluate(() => {
|
||||
let results = [];
|
||||
// get the hotel elements
|
||||
let items = document.querySelectorAll('.article__content');
|
||||
// get the hotel data
|
||||
items.forEach((newsitem) => {
|
||||
let data = {};
|
||||
try {
|
||||
data.link = newsitem.querySelector('.article__headline a').getAttribute('href');
|
||||
data.title = newsitem.querySelector('.article__headline a').innerText;
|
||||
data.date = newsitem.querySelector('.article__timestamp').innerText;
|
||||
data.author = newsitem.querySelector('.article__author').innerText;
|
||||
}
|
||||
catch (exception) {
|
||||
console.error('Error parsing marketwatch data: ', exception);
|
||||
}
|
||||
results.push(data);
|
||||
});
|
||||
return results;
|
||||
});
|
||||
|
||||
results[keyword] = {
|
||||
time: (new Date()).toUTCString(),
|
||||
results: newsData,
|
||||
}
|
||||
|
||||
} catch (e) {
|
||||
console.error(`Problem with scraping ${keyword}: ${e}`);
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
|
||||
async function scrape_bloomberg_finance_pup(page, event, context, pluggable) {
|
||||
/*
|
||||
Bloomberg blocks after one request. what a shit hole.
|
||||
*/
|
||||
var results = {};
|
||||
for (let keyword of event.keywords) {
|
||||
|
||||
if (pluggable.before_keyword_scraped) {
|
||||
await pluggable.before_keyword_scraped({
|
||||
keyword: keyword,
|
||||
page: page,
|
||||
event: event,
|
||||
context: context,
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
await page.goto(`https://www.bloomberg.com/quote/${keyword}:US`);
|
||||
await page.waitForSelector('.pseudoMainContent', { timeout: 8000 });
|
||||
|
||||
if (event.debug === true && event.is_local === true) {
|
||||
await page.screenshot({path: `debug/${keyword}.png`});
|
||||
}
|
||||
|
||||
await sleep(1000);
|
||||
|
||||
let news_items = await page.$x('//*[starts-with(@class,"newsItem")]');
|
||||
for (let item of news_items) {
|
||||
let url = item.$$('a').then((link) => {
|
||||
link.getProperty('href').then((anchor) => {
|
||||
return anchor;
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
} catch (e) {
|
||||
console.error(`Problem with scraping ${keyword}: ${e}`);
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
async function scrape_reuters_finance_pup(page, event, context, pluggable) {
|
||||
var results = {};
|
||||
for (let keyword of event.keywords) {
|
||||
|
||||
if (pluggable.before_keyword_scraped) {
|
||||
await pluggable.before_keyword_scraped({
|
||||
keyword: keyword,
|
||||
page: page,
|
||||
event: event,
|
||||
context: context,
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
await page.goto(`https://www.reuters.com/finance/stocks/overview/${keyword}`);
|
||||
await page.waitForSelector('#sectionHeader', { timeout: 8000 });
|
||||
|
||||
if (event.debug === true && event.is_local === true) {
|
||||
await page.screenshot({path: `debug/${keyword}.png`});
|
||||
}
|
||||
|
||||
await sleep(500);
|
||||
|
||||
let newsData = await page.evaluate(() => {
|
||||
let results = [];
|
||||
// get the hotel elements
|
||||
let items = document.querySelectorAll('div.feature');
|
||||
// get the hotel data
|
||||
items.forEach((newsitem) => {
|
||||
let data = {};
|
||||
try {
|
||||
data.link = newsitem.querySelector('h2 a').getAttribute('href');
|
||||
data.link = 'https://www.reuters.com' + data.link;
|
||||
data.title = newsitem.querySelector('h2 a').innerText;
|
||||
data.text = newsitem.querySelector('p').innerText;
|
||||
data.date = newsitem.querySelector('.timestamp').innerText;
|
||||
}
|
||||
catch (exception) {
|
||||
console.error('Error parsing reuters data: ', exception);
|
||||
}
|
||||
results.push(data);
|
||||
});
|
||||
return results;
|
||||
});
|
||||
|
||||
results[keyword] = {
|
||||
time: (new Date()).toUTCString(),
|
||||
results: newsData,
|
||||
}
|
||||
|
||||
} catch (e) {
|
||||
console.error(`Problem with scraping ${keyword}: ${e}`);
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
async function scrape_cnbc_finance_pup(page, event, context, pluggable) {
|
||||
var results = {};
|
||||
for (let keyword of event.keywords) {
|
||||
|
||||
if (pluggable.before_keyword_scraped) {
|
||||
await pluggable.before_keyword_scraped({
|
||||
keyword: keyword,
|
||||
page: page,
|
||||
event: event,
|
||||
context: context,
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
await page.goto(`https://www.cnbc.com/quotes/?symbol=${keyword}&tab=news`);
|
||||
await page.waitForSelector('#quote_title_and_chart', { timeout: 8000 });
|
||||
|
||||
if (event.debug === true && event.is_local === true) {
|
||||
await page.screenshot({path: `debug/${keyword}.png`});
|
||||
}
|
||||
|
||||
await sleep(500);
|
||||
|
||||
let newsData = await page.evaluate(() => {
|
||||
let results = [];
|
||||
// get the hotel elements
|
||||
let items = document.querySelectorAll('div.headline');
|
||||
// get the hotel data
|
||||
items.forEach((newsitem) => {
|
||||
let data = {};
|
||||
try {
|
||||
data.link = newsitem.querySelector('a').getAttribute('href');
|
||||
data.title = newsitem.querySelector('[ng-bind="asset.headline"]').innerText;
|
||||
data.date = newsitem.querySelector('span.note').innerText;
|
||||
}
|
||||
catch (exception) {
|
||||
console.error('Error parsing cnbc data: ', exception);
|
||||
}
|
||||
results.push(data);
|
||||
});
|
||||
return results;
|
||||
});
|
||||
|
||||
results[keyword] = {
|
||||
time: (new Date()).toUTCString(),
|
||||
results: newsData,
|
||||
}
|
||||
|
||||
} catch (e) {
|
||||
console.error(`Problem with scraping ${keyword}: ${e}`);
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
YahooFinanceScraper: YahooFinanceScraper,
|
||||
ReutersFinanceScraper: ReutersFinanceScraper,
|
||||
CnbcFinanceScraper: CnbcFinanceScraper,
|
||||
MarketwatchFinanceScraper: MarketwatchFinanceScraper,
|
||||
};
|
@ -144,13 +144,10 @@ module.exports.handler = async function handler (event, context, callback) {
|
||||
webcrawler: infospace.WebcrawlerNewsScraper,
|
||||
baidu: baidu.BaiduScraper,
|
||||
youtube: youtube.YoutubeScraper,
|
||||
|
||||
yahoo_news: tickersearch.not_implemented,
|
||||
bloomberg: tickersearch.not_implemented,
|
||||
reuters: tickersearch.not_implemented,
|
||||
cnbc: tickersearch.not_implemented,
|
||||
marketwatch: tickersearch.not_implemented,
|
||||
|
||||
yahoo_news: tickersearch.YahooFinanceScraper,
|
||||
reuters: tickersearch.ReutersFinanceScraper,
|
||||
cnbc: tickersearch.CnbcFinanceScraper,
|
||||
marketwatch: tickersearch.MarketwatchFinanceScraper,
|
||||
}[config.search_engine];
|
||||
|
||||
if (Scraper === undefined) {
|
||||
|
@ -56,7 +56,7 @@ function normal_search_test_case(err, response) {
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'rank'], 'not all keys are in the SERP object');
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
|
227
test/test_ticker_search.js
Normal file
227
test/test_ticker_search.js
Normal file
@ -0,0 +1,227 @@
|
||||
const se_scraper = require('./../index.js');
|
||||
var assert = require('chai').assert;
|
||||
|
||||
/*
|
||||
* Use chai and mocha for tests.
|
||||
* https://mochajs.org/#installation
|
||||
*/
|
||||
|
||||
const quote_search_keywords = ['MSFT', 'AAPL'];
|
||||
|
||||
async function reuters_search_test() {
|
||||
let config = {
|
||||
search_engine: 'reuters',
|
||||
compress: false,
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: quote_search_keywords,
|
||||
keyword_file: '',
|
||||
num_pages: 1,
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
console.log('reuters_search_test()');
|
||||
await se_scraper.scrape(config, reuters_search_test_case);
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function reuters_search_test_case(err, response) {
|
||||
|
||||
if (err) {
|
||||
console.error(err);
|
||||
} else {
|
||||
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
|
||||
assert.equal(response.statusCode, 200, 'status code must be 200');
|
||||
|
||||
let total_rank = 1;
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, quote_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'date', 'snippet'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.date, 'date must be ok');
|
||||
assert.typeOf(res.date, 'string', 'date must be string');
|
||||
assert.isAtLeast(res.date.length, 5, 'date must have at least 5 chars');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function cnbc_search_test() {
|
||||
let config = {
|
||||
search_engine: 'cnbc',
|
||||
compress: false,
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: quote_search_keywords,
|
||||
keyword_file: '',
|
||||
num_pages: 1,
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
console.log('cnbc_search_test()');
|
||||
await se_scraper.scrape(config, cnbc_search_test_case);
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function cnbc_search_test_case(err, response) {
|
||||
|
||||
if (err) {
|
||||
console.error(err);
|
||||
} else {
|
||||
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
|
||||
assert.equal(response.statusCode, 200, 'status code must be 200');
|
||||
|
||||
let total_rank = 1;
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, quote_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'date'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.date, 'date must be ok');
|
||||
assert.typeOf(res.date, 'string', 'date must be string');
|
||||
assert.isAtLeast(res.date.length, 5, 'date must have at least 5 chars');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const marketwatch_search_keywords = ['MSFT'];
|
||||
|
||||
async function marketwatch_search_test() {
|
||||
let config = {
|
||||
search_engine: 'marketwatch',
|
||||
compress: false,
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: marketwatch_search_keywords,
|
||||
keyword_file: '',
|
||||
num_pages: 1,
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
console.log('marketwatch_search_test()');
|
||||
await se_scraper.scrape(config, marketwatch_search_test_case);
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function marketwatch_search_test_case(err, response) {
|
||||
|
||||
if (err) {
|
||||
console.error(err);
|
||||
} else {
|
||||
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
|
||||
assert.equal(response.statusCode, 200, 'status code must be 200');
|
||||
|
||||
let total_rank = 1;
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, marketwatch_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'date', 'author'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.author, 'author must be ok');
|
||||
assert.typeOf(res.author, 'string', 'author must be string');
|
||||
assert.isAtLeast(res.author.length, 5, 'author must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.date, 'date must be ok');
|
||||
assert.typeOf(res.date, 'string', 'date must be string');
|
||||
assert.isAtLeast(res.date.length, 5, 'date must have at least 5 chars');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
(async () => {
|
||||
await reuters_search_test();
|
||||
await cnbc_search_test();
|
||||
await marketwatch_search_test();
|
||||
})();
|
Loading…
Reference in New Issue
Block a user