mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2024-11-25 09:03:10 +01:00
supporting yahoo ticker search for news
This commit is contained in:
parent
9cfa502851
commit
bab902e80a
@ -52,13 +52,17 @@ let config = {
|
|||||||
// is drawn before every request. empty string for no sleeping.
|
// is drawn before every request. empty string for no sleeping.
|
||||||
sleep_range: '[1,1]',
|
sleep_range: '[1,1]',
|
||||||
// which search engine to scrape
|
// which search engine to scrape
|
||||||
search_engine: 'google',
|
search_engine: 'yahoo_news',
|
||||||
// whether debug information should be printed
|
// whether debug information should be printed
|
||||||
debug: 'true',
|
debug: 'true',
|
||||||
// whether verbose program output should be printed
|
// whether verbose program output should be printed
|
||||||
verbose: 'false',
|
verbose: 'false',
|
||||||
// an array of keywords to scrape
|
// an array of keywords to scrape
|
||||||
keywords: ['incolumitas.com scraping', 'best scraping framework'],
|
keywords: ['GOOGL', ],
|
||||||
|
// alternatively you can specify a keyword_file. this overwrites the keywords array
|
||||||
|
keyword_file: './keywords.txt',
|
||||||
|
// whether to start the browser in headless mode
|
||||||
|
headless: false,
|
||||||
};
|
};
|
||||||
|
|
||||||
se_scraper.scrape(config, (err, response) => {
|
se_scraper.scrape(config, (err, response) => {
|
||||||
@ -90,6 +94,7 @@ Supported options for the `search_engine` config key:
|
|||||||
'youtube'
|
'youtube'
|
||||||
'duckduckgo_news'
|
'duckduckgo_news'
|
||||||
'google_dr'
|
'google_dr'
|
||||||
|
'yahoo_news'
|
||||||
```
|
```
|
||||||
|
|
||||||
Output for the above script on my laptop:
|
Output for the above script on my laptop:
|
||||||
|
5
TODO.txt
5
TODO.txt
@ -2,6 +2,11 @@
|
|||||||
- fix interface to scrape() [DONE]
|
- fix interface to scrape() [DONE]
|
||||||
- add to Github
|
- add to Github
|
||||||
|
|
||||||
|
|
||||||
|
24.1.2018
|
||||||
|
|
||||||
|
- fix issue #3: add functionality to add keyword file
|
||||||
|
|
||||||
TODO:
|
TODO:
|
||||||
- add proxy support
|
- add proxy support
|
||||||
- add captcha service solving support
|
- add captcha service solving support
|
||||||
|
7
index.js
7
index.js
@ -1,5 +1,6 @@
|
|||||||
const handler = require('./src/node_scraper.js');
|
const handler = require('./src/node_scraper.js');
|
||||||
var fs = require('fs');
|
var fs = require('fs');
|
||||||
|
var os = require("os");
|
||||||
|
|
||||||
exports.scrape = function(config, callback) {
|
exports.scrape = function(config, callback) {
|
||||||
// options for scraping
|
// options for scraping
|
||||||
@ -21,14 +22,14 @@ exports.scrape = function(config, callback) {
|
|||||||
compress: 'false', // compress
|
compress: 'false', // compress
|
||||||
debug: 'false',
|
debug: 'false',
|
||||||
verbose: 'false',
|
verbose: 'false',
|
||||||
keywords: [],
|
keywords: ['test'],
|
||||||
};
|
};
|
||||||
|
|
||||||
for (var key in config) {
|
for (var key in config) {
|
||||||
event[key] = config[key];
|
event[key] = config[key];
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fs.existsSync( event.keyword_file )) {
|
if (fs.existsSync(event.keyword_file)) {
|
||||||
event.keywords = read_keywords_from_file(event.keyword_file);
|
event.keywords = read_keywords_from_file(event.keyword_file);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -47,7 +48,7 @@ exports.scrape = function(config, callback) {
|
|||||||
};
|
};
|
||||||
|
|
||||||
function read_keywords_from_file(fname) {
|
function read_keywords_from_file(fname) {
|
||||||
let kws = fs.readFileSync(fname).toString().split("\n");
|
let kws = fs.readFileSync(fname).toString().split(os.EOL);
|
||||||
// clean keywords
|
// clean keywords
|
||||||
kws = kws.filter((kw) => {
|
kws = kws.filter((kw) => {
|
||||||
return kw.trim().length > 0;
|
return kw.trim().length > 0;
|
||||||
|
@ -1,3 +1,2 @@
|
|||||||
google scraper nikolait
|
GOOGL
|
||||||
mount everest
|
AAPL
|
||||||
incolumitas.com
|
|
2
package-lock.json
generated
2
package-lock.json
generated
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "se-scraper",
|
"name": "se-scraper",
|
||||||
"version": "1.0.0",
|
"version": "1.0.5",
|
||||||
"lockfileVersion": 1,
|
"lockfileVersion": 1,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "se-scraper",
|
"name": "se-scraper",
|
||||||
"version": "1.0.4",
|
"version": "1.0.5",
|
||||||
"description": "A simple module which uses puppeteer to scrape several search engines.",
|
"description": "A simple module which uses puppeteer to scrape several search engines.",
|
||||||
"main": "index.js",
|
"main": "index.js",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
|
8
run.js
8
run.js
@ -11,13 +11,17 @@ let config = {
|
|||||||
// is drawn before every request. empty string for no sleeping.
|
// is drawn before every request. empty string for no sleeping.
|
||||||
sleep_range: '[1,1]',
|
sleep_range: '[1,1]',
|
||||||
// which search engine to scrape
|
// which search engine to scrape
|
||||||
search_engine: 'google',
|
search_engine: 'yahoo_news',
|
||||||
// whether debug information should be printed
|
// whether debug information should be printed
|
||||||
debug: 'true',
|
debug: 'true',
|
||||||
// whether verbose program output should be printed
|
// whether verbose program output should be printed
|
||||||
verbose: 'false',
|
verbose: 'false',
|
||||||
// an array of keywords to scrape
|
// an array of keywords to scrape
|
||||||
keywords: ['incolumitas.com scraping', 'best scraping framework'],
|
keywords: ['GOOGL', ],
|
||||||
|
// alternatively you can specify a keyword_file. this overwrites the keywords array
|
||||||
|
keyword_file: './keywords.txt',
|
||||||
|
// whether to start the browser in headless mode
|
||||||
|
headless: false,
|
||||||
};
|
};
|
||||||
|
|
||||||
se_scraper.scrape(config, (err, response) => {
|
se_scraper.scrape(config, (err, response) => {
|
||||||
|
9
se-scraper.iml
Normal file
9
se-scraper.iml
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<module type="WEB_MODULE" version="4">
|
||||||
|
<component name="NewModuleRootManager" inherit-compiler-output="true">
|
||||||
|
<exclude-output />
|
||||||
|
<content url="file://$MODULE_DIR$" />
|
||||||
|
<orderEntry type="inheritedJdk" />
|
||||||
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
</component>
|
||||||
|
</module>
|
@ -3,8 +3,16 @@ module.exports = {
|
|||||||
effective_query: effective_query,
|
effective_query: effective_query,
|
||||||
sleep: sleep,
|
sleep: sleep,
|
||||||
random_sleep: random_sleep,
|
random_sleep: random_sleep,
|
||||||
|
set_input_value: set_input_value,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
async function set_input_value(page, selector, value) {
|
||||||
|
await page.waitFor(selector);
|
||||||
|
await page.evaluate((value, selector) => {
|
||||||
|
return document.querySelector(selector).value = value;
|
||||||
|
}, value, selector);
|
||||||
|
}
|
||||||
|
|
||||||
function no_results(needles, html) {
|
function no_results(needles, html) {
|
||||||
return !needles.map((needle) => { return html.indexOf(needle)})
|
return !needles.map((needle) => { return html.indexOf(needle)})
|
||||||
.every((res) => { return res == -1});
|
.every((res) => { return res == -1});
|
||||||
|
58
src/modules/ticker_search.js
Normal file
58
src/modules/ticker_search.js
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
const cheerio = require('cheerio');
|
||||||
|
const sfunctions = require('./functions.js');
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
scrape_yahoo_finance_pup: scrape_yahoo_finance_pup,
|
||||||
|
};
|
||||||
|
|
||||||
|
async function scrape_yahoo_finance_pup(browser, event, context) {
|
||||||
|
var results = {};
|
||||||
|
const page = await browser.newPage();
|
||||||
|
await page.goto('https://finance.yahoo.com/');
|
||||||
|
|
||||||
|
for (var i = 0; i < 3; i++) {
|
||||||
|
consent = await page.waitForSelector('[type="submit"]');
|
||||||
|
await consent.click();
|
||||||
|
}
|
||||||
|
|
||||||
|
for (let keyword of event.keywords) {
|
||||||
|
try {
|
||||||
|
await page.goto(`https://finance.yahoo.com/quote/${keyword}/news?p=${keyword}`);
|
||||||
|
|
||||||
|
await page.waitForSelector('#quote-header-info', { timeout: 8000 });
|
||||||
|
|
||||||
|
if (event.debug === true && event.is_local === true) {
|
||||||
|
await page.screenshot({path: `debug/${keyword}.png`});
|
||||||
|
}
|
||||||
|
|
||||||
|
await sfunctions.sleep(1000);
|
||||||
|
|
||||||
|
let html = await page.content();
|
||||||
|
results[keyword] = parse(html);
|
||||||
|
|
||||||
|
} catch (e) {
|
||||||
|
console.error(`Problem with scraping ${keyword}: ${e}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
|
function parse(html) {
|
||||||
|
// load the page source into cheerio
|
||||||
|
const $ = cheerio.load(html);
|
||||||
|
|
||||||
|
const results = [];
|
||||||
|
$('.js-stream-content .Cf').each((i, link) => {
|
||||||
|
results.push({
|
||||||
|
link: $(link).find('h3 a').attr('href'),
|
||||||
|
title: $(link).find('h3').text(),
|
||||||
|
snippet: $(link).find('p').text(),
|
||||||
|
})
|
||||||
|
});
|
||||||
|
|
||||||
|
return {
|
||||||
|
time: (new Date()).toUTCString(),
|
||||||
|
results: results,
|
||||||
|
}
|
||||||
|
}
|
@ -10,6 +10,7 @@ const youtube = require('./modules/youtube.js');
|
|||||||
const ua = require('./modules/user_agents.js');
|
const ua = require('./modules/user_agents.js');
|
||||||
const meta = require('./modules/metadata.js');
|
const meta = require('./modules/metadata.js');
|
||||||
const duckduckgo = require('./modules/duckduckgo.js');
|
const duckduckgo = require('./modules/duckduckgo.js');
|
||||||
|
const tickersearch = require('./modules/ticker_search.js');
|
||||||
|
|
||||||
module.exports.handler = async function handler (event, context, callback) {
|
module.exports.handler = async function handler (event, context, callback) {
|
||||||
|
|
||||||
@ -54,7 +55,7 @@ module.exports.handler = async function handler (event, context, callback) {
|
|||||||
|
|
||||||
browser = await puppeteer.launch({
|
browser = await puppeteer.launch({
|
||||||
args: ADDITIONAL_CHROME_FLAGS,
|
args: ADDITIONAL_CHROME_FLAGS,
|
||||||
headless: true,
|
headless: event.headless !== false,
|
||||||
});
|
});
|
||||||
|
|
||||||
if (event.log_http_headers === true) {
|
if (event.log_http_headers === true) {
|
||||||
@ -87,6 +88,8 @@ module.exports.handler = async function handler (event, context, callback) {
|
|||||||
results = await duckduckgo.scrape_duckduckgo_news_pup(browser, event, context);
|
results = await duckduckgo.scrape_duckduckgo_news_pup(browser, event, context);
|
||||||
} else if (event.search_engine == 'google_dr') {
|
} else if (event.search_engine == 'google_dr') {
|
||||||
results = await google.scrape_google_pup_dr(browser, event, context);
|
results = await google.scrape_google_pup_dr(browser, event, context);
|
||||||
|
} else if (event.search_engine == 'yahoo_news') {
|
||||||
|
results = await tickersearch.scrape_yahoo_finance_pup(browser, event, context);
|
||||||
}
|
}
|
||||||
|
|
||||||
let metadata = {};
|
let metadata = {};
|
||||||
|
Loading…
Reference in New Issue
Block a user