mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2024-11-21 23:23:07 +01:00
supporting yahoo ticker search for news
This commit is contained in:
parent
9cfa502851
commit
bab902e80a
@ -52,13 +52,17 @@ let config = {
|
||||
// is drawn before every request. empty string for no sleeping.
|
||||
sleep_range: '[1,1]',
|
||||
// which search engine to scrape
|
||||
search_engine: 'google',
|
||||
search_engine: 'yahoo_news',
|
||||
// whether debug information should be printed
|
||||
debug: 'true',
|
||||
// whether verbose program output should be printed
|
||||
verbose: 'false',
|
||||
// an array of keywords to scrape
|
||||
keywords: ['incolumitas.com scraping', 'best scraping framework'],
|
||||
keywords: ['GOOGL', ],
|
||||
// alternatively you can specify a keyword_file. this overwrites the keywords array
|
||||
keyword_file: './keywords.txt',
|
||||
// whether to start the browser in headless mode
|
||||
headless: false,
|
||||
};
|
||||
|
||||
se_scraper.scrape(config, (err, response) => {
|
||||
@ -90,6 +94,7 @@ Supported options for the `search_engine` config key:
|
||||
'youtube'
|
||||
'duckduckgo_news'
|
||||
'google_dr'
|
||||
'yahoo_news'
|
||||
```
|
||||
|
||||
Output for the above script on my laptop:
|
||||
|
5
TODO.txt
5
TODO.txt
@ -2,6 +2,11 @@
|
||||
- fix interface to scrape() [DONE]
|
||||
- add to Github
|
||||
|
||||
|
||||
24.1.2018
|
||||
|
||||
- fix issue #3: add functionality to add keyword file
|
||||
|
||||
TODO:
|
||||
- add proxy support
|
||||
- add captcha service solving support
|
||||
|
7
index.js
7
index.js
@ -1,5 +1,6 @@
|
||||
const handler = require('./src/node_scraper.js');
|
||||
var fs = require('fs');
|
||||
var os = require("os");
|
||||
|
||||
exports.scrape = function(config, callback) {
|
||||
// options for scraping
|
||||
@ -21,14 +22,14 @@ exports.scrape = function(config, callback) {
|
||||
compress: 'false', // compress
|
||||
debug: 'false',
|
||||
verbose: 'false',
|
||||
keywords: [],
|
||||
keywords: ['test'],
|
||||
};
|
||||
|
||||
for (var key in config) {
|
||||
event[key] = config[key];
|
||||
}
|
||||
|
||||
if (fs.existsSync( event.keyword_file )) {
|
||||
if (fs.existsSync(event.keyword_file)) {
|
||||
event.keywords = read_keywords_from_file(event.keyword_file);
|
||||
}
|
||||
|
||||
@ -47,7 +48,7 @@ exports.scrape = function(config, callback) {
|
||||
};
|
||||
|
||||
function read_keywords_from_file(fname) {
|
||||
let kws = fs.readFileSync(fname).toString().split("\n");
|
||||
let kws = fs.readFileSync(fname).toString().split(os.EOL);
|
||||
// clean keywords
|
||||
kws = kws.filter((kw) => {
|
||||
return kw.trim().length > 0;
|
||||
|
@ -1,3 +1,2 @@
|
||||
google scraper nikolait
|
||||
mount everest
|
||||
incolumitas.com
|
||||
GOOGL
|
||||
AAPL
|
2
package-lock.json
generated
2
package-lock.json
generated
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "se-scraper",
|
||||
"version": "1.0.0",
|
||||
"version": "1.0.5",
|
||||
"lockfileVersion": 1,
|
||||
"requires": true,
|
||||
"dependencies": {
|
||||
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "se-scraper",
|
||||
"version": "1.0.4",
|
||||
"version": "1.0.5",
|
||||
"description": "A simple module which uses puppeteer to scrape several search engines.",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
|
8
run.js
8
run.js
@ -11,13 +11,17 @@ let config = {
|
||||
// is drawn before every request. empty string for no sleeping.
|
||||
sleep_range: '[1,1]',
|
||||
// which search engine to scrape
|
||||
search_engine: 'google',
|
||||
search_engine: 'yahoo_news',
|
||||
// whether debug information should be printed
|
||||
debug: 'true',
|
||||
// whether verbose program output should be printed
|
||||
verbose: 'false',
|
||||
// an array of keywords to scrape
|
||||
keywords: ['incolumitas.com scraping', 'best scraping framework'],
|
||||
keywords: ['GOOGL', ],
|
||||
// alternatively you can specify a keyword_file. this overwrites the keywords array
|
||||
keyword_file: './keywords.txt',
|
||||
// whether to start the browser in headless mode
|
||||
headless: false,
|
||||
};
|
||||
|
||||
se_scraper.scrape(config, (err, response) => {
|
||||
|
9
se-scraper.iml
Normal file
9
se-scraper.iml
Normal file
@ -0,0 +1,9 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="WEB_MODULE" version="4">
|
||||
<component name="NewModuleRootManager" inherit-compiler-output="true">
|
||||
<exclude-output />
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
@ -3,8 +3,16 @@ module.exports = {
|
||||
effective_query: effective_query,
|
||||
sleep: sleep,
|
||||
random_sleep: random_sleep,
|
||||
set_input_value: set_input_value,
|
||||
};
|
||||
|
||||
async function set_input_value(page, selector, value) {
|
||||
await page.waitFor(selector);
|
||||
await page.evaluate((value, selector) => {
|
||||
return document.querySelector(selector).value = value;
|
||||
}, value, selector);
|
||||
}
|
||||
|
||||
function no_results(needles, html) {
|
||||
return !needles.map((needle) => { return html.indexOf(needle)})
|
||||
.every((res) => { return res == -1});
|
||||
|
58
src/modules/ticker_search.js
Normal file
58
src/modules/ticker_search.js
Normal file
@ -0,0 +1,58 @@
|
||||
const cheerio = require('cheerio');
|
||||
const sfunctions = require('./functions.js');
|
||||
|
||||
module.exports = {
|
||||
scrape_yahoo_finance_pup: scrape_yahoo_finance_pup,
|
||||
};
|
||||
|
||||
async function scrape_yahoo_finance_pup(browser, event, context) {
|
||||
var results = {};
|
||||
const page = await browser.newPage();
|
||||
await page.goto('https://finance.yahoo.com/');
|
||||
|
||||
for (var i = 0; i < 3; i++) {
|
||||
consent = await page.waitForSelector('[type="submit"]');
|
||||
await consent.click();
|
||||
}
|
||||
|
||||
for (let keyword of event.keywords) {
|
||||
try {
|
||||
await page.goto(`https://finance.yahoo.com/quote/${keyword}/news?p=${keyword}`);
|
||||
|
||||
await page.waitForSelector('#quote-header-info', { timeout: 8000 });
|
||||
|
||||
if (event.debug === true && event.is_local === true) {
|
||||
await page.screenshot({path: `debug/${keyword}.png`});
|
||||
}
|
||||
|
||||
await sfunctions.sleep(1000);
|
||||
|
||||
let html = await page.content();
|
||||
results[keyword] = parse(html);
|
||||
|
||||
} catch (e) {
|
||||
console.error(`Problem with scraping ${keyword}: ${e}`);
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
function parse(html) {
|
||||
// load the page source into cheerio
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const results = [];
|
||||
$('.js-stream-content .Cf').each((i, link) => {
|
||||
results.push({
|
||||
link: $(link).find('h3 a').attr('href'),
|
||||
title: $(link).find('h3').text(),
|
||||
snippet: $(link).find('p').text(),
|
||||
})
|
||||
});
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
results: results,
|
||||
}
|
||||
}
|
@ -10,6 +10,7 @@ const youtube = require('./modules/youtube.js');
|
||||
const ua = require('./modules/user_agents.js');
|
||||
const meta = require('./modules/metadata.js');
|
||||
const duckduckgo = require('./modules/duckduckgo.js');
|
||||
const tickersearch = require('./modules/ticker_search.js');
|
||||
|
||||
module.exports.handler = async function handler (event, context, callback) {
|
||||
|
||||
@ -54,7 +55,7 @@ module.exports.handler = async function handler (event, context, callback) {
|
||||
|
||||
browser = await puppeteer.launch({
|
||||
args: ADDITIONAL_CHROME_FLAGS,
|
||||
headless: true,
|
||||
headless: event.headless !== false,
|
||||
});
|
||||
|
||||
if (event.log_http_headers === true) {
|
||||
@ -87,7 +88,9 @@ module.exports.handler = async function handler (event, context, callback) {
|
||||
results = await duckduckgo.scrape_duckduckgo_news_pup(browser, event, context);
|
||||
} else if (event.search_engine == 'google_dr') {
|
||||
results = await google.scrape_google_pup_dr(browser, event, context);
|
||||
}
|
||||
} else if (event.search_engine == 'yahoo_news') {
|
||||
results = await tickersearch.scrape_yahoo_finance_pup(browser, event, context);
|
||||
}
|
||||
|
||||
let metadata = {};
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user