forked from extern/se-scraper
faster scraping, added ticker search engines
This commit is contained in:
parent
b0e588f916
commit
e78d7145b5
@ -65,6 +65,9 @@ let config = {
|
||||
headless: false,
|
||||
// path to output file, data will be stored in JSON
|
||||
output_file: 'results.json',
|
||||
// whether to prevent images, css, fonts from being loaded
|
||||
// will speed up scraping a great deal
|
||||
block_assets: true
|
||||
};
|
||||
|
||||
se_scraper.scrape(config, (err, response) => {
|
||||
|
6
TODO.txt
6
TODO.txt
@ -7,6 +7,12 @@
|
||||
|
||||
- fix issue #3: add functionality to add keyword file
|
||||
|
||||
27.1.2019
|
||||
|
||||
- Add functionality to block images and CSS from loading as described here:
|
||||
|
||||
https://www.scrapehero.com/how-to-increase-web-scraping-speed-using-puppeteer/
|
||||
|
||||
TODO:
|
||||
- add proxy support
|
||||
- add captcha service solving support
|
||||
|
3
index.js
3
index.js
@ -27,6 +27,9 @@ exports.scrape = async function(config, callback) {
|
||||
headless: true,
|
||||
// path to output file, data will be stored in JSON
|
||||
output_file: '',
|
||||
// whether to prevent images, css, fonts from being loaded
|
||||
// will speed up scraping a great deal
|
||||
block_assets: true
|
||||
};
|
||||
|
||||
for (var key in config) {
|
||||
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "se-scraper",
|
||||
"version": "1.0.6",
|
||||
"version": "1.1.0",
|
||||
"description": "A simple module which uses puppeteer to scrape several search engines.",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
|
7
run.js
7
run.js
@ -11,19 +11,22 @@ let config = {
|
||||
// is drawn before every request. empty string for no sleeping.
|
||||
sleep_range: '[1,1]',
|
||||
// which search engine to scrape
|
||||
search_engine: 'google',
|
||||
search_engine: 'bing',
|
||||
// whether debug information should be printed
|
||||
debug: true,
|
||||
// whether verbose program output should be printed
|
||||
verbose: false,
|
||||
// an array of keywords to scrape
|
||||
keywords: ['scrapeulous.com', ],
|
||||
keywords: ['MSFT', ],
|
||||
// alternatively you can specify a keyword_file. this overwrites the keywords array
|
||||
keyword_file: '',
|
||||
// whether to start the browser in headless mode
|
||||
headless: false,
|
||||
// path to output file, data will be stored in JSON
|
||||
output_file: 'data.json',
|
||||
// whether to prevent images, css, fonts from being loaded
|
||||
// will speed up scraping a great deal
|
||||
block_assets: true
|
||||
};
|
||||
|
||||
se_scraper.scrape(config, (err, response) => {
|
||||
|
@ -5,8 +5,7 @@ module.exports = {
|
||||
scrape_baidu_pup: scrape_baidu_pup,
|
||||
};
|
||||
|
||||
async function scrape_baidu_pup(browser, event, context) {
|
||||
const page = await browser.newPage();
|
||||
async function scrape_baidu_pup(page, event, context) {
|
||||
await page.goto('https://www.baidu.com/');
|
||||
|
||||
try {
|
||||
|
@ -6,8 +6,7 @@ module.exports = {
|
||||
scrape_bing_news_pup: scrape_bing_news_pup,
|
||||
};
|
||||
|
||||
async function scrape_bing_pup(browser, event, context) {
|
||||
const page = await browser.newPage();
|
||||
async function scrape_bing_pup(page, event, context) {
|
||||
await page.goto('https://www.bing.com/');
|
||||
|
||||
try {
|
||||
@ -91,8 +90,7 @@ function parse(html) {
|
||||
}
|
||||
}
|
||||
|
||||
async function scrape_bing_news_pup(browser, event, context) {
|
||||
const page = await browser.newPage();
|
||||
async function scrape_bing_news_pup(page, event, context) {
|
||||
await page.goto('https://www.bing.com/news/search?');
|
||||
|
||||
if (event.set_manual_settings === true) {
|
||||
|
@ -5,8 +5,7 @@ module.exports = {
|
||||
scrape_duckduckgo_news_pup: scrape_duckduckgo_news_pup,
|
||||
};
|
||||
|
||||
async function scrape_duckduckgo_news_pup(browser, event, context) {
|
||||
const page = await browser.newPage();
|
||||
async function scrape_duckduckgo_news_pup(page, event, context) {
|
||||
await page.goto('https://duckduckgo.com/?q=42&t=h_&iar=news&ia=news');
|
||||
|
||||
try {
|
||||
|
@ -26,9 +26,7 @@ const setTextInputValue = async (page, selector, value) => {
|
||||
}, value, selector);
|
||||
};
|
||||
|
||||
|
||||
async function scrape_google_pup(browser, event, context) {
|
||||
const page = await browser.newPage();
|
||||
async function scrape_google_pup(page, event, context) {
|
||||
await page.goto('https://www.google.com/');
|
||||
|
||||
try {
|
||||
@ -92,14 +90,12 @@ async function scrape_google_pup(browser, event, context) {
|
||||
|
||||
let html = await page.content();
|
||||
results[keyword] = parse_google_results(html);
|
||||
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
async function scrape_google_pup_dr(browser, event, context) {
|
||||
const page = await browser.newPage();
|
||||
async function scrape_google_pup_dr(page, event, context) {
|
||||
let keywords = event.keywords;
|
||||
first = keywords[0];
|
||||
var year = first.slice(-5);
|
||||
@ -235,10 +231,7 @@ async function scraping_detected(page) {
|
||||
return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
|
||||
}
|
||||
|
||||
|
||||
async function scrape_google_news_old_pup(browser, event, context) {
|
||||
const page = await browser.newPage();
|
||||
|
||||
async function scrape_google_news_old_pup(page, event, context) {
|
||||
let keywords = event.keywords;
|
||||
var results = {};
|
||||
|
||||
@ -347,9 +340,7 @@ function parse_google_news_results_se_format(html) {
|
||||
}
|
||||
}
|
||||
|
||||
async function scrape_google_image_pup(browser, event, context) {
|
||||
const page = await browser.newPage();
|
||||
|
||||
async function scrape_google_image_pup(page, event, context) {
|
||||
let keywords = event.keywords;
|
||||
var results = {};
|
||||
|
||||
@ -477,9 +468,7 @@ function clean_image_url(url) {
|
||||
|
||||
const all_results = new Set();
|
||||
|
||||
async function scrape_google_news_pup(browser, event, context) {
|
||||
const page = await browser.newPage();
|
||||
|
||||
async function scrape_google_news_pup(page, event, context) {
|
||||
let keywords = event.keywords;
|
||||
var results = {};
|
||||
|
||||
|
@ -6,8 +6,7 @@ module.exports = {
|
||||
scrape_webcrawler_news_pup: scrape_webcrawler_news_pup,
|
||||
};
|
||||
|
||||
async function scrape_infospace_pup(browser, event, context) {
|
||||
const page = await browser.newPage();
|
||||
async function scrape_infospace_pup(page, event, context) {
|
||||
await page.goto('http://infospace.com/index.html');
|
||||
|
||||
try {
|
||||
@ -89,8 +88,7 @@ function parse(html) {
|
||||
}
|
||||
}
|
||||
|
||||
async function scrape_webcrawler_news_pup(browser, event, context) {
|
||||
const page = await browser.newPage();
|
||||
async function scrape_webcrawler_news_pup(page, event, context) {
|
||||
await page.goto('https://www.webcrawler.com/?qc=news');
|
||||
|
||||
try {
|
||||
|
@ -3,11 +3,16 @@ const sfunctions = require('./functions.js');
|
||||
|
||||
module.exports = {
|
||||
scrape_yahoo_finance_pup: scrape_yahoo_finance_pup,
|
||||
scrape_bloomberg_finance_pup: scrape_bloomberg_finance_pup,
|
||||
scrape_reuters_finance_pup: scrape_reuters_finance_pup,
|
||||
scrape_cnbc_finance_pup: scrape_cnbc_finance_pup,
|
||||
scrape_marketwatch_finance_pup: scrape_marketwatch_finance_pup,
|
||||
};
|
||||
|
||||
async function scrape_yahoo_finance_pup(browser, event, context) {
|
||||
// https://www.google.com/search?q=MSFT&tbm=fin
|
||||
|
||||
async function scrape_yahoo_finance_pup(page, event, context) {
|
||||
var results = {};
|
||||
const page = await browser.newPage();
|
||||
await page.goto('https://finance.yahoo.com/');
|
||||
|
||||
for (var i = 0; i < 3; i++) {
|
||||
@ -34,7 +39,6 @@ async function scrape_yahoo_finance_pup(browser, event, context) {
|
||||
console.error(`Problem with scraping ${keyword}: ${e}`);
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
@ -55,4 +59,175 @@ function parse(html) {
|
||||
time: (new Date()).toUTCString(),
|
||||
results: results,
|
||||
}
|
||||
}
|
||||
|
||||
async function scrape_marketwatch_finance_pup(page, event, context) {
|
||||
var results = {};
|
||||
for (let keyword of event.keywords) {
|
||||
try {
|
||||
await page.goto(`https://www.marketwatch.com/investing/stock/${keyword}`);
|
||||
await page.waitForSelector('.intraday__data', { timeout: 8000 });
|
||||
|
||||
if (event.debug === true && event.is_local === true) {
|
||||
await page.screenshot({path: `debug/${keyword}.png`});
|
||||
}
|
||||
|
||||
await sfunctions.sleep(500);
|
||||
|
||||
let newsData = await page.evaluate(() => {
|
||||
let results = [];
|
||||
// get the hotel elements
|
||||
let items = document.querySelectorAll('.article__content');
|
||||
// get the hotel data
|
||||
items.forEach((newsitem) => {
|
||||
let data = {};
|
||||
try {
|
||||
data.link = newsitem.querySelector('.article__headline a').getAttribute('href');
|
||||
data.title = newsitem.querySelector('.article__headline a').innerText;
|
||||
data.date = newsitem.querySelector('.article__timestamp').innerText;
|
||||
data.author = newsitem.querySelector('.article__author').innerText;
|
||||
}
|
||||
catch (exception) {
|
||||
console.error('Error parsing marketwatch data: ', exception);
|
||||
}
|
||||
results.push(data);
|
||||
});
|
||||
return results;
|
||||
});
|
||||
|
||||
results[keyword] = {
|
||||
time: (new Date()).toUTCString(),
|
||||
results: newsData,
|
||||
}
|
||||
|
||||
} catch (e) {
|
||||
console.error(`Problem with scraping ${keyword}: ${e}`);
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
|
||||
async function scrape_bloomberg_finance_pup(page, event, context) {
|
||||
/*
|
||||
Bloomberg blocks after one request. what a shit hole.
|
||||
*/
|
||||
var results = {};
|
||||
for (let keyword of event.keywords) {
|
||||
try {
|
||||
await page.goto(`https://www.bloomberg.com/quote/${keyword}:US`);
|
||||
await page.waitForSelector('.pseudoMainContent', { timeout: 8000 });
|
||||
|
||||
if (event.debug === true && event.is_local === true) {
|
||||
await page.screenshot({path: `debug/${keyword}.png`});
|
||||
}
|
||||
|
||||
await sfunctions.sleep(1000);
|
||||
|
||||
let news_items = await page.$x('//*[starts-with(@class,"newsItem")]');
|
||||
for (let item of news_items) {
|
||||
let url = item.$$('a').then((link) => {
|
||||
link.getProperty('href').then((anchor) => {
|
||||
return anchor;
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
} catch (e) {
|
||||
console.error(`Problem with scraping ${keyword}: ${e}`);
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
async function scrape_reuters_finance_pup(page, event, context) {
|
||||
var results = {};
|
||||
for (let keyword of event.keywords) {
|
||||
try {
|
||||
await page.goto(`https://www.reuters.com/finance/stocks/overview/${keyword}`);
|
||||
await page.waitForSelector('#sectionHeader', { timeout: 8000 });
|
||||
|
||||
if (event.debug === true && event.is_local === true) {
|
||||
await page.screenshot({path: `debug/${keyword}.png`});
|
||||
}
|
||||
|
||||
await sfunctions.sleep(500);
|
||||
|
||||
let newsData = await page.evaluate(() => {
|
||||
let results = [];
|
||||
// get the hotel elements
|
||||
let items = document.querySelectorAll('div.feature');
|
||||
// get the hotel data
|
||||
items.forEach((newsitem) => {
|
||||
let data = {};
|
||||
try {
|
||||
data.link = newsitem.querySelector('h2 a').getAttribute('href');
|
||||
data.link = 'https://www.reuters.com' + data.link;
|
||||
data.title = newsitem.querySelector('h2 a').innerText;
|
||||
data.text = newsitem.querySelector('p').innerText;
|
||||
data.date = newsitem.querySelector('.timestamp').innerText;
|
||||
}
|
||||
catch (exception) {
|
||||
console.error('Error parsing reuters data: ', exception);
|
||||
}
|
||||
results.push(data);
|
||||
});
|
||||
return results;
|
||||
});
|
||||
|
||||
results[keyword] = {
|
||||
time: (new Date()).toUTCString(),
|
||||
results: newsData,
|
||||
}
|
||||
|
||||
} catch (e) {
|
||||
console.error(`Problem with scraping ${keyword}: ${e}`);
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
async function scrape_cnbc_finance_pup(page, event, context) {
|
||||
var results = {};
|
||||
for (let keyword of event.keywords) {
|
||||
try {
|
||||
await page.goto(`https://www.cnbc.com/quotes/?symbol=${keyword}&tab=news`);
|
||||
await page.waitForSelector('#quote_title_and_chart', { timeout: 8000 });
|
||||
|
||||
if (event.debug === true && event.is_local === true) {
|
||||
await page.screenshot({path: `debug/${keyword}.png`});
|
||||
}
|
||||
|
||||
await sfunctions.sleep(500);
|
||||
|
||||
let newsData = await page.evaluate(() => {
|
||||
let results = [];
|
||||
// get the hotel elements
|
||||
let items = document.querySelectorAll('div.headline');
|
||||
// get the hotel data
|
||||
items.forEach((newsitem) => {
|
||||
let data = {};
|
||||
try {
|
||||
data.link = newsitem.querySelector('a').getAttribute('href');
|
||||
data.title = newsitem.querySelector('[ng-bind="asset.headline"]').innerText;
|
||||
data.date = newsitem.querySelector('span.note').innerText;
|
||||
}
|
||||
catch (exception) {
|
||||
console.error('Error parsing cnbc data: ', exception);
|
||||
}
|
||||
results.push(data);
|
||||
});
|
||||
return results;
|
||||
});
|
||||
|
||||
results[keyword] = {
|
||||
time: (new Date()).toUTCString(),
|
||||
results: newsData,
|
||||
}
|
||||
|
||||
} catch (e) {
|
||||
console.error(`Problem with scraping ${keyword}: ${e}`);
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
@ -7,8 +7,7 @@ module.exports = {
|
||||
|
||||
const all_videos = new Set();
|
||||
|
||||
async function scrape_youtube_pup(browser, event, context) {
|
||||
const page = await browser.newPage();
|
||||
async function scrape_youtube_pup(page, event, context) {
|
||||
await page.goto('https://www.youtube.com');
|
||||
|
||||
try {
|
||||
|
@ -72,35 +72,40 @@ module.exports.handler = async function handler (event, context, callback) {
|
||||
console.dir(headers);
|
||||
}
|
||||
|
||||
// TODO: this is ugly but I don't want to use to much objects and classes right now.
|
||||
if (event.search_engine == 'google') {
|
||||
results = await google.scrape_google_pup(browser, event, context);
|
||||
} else if (event.search_engine == 'google_news_old') {
|
||||
results = await google.scrape_google_news_old_pup(browser, event, context);
|
||||
} else if (event.search_engine == 'google_news') {
|
||||
results = await google.scrape_google_news_pup(browser, event, context);
|
||||
} else if (event.search_engine == 'google_image') {
|
||||
results = await google.scrape_google_image_pup(browser, event, context);
|
||||
} else if (event.search_engine == 'bing') {
|
||||
results = await bing.scrape_bing_pup(browser, event, context);
|
||||
} else if (event.search_engine == 'bing_news') {
|
||||
results = await bing.scrape_bing_news_pup(browser, event, context);
|
||||
} else if (event.search_engine == 'infospace') {
|
||||
results = await infospace.scrape_infospace_pup(browser, event, context);
|
||||
} else if (event.search_engine == 'webcrawler') {
|
||||
results = await infospace.scrape_webcrawler_news_pup(browser, event, context);
|
||||
} else if (event.search_engine == 'baidu') {
|
||||
results = await baidu.scrape_baidu_pup(browser, event, context);
|
||||
} else if (event.search_engine == 'youtube') {
|
||||
results = await youtube.scrape_youtube_pup(browser, event, context);
|
||||
} else if (event.search_engine == 'duckduckgo_news') {
|
||||
results = await duckduckgo.scrape_duckduckgo_news_pup(browser, event, context);
|
||||
} else if (event.search_engine == 'google_dr') {
|
||||
results = await google.scrape_google_pup_dr(browser, event, context);
|
||||
} else if (event.search_engine == 'yahoo_news') {
|
||||
results = await tickersearch.scrape_yahoo_finance_pup(browser, event, context);
|
||||
const page = await browser.newPage();
|
||||
|
||||
if (event.block_assets === true) {
|
||||
await page.setRequestInterception(true);
|
||||
|
||||
page.on('request', (req) => {
|
||||
if (req.resourceType() == 'stylesheet' || req.resourceType() == 'font' || req.resourceType() == 'image') {
|
||||
req.abort();
|
||||
} else {
|
||||
req.continue();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
results = await {
|
||||
google: google.scrape_google_pup,
|
||||
google_news_old: google.scrape_google_news_old_pup,
|
||||
google_news: google.scrape_google_news_pup,
|
||||
google_image: google.scrape_google_image_pup,
|
||||
bing: bing.scrape_bing_pup,
|
||||
bing_news: bing.scrape_bing_news_pup,
|
||||
infospace: infospace.scrape_infospace_pup,
|
||||
webcrawler: infospace.scrape_webcrawler_news_pup,
|
||||
baidu: baidu.scrape_baidu_pup,
|
||||
youtube: youtube.scrape_youtube_pup,
|
||||
duckduckgo_news: duckduckgo.scrape_duckduckgo_news_pup,
|
||||
google_dr: google.scrape_google_pup_dr,
|
||||
yahoo_news: tickersearch.scrape_yahoo_finance_pup,
|
||||
bloomberg: tickersearch.scrape_bloomberg_finance_pup,
|
||||
reuters: tickersearch.scrape_reuters_finance_pup,
|
||||
cnbc: tickersearch.scrape_cnbc_finance_pup,
|
||||
marketwatch: tickersearch.scrape_marketwatch_finance_pup,
|
||||
}[event.search_engine](page, event, context);
|
||||
|
||||
let metadata = {};
|
||||
|
||||
if (event.write_meta_data === true) {
|
||||
@ -203,6 +208,10 @@ function parseEventData(event) {
|
||||
event.set_manual_settings = _bool(event.set_manual_settings);
|
||||
}
|
||||
|
||||
if (event.block_assets) {
|
||||
event.block_assets = _bool(event.block_assets);
|
||||
}
|
||||
|
||||
if (event.sleep_range) {
|
||||
// parse an array
|
||||
event.sleep_range = eval(event.sleep_range);
|
||||
|
Loading…
Reference in New Issue
Block a user