faster scraping, added ticker search engines

This commit is contained in:
Nikolai Tschacher 2019-01-27 01:27:52 +01:00
parent b0e588f916
commit e78d7145b5
13 changed files with 244 additions and 63 deletions

View File

@ -65,6 +65,9 @@ let config = {
headless: false,
// path to output file, data will be stored in JSON
output_file: 'results.json',
// whether to prevent images, css, fonts from being loaded
// will speed up scraping a great deal
block_assets: true
};
se_scraper.scrape(config, (err, response) => {

View File

@ -7,6 +7,12 @@
- fix issue #3: add functionality to add keyword file
27.1.2019
- Add functionality to block images and CSS from loading as described here:
https://www.scrapehero.com/how-to-increase-web-scraping-speed-using-puppeteer/
TODO:
- add proxy support
- add captcha service solving support

View File

@ -27,6 +27,9 @@ exports.scrape = async function(config, callback) {
headless: true,
// path to output file, data will be stored in JSON
output_file: '',
// whether to prevent images, css, fonts from being loaded
// will speed up scraping a great deal
block_assets: true
};
for (var key in config) {

View File

@ -1,6 +1,6 @@
{
"name": "se-scraper",
"version": "1.0.6",
"version": "1.1.0",
"description": "A simple module which uses puppeteer to scrape several search engines.",
"main": "index.js",
"scripts": {

7
run.js
View File

@ -11,19 +11,22 @@ let config = {
// is drawn before every request. empty string for no sleeping.
sleep_range: '[1,1]',
// which search engine to scrape
search_engine: 'google',
search_engine: 'bing',
// whether debug information should be printed
debug: true,
// whether verbose program output should be printed
verbose: false,
// an array of keywords to scrape
keywords: ['scrapeulous.com', ],
keywords: ['MSFT', ],
// alternatively you can specify a keyword_file. this overwrites the keywords array
keyword_file: '',
// whether to start the browser in headless mode
headless: false,
// path to output file, data will be stored in JSON
output_file: 'data.json',
// whether to prevent images, css, fonts from being loaded
// will speed up scraping a great deal
block_assets: true
};
se_scraper.scrape(config, (err, response) => {

View File

@ -5,8 +5,7 @@ module.exports = {
scrape_baidu_pup: scrape_baidu_pup,
};
async function scrape_baidu_pup(browser, event, context) {
const page = await browser.newPage();
async function scrape_baidu_pup(page, event, context) {
await page.goto('https://www.baidu.com/');
try {

View File

@ -6,8 +6,7 @@ module.exports = {
scrape_bing_news_pup: scrape_bing_news_pup,
};
async function scrape_bing_pup(browser, event, context) {
const page = await browser.newPage();
async function scrape_bing_pup(page, event, context) {
await page.goto('https://www.bing.com/');
try {
@ -91,8 +90,7 @@ function parse(html) {
}
}
async function scrape_bing_news_pup(browser, event, context) {
const page = await browser.newPage();
async function scrape_bing_news_pup(page, event, context) {
await page.goto('https://www.bing.com/news/search?');
if (event.set_manual_settings === true) {

View File

@ -5,8 +5,7 @@ module.exports = {
scrape_duckduckgo_news_pup: scrape_duckduckgo_news_pup,
};
async function scrape_duckduckgo_news_pup(browser, event, context) {
const page = await browser.newPage();
async function scrape_duckduckgo_news_pup(page, event, context) {
await page.goto('https://duckduckgo.com/?q=42&t=h_&iar=news&ia=news');
try {

View File

@ -26,9 +26,7 @@ const setTextInputValue = async (page, selector, value) => {
}, value, selector);
};
async function scrape_google_pup(browser, event, context) {
const page = await browser.newPage();
async function scrape_google_pup(page, event, context) {
await page.goto('https://www.google.com/');
try {
@ -92,14 +90,12 @@ async function scrape_google_pup(browser, event, context) {
let html = await page.content();
results[keyword] = parse_google_results(html);
}
return results;
}
async function scrape_google_pup_dr(browser, event, context) {
const page = await browser.newPage();
async function scrape_google_pup_dr(page, event, context) {
let keywords = event.keywords;
first = keywords[0];
var year = first.slice(-5);
@ -235,10 +231,7 @@ async function scraping_detected(page) {
return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
}
async function scrape_google_news_old_pup(browser, event, context) {
const page = await browser.newPage();
async function scrape_google_news_old_pup(page, event, context) {
let keywords = event.keywords;
var results = {};
@ -347,9 +340,7 @@ function parse_google_news_results_se_format(html) {
}
}
async function scrape_google_image_pup(browser, event, context) {
const page = await browser.newPage();
async function scrape_google_image_pup(page, event, context) {
let keywords = event.keywords;
var results = {};
@ -477,9 +468,7 @@ function clean_image_url(url) {
const all_results = new Set();
async function scrape_google_news_pup(browser, event, context) {
const page = await browser.newPage();
async function scrape_google_news_pup(page, event, context) {
let keywords = event.keywords;
var results = {};

View File

@ -6,8 +6,7 @@ module.exports = {
scrape_webcrawler_news_pup: scrape_webcrawler_news_pup,
};
async function scrape_infospace_pup(browser, event, context) {
const page = await browser.newPage();
async function scrape_infospace_pup(page, event, context) {
await page.goto('http://infospace.com/index.html');
try {
@ -89,8 +88,7 @@ function parse(html) {
}
}
async function scrape_webcrawler_news_pup(browser, event, context) {
const page = await browser.newPage();
async function scrape_webcrawler_news_pup(page, event, context) {
await page.goto('https://www.webcrawler.com/?qc=news');
try {

View File

@ -3,11 +3,16 @@ const sfunctions = require('./functions.js');
module.exports = {
scrape_yahoo_finance_pup: scrape_yahoo_finance_pup,
scrape_bloomberg_finance_pup: scrape_bloomberg_finance_pup,
scrape_reuters_finance_pup: scrape_reuters_finance_pup,
scrape_cnbc_finance_pup: scrape_cnbc_finance_pup,
scrape_marketwatch_finance_pup: scrape_marketwatch_finance_pup,
};
async function scrape_yahoo_finance_pup(browser, event, context) {
// https://www.google.com/search?q=MSFT&tbm=fin
async function scrape_yahoo_finance_pup(page, event, context) {
var results = {};
const page = await browser.newPage();
await page.goto('https://finance.yahoo.com/');
for (var i = 0; i < 3; i++) {
@ -34,7 +39,6 @@ async function scrape_yahoo_finance_pup(browser, event, context) {
console.error(`Problem with scraping ${keyword}: ${e}`);
}
}
return results;
}
@ -55,4 +59,175 @@ function parse(html) {
time: (new Date()).toUTCString(),
results: results,
}
}
async function scrape_marketwatch_finance_pup(page, event, context) {
var results = {};
for (let keyword of event.keywords) {
try {
await page.goto(`https://www.marketwatch.com/investing/stock/${keyword}`);
await page.waitForSelector('.intraday__data', { timeout: 8000 });
if (event.debug === true && event.is_local === true) {
await page.screenshot({path: `debug/${keyword}.png`});
}
await sfunctions.sleep(500);
let newsData = await page.evaluate(() => {
let results = [];
// get the hotel elements
let items = document.querySelectorAll('.article__content');
// get the hotel data
items.forEach((newsitem) => {
let data = {};
try {
data.link = newsitem.querySelector('.article__headline a').getAttribute('href');
data.title = newsitem.querySelector('.article__headline a').innerText;
data.date = newsitem.querySelector('.article__timestamp').innerText;
data.author = newsitem.querySelector('.article__author').innerText;
}
catch (exception) {
console.error('Error parsing marketwatch data: ', exception);
}
results.push(data);
});
return results;
});
results[keyword] = {
time: (new Date()).toUTCString(),
results: newsData,
}
} catch (e) {
console.error(`Problem with scraping ${keyword}: ${e}`);
}
}
return results;
}
async function scrape_bloomberg_finance_pup(page, event, context) {
/*
Bloomberg blocks after one request. what a shit hole.
*/
var results = {};
for (let keyword of event.keywords) {
try {
await page.goto(`https://www.bloomberg.com/quote/${keyword}:US`);
await page.waitForSelector('.pseudoMainContent', { timeout: 8000 });
if (event.debug === true && event.is_local === true) {
await page.screenshot({path: `debug/${keyword}.png`});
}
await sfunctions.sleep(1000);
let news_items = await page.$x('//*[starts-with(@class,"newsItem")]');
for (let item of news_items) {
let url = item.$$('a').then((link) => {
link.getProperty('href').then((anchor) => {
return anchor;
})
});
}
} catch (e) {
console.error(`Problem with scraping ${keyword}: ${e}`);
}
}
return results;
}
async function scrape_reuters_finance_pup(page, event, context) {
var results = {};
for (let keyword of event.keywords) {
try {
await page.goto(`https://www.reuters.com/finance/stocks/overview/${keyword}`);
await page.waitForSelector('#sectionHeader', { timeout: 8000 });
if (event.debug === true && event.is_local === true) {
await page.screenshot({path: `debug/${keyword}.png`});
}
await sfunctions.sleep(500);
let newsData = await page.evaluate(() => {
let results = [];
// get the hotel elements
let items = document.querySelectorAll('div.feature');
// get the hotel data
items.forEach((newsitem) => {
let data = {};
try {
data.link = newsitem.querySelector('h2 a').getAttribute('href');
data.link = 'https://www.reuters.com' + data.link;
data.title = newsitem.querySelector('h2 a').innerText;
data.text = newsitem.querySelector('p').innerText;
data.date = newsitem.querySelector('.timestamp').innerText;
}
catch (exception) {
console.error('Error parsing reuters data: ', exception);
}
results.push(data);
});
return results;
});
results[keyword] = {
time: (new Date()).toUTCString(),
results: newsData,
}
} catch (e) {
console.error(`Problem with scraping ${keyword}: ${e}`);
}
}
return results;
}
async function scrape_cnbc_finance_pup(page, event, context) {
var results = {};
for (let keyword of event.keywords) {
try {
await page.goto(`https://www.cnbc.com/quotes/?symbol=${keyword}&tab=news`);
await page.waitForSelector('#quote_title_and_chart', { timeout: 8000 });
if (event.debug === true && event.is_local === true) {
await page.screenshot({path: `debug/${keyword}.png`});
}
await sfunctions.sleep(500);
let newsData = await page.evaluate(() => {
let results = [];
// get the hotel elements
let items = document.querySelectorAll('div.headline');
// get the hotel data
items.forEach((newsitem) => {
let data = {};
try {
data.link = newsitem.querySelector('a').getAttribute('href');
data.title = newsitem.querySelector('[ng-bind="asset.headline"]').innerText;
data.date = newsitem.querySelector('span.note').innerText;
}
catch (exception) {
console.error('Error parsing cnbc data: ', exception);
}
results.push(data);
});
return results;
});
results[keyword] = {
time: (new Date()).toUTCString(),
results: newsData,
}
} catch (e) {
console.error(`Problem with scraping ${keyword}: ${e}`);
}
}
return results;
}

View File

@ -7,8 +7,7 @@ module.exports = {
const all_videos = new Set();
async function scrape_youtube_pup(browser, event, context) {
const page = await browser.newPage();
async function scrape_youtube_pup(page, event, context) {
await page.goto('https://www.youtube.com');
try {

View File

@ -72,35 +72,40 @@ module.exports.handler = async function handler (event, context, callback) {
console.dir(headers);
}
// TODO: this is ugly but I don't want to use to much objects and classes right now.
if (event.search_engine == 'google') {
results = await google.scrape_google_pup(browser, event, context);
} else if (event.search_engine == 'google_news_old') {
results = await google.scrape_google_news_old_pup(browser, event, context);
} else if (event.search_engine == 'google_news') {
results = await google.scrape_google_news_pup(browser, event, context);
} else if (event.search_engine == 'google_image') {
results = await google.scrape_google_image_pup(browser, event, context);
} else if (event.search_engine == 'bing') {
results = await bing.scrape_bing_pup(browser, event, context);
} else if (event.search_engine == 'bing_news') {
results = await bing.scrape_bing_news_pup(browser, event, context);
} else if (event.search_engine == 'infospace') {
results = await infospace.scrape_infospace_pup(browser, event, context);
} else if (event.search_engine == 'webcrawler') {
results = await infospace.scrape_webcrawler_news_pup(browser, event, context);
} else if (event.search_engine == 'baidu') {
results = await baidu.scrape_baidu_pup(browser, event, context);
} else if (event.search_engine == 'youtube') {
results = await youtube.scrape_youtube_pup(browser, event, context);
} else if (event.search_engine == 'duckduckgo_news') {
results = await duckduckgo.scrape_duckduckgo_news_pup(browser, event, context);
} else if (event.search_engine == 'google_dr') {
results = await google.scrape_google_pup_dr(browser, event, context);
} else if (event.search_engine == 'yahoo_news') {
results = await tickersearch.scrape_yahoo_finance_pup(browser, event, context);
const page = await browser.newPage();
if (event.block_assets === true) {
await page.setRequestInterception(true);
page.on('request', (req) => {
if (req.resourceType() == 'stylesheet' || req.resourceType() == 'font' || req.resourceType() == 'image') {
req.abort();
} else {
req.continue();
}
});
}
results = await {
google: google.scrape_google_pup,
google_news_old: google.scrape_google_news_old_pup,
google_news: google.scrape_google_news_pup,
google_image: google.scrape_google_image_pup,
bing: bing.scrape_bing_pup,
bing_news: bing.scrape_bing_news_pup,
infospace: infospace.scrape_infospace_pup,
webcrawler: infospace.scrape_webcrawler_news_pup,
baidu: baidu.scrape_baidu_pup,
youtube: youtube.scrape_youtube_pup,
duckduckgo_news: duckduckgo.scrape_duckduckgo_news_pup,
google_dr: google.scrape_google_pup_dr,
yahoo_news: tickersearch.scrape_yahoo_finance_pup,
bloomberg: tickersearch.scrape_bloomberg_finance_pup,
reuters: tickersearch.scrape_reuters_finance_pup,
cnbc: tickersearch.scrape_cnbc_finance_pup,
marketwatch: tickersearch.scrape_marketwatch_finance_pup,
}[event.search_engine](page, event, context);
let metadata = {};
if (event.write_meta_data === true) {
@ -203,6 +208,10 @@ function parseEventData(event) {
event.set_manual_settings = _bool(event.set_manual_settings);
}
if (event.block_assets) {
event.block_assets = _bool(event.block_assets);
}
if (event.sleep_range) {
// parse an array
event.sleep_range = eval(event.sleep_range);