before_keyword_scraped() hook supported

This commit is contained in:
Nikolai Tschacher 2019-01-29 13:29:24 +01:00
parent c5e3e84e1d
commit 89441070cd
15 changed files with 309 additions and 117 deletions

View File

@ -15,6 +15,12 @@
https://www.scrapehero.com/how-to-build-a-web-scraper-using-puppeteer-and-node-js/
TODO:
- think about implementing ticker search for: https://quotes.wsj.com/MSFT?mod=searchresults_companyquotes
- add proxy support
- add captcha service solving support
- check if news instances run the same browser and if we can have one proxy per tab wokers
- check if news instances run the same browser and if we can have one proxy per tab wokers
TODO:
- think whether it makes sense to introduce a generic scraping class?
- is scraping abstractable or is every scraper too unique?
- dont make the same mistakes as with GoogleScraper

File diff suppressed because one or more lines are too long

View File

@ -26,14 +26,27 @@ module.exports = class Pluggable {
await this.browser.close();
}
// Callback invoked after metadata has been gathered
async handle_metadata(args) {
// store scraping metadata somewhere
}
// Callback invoked after all keywords have been scraped
async handle_results(args) {
// store the results somewhere
}
// Callback invoked before a keyword is scraped.
async before_keyword_scraped(args) {
console.log('before keyword scraped.');
}
// Callback invoked after a keyword has been scraped.
// TODO: implement this
async after_keyword_scraped(args) {
console.log('after keyword scraped.')
}
async start_browser(args={}) {
const puppeteer = require('puppeteer');

View File

@ -22,20 +22,22 @@ exports.scrape = async function(config, callback) {
compress: false, // compress
debug: false,
verbose: false,
keywords: ['test'],
keywords: ['scrapeulous.com'],
// whether to start the browser in headless mode
headless: true,
// path to output file, data will be stored in JSON
output_file: '',
// whether to prevent images, css, fonts from being loaded
// whether to prevent images, css, fonts and media from being loaded
// will speed up scraping a great deal
block_assets: true,
// path to js module that extends functionality
// this module should export the functions:
// get_browser, handle_metadata, close_browser
//custom_func: resolve('examples/pluggable.js'),
custom_func: '',
};
// overwrite default config
for (var key in config) {
event[key] = config[key];
}

View File

@ -1,6 +1,6 @@
{
"name": "se-scraper",
"version": "1.1.5",
"version": "1.1.7",
"description": "A simple module which uses puppeteer to scrape several search engines.",
"homepage": "https://scrapeulous.com/",
"main": "index.js",

4
run.js
View File

@ -10,7 +10,7 @@ let config = {
write_meta_data: false,
// how long to sleep between requests. a random sleep interval within the range [a,b]
// is drawn before every request. empty string for no sleeping.
sleep_range: '',
sleep_range: '[1,1]',
// which search engine to scrape
search_engine: 'google',
// whether debug information should be printed
@ -20,7 +20,7 @@ let config = {
// this output is informational
verbose: false,
// an array of keywords to scrape
keywords: ['incolumitas.com news', ],
keywords: ['trump', 'chief'],
// alternatively you can specify a keyword_file. this overwrites the keywords array
keyword_file: '',
// whether to start the browser in headless mode

View File

@ -5,7 +5,7 @@ module.exports = {
scrape_baidu_pup: scrape_baidu_pup,
};
async function scrape_baidu_pup(page, event, context) {
async function scrape_baidu_pup(page, event, context, pluggable) {
await page.goto('https://www.baidu.com/');
try {
@ -21,6 +21,15 @@ async function scrape_baidu_pup(page, event, context) {
keyword = keywords[i];
if (pluggable.before_keyword_scraped) {
await pluggable.before_keyword_scraped({
keyword: keyword,
page: page,
event: event,
context: context,
});
}
try {
const input = await page.$('input[name="wd"]');
// overwrites last text in input

View File

@ -6,7 +6,7 @@ module.exports = {
scrape_bing_news_pup: scrape_bing_news_pup,
};
async function scrape_bing_pup(page, event, context) {
async function scrape_bing_pup(page, event, context, pluggable) {
await page.goto('https://www.bing.com/');
try {
@ -22,6 +22,15 @@ async function scrape_bing_pup(page, event, context) {
keyword = keywords[i];
if (pluggable.before_keyword_scraped) {
await pluggable.before_keyword_scraped({
keyword: keyword,
page: page,
event: event,
context: context,
});
}
try {
const input = await page.$('input[name="q"]');
// overwrites last text in input
@ -90,7 +99,7 @@ function parse(html) {
}
}
async function scrape_bing_news_pup(page, event, context) {
async function scrape_bing_news_pup(page, event, context, pluggable) {
await page.goto('https://www.bing.com/news/search?');
if (event.set_manual_settings === true) {
@ -109,12 +118,17 @@ async function scrape_bing_news_pup(page, event, context) {
for (var i = 0; i < keywords.length; i++) {
if (sfunctions.should_turn_down(context)) {
break;
}
keyword = keywords[i];
if (pluggable.before_keyword_scraped) {
await pluggable.before_keyword_scraped({
keyword: keyword,
page: page,
event: event,
context: context,
});
}
try {
const input = await page.$('input[name="q"]');
// overwrites last text in input

View File

@ -5,7 +5,7 @@ module.exports = {
scrape_duckduckgo_news_pup: scrape_duckduckgo_news_pup,
};
async function scrape_duckduckgo_news_pup(page, event, context) {
async function scrape_duckduckgo_news_pup(page, event, context, pluggable) {
await page.goto('https://duckduckgo.com/?q=42&t=h_&iar=news&ia=news');
try {
@ -21,6 +21,15 @@ async function scrape_duckduckgo_news_pup(page, event, context) {
keyword = keywords[i];
if (pluggable.before_keyword_scraped) {
await pluggable.before_keyword_scraped({
keyword: keyword,
page: page,
event: event,
context: context,
});
}
try {
const input = await page.$('input[name="q"]');
// overwrites last text in input

View File

@ -12,7 +12,7 @@ module.exports = {
const STANDARD_TIMEOUT = 8000;
const SOLVE_CAPTCHA_TIME = 45000;
async function scrape_google_pup(page, event, context) {
async function scrape_google_pup(page, event, context, pluggable) {
await page.goto('https://www.google.com/');
try {
@ -28,6 +28,15 @@ async function scrape_google_pup(page, event, context) {
keyword = keywords[i];
if (pluggable.before_keyword_scraped) {
await pluggable.before_keyword_scraped({
keyword: keyword,
page: page,
event: event,
context: context,
});
}
if (event.verbose === true) {
console.log(`${event.search_engine} is scraping keyword: ${keyword}`);
}
@ -81,7 +90,7 @@ async function scrape_google_pup(page, event, context) {
return results;
}
async function scrape_google_pup_dr(page, event, context) {
async function scrape_google_pup_dr(page, event, context, pluggable) {
let keywords = event.keywords;
first = keywords[0];
var year = first.slice(-5);
@ -106,6 +115,15 @@ async function scrape_google_pup_dr(page, event, context) {
// strip the year at the end plus whitespace
keyword = keywords[i].slice(0,-5);
if (pluggable.before_keyword_scraped) {
await pluggable.before_keyword_scraped({
keyword: keyword,
page: page,
event: event,
context: context,
});
}
if (event.verbose === true) {
console.log(`${event.search_engine} is scraping keyword: ${keyword}`);
}
@ -217,7 +235,7 @@ async function scraping_detected(page) {
return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
}
async function scrape_google_news_old_pup(page, event, context) {
async function scrape_google_news_old_pup(page, event, context, pluggable) {
let keywords = event.keywords;
var results = {};
@ -225,6 +243,15 @@ async function scrape_google_news_old_pup(page, event, context) {
keyword = keywords[i];
if (pluggable.before_keyword_scraped) {
await pluggable.before_keyword_scraped({
keyword: keyword,
page: page,
event: event,
context: context,
});
}
if (event.verbose === true) {
console.log(`${event.search_engine} is scraping keyword: ${keyword}`);
}
@ -326,7 +353,7 @@ function parse_google_news_results_se_format(html) {
}
}
async function scrape_google_image_pup(page, event, context) {
async function scrape_google_image_pup(page, event, context, pluggable) {
let keywords = event.keywords;
var results = {};
@ -344,6 +371,15 @@ async function scrape_google_image_pup(page, event, context) {
keyword = keywords[i];
if (pluggable.before_keyword_scraped) {
await pluggable.before_keyword_scraped({
keyword: keyword,
page: page,
event: event,
context: context,
});
}
if (event.verbose === true) {
console.log(`${event.search_engine} is scraping keyword: ${keyword}`);
}
@ -452,9 +488,22 @@ function clean_image_url(url) {
}
}
function clean_google_url(url) {
// Example:
// /url?q=https://www.zeit.de/thema/donald-trump&sa=U&ved=0ahUKEwiL9-u-_ZLgAhVJsqQKHeITDAoQFgg0MAc&usg=AOvVaw3JV3UZjTXRwaS2I-sBbeXF
// /search?q=trump&hl=de&gbv=2&ie=UTF-8&prmd=ivns&source=univ&tbm=nws&tbo=u&sa=X&ved=0ahUKEwiL9-u-_ZLgAhVJsqQKHeITDAoQqAIIFA
const regex = /url\?q=(.*?)&/gm;
let match = regex.exec(url);
if (match !== null) {
return decodeURIComponent(match[1]);
} else {
return url;
}
}
const all_results = new Set();
async function scrape_google_news_pup(page, event, context) {
async function scrape_google_news_pup(page, event, context, pluggable) {
let keywords = event.keywords;
var results = {};
@ -472,6 +521,15 @@ async function scrape_google_news_pup(page, event, context) {
keyword = keywords[i];
if (pluggable.before_keyword_scraped) {
await pluggable.before_keyword_scraped({
keyword: keyword,
page: page,
event: event,
context: context,
});
}
if (event.verbose === true) {
console.log(`${event.search_engine} is scraping keyword: ${keyword}`);
}

View File

@ -6,7 +6,7 @@ module.exports = {
scrape_webcrawler_news_pup: scrape_webcrawler_news_pup,
};
async function scrape_infospace_pup(page, event, context) {
async function scrape_infospace_pup(page, event, context, pluggable) {
await page.goto('http://infospace.com/index.html');
try {
@ -22,6 +22,15 @@ async function scrape_infospace_pup(page, event, context) {
keyword = keywords[i];
if (pluggable.before_keyword_scraped) {
await pluggable.before_keyword_scraped({
keyword: keyword,
page: page,
event: event,
context: context,
});
}
try {
const input = await page.$('input[id="q"]');
// overwrites last text in input
@ -88,7 +97,7 @@ function parse(html) {
}
}
async function scrape_webcrawler_news_pup(page, event, context) {
async function scrape_webcrawler_news_pup(page, event, context, pluggable) {
await page.goto('https://www.webcrawler.com/?qc=news');
try {
@ -104,6 +113,15 @@ async function scrape_webcrawler_news_pup(page, event, context) {
keyword = keywords[i];
if (pluggable.before_keyword_scraped) {
await pluggable.before_keyword_scraped({
keyword: keyword,
page: page,
event: event,
context: context,
});
}
try {
const input = await page.$('input[name="q"]');
// overwrites last text in input

View File

@ -11,7 +11,7 @@ module.exports = {
// https://www.google.com/search?q=MSFT&tbm=fin
async function scrape_yahoo_finance_pup(page, event, context) {
async function scrape_yahoo_finance_pup(page, event, context, pluggable) {
var results = {};
await page.goto('https://finance.yahoo.com/');
@ -21,6 +21,16 @@ async function scrape_yahoo_finance_pup(page, event, context) {
}
for (let keyword of event.keywords) {
if (pluggable.before_keyword_scraped) {
await pluggable.before_keyword_scraped({
keyword: keyword,
page: page,
event: event,
context: context,
});
}
try {
await page.goto(`https://finance.yahoo.com/quote/${keyword}/news?p=${keyword}`);
@ -61,9 +71,17 @@ function parse(html) {
}
}
async function scrape_marketwatch_finance_pup(page, event, context) {
async function scrape_marketwatch_finance_pup(page, event, context, pluggable) {
var results = {};
for (let keyword of event.keywords) {
if (pluggable.before_keyword_scraped) {
await pluggable.before_keyword_scraped({
keyword: keyword,
page: page,
event: event,
context: context,
});
}
try {
await page.goto(`https://www.marketwatch.com/investing/stock/${keyword}`);
await page.waitForSelector('.intraday__data', { timeout: 8000 });
@ -108,12 +126,22 @@ async function scrape_marketwatch_finance_pup(page, event, context) {
}
async function scrape_bloomberg_finance_pup(page, event, context) {
async function scrape_bloomberg_finance_pup(page, event, context, pluggable) {
/*
Bloomberg blocks after one request. what a shit hole.
*/
var results = {};
for (let keyword of event.keywords) {
if (pluggable.before_keyword_scraped) {
await pluggable.before_keyword_scraped({
keyword: keyword,
page: page,
event: event,
context: context,
});
}
try {
await page.goto(`https://www.bloomberg.com/quote/${keyword}:US`);
await page.waitForSelector('.pseudoMainContent', { timeout: 8000 });
@ -140,9 +168,19 @@ async function scrape_bloomberg_finance_pup(page, event, context) {
return results;
}
async function scrape_reuters_finance_pup(page, event, context) {
async function scrape_reuters_finance_pup(page, event, context, pluggable) {
var results = {};
for (let keyword of event.keywords) {
if (pluggable.before_keyword_scraped) {
await pluggable.before_keyword_scraped({
keyword: keyword,
page: page,
event: event,
context: context,
});
}
try {
await page.goto(`https://www.reuters.com/finance/stocks/overview/${keyword}`);
await page.waitForSelector('#sectionHeader', { timeout: 8000 });
@ -187,9 +225,19 @@ async function scrape_reuters_finance_pup(page, event, context) {
return results;
}
async function scrape_cnbc_finance_pup(page, event, context) {
async function scrape_cnbc_finance_pup(page, event, context, pluggable) {
var results = {};
for (let keyword of event.keywords) {
if (pluggable.before_keyword_scraped) {
await pluggable.before_keyword_scraped({
keyword: keyword,
page: page,
event: event,
context: context,
});
}
try {
await page.goto(`https://www.cnbc.com/quotes/?symbol=${keyword}&tab=news`);
await page.waitForSelector('#quote_title_and_chart', { timeout: 8000 });

View File

@ -6,80 +6,77 @@ function random_user_agent() {
return user_agents[Math.floor(Math.random()*user_agents.length)];
}
// updated: 29 Jan 2019
const user_agents = [
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:62.0) Gecko/20100101 Firefox/62.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0',
'Mozilla/5.0 (X11; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:62.0) Gecko/20100101 Firefox/62.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/69.0.3497.81 Chrome/69.0.3497.81 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:62.0) Gecko/20100101 Firefox/62.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:62.0) Gecko/20100101 Firefox/62.0',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 OPR/55.0.2994.61',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:62.0) Gecko/20100101 Firefox/62.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0',
'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:62.0) Gecko/20100101 Firefox/62.0',
'Mozilla/5.0 (Windows NT 5.1; rv:52.0) Gecko/20100101 Firefox/52.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'Mozilla/5.0 (iPad; CPU OS 12_0_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/61.0',
'Mozilla/5.0 (Windows NT 6.1; rv:62.0) Gecko/20100101 Firefox/62.0',
'Mozilla/5.0 (iPad; CPU OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 OPR/56.0.3051.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 YaBrowser/18.9.0.3467 Yowser/2.5 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:60.0) Gecko/20100101 Firefox/60.0',
'Mozilla/5.0 (X11; CrOS x86_64 10895.56.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.95 Safari/537.36'
['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 YaBrowser/18.11.1.805 Yowser/2.5 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763',
'Mozilla/5.0 (iPad; CPU OS 12_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.1 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 YaBrowser/18.11.1.805 Yowser/2.5 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.106',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.98 Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; Touch; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8',
'Mozilla/5.0 (Windows NT 10.0; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0',
'Mozilla/5.0 (X11; CrOS x86_64 11151.59.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.94 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
]
];

View File

@ -7,7 +7,7 @@ module.exports = {
const all_videos = new Set();
async function scrape_youtube_pup(page, event, context) {
async function scrape_youtube_pup(page, event, context, pluggable) {
await page.goto('https://www.youtube.com');
try {
@ -30,6 +30,15 @@ async function scrape_youtube_pup(page, event, context) {
keyword = keywords[i];
if (pluggable.before_keyword_scraped) {
await pluggable.before_keyword_scraped({
keyword: keyword,
page: page,
event: event,
context: context,
});
}
try {
const input = await page.$('input[id="search"]');
// overwrites last text in input

View File

@ -23,18 +23,21 @@ function write_results(fname, data) {
module.exports.handler = async function handler (event, context, callback) {
config = event;
pluggable = null;
if (config.custom_func && fs.existsSync(config.custom_func)) {
try {
Pluggable = require(config.custom_func);
pluggable = new Pluggable({config:config});
} catch (exception) {
console.error(exception);
if (config.custom_func) {
if (fs.existsSync(config.custom_func)) {
try {
Pluggable = require(config.custom_func);
pluggable = new Pluggable({config: config});
} catch (exception) {
console.error(exception);
}
} else {
console.error(`File "${config.custom_func}" does not exist...`);
}
}
try {
const startTime = Date.now();
config = parseEventData(config);
if (config.debug === true) {
console.log(config);
@ -53,14 +56,14 @@ module.exports.handler = async function handler (event, context, callback) {
let USER_AGENT = '';
if (config.random_user_agent) {
USER_AGENT = ua.random_user_agent();
}
if (config.user_agent) {
USER_AGENT = config.user_agent;
}
if (config.random_user_agent === true) {
USER_AGENT = ua.random_user_agent();
}
if (USER_AGENT) {
ADDITIONAL_CHROME_FLAGS.push(
`--user-agent="${USER_AGENT}"`
@ -90,11 +93,13 @@ module.exports.handler = async function handler (event, context, callback) {
const page = await browser.newPage();
// block some assets to speed up scraping
if (config.block_assets === true) {
await page.setRequestInterception(true);
page.on('request', (req) => {
if (req.resourceType() == 'stylesheet' || req.resourceType() == 'font' || req.resourceType() == 'image') {
let type = req.resourceType();
const block = ['stylesheet', 'font', 'image', 'media'];
if (block.includes(type)) {
req.abort();
} else {
req.continue();
@ -120,7 +125,7 @@ module.exports.handler = async function handler (event, context, callback) {
reuters: tickersearch.scrape_reuters_finance_pup,
cnbc: tickersearch.scrape_cnbc_finance_pup,
marketwatch: tickersearch.scrape_marketwatch_finance_pup,
}[config.search_engine](page, config, context);
}[config.search_engine](page, config, context, pluggable);
let metadata = {};
@ -223,6 +228,10 @@ function parseEventData(config) {
config.log_http_headers = _bool(config.log_http_headers);
}
if (config.random_user_agent) {
config.random_user_agent = _bool(config.random_user_agent);
}
if (config.compress) {
config.compress = _bool(config.compress);
}