mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2025-02-16 16:50:45 +01:00
before_keyword_scraped() hook supported
This commit is contained in:
parent
c5e3e84e1d
commit
89441070cd
8
TODO.txt
8
TODO.txt
@ -15,6 +15,12 @@
|
||||
https://www.scrapehero.com/how-to-build-a-web-scraper-using-puppeteer-and-node-js/
|
||||
|
||||
TODO:
|
||||
- think about implementing ticker search for: https://quotes.wsj.com/MSFT?mod=searchresults_companyquotes
|
||||
- add proxy support
|
||||
- add captcha service solving support
|
||||
- check if news instances run the same browser and if we can have one proxy per tab wokers
|
||||
- check if news instances run the same browser and if we can have one proxy per tab wokers
|
||||
|
||||
TODO:
|
||||
- think whether it makes sense to introduce a generic scraping class?
|
||||
- is scraping abstractable or is every scraper too unique?
|
||||
- dont make the same mistakes as with GoogleScraper
|
@ -26,14 +26,27 @@ module.exports = class Pluggable {
|
||||
await this.browser.close();
|
||||
}
|
||||
|
||||
// Callback invoked after metadata has been gathered
|
||||
async handle_metadata(args) {
|
||||
// store scraping metadata somewhere
|
||||
}
|
||||
|
||||
// Callback invoked after all keywords have been scraped
|
||||
async handle_results(args) {
|
||||
// store the results somewhere
|
||||
}
|
||||
|
||||
// Callback invoked before a keyword is scraped.
|
||||
async before_keyword_scraped(args) {
|
||||
console.log('before keyword scraped.');
|
||||
}
|
||||
|
||||
// Callback invoked after a keyword has been scraped.
|
||||
// TODO: implement this
|
||||
async after_keyword_scraped(args) {
|
||||
console.log('after keyword scraped.')
|
||||
}
|
||||
|
||||
async start_browser(args={}) {
|
||||
const puppeteer = require('puppeteer');
|
||||
|
||||
|
6
index.js
6
index.js
@ -22,20 +22,22 @@ exports.scrape = async function(config, callback) {
|
||||
compress: false, // compress
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: ['test'],
|
||||
keywords: ['scrapeulous.com'],
|
||||
// whether to start the browser in headless mode
|
||||
headless: true,
|
||||
// path to output file, data will be stored in JSON
|
||||
output_file: '',
|
||||
// whether to prevent images, css, fonts from being loaded
|
||||
// whether to prevent images, css, fonts and media from being loaded
|
||||
// will speed up scraping a great deal
|
||||
block_assets: true,
|
||||
// path to js module that extends functionality
|
||||
// this module should export the functions:
|
||||
// get_browser, handle_metadata, close_browser
|
||||
//custom_func: resolve('examples/pluggable.js'),
|
||||
custom_func: '',
|
||||
};
|
||||
|
||||
// overwrite default config
|
||||
for (var key in config) {
|
||||
event[key] = config[key];
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "se-scraper",
|
||||
"version": "1.1.5",
|
||||
"version": "1.1.7",
|
||||
"description": "A simple module which uses puppeteer to scrape several search engines.",
|
||||
"homepage": "https://scrapeulous.com/",
|
||||
"main": "index.js",
|
||||
|
4
run.js
4
run.js
@ -10,7 +10,7 @@ let config = {
|
||||
write_meta_data: false,
|
||||
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
||||
// is drawn before every request. empty string for no sleeping.
|
||||
sleep_range: '',
|
||||
sleep_range: '[1,1]',
|
||||
// which search engine to scrape
|
||||
search_engine: 'google',
|
||||
// whether debug information should be printed
|
||||
@ -20,7 +20,7 @@ let config = {
|
||||
// this output is informational
|
||||
verbose: false,
|
||||
// an array of keywords to scrape
|
||||
keywords: ['incolumitas.com news', ],
|
||||
keywords: ['trump', 'chief'],
|
||||
// alternatively you can specify a keyword_file. this overwrites the keywords array
|
||||
keyword_file: '',
|
||||
// whether to start the browser in headless mode
|
||||
|
@ -5,7 +5,7 @@ module.exports = {
|
||||
scrape_baidu_pup: scrape_baidu_pup,
|
||||
};
|
||||
|
||||
async function scrape_baidu_pup(page, event, context) {
|
||||
async function scrape_baidu_pup(page, event, context, pluggable) {
|
||||
await page.goto('https://www.baidu.com/');
|
||||
|
||||
try {
|
||||
@ -21,6 +21,15 @@ async function scrape_baidu_pup(page, event, context) {
|
||||
|
||||
keyword = keywords[i];
|
||||
|
||||
if (pluggable.before_keyword_scraped) {
|
||||
await pluggable.before_keyword_scraped({
|
||||
keyword: keyword,
|
||||
page: page,
|
||||
event: event,
|
||||
context: context,
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
const input = await page.$('input[name="wd"]');
|
||||
// overwrites last text in input
|
||||
|
@ -6,7 +6,7 @@ module.exports = {
|
||||
scrape_bing_news_pup: scrape_bing_news_pup,
|
||||
};
|
||||
|
||||
async function scrape_bing_pup(page, event, context) {
|
||||
async function scrape_bing_pup(page, event, context, pluggable) {
|
||||
await page.goto('https://www.bing.com/');
|
||||
|
||||
try {
|
||||
@ -22,6 +22,15 @@ async function scrape_bing_pup(page, event, context) {
|
||||
|
||||
keyword = keywords[i];
|
||||
|
||||
if (pluggable.before_keyword_scraped) {
|
||||
await pluggable.before_keyword_scraped({
|
||||
keyword: keyword,
|
||||
page: page,
|
||||
event: event,
|
||||
context: context,
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
const input = await page.$('input[name="q"]');
|
||||
// overwrites last text in input
|
||||
@ -90,7 +99,7 @@ function parse(html) {
|
||||
}
|
||||
}
|
||||
|
||||
async function scrape_bing_news_pup(page, event, context) {
|
||||
async function scrape_bing_news_pup(page, event, context, pluggable) {
|
||||
await page.goto('https://www.bing.com/news/search?');
|
||||
|
||||
if (event.set_manual_settings === true) {
|
||||
@ -109,12 +118,17 @@ async function scrape_bing_news_pup(page, event, context) {
|
||||
|
||||
for (var i = 0; i < keywords.length; i++) {
|
||||
|
||||
if (sfunctions.should_turn_down(context)) {
|
||||
break;
|
||||
}
|
||||
|
||||
keyword = keywords[i];
|
||||
|
||||
if (pluggable.before_keyword_scraped) {
|
||||
await pluggable.before_keyword_scraped({
|
||||
keyword: keyword,
|
||||
page: page,
|
||||
event: event,
|
||||
context: context,
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
const input = await page.$('input[name="q"]');
|
||||
// overwrites last text in input
|
||||
|
@ -5,7 +5,7 @@ module.exports = {
|
||||
scrape_duckduckgo_news_pup: scrape_duckduckgo_news_pup,
|
||||
};
|
||||
|
||||
async function scrape_duckduckgo_news_pup(page, event, context) {
|
||||
async function scrape_duckduckgo_news_pup(page, event, context, pluggable) {
|
||||
await page.goto('https://duckduckgo.com/?q=42&t=h_&iar=news&ia=news');
|
||||
|
||||
try {
|
||||
@ -21,6 +21,15 @@ async function scrape_duckduckgo_news_pup(page, event, context) {
|
||||
|
||||
keyword = keywords[i];
|
||||
|
||||
if (pluggable.before_keyword_scraped) {
|
||||
await pluggable.before_keyword_scraped({
|
||||
keyword: keyword,
|
||||
page: page,
|
||||
event: event,
|
||||
context: context,
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
const input = await page.$('input[name="q"]');
|
||||
// overwrites last text in input
|
||||
|
@ -12,7 +12,7 @@ module.exports = {
|
||||
const STANDARD_TIMEOUT = 8000;
|
||||
const SOLVE_CAPTCHA_TIME = 45000;
|
||||
|
||||
async function scrape_google_pup(page, event, context) {
|
||||
async function scrape_google_pup(page, event, context, pluggable) {
|
||||
await page.goto('https://www.google.com/');
|
||||
|
||||
try {
|
||||
@ -28,6 +28,15 @@ async function scrape_google_pup(page, event, context) {
|
||||
|
||||
keyword = keywords[i];
|
||||
|
||||
if (pluggable.before_keyword_scraped) {
|
||||
await pluggable.before_keyword_scraped({
|
||||
keyword: keyword,
|
||||
page: page,
|
||||
event: event,
|
||||
context: context,
|
||||
});
|
||||
}
|
||||
|
||||
if (event.verbose === true) {
|
||||
console.log(`${event.search_engine} is scraping keyword: ${keyword}`);
|
||||
}
|
||||
@ -81,7 +90,7 @@ async function scrape_google_pup(page, event, context) {
|
||||
return results;
|
||||
}
|
||||
|
||||
async function scrape_google_pup_dr(page, event, context) {
|
||||
async function scrape_google_pup_dr(page, event, context, pluggable) {
|
||||
let keywords = event.keywords;
|
||||
first = keywords[0];
|
||||
var year = first.slice(-5);
|
||||
@ -106,6 +115,15 @@ async function scrape_google_pup_dr(page, event, context) {
|
||||
// strip the year at the end plus whitespace
|
||||
keyword = keywords[i].slice(0,-5);
|
||||
|
||||
if (pluggable.before_keyword_scraped) {
|
||||
await pluggable.before_keyword_scraped({
|
||||
keyword: keyword,
|
||||
page: page,
|
||||
event: event,
|
||||
context: context,
|
||||
});
|
||||
}
|
||||
|
||||
if (event.verbose === true) {
|
||||
console.log(`${event.search_engine} is scraping keyword: ${keyword}`);
|
||||
}
|
||||
@ -217,7 +235,7 @@ async function scraping_detected(page) {
|
||||
return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
|
||||
}
|
||||
|
||||
async function scrape_google_news_old_pup(page, event, context) {
|
||||
async function scrape_google_news_old_pup(page, event, context, pluggable) {
|
||||
let keywords = event.keywords;
|
||||
var results = {};
|
||||
|
||||
@ -225,6 +243,15 @@ async function scrape_google_news_old_pup(page, event, context) {
|
||||
|
||||
keyword = keywords[i];
|
||||
|
||||
if (pluggable.before_keyword_scraped) {
|
||||
await pluggable.before_keyword_scraped({
|
||||
keyword: keyword,
|
||||
page: page,
|
||||
event: event,
|
||||
context: context,
|
||||
});
|
||||
}
|
||||
|
||||
if (event.verbose === true) {
|
||||
console.log(`${event.search_engine} is scraping keyword: ${keyword}`);
|
||||
}
|
||||
@ -326,7 +353,7 @@ function parse_google_news_results_se_format(html) {
|
||||
}
|
||||
}
|
||||
|
||||
async function scrape_google_image_pup(page, event, context) {
|
||||
async function scrape_google_image_pup(page, event, context, pluggable) {
|
||||
let keywords = event.keywords;
|
||||
var results = {};
|
||||
|
||||
@ -344,6 +371,15 @@ async function scrape_google_image_pup(page, event, context) {
|
||||
|
||||
keyword = keywords[i];
|
||||
|
||||
if (pluggable.before_keyword_scraped) {
|
||||
await pluggable.before_keyword_scraped({
|
||||
keyword: keyword,
|
||||
page: page,
|
||||
event: event,
|
||||
context: context,
|
||||
});
|
||||
}
|
||||
|
||||
if (event.verbose === true) {
|
||||
console.log(`${event.search_engine} is scraping keyword: ${keyword}`);
|
||||
}
|
||||
@ -452,9 +488,22 @@ function clean_image_url(url) {
|
||||
}
|
||||
}
|
||||
|
||||
function clean_google_url(url) {
|
||||
// Example:
|
||||
// /url?q=https://www.zeit.de/thema/donald-trump&sa=U&ved=0ahUKEwiL9-u-_ZLgAhVJsqQKHeITDAoQFgg0MAc&usg=AOvVaw3JV3UZjTXRwaS2I-sBbeXF
|
||||
// /search?q=trump&hl=de&gbv=2&ie=UTF-8&prmd=ivns&source=univ&tbm=nws&tbo=u&sa=X&ved=0ahUKEwiL9-u-_ZLgAhVJsqQKHeITDAoQqAIIFA
|
||||
const regex = /url\?q=(.*?)&/gm;
|
||||
let match = regex.exec(url);
|
||||
if (match !== null) {
|
||||
return decodeURIComponent(match[1]);
|
||||
} else {
|
||||
return url;
|
||||
}
|
||||
}
|
||||
|
||||
const all_results = new Set();
|
||||
|
||||
async function scrape_google_news_pup(page, event, context) {
|
||||
async function scrape_google_news_pup(page, event, context, pluggable) {
|
||||
let keywords = event.keywords;
|
||||
var results = {};
|
||||
|
||||
@ -472,6 +521,15 @@ async function scrape_google_news_pup(page, event, context) {
|
||||
|
||||
keyword = keywords[i];
|
||||
|
||||
if (pluggable.before_keyword_scraped) {
|
||||
await pluggable.before_keyword_scraped({
|
||||
keyword: keyword,
|
||||
page: page,
|
||||
event: event,
|
||||
context: context,
|
||||
});
|
||||
}
|
||||
|
||||
if (event.verbose === true) {
|
||||
console.log(`${event.search_engine} is scraping keyword: ${keyword}`);
|
||||
}
|
||||
|
@ -6,7 +6,7 @@ module.exports = {
|
||||
scrape_webcrawler_news_pup: scrape_webcrawler_news_pup,
|
||||
};
|
||||
|
||||
async function scrape_infospace_pup(page, event, context) {
|
||||
async function scrape_infospace_pup(page, event, context, pluggable) {
|
||||
await page.goto('http://infospace.com/index.html');
|
||||
|
||||
try {
|
||||
@ -22,6 +22,15 @@ async function scrape_infospace_pup(page, event, context) {
|
||||
|
||||
keyword = keywords[i];
|
||||
|
||||
if (pluggable.before_keyword_scraped) {
|
||||
await pluggable.before_keyword_scraped({
|
||||
keyword: keyword,
|
||||
page: page,
|
||||
event: event,
|
||||
context: context,
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
const input = await page.$('input[id="q"]');
|
||||
// overwrites last text in input
|
||||
@ -88,7 +97,7 @@ function parse(html) {
|
||||
}
|
||||
}
|
||||
|
||||
async function scrape_webcrawler_news_pup(page, event, context) {
|
||||
async function scrape_webcrawler_news_pup(page, event, context, pluggable) {
|
||||
await page.goto('https://www.webcrawler.com/?qc=news');
|
||||
|
||||
try {
|
||||
@ -104,6 +113,15 @@ async function scrape_webcrawler_news_pup(page, event, context) {
|
||||
|
||||
keyword = keywords[i];
|
||||
|
||||
if (pluggable.before_keyword_scraped) {
|
||||
await pluggable.before_keyword_scraped({
|
||||
keyword: keyword,
|
||||
page: page,
|
||||
event: event,
|
||||
context: context,
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
const input = await page.$('input[name="q"]');
|
||||
// overwrites last text in input
|
||||
|
@ -11,7 +11,7 @@ module.exports = {
|
||||
|
||||
// https://www.google.com/search?q=MSFT&tbm=fin
|
||||
|
||||
async function scrape_yahoo_finance_pup(page, event, context) {
|
||||
async function scrape_yahoo_finance_pup(page, event, context, pluggable) {
|
||||
var results = {};
|
||||
await page.goto('https://finance.yahoo.com/');
|
||||
|
||||
@ -21,6 +21,16 @@ async function scrape_yahoo_finance_pup(page, event, context) {
|
||||
}
|
||||
|
||||
for (let keyword of event.keywords) {
|
||||
|
||||
if (pluggable.before_keyword_scraped) {
|
||||
await pluggable.before_keyword_scraped({
|
||||
keyword: keyword,
|
||||
page: page,
|
||||
event: event,
|
||||
context: context,
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
await page.goto(`https://finance.yahoo.com/quote/${keyword}/news?p=${keyword}`);
|
||||
|
||||
@ -61,9 +71,17 @@ function parse(html) {
|
||||
}
|
||||
}
|
||||
|
||||
async function scrape_marketwatch_finance_pup(page, event, context) {
|
||||
async function scrape_marketwatch_finance_pup(page, event, context, pluggable) {
|
||||
var results = {};
|
||||
for (let keyword of event.keywords) {
|
||||
if (pluggable.before_keyword_scraped) {
|
||||
await pluggable.before_keyword_scraped({
|
||||
keyword: keyword,
|
||||
page: page,
|
||||
event: event,
|
||||
context: context,
|
||||
});
|
||||
}
|
||||
try {
|
||||
await page.goto(`https://www.marketwatch.com/investing/stock/${keyword}`);
|
||||
await page.waitForSelector('.intraday__data', { timeout: 8000 });
|
||||
@ -108,12 +126,22 @@ async function scrape_marketwatch_finance_pup(page, event, context) {
|
||||
}
|
||||
|
||||
|
||||
async function scrape_bloomberg_finance_pup(page, event, context) {
|
||||
async function scrape_bloomberg_finance_pup(page, event, context, pluggable) {
|
||||
/*
|
||||
Bloomberg blocks after one request. what a shit hole.
|
||||
*/
|
||||
var results = {};
|
||||
for (let keyword of event.keywords) {
|
||||
|
||||
if (pluggable.before_keyword_scraped) {
|
||||
await pluggable.before_keyword_scraped({
|
||||
keyword: keyword,
|
||||
page: page,
|
||||
event: event,
|
||||
context: context,
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
await page.goto(`https://www.bloomberg.com/quote/${keyword}:US`);
|
||||
await page.waitForSelector('.pseudoMainContent', { timeout: 8000 });
|
||||
@ -140,9 +168,19 @@ async function scrape_bloomberg_finance_pup(page, event, context) {
|
||||
return results;
|
||||
}
|
||||
|
||||
async function scrape_reuters_finance_pup(page, event, context) {
|
||||
async function scrape_reuters_finance_pup(page, event, context, pluggable) {
|
||||
var results = {};
|
||||
for (let keyword of event.keywords) {
|
||||
|
||||
if (pluggable.before_keyword_scraped) {
|
||||
await pluggable.before_keyword_scraped({
|
||||
keyword: keyword,
|
||||
page: page,
|
||||
event: event,
|
||||
context: context,
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
await page.goto(`https://www.reuters.com/finance/stocks/overview/${keyword}`);
|
||||
await page.waitForSelector('#sectionHeader', { timeout: 8000 });
|
||||
@ -187,9 +225,19 @@ async function scrape_reuters_finance_pup(page, event, context) {
|
||||
return results;
|
||||
}
|
||||
|
||||
async function scrape_cnbc_finance_pup(page, event, context) {
|
||||
async function scrape_cnbc_finance_pup(page, event, context, pluggable) {
|
||||
var results = {};
|
||||
for (let keyword of event.keywords) {
|
||||
|
||||
if (pluggable.before_keyword_scraped) {
|
||||
await pluggable.before_keyword_scraped({
|
||||
keyword: keyword,
|
||||
page: page,
|
||||
event: event,
|
||||
context: context,
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
await page.goto(`https://www.cnbc.com/quotes/?symbol=${keyword}&tab=news`);
|
||||
await page.waitForSelector('#quote_title_and_chart', { timeout: 8000 });
|
||||
|
@ -6,80 +6,77 @@ function random_user_agent() {
|
||||
return user_agents[Math.floor(Math.random()*user_agents.length)];
|
||||
}
|
||||
|
||||
// updated: 29 Jan 2019
|
||||
const user_agents = [
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0',
|
||||
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:62.0) Gecko/20100101 Firefox/62.0',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0',
|
||||
'Mozilla/5.0 (X11; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
||||
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
||||
'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:62.0) Gecko/20100101 Firefox/62.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0',
|
||||
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/69.0.3497.81 Chrome/69.0.3497.81 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:62.0) Gecko/20100101 Firefox/62.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:62.0) Gecko/20100101 Firefox/62.0',
|
||||
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15',
|
||||
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 OPR/55.0.2994.61',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:62.0) Gecko/20100101 Firefox/62.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0',
|
||||
'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
|
||||
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:62.0) Gecko/20100101 Firefox/62.0',
|
||||
'Mozilla/5.0 (Windows NT 5.1; rv:52.0) Gecko/20100101 Firefox/52.0',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
|
||||
'Mozilla/5.0 (iPad; CPU OS 12_0_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/61.0',
|
||||
'Mozilla/5.0 (Windows NT 6.1; rv:62.0) Gecko/20100101 Firefox/62.0',
|
||||
'Mozilla/5.0 (iPad; CPU OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 OPR/56.0.3051.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 YaBrowser/18.9.0.3467 Yowser/2.5 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:60.0) Gecko/20100101 Firefox/60.0',
|
||||
'Mozilla/5.0 (X11; CrOS x86_64 10895.56.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.95 Safari/537.36'
|
||||
['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134',
|
||||
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
||||
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
||||
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 YaBrowser/18.11.1.805 Yowser/2.5 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763',
|
||||
'Mozilla/5.0 (iPad; CPU OS 12_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.1 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116',
|
||||
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 YaBrowser/18.11.1.805 Yowser/2.5 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299',
|
||||
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.106',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0',
|
||||
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.98 Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; Touch; rv:11.0) like Gecko',
|
||||
'Mozilla/5.0 (Windows NT 6.1; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
|
||||
'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8',
|
||||
'Mozilla/5.0 (Windows NT 10.0; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
|
||||
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0',
|
||||
'Mozilla/5.0 (X11; CrOS x86_64 11151.59.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.94 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
]
|
||||
];
|
@ -7,7 +7,7 @@ module.exports = {
|
||||
|
||||
const all_videos = new Set();
|
||||
|
||||
async function scrape_youtube_pup(page, event, context) {
|
||||
async function scrape_youtube_pup(page, event, context, pluggable) {
|
||||
await page.goto('https://www.youtube.com');
|
||||
|
||||
try {
|
||||
@ -30,6 +30,15 @@ async function scrape_youtube_pup(page, event, context) {
|
||||
|
||||
keyword = keywords[i];
|
||||
|
||||
if (pluggable.before_keyword_scraped) {
|
||||
await pluggable.before_keyword_scraped({
|
||||
keyword: keyword,
|
||||
page: page,
|
||||
event: event,
|
||||
context: context,
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
const input = await page.$('input[id="search"]');
|
||||
// overwrites last text in input
|
||||
|
@ -23,18 +23,21 @@ function write_results(fname, data) {
|
||||
module.exports.handler = async function handler (event, context, callback) {
|
||||
config = event;
|
||||
pluggable = null;
|
||||
if (config.custom_func && fs.existsSync(config.custom_func)) {
|
||||
try {
|
||||
Pluggable = require(config.custom_func);
|
||||
pluggable = new Pluggable({config:config});
|
||||
} catch (exception) {
|
||||
console.error(exception);
|
||||
if (config.custom_func) {
|
||||
if (fs.existsSync(config.custom_func)) {
|
||||
try {
|
||||
Pluggable = require(config.custom_func);
|
||||
pluggable = new Pluggable({config: config});
|
||||
} catch (exception) {
|
||||
console.error(exception);
|
||||
}
|
||||
} else {
|
||||
console.error(`File "${config.custom_func}" does not exist...`);
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
const startTime = Date.now();
|
||||
|
||||
config = parseEventData(config);
|
||||
if (config.debug === true) {
|
||||
console.log(config);
|
||||
@ -53,14 +56,14 @@ module.exports.handler = async function handler (event, context, callback) {
|
||||
|
||||
let USER_AGENT = '';
|
||||
|
||||
if (config.random_user_agent) {
|
||||
USER_AGENT = ua.random_user_agent();
|
||||
}
|
||||
|
||||
if (config.user_agent) {
|
||||
USER_AGENT = config.user_agent;
|
||||
}
|
||||
|
||||
if (config.random_user_agent === true) {
|
||||
USER_AGENT = ua.random_user_agent();
|
||||
}
|
||||
|
||||
if (USER_AGENT) {
|
||||
ADDITIONAL_CHROME_FLAGS.push(
|
||||
`--user-agent="${USER_AGENT}"`
|
||||
@ -90,11 +93,13 @@ module.exports.handler = async function handler (event, context, callback) {
|
||||
|
||||
const page = await browser.newPage();
|
||||
|
||||
// block some assets to speed up scraping
|
||||
if (config.block_assets === true) {
|
||||
await page.setRequestInterception(true);
|
||||
|
||||
page.on('request', (req) => {
|
||||
if (req.resourceType() == 'stylesheet' || req.resourceType() == 'font' || req.resourceType() == 'image') {
|
||||
let type = req.resourceType();
|
||||
const block = ['stylesheet', 'font', 'image', 'media'];
|
||||
if (block.includes(type)) {
|
||||
req.abort();
|
||||
} else {
|
||||
req.continue();
|
||||
@ -120,7 +125,7 @@ module.exports.handler = async function handler (event, context, callback) {
|
||||
reuters: tickersearch.scrape_reuters_finance_pup,
|
||||
cnbc: tickersearch.scrape_cnbc_finance_pup,
|
||||
marketwatch: tickersearch.scrape_marketwatch_finance_pup,
|
||||
}[config.search_engine](page, config, context);
|
||||
}[config.search_engine](page, config, context, pluggable);
|
||||
|
||||
let metadata = {};
|
||||
|
||||
@ -223,6 +228,10 @@ function parseEventData(config) {
|
||||
config.log_http_headers = _bool(config.log_http_headers);
|
||||
}
|
||||
|
||||
if (config.random_user_agent) {
|
||||
config.random_user_agent = _bool(config.random_user_agent);
|
||||
}
|
||||
|
||||
if (config.compress) {
|
||||
config.compress = _bool(config.compress);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user