From 89441070cd9c03ccf94e894882d965ad973a49e2 Mon Sep 17 00:00:00 2001 From: Nikolai Tschacher Date: Tue, 29 Jan 2019 13:29:24 +0100 Subject: [PATCH] before_keyword_scraped() hook supported --- TODO.txt | 8 +- data.json | 2 +- examples/pluggable.js | 13 ++++ index.js | 6 +- package.json | 2 +- run.js | 4 +- src/modules/baidu.js | 11 ++- src/modules/bing.js | 26 +++++-- src/modules/duckduckgo.js | 11 ++- src/modules/google.js | 68 ++++++++++++++-- src/modules/infospace.js | 22 +++++- src/modules/ticker_search.js | 58 ++++++++++++-- src/modules/user_agents.js | 147 +++++++++++++++++------------------ src/modules/youtube.js | 11 ++- src/node_scraper.js | 37 +++++---- 15 files changed, 309 insertions(+), 117 deletions(-) diff --git a/TODO.txt b/TODO.txt index 7899adb..669c4be 100644 --- a/TODO.txt +++ b/TODO.txt @@ -15,6 +15,12 @@ https://www.scrapehero.com/how-to-build-a-web-scraper-using-puppeteer-and-node-js/ TODO: + - think about implementing ticker search for: https://quotes.wsj.com/MSFT?mod=searchresults_companyquotes - add proxy support - add captcha service solving support - - check if news instances run the same browser and if we can have one proxy per tab wokers \ No newline at end of file + - check if news instances run the same browser and if we can have one proxy per tab wokers + +TODO: + - think whether it makes sense to introduce a generic scraping class? + - is scraping abstractable or is every scraper too unique? + - dont make the same mistakes as with GoogleScraper \ No newline at end of file diff --git a/data.json b/data.json index 04e0994..7c48102 100644 --- a/data.json +++ b/data.json @@ -1 +1 @@ -{"incolumitas.com news":{"time":"Sun, 27 Jan 2019 19:07:41 GMT","num_results":"Ungefähr 691.000 Ergebnisse","no_results":false,"effective_query":"","results":[{"link":"/url?q=https://incolumitas.com/2018/10/29/youtube-puppeteer-scraping/&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFggUMAA&usg=AOvVaw1PmFDpPlIFYilxfQb1ym1W","title":"Coding, Learning and Business Ideas – Tutorial ... - Incolumitas","snippet":"29 Oct 2018 ... If you are a scientist, you might be interested in the spreading of fake news for \nexample. You want to monitor the sources of fake news.","visible_link":"","date":"","rank":1},{"link":"/url?q=https://incolumitas.com/2018/11/18/introduction-machine-learning-2019/&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFggaMAE&usg=AOvVaw37PW4eBuCGv2zkvXt0sZkw","title":"Coding, Learning and Business Ideas – Introduction to ... - Incolumitas","snippet":"18 Nov 2018 ... I want to recognize news articles soley based on one input: A link to the \ndocument. The algorithm should automatically recognize whether the ...","visible_link":"","date":"","rank":2},{"link":"/url?q=https://incolumitas.com/pages/about/&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFgggMAI&usg=AOvVaw2-XXZq6XPgh5orgEd_2rr6","title":"Coding, Learning and Business Ideas – About - Incolumitas","snippet":"As far as I know, this security vulnerability is still unfixed in late 2018. See my \noriginal blog post here. An news article from Heise and an article from ars \ntechnica.","visible_link":"","date":"","rank":3},{"link":"/url?q=https://incolumitas.com/uploads/2013/12/links.txt&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFgglMAM&usg=AOvVaw0cAkft0dN7WcT6DCDbDYZM","title":"http://www.urbandictionary.com/define.php?term=holy%20shit http ...","snippet":"... http://www.tumblr.com/tagged/holy-shit http://pitchfork.com/news/52370-watch-\nholy-shit-perform-at-ryan-mcginley-gallery-opening-in-san-francisco/ ...","visible_link":"","date":"","rank":4},{"link":"/url?q=https://incolumitas.com/2014/11/12/scraping-and-extracting-links-from-any-major-search-engine-like-google-yandex-baidu-bing-and-duckduckgo/&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFggqMAQ&usg=AOvVaw0I45HMQSEjMeuKozexkwcI","title":"Coding, Learning and Business Ideas – Scraping and ... - Incolumitas","snippet":"12 Nov 2014 ... The supported search types. For instance, Google supports Video Search, Image \nSearch, News search search_types = [] def __init__(self, html, ...","visible_link":"","date":"","rank":5},{"link":"/url?q=https://incolumitas.com/2013/01/06/googlesearch-a-rapid-python-class-to-get-search-results/&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFggwMAU&usg=AOvVaw39UsbJVwdj3lCe2hAGZlgq","title":"Coding, Learning and Business Ideas ... - Incolumitas","snippet":"18 Feb 2014 ... UPDATE on 18th February 2014: This python module has now its own github \nrepository! The plugin can extract All links Link titles The ...","visible_link":"","date":"","rank":6},{"link":"/url?q=https://www.npmjs.com/package/se-scraper&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFgg2MAY&usg=AOvVaw2qrTw_GOSWhggUYNgRdAPy","title":"se-scraper - npm","snippet":"3 days ago ... Google; Google News; Google News New (https://news.google.com) .... 'https://\nincolumitas.com/2018/10/29/youtube-puppeteer-scraping/',. title:.","visible_link":"","date":"","rank":7},{"link":"/url?q=https://pypi.org/project/CountryGoogleScraper/&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFgg8MAc&usg=AOvVaw039KcMaYiUHgES64a5tW9F","title":"CountryGoogleScraper · PyPI","snippet":"News GoogleScraper becomes **finally mature!** In the last months I didn't ..... [4\n]: http://incolumitas.com/2013/01/06/googlesearch-a-rapid-python-class-to-get- ...","visible_link":"","date":"","rank":8},{"link":"/url?q=https://pycoders.com/issues/69&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFghBMAg&usg=AOvVaw1sJXhzn4R2ihjximQOaA_W","title":"PyCoder's Weekly | Issue #69","snippet":"7 Jun 2013 ... To the keep up with all the breaking Python news follow @pycoders. Support us \non Gittip -- ... News and Developments. ... (incolumitas.com).","visible_link":"","date":"","rank":9},{"link":"/url?q=https://news.ycombinator.com/item%3Fid%3D11925325&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFghHMAk&usg=AOvVaw1F_732iyOahhNsWaJyvYM2","title":"Typosquatting in Programming Language Package ... - Hacker News","snippet":"Typosquatting in Programming Language Package Managers [pdf] (incolumitas.\ncom). 3 points by henry_flower on June 17, 2016 | hide | past | web | favorite ...","visible_link":"","date":"","rank":10}]}} \ No newline at end of file +{"trump":{"time":"Tue, 29 Jan 2019 12:28:51 GMT","num_results":"Ungefähr 2.590.000.000 Ergebnisse","no_results":false,"effective_query":"","results":[{"link":"/search?q=trump&hl=de&gbv=2&ie=UTF-8&prmd=ivns&source=univ&tbm=nws&tbo=u&sa=X&ved=0ahUKEwinifL7_pLgAhVGM-wKHcjSBlQQqAIIFA","title":"News zu trump","snippet":"Der 5. Februar soll es also sein. Dann darf US-Präsident Donald Trump im \nKongress seine Rede zur Lage der Nation halten, die State of the ...","visible_link":"","date":" - vor 7 Stunden - vor 5 Stunden - vor 8 Stunden","rank":1},{"link":"/url?q=https://de.wikipedia.org/wiki/Donald_Trump&sa=U&ved=0ahUKEwinifL7_pLgAhVGM-wKHcjSBlQQFggdMAM&usg=AOvVaw1_CRYjDXuT3uMta9qZxpzX","title":"Donald Trump – Wikipedia","snippet":"Donald John Trump /dɒnəld d͡ʒɒn trʌmp/ (* 14. Juni 1946 in Queens, New \nYork City, New York) ist ein amerikanischer Unternehmer, Entertainer und seit ...","visible_link":"","date":"","rank":2},{"link":"/url?q=https://www.donaldjtrump.com/&sa=U&ved=0ahUKEwinifL7_pLgAhVGM-wKHcjSBlQQFggjMAQ&usg=AOvVaw2eVjVZURDKjiDEQx9tY2Np","title":"Donald J. Trump for President: Home","snippet":"Help continue our promise to Make America Great Again!","visible_link":"","date":"","rank":3},{"link":"/url?q=https://www.welt.de/politik/ausland/article187872656/Trump-und-Pelosi-einigen-sich-auf-Datum-fuer-Rede-zur-Lage-der-Nation.html&sa=U&ved=0ahUKEwinifL7_pLgAhVGM-wKHcjSBlQQFggpMAU&usg=AOvVaw26OtBVzws_828892r3VaMe","title":"Trump und Pelosi einigen sich auf Datum für Rede zur Lage der ...","snippet":"vor 5 Stunden ... Donald Trump will seine Rede zur Lage der Nation nun am kommenden \nDienstag halten. In einem Brief beschrieb er seine Vorfreude auf den ...","visible_link":"","date":"","rank":4},{"link":"/url?q=http://www.spiegel.de/thema/donald_trump/&sa=U&ved=0ahUKEwinifL7_pLgAhVGM-wKHcjSBlQQFggvMAY&usg=AOvVaw14924asIp5LdTx0DYTuK6B","title":"Donald Trump - SPIEGEL ONLINE","snippet":"Der Unternehmer Donald Trump war schon vor seiner Bewerbung als \nrepublikanischer Präsidentschaftskandidat weltweit bekannt. Überraschend \ngewann der ...","visible_link":"","date":"","rank":5},{"link":"/url?q=https://www.zeit.de/thema/donald-trump&sa=U&ved=0ahUKEwinifL7_pLgAhVGM-wKHcjSBlQQFgg0MAc&usg=AOvVaw0-NJodJtXSLP4fdhnWjjcw","title":"Donald Trump: Präsident der USA | ZEIT ONLINE - Die Zeit","snippet":"Importzölle, Atomabkommen, Einreiseverbot: Donald Trump sorgt innen- und \naußenpolitisch für Schlagzeilen. Hier lesen Sie Nachrichten und Analysen zum ...","visible_link":"","date":"","rank":6},{"link":"/url?q=https://www.handelsblatt.com/themen/donald-trump&sa=U&ved=0ahUKEwinifL7_pLgAhVGM-wKHcjSBlQQFgg2MAg&usg=AOvVaw3WFgaMFYaKw9Y8cj8UmRAY","title":"Donald Trump News: Aktuelle deutsche Nachrichten - Handelsblatt","snippet":"Lesen Sie Nachrichten, Exklusiv-Meldungen und Insights zum Thema Donald \nTrump auf Handelsblatt Online » Verfolgen Sie alle aktuellen Trends und ...","visible_link":"","date":"","rank":7},{"link":"/url?q=https://www.cnn.com/specials/politics/president-donald-trump-45&sa=U&ved=0ahUKEwinifL7_pLgAhVGM-wKHcjSBlQQFgg4MAk&usg=AOvVaw0RuvK0ef1Q7lACZD3QSSj_","title":"Donald Trump News - CNN - CNN.com","snippet":"The latest news on President Donald Trump, the White House and the first family.","visible_link":"","date":"","rank":8},{"link":"/url?q=https://www.faz.net/aktuell/politik/thema/donald-trump&sa=U&ved=0ahUKEwinifL7_pLgAhVGM-wKHcjSBlQQFgg6MAo&usg=AOvVaw3XEVCZQNWgrPKvkntPTAaF","title":"Donald Trump: Aktuelle News der FAZ zum US-Präsidenten","snippet":"Donald Trump ist der 45. US-Präsident. ▷ Lesen Sie hier alle Nachrichten der \nFAZ rund um die Politik und Entscheidungen des Republikaners.","visible_link":"","date":"","rank":9},{"link":"/url?q=https://twitter.com/realdonaldtrump%3Flang%3Dde&sa=U&ved=0ahUKEwinifL7_pLgAhVGM-wKHcjSBlQQFgg8MAs&usg=AOvVaw28eoTuiFwEw7iWEuX6mzAw","title":"Donald J. Trump (@realDonaldTrump) | Twitter","snippet":"The latest Tweets from Donald J. Trump (@realDonaldTrump). 45th President of \nthe United States of America . Washington, DC.","visible_link":"","date":"","rank":10}]},"chief":{"time":"Tue, 29 Jan 2019 12:28:53 GMT","num_results":"Ungefähr 1.050.000.000 Ergebnisse","no_results":false,"effective_query":"","results":[{"link":"/url?q=https://de.wikipedia.org/wiki/Chief&sa=U&ved=0ahUKEwi1zdn8_pLgAhVI3qQKHTTBBToQFggaMAI&usg=AOvVaw0Nfgf6yXi4bgdXKUO2lN_H","title":"Chief – Wikipedia","snippet":"Chief (englisch „Chef“) steht für: Chief, Stammesführer, siehe Häuptling · Clan \nChief (Schottland), Oberhaupt eines Clans; Chief, leitender Schiffsingenieur ...","visible_link":"","date":"","rank":3},{"link":"/url?q=https://de.wikipedia.org/wiki/Kategorie:Chief_Officer&sa=U&ved=0ahUKEwi1zdn8_pLgAhVI3qQKHTTBBToQFgggMAM&usg=AOvVaw3qdIV28ng89mrKFAcAusaE","title":"Kategorie:Chief Officer – Wikipedia","snippet":"In dieser Kategorie werden Chief-Officer-Positionen gelistet. Personen, die eine \nentsprechende Position bekleiden, werden in der Kategorie:Manager einsortiert\n ...","visible_link":"","date":"","rank":4},{"link":"/url?q=https://www.legrandav.com/products/chief&sa=U&ved=0ahUKEwi1zdn8_pLgAhVI3qQKHTTBBToQFggmMAQ&usg=AOvVaw3xmZvLplR0_-BX23EOG-Yd","title":"Chief | TV Mounts and Projector Mounts | Legrand AV Brands","snippet":"Designed for rapid setup and teardown, most people can set up Chief's new ... \nChief is shipping three new flexible storage solutions for in-wall, on-wall and ...","visible_link":"","date":"","rank":5},{"link":"/url?q=https://www.dict.cc/englisch-deutsch/chief.html&sa=U&ved=0ahUKEwi1zdn8_pLgAhVI3qQKHTTBBToQFggwMAU&usg=AOvVaw1Fjasjg4O6_c9ahC2e_q60","title":"dict.cc Wörterbuch :: chief :: Englisch-Deutsch-Übersetzung","snippet":"Englisch-Deutsch-Übersetzungen für chief im Online-Wörterbuch dict.cc (\nDeutschwörterbuch).","visible_link":"","date":"","rank":6},{"link":"/url?q=https://de.wiktionary.org/wiki/chief&sa=U&ved=0ahUKEwi1zdn8_pLgAhVI3qQKHTTBBToQFgg1MAY&usg=AOvVaw2bw_tTRVvRIyqRZDultc7i","title":"chief – Wiktionary","snippet":"Referenzen und weiterführende Informationen: [1, 2] Oxford English Dictionary „\nchief“: [1, 2] Cambridge Dictionaries: „chief“ (britisch), „chief“ (US-amerikanisch) ...","visible_link":"","date":"","rank":7},{"link":"/url?q=https://en.wiktionary.org/wiki/chief&sa=U&ved=0ahUKEwi1zdn8_pLgAhVI3qQKHTTBBToQFgg7MAc&usg=AOvVaw3cBWPnQpNzWwDGvPwz3AWE","title":"chief - Wiktionary","snippet":"From Middle English chef, borrowed from Old French chief (“leader”), from Vulgar \nLatin *capum (from which also captain, chieftain), from Latin caput (“head”) ...","visible_link":"","date":"","rank":8},{"link":"/url?q=https://www.urbandictionary.com/define.php%3Fterm%3Dc.h.i.e.f.&sa=U&ved=0ahUKEwi1zdn8_pLgAhVI3qQKHTTBBToQFghBMAg&usg=AOvVaw0vxyR2h-bOXxYkecdegvMn","title":"Urban Dictionary: c.h.i.e.f.","snippet":"Council of High Intelligence and Education Findings. (C.H.I.E.F.)","visible_link":"","date":"","rank":9},{"link":"/url?q=https://en.oxforddictionaries.com/definition/chief&sa=U&ved=0ahUKEwi1zdn8_pLgAhVI3qQKHTTBBToQFghGMAk&usg=AOvVaw0aA_7eNxeAK-Cc-BLxz6FL","title":"chief | Definition of chief in English by Oxford Dictionaries","snippet":"Definition of chief - a leader or ruler of a people or clan, an ordinary consisting of \na broad horizontal band across the top of the shield.","visible_link":"","date":"","rank":10},{"link":"/url?q=https://www.duden.de/rechtschreibung/Chief&sa=U&ved=0ahUKEwi1zdn8_pLgAhVI3qQKHTTBBToQFghMMAo&usg=AOvVaw020XNtghFP3yi7G4aXbFQG","title":"Duden | Chief | Rechtschreibung, Bedeutung, Definition, Herkunft","snippet":"Definition, Rechtschreibung, Synonyme und Grammatik von 'Chief' auf Duden \nonline nachschlagen. Wörterbuch der deutschen Sprache.","visible_link":"","date":"","rank":11},{"link":"/url?q=http://www.chiefautomotive.com/&sa=U&ved=0ahUKEwi1zdn8_pLgAhVI3qQKHTTBBToQFghRMAs&usg=AOvVaw31s-PFzEeLhfa_25XbDbyn","title":"Chief Automotive Technologies: Collision Repair Tools","snippet":"At Chief we offer collision repair tools for measuring equipment, frame machines, \nspot welder, mig welder, plasma cutter, aluminum welder, aluminum repair…","visible_link":"","date":"","rank":12}]}} \ No newline at end of file diff --git a/examples/pluggable.js b/examples/pluggable.js index aea2e9b..0f640f1 100644 --- a/examples/pluggable.js +++ b/examples/pluggable.js @@ -26,14 +26,27 @@ module.exports = class Pluggable { await this.browser.close(); } + // Callback invoked after metadata has been gathered async handle_metadata(args) { // store scraping metadata somewhere } + // Callback invoked after all keywords have been scraped async handle_results(args) { // store the results somewhere } + // Callback invoked before a keyword is scraped. + async before_keyword_scraped(args) { + console.log('before keyword scraped.'); + } + + // Callback invoked after a keyword has been scraped. + // TODO: implement this + async after_keyword_scraped(args) { + console.log('after keyword scraped.') + } + async start_browser(args={}) { const puppeteer = require('puppeteer'); diff --git a/index.js b/index.js index 2904cd3..880b73b 100644 --- a/index.js +++ b/index.js @@ -22,20 +22,22 @@ exports.scrape = async function(config, callback) { compress: false, // compress debug: false, verbose: false, - keywords: ['test'], + keywords: ['scrapeulous.com'], // whether to start the browser in headless mode headless: true, // path to output file, data will be stored in JSON output_file: '', - // whether to prevent images, css, fonts from being loaded + // whether to prevent images, css, fonts and media from being loaded // will speed up scraping a great deal block_assets: true, // path to js module that extends functionality // this module should export the functions: // get_browser, handle_metadata, close_browser + //custom_func: resolve('examples/pluggable.js'), custom_func: '', }; + // overwrite default config for (var key in config) { event[key] = config[key]; } diff --git a/package.json b/package.json index 94309f8..8e9e44e 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "se-scraper", - "version": "1.1.5", + "version": "1.1.7", "description": "A simple module which uses puppeteer to scrape several search engines.", "homepage": "https://scrapeulous.com/", "main": "index.js", diff --git a/run.js b/run.js index 338d239..9b01c8d 100644 --- a/run.js +++ b/run.js @@ -10,7 +10,7 @@ let config = { write_meta_data: false, // how long to sleep between requests. a random sleep interval within the range [a,b] // is drawn before every request. empty string for no sleeping. - sleep_range: '', + sleep_range: '[1,1]', // which search engine to scrape search_engine: 'google', // whether debug information should be printed @@ -20,7 +20,7 @@ let config = { // this output is informational verbose: false, // an array of keywords to scrape - keywords: ['incolumitas.com news', ], + keywords: ['trump', 'chief'], // alternatively you can specify a keyword_file. this overwrites the keywords array keyword_file: '', // whether to start the browser in headless mode diff --git a/src/modules/baidu.js b/src/modules/baidu.js index 9c2fcfa..8cb1a0d 100644 --- a/src/modules/baidu.js +++ b/src/modules/baidu.js @@ -5,7 +5,7 @@ module.exports = { scrape_baidu_pup: scrape_baidu_pup, }; -async function scrape_baidu_pup(page, event, context) { +async function scrape_baidu_pup(page, event, context, pluggable) { await page.goto('https://www.baidu.com/'); try { @@ -21,6 +21,15 @@ async function scrape_baidu_pup(page, event, context) { keyword = keywords[i]; + if (pluggable.before_keyword_scraped) { + await pluggable.before_keyword_scraped({ + keyword: keyword, + page: page, + event: event, + context: context, + }); + } + try { const input = await page.$('input[name="wd"]'); // overwrites last text in input diff --git a/src/modules/bing.js b/src/modules/bing.js index c250d71..dc28e74 100644 --- a/src/modules/bing.js +++ b/src/modules/bing.js @@ -6,7 +6,7 @@ module.exports = { scrape_bing_news_pup: scrape_bing_news_pup, }; -async function scrape_bing_pup(page, event, context) { +async function scrape_bing_pup(page, event, context, pluggable) { await page.goto('https://www.bing.com/'); try { @@ -22,6 +22,15 @@ async function scrape_bing_pup(page, event, context) { keyword = keywords[i]; + if (pluggable.before_keyword_scraped) { + await pluggable.before_keyword_scraped({ + keyword: keyword, + page: page, + event: event, + context: context, + }); + } + try { const input = await page.$('input[name="q"]'); // overwrites last text in input @@ -90,7 +99,7 @@ function parse(html) { } } -async function scrape_bing_news_pup(page, event, context) { +async function scrape_bing_news_pup(page, event, context, pluggable) { await page.goto('https://www.bing.com/news/search?'); if (event.set_manual_settings === true) { @@ -109,12 +118,17 @@ async function scrape_bing_news_pup(page, event, context) { for (var i = 0; i < keywords.length; i++) { - if (sfunctions.should_turn_down(context)) { - break; - } - keyword = keywords[i]; + if (pluggable.before_keyword_scraped) { + await pluggable.before_keyword_scraped({ + keyword: keyword, + page: page, + event: event, + context: context, + }); + } + try { const input = await page.$('input[name="q"]'); // overwrites last text in input diff --git a/src/modules/duckduckgo.js b/src/modules/duckduckgo.js index 1713b56..86d2c07 100644 --- a/src/modules/duckduckgo.js +++ b/src/modules/duckduckgo.js @@ -5,7 +5,7 @@ module.exports = { scrape_duckduckgo_news_pup: scrape_duckduckgo_news_pup, }; -async function scrape_duckduckgo_news_pup(page, event, context) { +async function scrape_duckduckgo_news_pup(page, event, context, pluggable) { await page.goto('https://duckduckgo.com/?q=42&t=h_&iar=news&ia=news'); try { @@ -21,6 +21,15 @@ async function scrape_duckduckgo_news_pup(page, event, context) { keyword = keywords[i]; + if (pluggable.before_keyword_scraped) { + await pluggable.before_keyword_scraped({ + keyword: keyword, + page: page, + event: event, + context: context, + }); + } + try { const input = await page.$('input[name="q"]'); // overwrites last text in input diff --git a/src/modules/google.js b/src/modules/google.js index 1c2a966..099948f 100644 --- a/src/modules/google.js +++ b/src/modules/google.js @@ -12,7 +12,7 @@ module.exports = { const STANDARD_TIMEOUT = 8000; const SOLVE_CAPTCHA_TIME = 45000; -async function scrape_google_pup(page, event, context) { +async function scrape_google_pup(page, event, context, pluggable) { await page.goto('https://www.google.com/'); try { @@ -28,6 +28,15 @@ async function scrape_google_pup(page, event, context) { keyword = keywords[i]; + if (pluggable.before_keyword_scraped) { + await pluggable.before_keyword_scraped({ + keyword: keyword, + page: page, + event: event, + context: context, + }); + } + if (event.verbose === true) { console.log(`${event.search_engine} is scraping keyword: ${keyword}`); } @@ -81,7 +90,7 @@ async function scrape_google_pup(page, event, context) { return results; } -async function scrape_google_pup_dr(page, event, context) { +async function scrape_google_pup_dr(page, event, context, pluggable) { let keywords = event.keywords; first = keywords[0]; var year = first.slice(-5); @@ -106,6 +115,15 @@ async function scrape_google_pup_dr(page, event, context) { // strip the year at the end plus whitespace keyword = keywords[i].slice(0,-5); + if (pluggable.before_keyword_scraped) { + await pluggable.before_keyword_scraped({ + keyword: keyword, + page: page, + event: event, + context: context, + }); + } + if (event.verbose === true) { console.log(`${event.search_engine} is scraping keyword: ${keyword}`); } @@ -217,7 +235,7 @@ async function scraping_detected(page) { return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1; } -async function scrape_google_news_old_pup(page, event, context) { +async function scrape_google_news_old_pup(page, event, context, pluggable) { let keywords = event.keywords; var results = {}; @@ -225,6 +243,15 @@ async function scrape_google_news_old_pup(page, event, context) { keyword = keywords[i]; + if (pluggable.before_keyword_scraped) { + await pluggable.before_keyword_scraped({ + keyword: keyword, + page: page, + event: event, + context: context, + }); + } + if (event.verbose === true) { console.log(`${event.search_engine} is scraping keyword: ${keyword}`); } @@ -326,7 +353,7 @@ function parse_google_news_results_se_format(html) { } } -async function scrape_google_image_pup(page, event, context) { +async function scrape_google_image_pup(page, event, context, pluggable) { let keywords = event.keywords; var results = {}; @@ -344,6 +371,15 @@ async function scrape_google_image_pup(page, event, context) { keyword = keywords[i]; + if (pluggable.before_keyword_scraped) { + await pluggable.before_keyword_scraped({ + keyword: keyword, + page: page, + event: event, + context: context, + }); + } + if (event.verbose === true) { console.log(`${event.search_engine} is scraping keyword: ${keyword}`); } @@ -452,9 +488,22 @@ function clean_image_url(url) { } } +function clean_google_url(url) { + // Example: + // /url?q=https://www.zeit.de/thema/donald-trump&sa=U&ved=0ahUKEwiL9-u-_ZLgAhVJsqQKHeITDAoQFgg0MAc&usg=AOvVaw3JV3UZjTXRwaS2I-sBbeXF + // /search?q=trump&hl=de&gbv=2&ie=UTF-8&prmd=ivns&source=univ&tbm=nws&tbo=u&sa=X&ved=0ahUKEwiL9-u-_ZLgAhVJsqQKHeITDAoQqAIIFA + const regex = /url\?q=(.*?)&/gm; + let match = regex.exec(url); + if (match !== null) { + return decodeURIComponent(match[1]); + } else { + return url; + } +} + const all_results = new Set(); -async function scrape_google_news_pup(page, event, context) { +async function scrape_google_news_pup(page, event, context, pluggable) { let keywords = event.keywords; var results = {}; @@ -472,6 +521,15 @@ async function scrape_google_news_pup(page, event, context) { keyword = keywords[i]; + if (pluggable.before_keyword_scraped) { + await pluggable.before_keyword_scraped({ + keyword: keyword, + page: page, + event: event, + context: context, + }); + } + if (event.verbose === true) { console.log(`${event.search_engine} is scraping keyword: ${keyword}`); } diff --git a/src/modules/infospace.js b/src/modules/infospace.js index e1ddc2d..d96d39f 100644 --- a/src/modules/infospace.js +++ b/src/modules/infospace.js @@ -6,7 +6,7 @@ module.exports = { scrape_webcrawler_news_pup: scrape_webcrawler_news_pup, }; -async function scrape_infospace_pup(page, event, context) { +async function scrape_infospace_pup(page, event, context, pluggable) { await page.goto('http://infospace.com/index.html'); try { @@ -22,6 +22,15 @@ async function scrape_infospace_pup(page, event, context) { keyword = keywords[i]; + if (pluggable.before_keyword_scraped) { + await pluggable.before_keyword_scraped({ + keyword: keyword, + page: page, + event: event, + context: context, + }); + } + try { const input = await page.$('input[id="q"]'); // overwrites last text in input @@ -88,7 +97,7 @@ function parse(html) { } } -async function scrape_webcrawler_news_pup(page, event, context) { +async function scrape_webcrawler_news_pup(page, event, context, pluggable) { await page.goto('https://www.webcrawler.com/?qc=news'); try { @@ -104,6 +113,15 @@ async function scrape_webcrawler_news_pup(page, event, context) { keyword = keywords[i]; + if (pluggable.before_keyword_scraped) { + await pluggable.before_keyword_scraped({ + keyword: keyword, + page: page, + event: event, + context: context, + }); + } + try { const input = await page.$('input[name="q"]'); // overwrites last text in input diff --git a/src/modules/ticker_search.js b/src/modules/ticker_search.js index dae6d57..9177ab8 100644 --- a/src/modules/ticker_search.js +++ b/src/modules/ticker_search.js @@ -11,7 +11,7 @@ module.exports = { // https://www.google.com/search?q=MSFT&tbm=fin -async function scrape_yahoo_finance_pup(page, event, context) { +async function scrape_yahoo_finance_pup(page, event, context, pluggable) { var results = {}; await page.goto('https://finance.yahoo.com/'); @@ -21,6 +21,16 @@ async function scrape_yahoo_finance_pup(page, event, context) { } for (let keyword of event.keywords) { + + if (pluggable.before_keyword_scraped) { + await pluggable.before_keyword_scraped({ + keyword: keyword, + page: page, + event: event, + context: context, + }); + } + try { await page.goto(`https://finance.yahoo.com/quote/${keyword}/news?p=${keyword}`); @@ -61,9 +71,17 @@ function parse(html) { } } -async function scrape_marketwatch_finance_pup(page, event, context) { +async function scrape_marketwatch_finance_pup(page, event, context, pluggable) { var results = {}; for (let keyword of event.keywords) { + if (pluggable.before_keyword_scraped) { + await pluggable.before_keyword_scraped({ + keyword: keyword, + page: page, + event: event, + context: context, + }); + } try { await page.goto(`https://www.marketwatch.com/investing/stock/${keyword}`); await page.waitForSelector('.intraday__data', { timeout: 8000 }); @@ -108,12 +126,22 @@ async function scrape_marketwatch_finance_pup(page, event, context) { } -async function scrape_bloomberg_finance_pup(page, event, context) { +async function scrape_bloomberg_finance_pup(page, event, context, pluggable) { /* Bloomberg blocks after one request. what a shit hole. */ var results = {}; for (let keyword of event.keywords) { + + if (pluggable.before_keyword_scraped) { + await pluggable.before_keyword_scraped({ + keyword: keyword, + page: page, + event: event, + context: context, + }); + } + try { await page.goto(`https://www.bloomberg.com/quote/${keyword}:US`); await page.waitForSelector('.pseudoMainContent', { timeout: 8000 }); @@ -140,9 +168,19 @@ async function scrape_bloomberg_finance_pup(page, event, context) { return results; } -async function scrape_reuters_finance_pup(page, event, context) { +async function scrape_reuters_finance_pup(page, event, context, pluggable) { var results = {}; for (let keyword of event.keywords) { + + if (pluggable.before_keyword_scraped) { + await pluggable.before_keyword_scraped({ + keyword: keyword, + page: page, + event: event, + context: context, + }); + } + try { await page.goto(`https://www.reuters.com/finance/stocks/overview/${keyword}`); await page.waitForSelector('#sectionHeader', { timeout: 8000 }); @@ -187,9 +225,19 @@ async function scrape_reuters_finance_pup(page, event, context) { return results; } -async function scrape_cnbc_finance_pup(page, event, context) { +async function scrape_cnbc_finance_pup(page, event, context, pluggable) { var results = {}; for (let keyword of event.keywords) { + + if (pluggable.before_keyword_scraped) { + await pluggable.before_keyword_scraped({ + keyword: keyword, + page: page, + event: event, + context: context, + }); + } + try { await page.goto(`https://www.cnbc.com/quotes/?symbol=${keyword}&tab=news`); await page.waitForSelector('#quote_title_and_chart', { timeout: 8000 }); diff --git a/src/modules/user_agents.js b/src/modules/user_agents.js index fbfb60a..9a0e252 100644 --- a/src/modules/user_agents.js +++ b/src/modules/user_agents.js @@ -6,80 +6,77 @@ function random_user_agent() { return user_agents[Math.floor(Math.random()*user_agents.length)]; } +// updated: 29 Jan 2019 const user_agents = [ - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', - 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:62.0) Gecko/20100101 Firefox/62.0', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', - 'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0', - 'Mozilla/5.0 (X11; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko', - 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', - 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:62.0) Gecko/20100101 Firefox/62.0', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/69.0.3497.81 Chrome/69.0.3497.81 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:62.0) Gecko/20100101 Firefox/62.0', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:62.0) Gecko/20100101 Firefox/62.0', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15', - 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 OPR/55.0.2994.61', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:62.0) Gecko/20100101 Firefox/62.0', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', - 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0', - 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0', - 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:62.0) Gecko/20100101 Firefox/62.0', - 'Mozilla/5.0 (Windows NT 5.1; rv:52.0) Gecko/20100101 Firefox/52.0', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', - 'Mozilla/5.0 (iPad; CPU OS 12_0_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/61.0', - 'Mozilla/5.0 (Windows NT 6.1; rv:62.0) Gecko/20100101 Firefox/62.0', - 'Mozilla/5.0 (iPad; CPU OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 OPR/56.0.3051.36', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 YaBrowser/18.9.0.3467 Yowser/2.5 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:60.0) Gecko/20100101 Firefox/60.0', - 'Mozilla/5.0 (X11; CrOS x86_64 10895.56.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.95 Safari/537.36' + ['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', + 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134', + 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0', + 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0', + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:64.0) Gecko/20100101 Firefox/64.0', + 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', + 'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', + 'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15', + 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko', + 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', + 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:64.0) Gecko/20100101 Firefox/64.0', + 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0', + 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 YaBrowser/18.11.1.805 Yowser/2.5 Safari/537.36', + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15', + 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763', + 'Mozilla/5.0 (iPad; CPU OS 12_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.1 Safari/605.1.15', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116', + 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 YaBrowser/18.11.1.805 Yowser/2.5 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299', + 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.106', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0', + 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0', + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.98 Chrome/71.0.3578.98 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; Touch; rv:11.0) like Gecko', + 'Mozilla/5.0 (Windows NT 6.1; rv:64.0) Gecko/20100101 Firefox/64.0', + 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116', + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0', + 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0', + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:64.0) Gecko/20100101 Firefox/64.0', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36', + 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:64.0) Gecko/20100101 Firefox/64.0', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8', + 'Mozilla/5.0 (Windows NT 10.0; rv:64.0) Gecko/20100101 Firefox/64.0', + 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko', + 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0', + 'Mozilla/5.0 (X11; CrOS x86_64 11151.59.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.94 Safari/537.36', + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', + ] ]; \ No newline at end of file diff --git a/src/modules/youtube.js b/src/modules/youtube.js index 8426c50..9e47ed2 100644 --- a/src/modules/youtube.js +++ b/src/modules/youtube.js @@ -7,7 +7,7 @@ module.exports = { const all_videos = new Set(); -async function scrape_youtube_pup(page, event, context) { +async function scrape_youtube_pup(page, event, context, pluggable) { await page.goto('https://www.youtube.com'); try { @@ -30,6 +30,15 @@ async function scrape_youtube_pup(page, event, context) { keyword = keywords[i]; + if (pluggable.before_keyword_scraped) { + await pluggable.before_keyword_scraped({ + keyword: keyword, + page: page, + event: event, + context: context, + }); + } + try { const input = await page.$('input[id="search"]'); // overwrites last text in input diff --git a/src/node_scraper.js b/src/node_scraper.js index 2e8f8db..e0ae7b1 100644 --- a/src/node_scraper.js +++ b/src/node_scraper.js @@ -23,18 +23,21 @@ function write_results(fname, data) { module.exports.handler = async function handler (event, context, callback) { config = event; pluggable = null; - if (config.custom_func && fs.existsSync(config.custom_func)) { - try { - Pluggable = require(config.custom_func); - pluggable = new Pluggable({config:config}); - } catch (exception) { - console.error(exception); + if (config.custom_func) { + if (fs.existsSync(config.custom_func)) { + try { + Pluggable = require(config.custom_func); + pluggable = new Pluggable({config: config}); + } catch (exception) { + console.error(exception); + } + } else { + console.error(`File "${config.custom_func}" does not exist...`); } } try { const startTime = Date.now(); - config = parseEventData(config); if (config.debug === true) { console.log(config); @@ -53,14 +56,14 @@ module.exports.handler = async function handler (event, context, callback) { let USER_AGENT = ''; - if (config.random_user_agent) { - USER_AGENT = ua.random_user_agent(); - } - if (config.user_agent) { USER_AGENT = config.user_agent; } + if (config.random_user_agent === true) { + USER_AGENT = ua.random_user_agent(); + } + if (USER_AGENT) { ADDITIONAL_CHROME_FLAGS.push( `--user-agent="${USER_AGENT}"` @@ -90,11 +93,13 @@ module.exports.handler = async function handler (event, context, callback) { const page = await browser.newPage(); + // block some assets to speed up scraping if (config.block_assets === true) { await page.setRequestInterception(true); - page.on('request', (req) => { - if (req.resourceType() == 'stylesheet' || req.resourceType() == 'font' || req.resourceType() == 'image') { + let type = req.resourceType(); + const block = ['stylesheet', 'font', 'image', 'media']; + if (block.includes(type)) { req.abort(); } else { req.continue(); @@ -120,7 +125,7 @@ module.exports.handler = async function handler (event, context, callback) { reuters: tickersearch.scrape_reuters_finance_pup, cnbc: tickersearch.scrape_cnbc_finance_pup, marketwatch: tickersearch.scrape_marketwatch_finance_pup, - }[config.search_engine](page, config, context); + }[config.search_engine](page, config, context, pluggable); let metadata = {}; @@ -223,6 +228,10 @@ function parseEventData(config) { config.log_http_headers = _bool(config.log_http_headers); } + if (config.random_user_agent) { + config.random_user_agent = _bool(config.random_user_agent); + } + if (config.compress) { config.compress = _bool(config.compress); }