diff --git a/TODO.txt b/TODO.txt index 81697a8..b173720 100644 --- a/TODO.txt +++ b/TODO.txt @@ -25,6 +25,12 @@ - implement duckduckgo scraping + +30.1.2019 + + - modify all scrapers to use the generic class where it makes sense + - Bing, Baidu, Google, Duckduckgo + TODO: - think about implementing ticker search for: https://quotes.wsj.com/MSFT?mod=searchresults_companyquotes - add proxy support diff --git a/data.json b/data.json index b4b7318..44781d9 100644 --- a/data.json +++ b/data.json @@ -1 +1 @@ -{"scraping scrapeulous.com":{"1":{"time":"Tue, 29 Jan 2019 21:46:30 GMT","num_results":"Ungefähr 139 Ergebnisse (0,29 Sekunden) ","no_results":false,"effective_query":"","results":[{"link":"https://scrapeulous.com/about/","title":"About - Scrapeuloushttps://scrapeulous.com/about/Im CacheDiese Seite übersetzen","snippet":"Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. The business requirement to scrape information from ...","visible_link":"https://scrapeulous.com/about/","date":"","rank":1},{"link":"https://scrapeulous.com/howto/","title":"Howto - Scrapeuloushttps://scrapeulous.com/howto/Im CacheDiese Seite übersetzen","snippet":"We offer scraping large amounts of keywords for the Google Search Engine. Large means any number of keywords between 40 and 50000. Additionally, we ...","visible_link":"https://scrapeulous.com/howto/","date":"","rank":2},{"link":"https://github.com/NikolaiT/se-scraper","title":"GitHub - NikolaiT/se-scraper: Javascript scraping module based on ...https://github.com/NikolaiT/se-scraperIm CacheDiese Seite übersetzen","snippet":"24.12.2018 - Javascript scraping module based on puppeteer for many different search ... for many different search engines... https://scrapeulous.com/.","visible_link":"https://github.com/NikolaiT/se-scraper","date":"24.12.2018 - ","rank":3},{"link":"https://github.com/NikolaiT/GoogleScraper/blob/master/README.md","title":"GoogleScraper/README.md at master · NikolaiT/GoogleScraper ...https://github.com/NikolaiT/GoogleScraper/blob/.../README.mdIm CacheÄhnliche SeitenDiese Seite übersetzen","snippet":"GoogleScraper - Scraping search engines professionally. Scrapeulous.com - Scraping Service. GoogleScraper is a open source tool and will remain a open ...","visible_link":"https://github.com/NikolaiT/GoogleScraper/blob/.../README.md","date":"","rank":4},{"link":"https://googlescraper.readthedocs.io/","title":"Welcome to GoogleScraper's documentation! — GoogleScraper ...https://googlescraper.readthedocs.io/Im CacheDiese Seite übersetzen","snippet":"Welcome to GoogleScraper's documentation!¶. Contents: GoogleScraper - Scraping search engines professionally · Scrapeulous.com - Scraping Service ...","visible_link":"https://googlescraper.readthedocs.io/","date":"","rank":5},{"link":"https://incolumitas.com/pages/scrapeulous/","title":"Coding, Learning and Business Ideas – Scrapeulous.com - Incolumitashttps://incolumitas.com/pages/scrapeulous/Im CacheDiese Seite übersetzen","snippet":"A scraping service for scientists, marketing professionals, analysts or SEO folk. In autumn 2018, I created a scraping service called scrapeulous.com. There you ...","visible_link":"https://incolumitas.com/pages/scrapeulous/","date":"","rank":6},{"link":"https://incolumitas.com/","title":"Coding, Learning and Business Ideashttps://incolumitas.com/Im CacheDiese Seite übersetzen","snippet":"Scraping Amazon Reviews using Headless Chrome Browser and Python3. Posted on Mi ... GoogleScraper Tutorial - How to scrape 1000 keywords with Google.","visible_link":"https://incolumitas.com/","date":"","rank":7},{"link":"https://en.wikipedia.org/wiki/Search_engine_scraping","title":"Search engine scraping - Wikipediahttps://en.wikipedia.org/wiki/Search_engine_scrapingIm CacheDiese Seite übersetzen","snippet":"Search engine scraping is the process of harvesting URLs, descriptions, or other information from search engines such as Google, Bing or Yahoo. This is a ...","visible_link":"https://en.wikipedia.org/wiki/Search_engine_scraping","date":"","rank":8},{"link":"https://readthedocs.org/projects/googlescraper/downloads/pdf/latest/","title":"GoogleScraper Documentation - Read the Docshttps://readthedocs.org/projects/googlescraper/downloads/.../latest...Im CacheDiese Seite übersetzen","snippet":"23.12.2018 - Contents: 1 GoogleScraper - Scraping search engines professionally. 1. 1.1 ... For this reason, I created the web service scrapeulous.com.","visible_link":"https://readthedocs.org/projects/googlescraper/downloads/.../latest...","date":"23.12.2018 - ","rank":9},{"link":"https://pypi.org/project/CountryGoogleScraper/","title":"CountryGoogleScraper · PyPIhttps://pypi.org/project/CountryGoogleScraper/Im CacheDiese Seite übersetzen","snippet":"A module to scrape and extract links, titles and descriptions from various search ... Look [here to get an idea how to use asynchronous mode](http://scrapeulous.","visible_link":"https://pypi.org/project/CountryGoogleScraper/","date":"","rank":10}]}}} \ No newline at end of file +{"trump":{"1":{"time":"Wed, 30 Jan 2019 15:03:46 GMT","num_results":"Ungefähr 1.450.000.000 Ergebnisse (0,49 Sekunden) ","no_results":false,"effective_query":"","results":[{"link":"https://de.wikipedia.org/wiki/Donald_Trump","title":"Donald Trump – Wikipediahttps://de.wikipedia.org/wiki/Donald_TrumpIm CacheÄhnliche Seiten","snippet":"Donald John Trump /dɒnəld d͡ʒɒn trʌmp/ (* 14. Juni 1946 in Queens, New York City, New York) ist ein amerikanischer Unternehmer, Entertainer und seit ...","visible_link":"https://de.wikipedia.org/wiki/Donald_Trump","date":"","rank":1},{"link":"https://www.merkur.de/politik/milliardaer-will-trump-mit-unfassbarer-summe-aus-amt-jagen-news-zr-11469011.html","title":"Milliardär will Trump mit unfassbarer Summe aus dem Amt jagen ...https://www.merkur.de/.../milliardaer-will-trump-mit-unfassbarer-summe-aus-amt-jage...Im Cache","snippet":"vor 1 Stunde - Donald Trump: Der längste Shutdown in der Geschichte der USA ist beendet. Die Rede zur Lage der Nation steht bevor und ein Milliardär fährt ...","visible_link":"https://www.merkur.de/.../milliardaer-will-trump-mit-unfassbarer-summe-aus-amt-jage...","date":"vor 1 Stunde - ","rank":2},{"link":"http://www.spiegel.de/wirtschaft/impeachment-tom-steyer-wirbt-fuer-amtsenthebung-von-donald-trump-a-1250683.html","title":"Impeachment: Tom Steyer wirbt für Amtsenthebung von Donald Trump ...www.spiegel.de › Wirtschaft › Donald Trump","snippet":"vor 5 Stunden - US-Milliardär Tom Steyer 100 Millionen Dollar, um Trump aus dem Amt zu jagen. Der Milliardär und Ex-Fondsmanager Tom Steyer ist ...","visible_link":"www.spiegel.de › Wirtschaft › Donald Trump","date":"vor 5 Stunden - ","rank":3},{"link":"http://www.spiegel.de/thema/donald_trump/","title":"Donald Trump - SPIEGEL ONLINEwww.spiegel.de › Politik › AuslandÄhnliche Seiten","snippet":"Der Unternehmer Donald Trump war schon vor seiner Bewerbung als republikanischer Präsidentschaftskandidat weltweit bekannt. Überraschend gewann der ...","visible_link":"www.spiegel.de › Politik › Ausland","date":"","rank":4},{"link":"https://www.faz.net/aktuell/politik/ausland/gefahren-fuer-amerika-geheimdienste-widersprechen-trump-16015734.html","title":"Gefahren für Amerika: Geheimdienste widersprechen Trump - Fazhttps://www.faz.net › Politik › Ausland","snippet":"vor 1 Stunde - Nordkorea rüstet ab, Iran auf und der „IS“ ist besiegt – so sieht es Donald Trump. Ein Bericht der amerikanischen Geheimdienste über ...","visible_link":"https://www.faz.net › Politik › Ausland","date":"vor 1 Stunde - ","rank":5},{"link":"https://www.faz.net/aktuell/politik/thema/donald-trump","title":"Donald Trump: Aktuelle News der FAZ zum US-Präsidentenhttps://www.faz.net/aktuell/politik/thema/donald-trump","snippet":"Donald Trump ist der 45. US-Präsident. ▷ Lesen Sie hier alle Nachrichten der FAZ rund um die Politik und Entscheidungen des Republikaners.","visible_link":"https://www.faz.net/aktuell/politik/thema/donald-trump","date":"","rank":6},{"link":"https://www.donaldjtrump.com/","title":"Donald J. Trump for President: Homehttps://www.donaldjtrump.com/Im CacheÄhnliche SeitenDiese Seite übersetzen","snippet":"Help continue our promise to Make America Great Again!","visible_link":"https://www.donaldjtrump.com/","date":"","rank":7},{"link":"https://www.zeit.de/thema/donald-trump","title":"Donald Trump: Präsident der USA | ZEIT ONLINE - Die Zeithttps://www.zeit.de › Politik","snippet":"Importzölle, Atomabkommen, Einreiseverbot: Donald Trump sorgt innen- und außenpolitisch für Schlagzeilen. Hier lesen Sie Nachrichten und Analysen zum ...","visible_link":"https://www.zeit.de › Politik","date":"","rank":8}]}}} \ No newline at end of file diff --git a/run.js b/run.js index 6f2ca56..7de239c 100644 --- a/run.js +++ b/run.js @@ -15,18 +15,18 @@ let config = { search_engine: 'google', // whether debug information should be printed // debug info is useful for developers when debugging - debug: false, + debug: true, // whether verbose program output should be printed // this output is informational - verbose: false, + verbose: true, // an array of keywords to scrape - keywords: ['scraping scrapeulous.com'], + keywords: ['trump', ], // alternatively you can specify a keyword_file. this overwrites the keywords array keyword_file: '', // the number of pages to scrape for each keyword num_pages: 1, // whether to start the browser in headless mode - headless: true, + headless: false, // path to output file, data will be stored in JSON output_file: 'data.json', // whether to prevent images, css, fonts from being loaded diff --git a/src/modules/bing.js b/src/modules/bing.js index 32b790b..445b0e9 100644 --- a/src/modules/bing.js +++ b/src/modules/bing.js @@ -1,203 +1,164 @@ const cheerio = require('cheerio'); -const sfunctions = require('./functions.js'); +const Scraper = require('./se_scraper'); + +class BingScraper extends Scraper { + + parse(html) { + // load the page source into cheerio + const $ = cheerio.load(html); + + // perform queries + const results = []; + $('#b_content #b_results .b_algo').each((i, link) => { + results.push({ + link: $(link).find('h2 a').attr('href'), + title: $(link).find('h2').text(), + snippet: $(link).find('.b_caption p').text(), + visible_link: $(link).find('cite').text(), + }) + }); + + let no_results = this.no_results( + ['There are no results', 'Es gibt keine Ergebnisse', 'Including results for', 'Einschließlich Ergebnisse'], + $('#b_results').text() + ); + + let effective_query = $('#sp_requery a').first().text() || ''; + + const cleaned = []; + for (var i=0; i < results.length; i++) { + let res = results[i]; + if (res.link && res.link.trim() && res.title && res.title.trim()) { + res.rank = i+1; + cleaned.push(res); + } + } + + return { + time: (new Date()).toUTCString(), + no_results: no_results, + effective_query: effective_query, + num_results: $('#b_content .sb_count').text(), + results: cleaned, + } + } + + async load_start_page() { + try { + await this.page.goto('https://www.bing.com/'); + await this.page.waitForSelector('input[name="q"]', { timeout: 5000 }); + } catch (e) { + return false; + } + return true; + } + + async search_keyword(keyword) { + const input = await this.page.$('input[name="q"]'); + await this.set_input_value(`input[name="q"]`, keyword); + await this.sleep(50); + await input.focus(); + await this.page.keyboard.press("Enter"); + } + + async next_page() { + let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000}); + if (!next_page_link) { + return false; + } + await next_page_link.click(); + await this.page.waitForNavigation(); + + return true; + } + + async wait_for_results() { + await this.page.waitForSelector('#b_content', { timeout: 5000 }); + await this.sleep(500); + } + + async detected() { + // TODO: I was actually never detected by bing. those are good guys. + } +} + + +class BingNewsScraper extends Scraper { + + parse(html) { + // load the page source into cheerio + const $ = cheerio.load(html); + + // perform queries + const results = []; + $('#algocore .newsitem').each((i, link) => { + results.push({ + link: $(link).attr('url'), + title: $(link).find('a.title').text(), + snippet: $(link).find('.snippet').text(), + date: $(link).find('.source span').last().text(), + }) + }); + + const cleaned = []; + for (var i=0; i < results.length; i++) { + let res = results[i]; + if (res.link && res.link.trim() && res.title && res.title.trim()) { + res.rank = i+1; + cleaned.push(res); + } + } + + return { + time: (new Date()).toUTCString(), + results: cleaned, + } + } + + async load_start_page() { + try { + await this.page.goto('https://www.bing.com/news/search?'); + if (this.config.set_manual_settings === true) { + console.log('Sleeping 30 seconds. Set your settings now.'); + await this.sleep(30000); + } + await this.page.waitForSelector('input[name="q"]', { timeout: 5000 }); + } catch (e) { + return false; + } + return true; + } + + async search_keyword(keyword) { + const input = await this.page.$('input[name="q"]'); + await this.set_input_value(`input[name="q"]`, keyword); + await this.sleep(50); + await input.focus(); + await this.page.keyboard.press("Enter"); + } + + async next_page() { + let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000}); + if (!next_page_link) { + return false; + } + await next_page_link.click(); + await this.page.waitForNavigation(); + + return true; + } + + async wait_for_results() { + await this.page.waitForSelector('#news', { timeout: 5000 }); + await this.sleep(2000); + } + + async detected() { + // TODO: I was actually never detected by bing news. + } +} module.exports = { - scrape_bing_pup: scrape_bing_pup, - scrape_bing_news_pup: scrape_bing_news_pup, -}; - -async function scrape_bing_pup(page, event, context, pluggable) { - await page.goto('https://www.bing.com/'); - - try { - await page.waitForSelector('input[name="q"]', { timeout: 5000 }); - } catch (e) { - return results; - } - - let keywords = event.keywords; - var results = {}; - - for (var i = 0; i < keywords.length; i++) { - - keyword = keywords[i]; - results[keyword] = {}; - - if (pluggable.before_keyword_scraped) { - await pluggable.before_keyword_scraped({ - keyword: keyword, - page: page, - event: event, - context: context, - }); - } - - try { - const input = await page.$('input[name="q"]'); - await sfunctions.set_input_value(page, `input[name="q"]`, keyword); - await sfunctions.sleep(50); - await input.focus(); - await page.keyboard.press("Enter"); - - let page_num = 1; - - do { - if (event.verbose === true) { - console.log(`${event.search_engine} is scraping keyword: ${keyword} on page ${page_num}`); - } - if (event.sleep_range) { - await sfunctions.random_sleep(event); - } - await page.waitForSelector('#b_content', { timeout: 5000 }); - await sfunctions.sleep(500); - let html = await page.content(); - results[keyword][page_num] = parse(html); - - page_num += 1; - - let next_page_link = await page.$('.sb_pagN', {timeout: 1000}); - if (!next_page_link) { - break; - } - await next_page_link.click(); - await page.waitForNavigation(); - - } while (page_num <= event.num_pages) - - } catch (e) { - console.error(`Problem with scraping ${keyword}: ${e}`); - } - } - - return results; -} - -function parse(html) { - // load the page source into cheerio - const $ = cheerio.load(html); - - // perform queries - const results = []; - $('#b_content #b_results .b_algo').each((i, link) => { - results.push({ - link: $(link).find('h2 a').attr('href'), - title: $(link).find('h2').text(), - snippet: $(link).find('.b_caption p').text(), - visible_link: $(link).find('cite').text(), - }) - }); - - let no_results = sfunctions.no_results( - ['There are no results', 'Es gibt keine Ergebnisse', 'Including results for', 'Einschließlich Ergebnisse'], - $('#b_results').text() - ); - - let effective_query = $('#sp_requery a').first().text() || ''; - - const cleaned = []; - for (var i=0; i < results.length; i++) { - let res = results[i]; - if (res.link && res.link.trim() && res.title && res.title.trim()) { - res.rank = i+1; - cleaned.push(res); - } - } - - return { - time: (new Date()).toUTCString(), - no_results: no_results, - effective_query: effective_query, - num_results: $('#b_content .sb_count').text(), - results: cleaned, - } -} - -async function scrape_bing_news_pup(page, event, context, pluggable) { - await page.goto('https://www.bing.com/news/search?'); - - if (event.set_manual_settings === true) { - console.log('Sleeping 30 seconds. Set your settings now.'); - await sfunctions.sleep(30000); - } - - try { - await page.waitForSelector('input[name="q"]', { timeout: 5000 }); - } catch (e) { - return results; - } - - let keywords = event.keywords; - var results = {}; - - for (var i = 0; i < keywords.length; i++) { - - keyword = keywords[i]; - - if (pluggable.before_keyword_scraped) { - await pluggable.before_keyword_scraped({ - keyword: keyword, - page: page, - event: event, - context: context, - }); - } - - try { - const input = await page.$('input[name="q"]'); - // overwrites last text in input - await input.click({ clickCount: 3 }); - await input.type(keyword); - await input.focus(); - await page.keyboard.press("Enter"); - - if (event.sleep_range) { - await sfunctions.random_sleep(event); - } - - await page.waitForSelector('#news', { timeout: 5000 }); - await sfunctions.sleep(2000); - - if (event.debug === true && event.is_local === true) { - await page.screenshot({path: `debug/${keyword}.png`}); - } - - let html = await page.content(); - results[keyword] = parse_bing_news(html); - - } catch (e) { - console.error(`Problem with scraping ${keyword}: ${e}`); - } - } - - return results; -} - -function parse_bing_news(html) { - // load the page source into cheerio - const $ = cheerio.load(html); - - // perform queries - const results = []; - $('#algocore .newsitem').each((i, link) => { - results.push({ - link: $(link).attr('url'), - title: $(link).find('a.title').text(), - snippet: $(link).find('.snippet').text(), - date: $(link).find('.source span').last().text(), - }) - }); - - const cleaned = []; - for (var i=0; i < results.length; i++) { - let res = results[i]; - if (res.link && res.link.trim() && res.title && res.title.trim()) { - res.rank = i+1; - cleaned.push(res); - } - } - - return { - time: (new Date()).toUTCString(), - results: cleaned, - } -} \ No newline at end of file + BingNewsScraper: BingNewsScraper, + BingScraper: BingScraper, +}; \ No newline at end of file diff --git a/src/modules/google.js b/src/modules/google.js index 22d8c8e..5e0d468 100644 --- a/src/modules/google.js +++ b/src/modules/google.js @@ -1,104 +1,98 @@ const cheerio = require('cheerio'); const sfunctions = require('./functions.js'); +const Scraper = require('./se_scraper'); -module.exports = { - scrape_google_news_old_pup: scrape_google_news_old_pup, - scrape_google_pup: scrape_google_pup, - scrape_google_image_pup: scrape_google_image_pup, - scrape_google_news_pup: scrape_google_news_pup, - scrape_google_pup_dr: scrape_google_pup_dr, -}; +class GoogleScraper extends Scraper { -const STANDARD_TIMEOUT = 8000; -const SOLVE_CAPTCHA_TIME = 45000; + parse(html) { + // load the page source into cheerio + const $ = cheerio.load(html); -async function scrape_google_pup(page, event, context, pluggable) { - await page.goto('https://www.google.com/'); + // perform queries + const results = []; + $('#center_col .g').each((i, link) => { + results.push({ + link: $(link).find('.r a').attr('href'), + title: $(link).find('.r a').text(), + snippet: $(link).find('span.st').text(), + visible_link: $(link).find('.r cite').text(), + date: $(link).find('span.f').text() || '', + }) + }); - try { - await page.waitForSelector('input[name="q"]', { timeout: STANDARD_TIMEOUT }); - } catch (e) { - return results; - } + let no_results = sfunctions.no_results( + ['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für', + 'No results found for', 'Ergebnisse für', 'Showing results for'], + $('#main').text() + ); - let keywords = event.keywords; - var results = {}; - - for (var i = 0; i < keywords.length; i++) { - keyword = keywords[i]; - results[keyword] = {}; - - if (pluggable.before_keyword_scraped) { - await pluggable.before_keyword_scraped({ - keyword: keyword, - page: page, - event: event, - context: context, - }); + let effective_query = $('#fprsl').text() || ''; + if (!effective_query) { + effective_query = $('#fprs a').text() } - try { - - const input = await page.$('input[name="q"]'); - await sfunctions.set_input_value(page, `input[name="q"]`, keyword); - await sfunctions.sleep(50); - await input.focus(); - await page.keyboard.press("Enter"); - - let page_num = 1; - - do { - if (event.verbose === true) { - console.log(`${event.search_engine} is scraping keyword: ${keyword} on page ${page_num}`); - } - if (event.sleep_range) { - await sfunctions.random_sleep(event); - } - await page.waitForSelector('#center_col', {timeout: STANDARD_TIMEOUT}); - await sfunctions.sleep(500); - let html = await page.content(); - results[keyword][page_num] = parse_google_results(html); - - page_num += 1; - - let next_page_link = await page.$('#pnnext', {timeout: 1000}); - if (!next_page_link) { - break; - } - await next_page_link.click(); - await page.waitForNavigation(); - - } while (page_num <= event.num_pages) - - } catch (e) { - console.error(`Problem with scraping ${keyword}.`); - console.error(e); - - if (await scraping_detected(page) === true) { - console.error('Google detected the scraping. Aborting.'); - - if (event.is_local === true) { - await sfunctions.sleep(SOLVE_CAPTCHA_TIME); - console.error('You have 45 seconds to enter the captcha.'); - // expect that user filled out necessary captcha - } else { - return results; - } - } else { - // some other error, quit scraping process if stuff is broken - if (event.is_local === true) { - console.error('You have 30 seconds to fix this.'); - await sfunctions.sleep(30000); - } else { - return results; - } + const cleaned = []; + for (var i=0; i < results.length; i++) { + let res = results[i]; + if (res.link && res.link.trim() && res.title && res.title.trim()) { + res.rank = i+1; + cleaned.push(res); } } + + return { + time: (new Date()).toUTCString(), + num_results: $('#resultStats').text(), + no_results: no_results, + effective_query: effective_query, + results: cleaned + } } - return results; + async load_start_page() { + await this.page.goto('https://www.google.com/'); + + try { + await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT }); + } catch (e) { + return false; + } + + return true; + } + + async search_keyword(keyword) { + const input = await this.page.$('input[name="q"]'); + await this.set_input_value(`input[name="q"]`, keyword); + await this.sleep(50); + await input.focus(); + await this.page.keyboard.press("Enter"); + } + + async next_page() { + let next_page_link = await this.page.$('#pnnext', {timeout: 1000}); + if (!next_page_link) { + return false; + } + await next_page_link.click(); + await this.page.waitForNavigation(); + + return true; + } + + async wait_for_results() { + await this.page.waitForSelector('#center_col', { timeout: this.STANDARD_TIMEOUT }); + await this.sleep(500); + } + + async detected() { + const title = await this.page.title(); + let html = await this.page.content(); + return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1; + } } + async function scrape_google_pup_dr(page, event, context, pluggable) { let keywords = event.keywords; first = keywords[0]; @@ -650,4 +644,13 @@ function parse_google_news_results(html) { no_results: no_results, effective_query: effective_query, } -} \ No newline at end of file +} + + +module.exports = { + scrape_google_news_old_pup: scrape_google_news_old_pup, + GoogleScraper: GoogleScraper, + scrape_google_image_pup: scrape_google_image_pup, + scrape_google_news_pup: scrape_google_news_pup, + scrape_google_pup_dr: scrape_google_pup_dr, +}; \ No newline at end of file diff --git a/src/modules/se_scraper.js b/src/modules/se_scraper.js index 1d3916d..51bd5d5 100644 --- a/src/modules/se_scraper.js +++ b/src/modules/se_scraper.js @@ -3,34 +3,214 @@ const start_url = { }; /* + Get useful JS knowledge and get awesome... + Read this shit: https://javascript.info/class-inheritance + And this: https://medium.freecodecamp.org/here-are-examples-of-everything-new-in-ecmascript-2016-2017-and-2018-d52fa3b5a70e */ module.exports = class Scraper { constructor(options = {}) { const { - searchEngine = 'google', - numPages = 1, + browser = null, + config = {}, + context = {}, pluggable = null, } = options; this.pluggable = pluggable; - this.searchEngine = searchEngine; - this.numPages = numPages; - this.results = {} + this.browser = browser; + this.config = config; + this.context = context; + + this.STANDARD_TIMEOUT = 8000; + // longer timeout when using proxies + this.PROXY_TIMEOUT = 15000; + this.SOLVE_CAPTCHA_TIME = 45000; + + this.results = {}; } + async run() { + + let do_continue = await this.load_search_engine(); + + if (!do_continue) { + console.error('Failed to load the search engine: load_search_engine()'); + return this.results; + } + + await this.scraping_loop(); + + return this.results; + } + + /** + * Action that runs only once in the beginning of the + * scraping procedure. + * + * @returns {Promise} true if everything is correct. + */ async load_search_engine() { + + this.page = await this.browser.newPage(); + + // block some assets to speed up scraping + if (this.config.block_assets === true) { + await this.page.setRequestInterception(true); + this.page.on('request', (req) => { + let type = req.resourceType(); + const block = ['stylesheet', 'font', 'image', 'media']; + if (block.includes(type)) { + req.abort(); + } else { + req.continue(); + } + }); + } + + return await this.load_start_page(); } - async search_keyword() { + /** + * Each scraper basically iterates over a list of + * keywords and a list of pages. This is the generic + * method for that. + * + * @returns {Promise} + */ + async scraping_loop() { + + for (let keyword of this.config.keywords) { + + this.results[keyword] = {}; + + if (this.pluggable.before_keyword_scraped) { + await this.pluggable.before_keyword_scraped({ + keyword: keyword, + page: this.page, + event: this.config, + context: this.context, + }); + } + + let page_num = 1; + + try { + + await this.search_keyword(keyword); + + do { + + if (this.config.verbose === true) { + console.log(`${this.config.search_engine} scrapes keyword "${keyword}" on page ${page_num}`); + } + + await this.wait_for_results(); + + if (event.sleep_range) { + await this.random_sleep(); + } + + let html = await this.page.content(); + this.results[keyword][page_num] = this.parse(html); + + page_num += 1; + + if (await this.next_page() === false) { + break; + } + + } while (page_num < event.num_pages); + + } catch (e) { + + console.error(`Problem with scraping ${keyword} in search engine ${this.config.search_engine}: ${e}`); + + if (await this.detected() === true) { + console.error(`${this.config.search_engine} DETECTED the scraping!`); + + if (this.config.is_local === true) { + await this.sleep(this.SOLVE_CAPTCHA_TIME); + console.error(`You have ${this.SOLVE_CAPTCHA_TIME}ms to enter the captcha.`); + // expect that user filled out necessary captcha + } else { + break; + } + } else { + // some other error, quit scraping process if stuff is broken + if (this.config.is_local === true) { + console.error('You have 30 seconds to fix this.'); + await this.sleep(30000); + } else { + break; + } + } + + } + } } - parse() { + sleep(ms) { + return new Promise(resolve => { + setTimeout(resolve, ms) + }) + } + + async random_sleep() { + const [min, max] = this.config.sleep_range; + let rand = Math.floor(Math.random() * (max - min + 1) + min); //Generate Random number + if (this.config.debug === true) { + console.log(`Sleeping for ${rand}s`); + } + await this.sleep(rand * 1000); + } + + async set_input_value(selector, value) { + await this.page.waitFor(selector); + await this.page.evaluate((value, selector) => { + return document.querySelector(selector).value = value; + }, value, selector); + } + + no_results(needles, html) { + return !needles.map((needle) => { return html.indexOf(needle)}) + .every((res) => { return res == -1}); + } + + parse(html) { } + /** + * + * @returns true if startpage was loaded correctly. + */ + async load_start_page() { + + } + + /** + * Searches the keyword by inputting it into the form and hitting enter + * or something similar. + * + * @param keyword + * @returns {Promise} + */ + async search_keyword(keyword) { + + } + + /** + * + * @returns true if the next page was loaded correctely + */ async next_page() { + + } + + async wait_for_results() { + } async detected() { diff --git a/src/node_scraper.js b/src/node_scraper.js index 61e7865..0c44a02 100644 --- a/src/node_scraper.js +++ b/src/node_scraper.js @@ -129,29 +129,13 @@ module.exports.handler = async function handler (event, context, callback) { } } - const page = await browser.newPage(); - - // block some assets to speed up scraping - if (config.block_assets === true) { - await page.setRequestInterception(true); - page.on('request', (req) => { - let type = req.resourceType(); - const block = ['stylesheet', 'font', 'image', 'media']; - if (block.includes(type)) { - req.abort(); - } else { - req.continue(); - } - }); - } - - results = await { - google: google.scrape_google_pup, + Scraper = { + google: google.GoogleScraper, google_news_old: google.scrape_google_news_old_pup, google_news: google.scrape_google_news_pup, google_image: google.scrape_google_image_pup, - bing: bing.scrape_bing_pup, - bing_news: bing.scrape_bing_news_pup, + bing: bing.BingScraper, + bing_news: bing.BingNewsScraper, infospace: infospace.scrape_infospace_pup, webcrawler: infospace.scrape_webcrawler_news_pup, baidu: baidu.scrape_baidu_pup, @@ -163,7 +147,16 @@ module.exports.handler = async function handler (event, context, callback) { reuters: tickersearch.scrape_reuters_finance_pup, cnbc: tickersearch.scrape_cnbc_finance_pup, marketwatch: tickersearch.scrape_marketwatch_finance_pup, - }[config.search_engine](page, config, context, pluggable); + }[config.search_engine]; + + let scraper = new Scraper({ + browser: browser, + config: config, + context: context, + pluggable: pluggable, + }); + + let results = await scraper.run(); if (pluggable.close_browser) {