implemented generic scraping class

This commit is contained in:
Nikolai Tschacher 2019-01-30 16:05:08 +01:00
parent 9e62f23451
commit 4306848657
7 changed files with 463 additions and 320 deletions

View File

@ -25,6 +25,12 @@
- implement duckduckgo scraping - implement duckduckgo scraping
30.1.2019
- modify all scrapers to use the generic class where it makes sense
- Bing, Baidu, Google, Duckduckgo
TODO: TODO:
- think about implementing ticker search for: https://quotes.wsj.com/MSFT?mod=searchresults_companyquotes - think about implementing ticker search for: https://quotes.wsj.com/MSFT?mod=searchresults_companyquotes
- add proxy support - add proxy support

View File

@ -1 +1 @@
{"scraping scrapeulous.com":{"1":{"time":"Tue, 29 Jan 2019 21:46:30 GMT","num_results":"Ungefähr 139 Ergebnisse (0,29 Sekunden) ","no_results":false,"effective_query":"","results":[{"link":"https://scrapeulous.com/about/","title":"About - Scrapeuloushttps://scrapeulous.com/about/Im CacheDiese Seite übersetzen","snippet":"Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. The business requirement to scrape information from ...","visible_link":"https://scrapeulous.com/about/","date":"","rank":1},{"link":"https://scrapeulous.com/howto/","title":"Howto - Scrapeuloushttps://scrapeulous.com/howto/Im CacheDiese Seite übersetzen","snippet":"We offer scraping large amounts of keywords for the Google Search Engine. Large means any number of keywords between 40 and 50000. Additionally, we ...","visible_link":"https://scrapeulous.com/howto/","date":"","rank":2},{"link":"https://github.com/NikolaiT/se-scraper","title":"GitHub - NikolaiT/se-scraper: Javascript scraping module based on ...https://github.com/NikolaiT/se-scraperIm CacheDiese Seite übersetzen","snippet":"24.12.2018 - Javascript scraping module based on puppeteer for many different search ... for many different search engines... https://scrapeulous.com/.","visible_link":"https://github.com/NikolaiT/se-scraper","date":"24.12.2018 - ","rank":3},{"link":"https://github.com/NikolaiT/GoogleScraper/blob/master/README.md","title":"GoogleScraper/README.md at master · NikolaiT/GoogleScraper ...https://github.com/NikolaiT/GoogleScraper/blob/.../README.mdIm CacheÄhnliche SeitenDiese Seite übersetzen","snippet":"GoogleScraper - Scraping search engines professionally. Scrapeulous.com - Scraping Service. GoogleScraper is a open source tool and will remain a open ...","visible_link":"https://github.com/NikolaiT/GoogleScraper/blob/.../README.md","date":"","rank":4},{"link":"https://googlescraper.readthedocs.io/","title":"Welcome to GoogleScraper's documentation! — GoogleScraper ...https://googlescraper.readthedocs.io/Im CacheDiese Seite übersetzen","snippet":"Welcome to GoogleScraper's documentation!¶. Contents: GoogleScraper - Scraping search engines professionally · Scrapeulous.com - Scraping Service ...","visible_link":"https://googlescraper.readthedocs.io/","date":"","rank":5},{"link":"https://incolumitas.com/pages/scrapeulous/","title":"Coding, Learning and Business Ideas Scrapeulous.com - Incolumitashttps://incolumitas.com/pages/scrapeulous/Im CacheDiese Seite übersetzen","snippet":"A scraping service for scientists, marketing professionals, analysts or SEO folk. In autumn 2018, I created a scraping service called scrapeulous.com. There you ...","visible_link":"https://incolumitas.com/pages/scrapeulous/","date":"","rank":6},{"link":"https://incolumitas.com/","title":"Coding, Learning and Business Ideashttps://incolumitas.com/Im CacheDiese Seite übersetzen","snippet":"Scraping Amazon Reviews using Headless Chrome Browser and Python3. Posted on Mi ... GoogleScraper Tutorial - How to scrape 1000 keywords with Google.","visible_link":"https://incolumitas.com/","date":"","rank":7},{"link":"https://en.wikipedia.org/wiki/Search_engine_scraping","title":"Search engine scraping - Wikipediahttps://en.wikipedia.org/wiki/Search_engine_scrapingIm CacheDiese Seite übersetzen","snippet":"Search engine scraping is the process of harvesting URLs, descriptions, or other information from search engines such as Google, Bing or Yahoo. This is a ...","visible_link":"https://en.wikipedia.org/wiki/Search_engine_scraping","date":"","rank":8},{"link":"https://readthedocs.org/projects/googlescraper/downloads/pdf/latest/","title":"GoogleScraper Documentation - Read the Docshttps://readthedocs.org/projects/googlescraper/downloads/.../latest...Im CacheDiese Seite übersetzen","snippet":"23.12.2018 - Contents: 1 GoogleScraper - Scraping search engines professionally. 1. 1.1 ... For this reason, I created the web service scrapeulous.com.","visible_link":"https://readthedocs.org/projects/googlescraper/downloads/.../latest...","date":"23.12.2018 - ","rank":9},{"link":"https://pypi.org/project/CountryGoogleScraper/","title":"CountryGoogleScraper · PyPIhttps://pypi.org/project/CountryGoogleScraper/Im CacheDiese Seite übersetzen","snippet":"A module to scrape and extract links, titles and descriptions from various search ... Look [here to get an idea how to use asynchronous mode](http://scrapeulous.","visible_link":"https://pypi.org/project/CountryGoogleScraper/","date":"","rank":10}]}}} {"trump":{"1":{"time":"Wed, 30 Jan 2019 15:03:46 GMT","num_results":"Ungefähr 1.450.000.000 Ergebnisse (0,49 Sekunden) ","no_results":false,"effective_query":"","results":[{"link":"https://de.wikipedia.org/wiki/Donald_Trump","title":"Donald Trump Wikipediahttps://de.wikipedia.org/wiki/Donald_TrumpIm CacheÄhnliche Seiten","snippet":"Donald John Trump /dɒnəld d͡ʒɒn trʌmp/ (* 14. Juni 1946 in Queens, New York City, New York) ist ein amerikanischer Unternehmer, Entertainer und seit ...","visible_link":"https://de.wikipedia.org/wiki/Donald_Trump","date":"","rank":1},{"link":"https://www.merkur.de/politik/milliardaer-will-trump-mit-unfassbarer-summe-aus-amt-jagen-news-zr-11469011.html","title":"Milliardär will Trump mit unfassbarer Summe aus dem Amt jagen ...https://www.merkur.de/.../milliardaer-will-trump-mit-unfassbarer-summe-aus-amt-jage...Im Cache","snippet":"vor 1 Stunde - Donald Trump: Der längste Shutdown in der Geschichte der USA ist beendet. Die Rede zur Lage der Nation steht bevor und ein Milliardär fährt ...","visible_link":"https://www.merkur.de/.../milliardaer-will-trump-mit-unfassbarer-summe-aus-amt-jage...","date":"vor 1 Stunde - ","rank":2},{"link":"http://www.spiegel.de/wirtschaft/impeachment-tom-steyer-wirbt-fuer-amtsenthebung-von-donald-trump-a-1250683.html","title":"Impeachment: Tom Steyer wirbt für Amtsenthebung von Donald Trump ...www.spiegel.de Wirtschaft Donald Trump","snippet":"vor 5 Stunden - US-Milliardär Tom Steyer 100 Millionen Dollar, um Trump aus dem Amt zu jagen. Der Milliardär und Ex-Fondsmanager Tom Steyer ist ...","visible_link":"www.spiegel.de Wirtschaft Donald Trump","date":"vor 5 Stunden - ","rank":3},{"link":"http://www.spiegel.de/thema/donald_trump/","title":"Donald Trump - SPIEGEL ONLINEwww.spiegel.de Politik AuslandÄhnliche Seiten","snippet":"Der Unternehmer Donald Trump war schon vor seiner Bewerbung als republikanischer Präsidentschaftskandidat weltweit bekannt. Überraschend gewann der ...","visible_link":"www.spiegel.de Politik Ausland","date":"","rank":4},{"link":"https://www.faz.net/aktuell/politik/ausland/gefahren-fuer-amerika-geheimdienste-widersprechen-trump-16015734.html","title":"Gefahren für Amerika: Geheimdienste widersprechen Trump - Fazhttps://www.faz.net Politik Ausland","snippet":"vor 1 Stunde - Nordkorea rüstet ab, Iran auf und der „IS“ ist besiegt so sieht es Donald Trump. Ein Bericht der amerikanischen Geheimdienste über ...","visible_link":"https://www.faz.net Politik Ausland","date":"vor 1 Stunde - ","rank":5},{"link":"https://www.faz.net/aktuell/politik/thema/donald-trump","title":"Donald Trump: Aktuelle News der FAZ zum US-Präsidentenhttps://www.faz.net/aktuell/politik/thema/donald-trump","snippet":"Donald Trump ist der 45. US-Präsident. ▷ Lesen Sie hier alle Nachrichten der FAZ rund um die Politik und Entscheidungen des Republikaners.","visible_link":"https://www.faz.net/aktuell/politik/thema/donald-trump","date":"","rank":6},{"link":"https://www.donaldjtrump.com/","title":"Donald J. Trump for President: Homehttps://www.donaldjtrump.com/Im CacheÄhnliche SeitenDiese Seite übersetzen","snippet":"Help continue our promise to Make America Great Again!","visible_link":"https://www.donaldjtrump.com/","date":"","rank":7},{"link":"https://www.zeit.de/thema/donald-trump","title":"Donald Trump: Präsident der USA | ZEIT ONLINE - Die Zeithttps://www.zeit.de Politik","snippet":"Importzölle, Atomabkommen, Einreiseverbot: Donald Trump sorgt innen- und außenpolitisch für Schlagzeilen. Hier lesen Sie Nachrichten und Analysen zum ...","visible_link":"https://www.zeit.de Politik","date":"","rank":8}]}}}

8
run.js
View File

@ -15,18 +15,18 @@ let config = {
search_engine: 'google', search_engine: 'google',
// whether debug information should be printed // whether debug information should be printed
// debug info is useful for developers when debugging // debug info is useful for developers when debugging
debug: false, debug: true,
// whether verbose program output should be printed // whether verbose program output should be printed
// this output is informational // this output is informational
verbose: false, verbose: true,
// an array of keywords to scrape // an array of keywords to scrape
keywords: ['scraping scrapeulous.com'], keywords: ['trump', ],
// alternatively you can specify a keyword_file. this overwrites the keywords array // alternatively you can specify a keyword_file. this overwrites the keywords array
keyword_file: '', keyword_file: '',
// the number of pages to scrape for each keyword // the number of pages to scrape for each keyword
num_pages: 1, num_pages: 1,
// whether to start the browser in headless mode // whether to start the browser in headless mode
headless: true, headless: false,
// path to output file, data will be stored in JSON // path to output file, data will be stored in JSON
output_file: 'data.json', output_file: 'data.json',
// whether to prevent images, css, fonts from being loaded // whether to prevent images, css, fonts from being loaded

View File

@ -1,203 +1,164 @@
const cheerio = require('cheerio'); const cheerio = require('cheerio');
const sfunctions = require('./functions.js'); const Scraper = require('./se_scraper');
class BingScraper extends Scraper {
parse(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
// perform queries
const results = [];
$('#b_content #b_results .b_algo').each((i, link) => {
results.push({
link: $(link).find('h2 a').attr('href'),
title: $(link).find('h2').text(),
snippet: $(link).find('.b_caption p').text(),
visible_link: $(link).find('cite').text(),
})
});
let no_results = this.no_results(
['There are no results', 'Es gibt keine Ergebnisse', 'Including results for', 'Einschließlich Ergebnisse'],
$('#b_results').text()
);
let effective_query = $('#sp_requery a').first().text() || '';
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.title && res.title.trim()) {
res.rank = i+1;
cleaned.push(res);
}
}
return {
time: (new Date()).toUTCString(),
no_results: no_results,
effective_query: effective_query,
num_results: $('#b_content .sb_count').text(),
results: cleaned,
}
}
async load_start_page() {
try {
await this.page.goto('https://www.bing.com/');
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
} catch (e) {
return false;
}
return true;
}
async search_keyword(keyword) {
const input = await this.page.$('input[name="q"]');
await this.set_input_value(`input[name="q"]`, keyword);
await this.sleep(50);
await input.focus();
await this.page.keyboard.press("Enter");
}
async next_page() {
let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000});
if (!next_page_link) {
return false;
}
await next_page_link.click();
await this.page.waitForNavigation();
return true;
}
async wait_for_results() {
await this.page.waitForSelector('#b_content', { timeout: 5000 });
await this.sleep(500);
}
async detected() {
// TODO: I was actually never detected by bing. those are good guys.
}
}
class BingNewsScraper extends Scraper {
parse(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
// perform queries
const results = [];
$('#algocore .newsitem').each((i, link) => {
results.push({
link: $(link).attr('url'),
title: $(link).find('a.title').text(),
snippet: $(link).find('.snippet').text(),
date: $(link).find('.source span').last().text(),
})
});
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.title && res.title.trim()) {
res.rank = i+1;
cleaned.push(res);
}
}
return {
time: (new Date()).toUTCString(),
results: cleaned,
}
}
async load_start_page() {
try {
await this.page.goto('https://www.bing.com/news/search?');
if (this.config.set_manual_settings === true) {
console.log('Sleeping 30 seconds. Set your settings now.');
await this.sleep(30000);
}
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
} catch (e) {
return false;
}
return true;
}
async search_keyword(keyword) {
const input = await this.page.$('input[name="q"]');
await this.set_input_value(`input[name="q"]`, keyword);
await this.sleep(50);
await input.focus();
await this.page.keyboard.press("Enter");
}
async next_page() {
let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000});
if (!next_page_link) {
return false;
}
await next_page_link.click();
await this.page.waitForNavigation();
return true;
}
async wait_for_results() {
await this.page.waitForSelector('#news', { timeout: 5000 });
await this.sleep(2000);
}
async detected() {
// TODO: I was actually never detected by bing news.
}
}
module.exports = { module.exports = {
scrape_bing_pup: scrape_bing_pup, BingNewsScraper: BingNewsScraper,
scrape_bing_news_pup: scrape_bing_news_pup, BingScraper: BingScraper,
}; };
async function scrape_bing_pup(page, event, context, pluggable) {
await page.goto('https://www.bing.com/');
try {
await page.waitForSelector('input[name="q"]', { timeout: 5000 });
} catch (e) {
return results;
}
let keywords = event.keywords;
var results = {};
for (var i = 0; i < keywords.length; i++) {
keyword = keywords[i];
results[keyword] = {};
if (pluggable.before_keyword_scraped) {
await pluggable.before_keyword_scraped({
keyword: keyword,
page: page,
event: event,
context: context,
});
}
try {
const input = await page.$('input[name="q"]');
await sfunctions.set_input_value(page, `input[name="q"]`, keyword);
await sfunctions.sleep(50);
await input.focus();
await page.keyboard.press("Enter");
let page_num = 1;
do {
if (event.verbose === true) {
console.log(`${event.search_engine} is scraping keyword: ${keyword} on page ${page_num}`);
}
if (event.sleep_range) {
await sfunctions.random_sleep(event);
}
await page.waitForSelector('#b_content', { timeout: 5000 });
await sfunctions.sleep(500);
let html = await page.content();
results[keyword][page_num] = parse(html);
page_num += 1;
let next_page_link = await page.$('.sb_pagN', {timeout: 1000});
if (!next_page_link) {
break;
}
await next_page_link.click();
await page.waitForNavigation();
} while (page_num <= event.num_pages)
} catch (e) {
console.error(`Problem with scraping ${keyword}: ${e}`);
}
}
return results;
}
function parse(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
// perform queries
const results = [];
$('#b_content #b_results .b_algo').each((i, link) => {
results.push({
link: $(link).find('h2 a').attr('href'),
title: $(link).find('h2').text(),
snippet: $(link).find('.b_caption p').text(),
visible_link: $(link).find('cite').text(),
})
});
let no_results = sfunctions.no_results(
['There are no results', 'Es gibt keine Ergebnisse', 'Including results for', 'Einschließlich Ergebnisse'],
$('#b_results').text()
);
let effective_query = $('#sp_requery a').first().text() || '';
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.title && res.title.trim()) {
res.rank = i+1;
cleaned.push(res);
}
}
return {
time: (new Date()).toUTCString(),
no_results: no_results,
effective_query: effective_query,
num_results: $('#b_content .sb_count').text(),
results: cleaned,
}
}
async function scrape_bing_news_pup(page, event, context, pluggable) {
await page.goto('https://www.bing.com/news/search?');
if (event.set_manual_settings === true) {
console.log('Sleeping 30 seconds. Set your settings now.');
await sfunctions.sleep(30000);
}
try {
await page.waitForSelector('input[name="q"]', { timeout: 5000 });
} catch (e) {
return results;
}
let keywords = event.keywords;
var results = {};
for (var i = 0; i < keywords.length; i++) {
keyword = keywords[i];
if (pluggable.before_keyword_scraped) {
await pluggable.before_keyword_scraped({
keyword: keyword,
page: page,
event: event,
context: context,
});
}
try {
const input = await page.$('input[name="q"]');
// overwrites last text in input
await input.click({ clickCount: 3 });
await input.type(keyword);
await input.focus();
await page.keyboard.press("Enter");
if (event.sleep_range) {
await sfunctions.random_sleep(event);
}
await page.waitForSelector('#news', { timeout: 5000 });
await sfunctions.sleep(2000);
if (event.debug === true && event.is_local === true) {
await page.screenshot({path: `debug/${keyword}.png`});
}
let html = await page.content();
results[keyword] = parse_bing_news(html);
} catch (e) {
console.error(`Problem with scraping ${keyword}: ${e}`);
}
}
return results;
}
function parse_bing_news(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
// perform queries
const results = [];
$('#algocore .newsitem').each((i, link) => {
results.push({
link: $(link).attr('url'),
title: $(link).find('a.title').text(),
snippet: $(link).find('.snippet').text(),
date: $(link).find('.source span').last().text(),
})
});
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.title && res.title.trim()) {
res.rank = i+1;
cleaned.push(res);
}
}
return {
time: (new Date()).toUTCString(),
results: cleaned,
}
}

View File

@ -1,104 +1,98 @@
const cheerio = require('cheerio'); const cheerio = require('cheerio');
const sfunctions = require('./functions.js'); const sfunctions = require('./functions.js');
const Scraper = require('./se_scraper');
module.exports = { class GoogleScraper extends Scraper {
scrape_google_news_old_pup: scrape_google_news_old_pup,
scrape_google_pup: scrape_google_pup,
scrape_google_image_pup: scrape_google_image_pup,
scrape_google_news_pup: scrape_google_news_pup,
scrape_google_pup_dr: scrape_google_pup_dr,
};
const STANDARD_TIMEOUT = 8000; parse(html) {
const SOLVE_CAPTCHA_TIME = 45000; // load the page source into cheerio
const $ = cheerio.load(html);
async function scrape_google_pup(page, event, context, pluggable) { // perform queries
await page.goto('https://www.google.com/'); const results = [];
$('#center_col .g').each((i, link) => {
results.push({
link: $(link).find('.r a').attr('href'),
title: $(link).find('.r a').text(),
snippet: $(link).find('span.st').text(),
visible_link: $(link).find('.r cite').text(),
date: $(link).find('span.f').text() || '',
})
});
try { let no_results = sfunctions.no_results(
await page.waitForSelector('input[name="q"]', { timeout: STANDARD_TIMEOUT }); ['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
} catch (e) { 'No results found for', 'Ergebnisse für', 'Showing results for'],
return results; $('#main').text()
} );
let keywords = event.keywords; let effective_query = $('#fprsl').text() || '';
var results = {}; if (!effective_query) {
effective_query = $('#fprs a').text()
for (var i = 0; i < keywords.length; i++) {
keyword = keywords[i];
results[keyword] = {};
if (pluggable.before_keyword_scraped) {
await pluggable.before_keyword_scraped({
keyword: keyword,
page: page,
event: event,
context: context,
});
} }
try { const cleaned = [];
for (var i=0; i < results.length; i++) {
const input = await page.$('input[name="q"]'); let res = results[i];
await sfunctions.set_input_value(page, `input[name="q"]`, keyword); if (res.link && res.link.trim() && res.title && res.title.trim()) {
await sfunctions.sleep(50); res.rank = i+1;
await input.focus(); cleaned.push(res);
await page.keyboard.press("Enter");
let page_num = 1;
do {
if (event.verbose === true) {
console.log(`${event.search_engine} is scraping keyword: ${keyword} on page ${page_num}`);
}
if (event.sleep_range) {
await sfunctions.random_sleep(event);
}
await page.waitForSelector('#center_col', {timeout: STANDARD_TIMEOUT});
await sfunctions.sleep(500);
let html = await page.content();
results[keyword][page_num] = parse_google_results(html);
page_num += 1;
let next_page_link = await page.$('#pnnext', {timeout: 1000});
if (!next_page_link) {
break;
}
await next_page_link.click();
await page.waitForNavigation();
} while (page_num <= event.num_pages)
} catch (e) {
console.error(`Problem with scraping ${keyword}.`);
console.error(e);
if (await scraping_detected(page) === true) {
console.error('Google detected the scraping. Aborting.');
if (event.is_local === true) {
await sfunctions.sleep(SOLVE_CAPTCHA_TIME);
console.error('You have 45 seconds to enter the captcha.');
// expect that user filled out necessary captcha
} else {
return results;
}
} else {
// some other error, quit scraping process if stuff is broken
if (event.is_local === true) {
console.error('You have 30 seconds to fix this.');
await sfunctions.sleep(30000);
} else {
return results;
}
} }
} }
return {
time: (new Date()).toUTCString(),
num_results: $('#resultStats').text(),
no_results: no_results,
effective_query: effective_query,
results: cleaned
}
} }
return results; async load_start_page() {
await this.page.goto('https://www.google.com/');
try {
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
} catch (e) {
return false;
}
return true;
}
async search_keyword(keyword) {
const input = await this.page.$('input[name="q"]');
await this.set_input_value(`input[name="q"]`, keyword);
await this.sleep(50);
await input.focus();
await this.page.keyboard.press("Enter");
}
async next_page() {
let next_page_link = await this.page.$('#pnnext', {timeout: 1000});
if (!next_page_link) {
return false;
}
await next_page_link.click();
await this.page.waitForNavigation();
return true;
}
async wait_for_results() {
await this.page.waitForSelector('#center_col', { timeout: this.STANDARD_TIMEOUT });
await this.sleep(500);
}
async detected() {
const title = await this.page.title();
let html = await this.page.content();
return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
}
} }
async function scrape_google_pup_dr(page, event, context, pluggable) { async function scrape_google_pup_dr(page, event, context, pluggable) {
let keywords = event.keywords; let keywords = event.keywords;
first = keywords[0]; first = keywords[0];
@ -650,4 +644,13 @@ function parse_google_news_results(html) {
no_results: no_results, no_results: no_results,
effective_query: effective_query, effective_query: effective_query,
} }
} }
module.exports = {
scrape_google_news_old_pup: scrape_google_news_old_pup,
GoogleScraper: GoogleScraper,
scrape_google_image_pup: scrape_google_image_pup,
scrape_google_news_pup: scrape_google_news_pup,
scrape_google_pup_dr: scrape_google_pup_dr,
};

View File

@ -3,34 +3,214 @@ const start_url = {
}; };
/* /*
Get useful JS knowledge and get awesome...
Read this shit: https://javascript.info/class-inheritance Read this shit: https://javascript.info/class-inheritance
And this: https://medium.freecodecamp.org/here-are-examples-of-everything-new-in-ecmascript-2016-2017-and-2018-d52fa3b5a70e
*/ */
module.exports = class Scraper { module.exports = class Scraper {
constructor(options = {}) { constructor(options = {}) {
const { const {
searchEngine = 'google', browser = null,
numPages = 1, config = {},
context = {},
pluggable = null, pluggable = null,
} = options; } = options;
this.pluggable = pluggable; this.pluggable = pluggable;
this.searchEngine = searchEngine; this.browser = browser;
this.numPages = numPages; this.config = config;
this.results = {} this.context = context;
this.STANDARD_TIMEOUT = 8000;
// longer timeout when using proxies
this.PROXY_TIMEOUT = 15000;
this.SOLVE_CAPTCHA_TIME = 45000;
this.results = {};
} }
async run() {
let do_continue = await this.load_search_engine();
if (!do_continue) {
console.error('Failed to load the search engine: load_search_engine()');
return this.results;
}
await this.scraping_loop();
return this.results;
}
/**
* Action that runs only once in the beginning of the
* scraping procedure.
*
* @returns {Promise<void>} true if everything is correct.
*/
async load_search_engine() { async load_search_engine() {
this.page = await this.browser.newPage();
// block some assets to speed up scraping
if (this.config.block_assets === true) {
await this.page.setRequestInterception(true);
this.page.on('request', (req) => {
let type = req.resourceType();
const block = ['stylesheet', 'font', 'image', 'media'];
if (block.includes(type)) {
req.abort();
} else {
req.continue();
}
});
}
return await this.load_start_page();
} }
async search_keyword() { /**
* Each scraper basically iterates over a list of
* keywords and a list of pages. This is the generic
* method for that.
*
* @returns {Promise<void>}
*/
async scraping_loop() {
for (let keyword of this.config.keywords) {
this.results[keyword] = {};
if (this.pluggable.before_keyword_scraped) {
await this.pluggable.before_keyword_scraped({
keyword: keyword,
page: this.page,
event: this.config,
context: this.context,
});
}
let page_num = 1;
try {
await this.search_keyword(keyword);
do {
if (this.config.verbose === true) {
console.log(`${this.config.search_engine} scrapes keyword "${keyword}" on page ${page_num}`);
}
await this.wait_for_results();
if (event.sleep_range) {
await this.random_sleep();
}
let html = await this.page.content();
this.results[keyword][page_num] = this.parse(html);
page_num += 1;
if (await this.next_page() === false) {
break;
}
} while (page_num < event.num_pages);
} catch (e) {
console.error(`Problem with scraping ${keyword} in search engine ${this.config.search_engine}: ${e}`);
if (await this.detected() === true) {
console.error(`${this.config.search_engine} DETECTED the scraping!`);
if (this.config.is_local === true) {
await this.sleep(this.SOLVE_CAPTCHA_TIME);
console.error(`You have ${this.SOLVE_CAPTCHA_TIME}ms to enter the captcha.`);
// expect that user filled out necessary captcha
} else {
break;
}
} else {
// some other error, quit scraping process if stuff is broken
if (this.config.is_local === true) {
console.error('You have 30 seconds to fix this.');
await this.sleep(30000);
} else {
break;
}
}
}
}
} }
parse() { sleep(ms) {
return new Promise(resolve => {
setTimeout(resolve, ms)
})
}
async random_sleep() {
const [min, max] = this.config.sleep_range;
let rand = Math.floor(Math.random() * (max - min + 1) + min); //Generate Random number
if (this.config.debug === true) {
console.log(`Sleeping for ${rand}s`);
}
await this.sleep(rand * 1000);
}
async set_input_value(selector, value) {
await this.page.waitFor(selector);
await this.page.evaluate((value, selector) => {
return document.querySelector(selector).value = value;
}, value, selector);
}
no_results(needles, html) {
return !needles.map((needle) => { return html.indexOf(needle)})
.every((res) => { return res == -1});
}
parse(html) {
} }
/**
*
* @returns true if startpage was loaded correctly.
*/
async load_start_page() {
}
/**
* Searches the keyword by inputting it into the form and hitting enter
* or something similar.
*
* @param keyword
* @returns {Promise<void>}
*/
async search_keyword(keyword) {
}
/**
*
* @returns true if the next page was loaded correctely
*/
async next_page() { async next_page() {
}
async wait_for_results() {
} }
async detected() { async detected() {

View File

@ -129,29 +129,13 @@ module.exports.handler = async function handler (event, context, callback) {
} }
} }
const page = await browser.newPage(); Scraper = {
google: google.GoogleScraper,
// block some assets to speed up scraping
if (config.block_assets === true) {
await page.setRequestInterception(true);
page.on('request', (req) => {
let type = req.resourceType();
const block = ['stylesheet', 'font', 'image', 'media'];
if (block.includes(type)) {
req.abort();
} else {
req.continue();
}
});
}
results = await {
google: google.scrape_google_pup,
google_news_old: google.scrape_google_news_old_pup, google_news_old: google.scrape_google_news_old_pup,
google_news: google.scrape_google_news_pup, google_news: google.scrape_google_news_pup,
google_image: google.scrape_google_image_pup, google_image: google.scrape_google_image_pup,
bing: bing.scrape_bing_pup, bing: bing.BingScraper,
bing_news: bing.scrape_bing_news_pup, bing_news: bing.BingNewsScraper,
infospace: infospace.scrape_infospace_pup, infospace: infospace.scrape_infospace_pup,
webcrawler: infospace.scrape_webcrawler_news_pup, webcrawler: infospace.scrape_webcrawler_news_pup,
baidu: baidu.scrape_baidu_pup, baidu: baidu.scrape_baidu_pup,
@ -163,7 +147,16 @@ module.exports.handler = async function handler (event, context, callback) {
reuters: tickersearch.scrape_reuters_finance_pup, reuters: tickersearch.scrape_reuters_finance_pup,
cnbc: tickersearch.scrape_cnbc_finance_pup, cnbc: tickersearch.scrape_cnbc_finance_pup,
marketwatch: tickersearch.scrape_marketwatch_finance_pup, marketwatch: tickersearch.scrape_marketwatch_finance_pup,
}[config.search_engine](page, config, context, pluggable); }[config.search_engine];
let scraper = new Scraper({
browser: browser,
config: config,
context: context,
pluggable: pluggable,
});
let results = await scraper.run();
if (pluggable.close_browser) { if (pluggable.close_browser) {