mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2025-06-20 09:38:06 +02:00
implemented generic scraping class
This commit is contained in:
parent
9e62f23451
commit
4306848657
6
TODO.txt
6
TODO.txt
@ -25,6 +25,12 @@
|
||||
|
||||
- implement duckduckgo scraping
|
||||
|
||||
|
||||
30.1.2019
|
||||
|
||||
- modify all scrapers to use the generic class where it makes sense
|
||||
- Bing, Baidu, Google, Duckduckgo
|
||||
|
||||
TODO:
|
||||
- think about implementing ticker search for: https://quotes.wsj.com/MSFT?mod=searchresults_companyquotes
|
||||
- add proxy support
|
||||
|
@ -1 +1 @@
|
||||
{"scraping scrapeulous.com":{"1":{"time":"Tue, 29 Jan 2019 21:46:30 GMT","num_results":"Ungefähr 139 Ergebnisse (0,29 Sekunden) ","no_results":false,"effective_query":"","results":[{"link":"https://scrapeulous.com/about/","title":"About - Scrapeuloushttps://scrapeulous.com/about/Im CacheDiese Seite übersetzen","snippet":"Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. The business requirement to scrape information from ...","visible_link":"https://scrapeulous.com/about/","date":"","rank":1},{"link":"https://scrapeulous.com/howto/","title":"Howto - Scrapeuloushttps://scrapeulous.com/howto/Im CacheDiese Seite übersetzen","snippet":"We offer scraping large amounts of keywords for the Google Search Engine. Large means any number of keywords between 40 and 50000. Additionally, we ...","visible_link":"https://scrapeulous.com/howto/","date":"","rank":2},{"link":"https://github.com/NikolaiT/se-scraper","title":"GitHub - NikolaiT/se-scraper: Javascript scraping module based on ...https://github.com/NikolaiT/se-scraperIm CacheDiese Seite übersetzen","snippet":"24.12.2018 - Javascript scraping module based on puppeteer for many different search ... for many different search engines... https://scrapeulous.com/.","visible_link":"https://github.com/NikolaiT/se-scraper","date":"24.12.2018 - ","rank":3},{"link":"https://github.com/NikolaiT/GoogleScraper/blob/master/README.md","title":"GoogleScraper/README.md at master · NikolaiT/GoogleScraper ...https://github.com/NikolaiT/GoogleScraper/blob/.../README.mdIm CacheÄhnliche SeitenDiese Seite übersetzen","snippet":"GoogleScraper - Scraping search engines professionally. Scrapeulous.com - Scraping Service. GoogleScraper is a open source tool and will remain a open ...","visible_link":"https://github.com/NikolaiT/GoogleScraper/blob/.../README.md","date":"","rank":4},{"link":"https://googlescraper.readthedocs.io/","title":"Welcome to GoogleScraper's documentation! — GoogleScraper ...https://googlescraper.readthedocs.io/Im CacheDiese Seite übersetzen","snippet":"Welcome to GoogleScraper's documentation!¶. Contents: GoogleScraper - Scraping search engines professionally · Scrapeulous.com - Scraping Service ...","visible_link":"https://googlescraper.readthedocs.io/","date":"","rank":5},{"link":"https://incolumitas.com/pages/scrapeulous/","title":"Coding, Learning and Business Ideas – Scrapeulous.com - Incolumitashttps://incolumitas.com/pages/scrapeulous/Im CacheDiese Seite übersetzen","snippet":"A scraping service for scientists, marketing professionals, analysts or SEO folk. In autumn 2018, I created a scraping service called scrapeulous.com. There you ...","visible_link":"https://incolumitas.com/pages/scrapeulous/","date":"","rank":6},{"link":"https://incolumitas.com/","title":"Coding, Learning and Business Ideashttps://incolumitas.com/Im CacheDiese Seite übersetzen","snippet":"Scraping Amazon Reviews using Headless Chrome Browser and Python3. Posted on Mi ... GoogleScraper Tutorial - How to scrape 1000 keywords with Google.","visible_link":"https://incolumitas.com/","date":"","rank":7},{"link":"https://en.wikipedia.org/wiki/Search_engine_scraping","title":"Search engine scraping - Wikipediahttps://en.wikipedia.org/wiki/Search_engine_scrapingIm CacheDiese Seite übersetzen","snippet":"Search engine scraping is the process of harvesting URLs, descriptions, or other information from search engines such as Google, Bing or Yahoo. This is a ...","visible_link":"https://en.wikipedia.org/wiki/Search_engine_scraping","date":"","rank":8},{"link":"https://readthedocs.org/projects/googlescraper/downloads/pdf/latest/","title":"GoogleScraper Documentation - Read the Docshttps://readthedocs.org/projects/googlescraper/downloads/.../latest...Im CacheDiese Seite übersetzen","snippet":"23.12.2018 - Contents: 1 GoogleScraper - Scraping search engines professionally. 1. 1.1 ... For this reason, I created the web service scrapeulous.com.","visible_link":"https://readthedocs.org/projects/googlescraper/downloads/.../latest...","date":"23.12.2018 - ","rank":9},{"link":"https://pypi.org/project/CountryGoogleScraper/","title":"CountryGoogleScraper · PyPIhttps://pypi.org/project/CountryGoogleScraper/Im CacheDiese Seite übersetzen","snippet":"A module to scrape and extract links, titles and descriptions from various search ... Look [here to get an idea how to use asynchronous mode](http://scrapeulous.","visible_link":"https://pypi.org/project/CountryGoogleScraper/","date":"","rank":10}]}}}
|
||||
{"trump":{"1":{"time":"Wed, 30 Jan 2019 15:03:46 GMT","num_results":"Ungefähr 1.450.000.000 Ergebnisse (0,49 Sekunden) ","no_results":false,"effective_query":"","results":[{"link":"https://de.wikipedia.org/wiki/Donald_Trump","title":"Donald Trump – Wikipediahttps://de.wikipedia.org/wiki/Donald_TrumpIm CacheÄhnliche Seiten","snippet":"Donald John Trump /dɒnəld d͡ʒɒn trʌmp/ (* 14. Juni 1946 in Queens, New York City, New York) ist ein amerikanischer Unternehmer, Entertainer und seit ...","visible_link":"https://de.wikipedia.org/wiki/Donald_Trump","date":"","rank":1},{"link":"https://www.merkur.de/politik/milliardaer-will-trump-mit-unfassbarer-summe-aus-amt-jagen-news-zr-11469011.html","title":"Milliardär will Trump mit unfassbarer Summe aus dem Amt jagen ...https://www.merkur.de/.../milliardaer-will-trump-mit-unfassbarer-summe-aus-amt-jage...Im Cache","snippet":"vor 1 Stunde - Donald Trump: Der längste Shutdown in der Geschichte der USA ist beendet. Die Rede zur Lage der Nation steht bevor und ein Milliardär fährt ...","visible_link":"https://www.merkur.de/.../milliardaer-will-trump-mit-unfassbarer-summe-aus-amt-jage...","date":"vor 1 Stunde - ","rank":2},{"link":"http://www.spiegel.de/wirtschaft/impeachment-tom-steyer-wirbt-fuer-amtsenthebung-von-donald-trump-a-1250683.html","title":"Impeachment: Tom Steyer wirbt für Amtsenthebung von Donald Trump ...www.spiegel.de › Wirtschaft › Donald Trump","snippet":"vor 5 Stunden - US-Milliardär Tom Steyer 100 Millionen Dollar, um Trump aus dem Amt zu jagen. Der Milliardär und Ex-Fondsmanager Tom Steyer ist ...","visible_link":"www.spiegel.de › Wirtschaft › Donald Trump","date":"vor 5 Stunden - ","rank":3},{"link":"http://www.spiegel.de/thema/donald_trump/","title":"Donald Trump - SPIEGEL ONLINEwww.spiegel.de › Politik › AuslandÄhnliche Seiten","snippet":"Der Unternehmer Donald Trump war schon vor seiner Bewerbung als republikanischer Präsidentschaftskandidat weltweit bekannt. Überraschend gewann der ...","visible_link":"www.spiegel.de › Politik › Ausland","date":"","rank":4},{"link":"https://www.faz.net/aktuell/politik/ausland/gefahren-fuer-amerika-geheimdienste-widersprechen-trump-16015734.html","title":"Gefahren für Amerika: Geheimdienste widersprechen Trump - Fazhttps://www.faz.net › Politik › Ausland","snippet":"vor 1 Stunde - Nordkorea rüstet ab, Iran auf und der „IS“ ist besiegt – so sieht es Donald Trump. Ein Bericht der amerikanischen Geheimdienste über ...","visible_link":"https://www.faz.net › Politik › Ausland","date":"vor 1 Stunde - ","rank":5},{"link":"https://www.faz.net/aktuell/politik/thema/donald-trump","title":"Donald Trump: Aktuelle News der FAZ zum US-Präsidentenhttps://www.faz.net/aktuell/politik/thema/donald-trump","snippet":"Donald Trump ist der 45. US-Präsident. ▷ Lesen Sie hier alle Nachrichten der FAZ rund um die Politik und Entscheidungen des Republikaners.","visible_link":"https://www.faz.net/aktuell/politik/thema/donald-trump","date":"","rank":6},{"link":"https://www.donaldjtrump.com/","title":"Donald J. Trump for President: Homehttps://www.donaldjtrump.com/Im CacheÄhnliche SeitenDiese Seite übersetzen","snippet":"Help continue our promise to Make America Great Again!","visible_link":"https://www.donaldjtrump.com/","date":"","rank":7},{"link":"https://www.zeit.de/thema/donald-trump","title":"Donald Trump: Präsident der USA | ZEIT ONLINE - Die Zeithttps://www.zeit.de › Politik","snippet":"Importzölle, Atomabkommen, Einreiseverbot: Donald Trump sorgt innen- und außenpolitisch für Schlagzeilen. Hier lesen Sie Nachrichten und Analysen zum ...","visible_link":"https://www.zeit.de › Politik","date":"","rank":8}]}}}
|
8
run.js
8
run.js
@ -15,18 +15,18 @@ let config = {
|
||||
search_engine: 'google',
|
||||
// whether debug information should be printed
|
||||
// debug info is useful for developers when debugging
|
||||
debug: false,
|
||||
debug: true,
|
||||
// whether verbose program output should be printed
|
||||
// this output is informational
|
||||
verbose: false,
|
||||
verbose: true,
|
||||
// an array of keywords to scrape
|
||||
keywords: ['scraping scrapeulous.com'],
|
||||
keywords: ['trump', ],
|
||||
// alternatively you can specify a keyword_file. this overwrites the keywords array
|
||||
keyword_file: '',
|
||||
// the number of pages to scrape for each keyword
|
||||
num_pages: 1,
|
||||
// whether to start the browser in headless mode
|
||||
headless: true,
|
||||
headless: false,
|
||||
// path to output file, data will be stored in JSON
|
||||
output_file: 'data.json',
|
||||
// whether to prevent images, css, fonts from being loaded
|
||||
|
@ -1,78 +1,9 @@
|
||||
const cheerio = require('cheerio');
|
||||
const sfunctions = require('./functions.js');
|
||||
const Scraper = require('./se_scraper');
|
||||
|
||||
module.exports = {
|
||||
scrape_bing_pup: scrape_bing_pup,
|
||||
scrape_bing_news_pup: scrape_bing_news_pup,
|
||||
};
|
||||
class BingScraper extends Scraper {
|
||||
|
||||
async function scrape_bing_pup(page, event, context, pluggable) {
|
||||
await page.goto('https://www.bing.com/');
|
||||
|
||||
try {
|
||||
await page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
||||
} catch (e) {
|
||||
return results;
|
||||
}
|
||||
|
||||
let keywords = event.keywords;
|
||||
var results = {};
|
||||
|
||||
for (var i = 0; i < keywords.length; i++) {
|
||||
|
||||
keyword = keywords[i];
|
||||
results[keyword] = {};
|
||||
|
||||
if (pluggable.before_keyword_scraped) {
|
||||
await pluggable.before_keyword_scraped({
|
||||
keyword: keyword,
|
||||
page: page,
|
||||
event: event,
|
||||
context: context,
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
const input = await page.$('input[name="q"]');
|
||||
await sfunctions.set_input_value(page, `input[name="q"]`, keyword);
|
||||
await sfunctions.sleep(50);
|
||||
await input.focus();
|
||||
await page.keyboard.press("Enter");
|
||||
|
||||
let page_num = 1;
|
||||
|
||||
do {
|
||||
if (event.verbose === true) {
|
||||
console.log(`${event.search_engine} is scraping keyword: ${keyword} on page ${page_num}`);
|
||||
}
|
||||
if (event.sleep_range) {
|
||||
await sfunctions.random_sleep(event);
|
||||
}
|
||||
await page.waitForSelector('#b_content', { timeout: 5000 });
|
||||
await sfunctions.sleep(500);
|
||||
let html = await page.content();
|
||||
results[keyword][page_num] = parse(html);
|
||||
|
||||
page_num += 1;
|
||||
|
||||
let next_page_link = await page.$('.sb_pagN', {timeout: 1000});
|
||||
if (!next_page_link) {
|
||||
break;
|
||||
}
|
||||
await next_page_link.click();
|
||||
await page.waitForNavigation();
|
||||
|
||||
} while (page_num <= event.num_pages)
|
||||
|
||||
} catch (e) {
|
||||
console.error(`Problem with scraping ${keyword}: ${e}`);
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
function parse(html) {
|
||||
parse(html) {
|
||||
// load the page source into cheerio
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
@ -87,7 +18,7 @@ function parse(html) {
|
||||
})
|
||||
});
|
||||
|
||||
let no_results = sfunctions.no_results(
|
||||
let no_results = this.no_results(
|
||||
['There are no results', 'Es gibt keine Ergebnisse', 'Including results for', 'Einschließlich Ergebnisse'],
|
||||
$('#b_results').text()
|
||||
);
|
||||
@ -110,69 +41,51 @@ function parse(html) {
|
||||
num_results: $('#b_content .sb_count').text(),
|
||||
results: cleaned,
|
||||
}
|
||||
}
|
||||
|
||||
async function scrape_bing_news_pup(page, event, context, pluggable) {
|
||||
await page.goto('https://www.bing.com/news/search?');
|
||||
|
||||
if (event.set_manual_settings === true) {
|
||||
console.log('Sleeping 30 seconds. Set your settings now.');
|
||||
await sfunctions.sleep(30000);
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
try {
|
||||
await page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
||||
await this.page.goto('https://www.bing.com/');
|
||||
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
||||
} catch (e) {
|
||||
return results;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
let keywords = event.keywords;
|
||||
var results = {};
|
||||
|
||||
for (var i = 0; i < keywords.length; i++) {
|
||||
|
||||
keyword = keywords[i];
|
||||
|
||||
if (pluggable.before_keyword_scraped) {
|
||||
await pluggable.before_keyword_scraped({
|
||||
keyword: keyword,
|
||||
page: page,
|
||||
event: event,
|
||||
context: context,
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
const input = await page.$('input[name="q"]');
|
||||
// overwrites last text in input
|
||||
await input.click({ clickCount: 3 });
|
||||
await input.type(keyword);
|
||||
async search_keyword(keyword) {
|
||||
const input = await this.page.$('input[name="q"]');
|
||||
await this.set_input_value(`input[name="q"]`, keyword);
|
||||
await this.sleep(50);
|
||||
await input.focus();
|
||||
await page.keyboard.press("Enter");
|
||||
|
||||
if (event.sleep_range) {
|
||||
await sfunctions.random_sleep(event);
|
||||
await this.page.keyboard.press("Enter");
|
||||
}
|
||||
|
||||
await page.waitForSelector('#news', { timeout: 5000 });
|
||||
await sfunctions.sleep(2000);
|
||||
async next_page() {
|
||||
let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000});
|
||||
if (!next_page_link) {
|
||||
return false;
|
||||
}
|
||||
await next_page_link.click();
|
||||
await this.page.waitForNavigation();
|
||||
|
||||
if (event.debug === true && event.is_local === true) {
|
||||
await page.screenshot({path: `debug/${keyword}.png`});
|
||||
return true;
|
||||
}
|
||||
|
||||
let html = await page.content();
|
||||
results[keyword] = parse_bing_news(html);
|
||||
|
||||
} catch (e) {
|
||||
console.error(`Problem with scraping ${keyword}: ${e}`);
|
||||
}
|
||||
async wait_for_results() {
|
||||
await this.page.waitForSelector('#b_content', { timeout: 5000 });
|
||||
await this.sleep(500);
|
||||
}
|
||||
|
||||
return results;
|
||||
async detected() {
|
||||
// TODO: I was actually never detected by bing. those are good guys.
|
||||
}
|
||||
}
|
||||
|
||||
function parse_bing_news(html) {
|
||||
|
||||
class BingNewsScraper extends Scraper {
|
||||
|
||||
parse(html) {
|
||||
// load the page source into cheerio
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
@ -200,4 +113,52 @@ function parse_bing_news(html) {
|
||||
time: (new Date()).toUTCString(),
|
||||
results: cleaned,
|
||||
}
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
try {
|
||||
await this.page.goto('https://www.bing.com/news/search?');
|
||||
if (this.config.set_manual_settings === true) {
|
||||
console.log('Sleeping 30 seconds. Set your settings now.');
|
||||
await this.sleep(30000);
|
||||
}
|
||||
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
async search_keyword(keyword) {
|
||||
const input = await this.page.$('input[name="q"]');
|
||||
await this.set_input_value(`input[name="q"]`, keyword);
|
||||
await this.sleep(50);
|
||||
await input.focus();
|
||||
await this.page.keyboard.press("Enter");
|
||||
}
|
||||
|
||||
async next_page() {
|
||||
let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000});
|
||||
if (!next_page_link) {
|
||||
return false;
|
||||
}
|
||||
await next_page_link.click();
|
||||
await this.page.waitForNavigation();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
async wait_for_results() {
|
||||
await this.page.waitForSelector('#news', { timeout: 5000 });
|
||||
await this.sleep(2000);
|
||||
}
|
||||
|
||||
async detected() {
|
||||
// TODO: I was actually never detected by bing news.
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
BingNewsScraper: BingNewsScraper,
|
||||
BingScraper: BingScraper,
|
||||
};
|
@ -1,104 +1,98 @@
|
||||
const cheerio = require('cheerio');
|
||||
const sfunctions = require('./functions.js');
|
||||
const Scraper = require('./se_scraper');
|
||||
|
||||
module.exports = {
|
||||
scrape_google_news_old_pup: scrape_google_news_old_pup,
|
||||
scrape_google_pup: scrape_google_pup,
|
||||
scrape_google_image_pup: scrape_google_image_pup,
|
||||
scrape_google_news_pup: scrape_google_news_pup,
|
||||
scrape_google_pup_dr: scrape_google_pup_dr,
|
||||
};
|
||||
class GoogleScraper extends Scraper {
|
||||
|
||||
const STANDARD_TIMEOUT = 8000;
|
||||
const SOLVE_CAPTCHA_TIME = 45000;
|
||||
parse(html) {
|
||||
// load the page source into cheerio
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
async function scrape_google_pup(page, event, context, pluggable) {
|
||||
await page.goto('https://www.google.com/');
|
||||
|
||||
try {
|
||||
await page.waitForSelector('input[name="q"]', { timeout: STANDARD_TIMEOUT });
|
||||
} catch (e) {
|
||||
return results;
|
||||
}
|
||||
|
||||
let keywords = event.keywords;
|
||||
var results = {};
|
||||
|
||||
for (var i = 0; i < keywords.length; i++) {
|
||||
keyword = keywords[i];
|
||||
results[keyword] = {};
|
||||
|
||||
if (pluggable.before_keyword_scraped) {
|
||||
await pluggable.before_keyword_scraped({
|
||||
keyword: keyword,
|
||||
page: page,
|
||||
event: event,
|
||||
context: context,
|
||||
// perform queries
|
||||
const results = [];
|
||||
$('#center_col .g').each((i, link) => {
|
||||
results.push({
|
||||
link: $(link).find('.r a').attr('href'),
|
||||
title: $(link).find('.r a').text(),
|
||||
snippet: $(link).find('span.st').text(),
|
||||
visible_link: $(link).find('.r cite').text(),
|
||||
date: $(link).find('span.f').text() || '',
|
||||
})
|
||||
});
|
||||
|
||||
let no_results = sfunctions.no_results(
|
||||
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
|
||||
'No results found for', 'Ergebnisse für', 'Showing results for'],
|
||||
$('#main').text()
|
||||
);
|
||||
|
||||
let effective_query = $('#fprsl').text() || '';
|
||||
if (!effective_query) {
|
||||
effective_query = $('#fprs a').text()
|
||||
}
|
||||
|
||||
const cleaned = [];
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
||||
res.rank = i+1;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
num_results: $('#resultStats').text(),
|
||||
no_results: no_results,
|
||||
effective_query: effective_query,
|
||||
results: cleaned
|
||||
}
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
await this.page.goto('https://www.google.com/');
|
||||
|
||||
try {
|
||||
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const input = await page.$('input[name="q"]');
|
||||
await sfunctions.set_input_value(page, `input[name="q"]`, keyword);
|
||||
await sfunctions.sleep(50);
|
||||
return true;
|
||||
}
|
||||
|
||||
async search_keyword(keyword) {
|
||||
const input = await this.page.$('input[name="q"]');
|
||||
await this.set_input_value(`input[name="q"]`, keyword);
|
||||
await this.sleep(50);
|
||||
await input.focus();
|
||||
await page.keyboard.press("Enter");
|
||||
|
||||
let page_num = 1;
|
||||
|
||||
do {
|
||||
if (event.verbose === true) {
|
||||
console.log(`${event.search_engine} is scraping keyword: ${keyword} on page ${page_num}`);
|
||||
await this.page.keyboard.press("Enter");
|
||||
}
|
||||
if (event.sleep_range) {
|
||||
await sfunctions.random_sleep(event);
|
||||
}
|
||||
await page.waitForSelector('#center_col', {timeout: STANDARD_TIMEOUT});
|
||||
await sfunctions.sleep(500);
|
||||
let html = await page.content();
|
||||
results[keyword][page_num] = parse_google_results(html);
|
||||
|
||||
page_num += 1;
|
||||
|
||||
let next_page_link = await page.$('#pnnext', {timeout: 1000});
|
||||
async next_page() {
|
||||
let next_page_link = await this.page.$('#pnnext', {timeout: 1000});
|
||||
if (!next_page_link) {
|
||||
break;
|
||||
return false;
|
||||
}
|
||||
await next_page_link.click();
|
||||
await page.waitForNavigation();
|
||||
await this.page.waitForNavigation();
|
||||
|
||||
} while (page_num <= event.num_pages)
|
||||
|
||||
} catch (e) {
|
||||
console.error(`Problem with scraping ${keyword}.`);
|
||||
console.error(e);
|
||||
|
||||
if (await scraping_detected(page) === true) {
|
||||
console.error('Google detected the scraping. Aborting.');
|
||||
|
||||
if (event.is_local === true) {
|
||||
await sfunctions.sleep(SOLVE_CAPTCHA_TIME);
|
||||
console.error('You have 45 seconds to enter the captcha.');
|
||||
// expect that user filled out necessary captcha
|
||||
} else {
|
||||
return results;
|
||||
}
|
||||
} else {
|
||||
// some other error, quit scraping process if stuff is broken
|
||||
if (event.is_local === true) {
|
||||
console.error('You have 30 seconds to fix this.');
|
||||
await sfunctions.sleep(30000);
|
||||
} else {
|
||||
return results;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return results;
|
||||
async wait_for_results() {
|
||||
await this.page.waitForSelector('#center_col', { timeout: this.STANDARD_TIMEOUT });
|
||||
await this.sleep(500);
|
||||
}
|
||||
|
||||
async detected() {
|
||||
const title = await this.page.title();
|
||||
let html = await this.page.content();
|
||||
return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
async function scrape_google_pup_dr(page, event, context, pluggable) {
|
||||
let keywords = event.keywords;
|
||||
first = keywords[0];
|
||||
@ -651,3 +645,12 @@ function parse_google_news_results(html) {
|
||||
effective_query: effective_query,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
module.exports = {
|
||||
scrape_google_news_old_pup: scrape_google_news_old_pup,
|
||||
GoogleScraper: GoogleScraper,
|
||||
scrape_google_image_pup: scrape_google_image_pup,
|
||||
scrape_google_news_pup: scrape_google_news_pup,
|
||||
scrape_google_pup_dr: scrape_google_pup_dr,
|
||||
};
|
@ -3,34 +3,214 @@ const start_url = {
|
||||
};
|
||||
|
||||
/*
|
||||
Get useful JS knowledge and get awesome...
|
||||
|
||||
Read this shit: https://javascript.info/class-inheritance
|
||||
And this: https://medium.freecodecamp.org/here-are-examples-of-everything-new-in-ecmascript-2016-2017-and-2018-d52fa3b5a70e
|
||||
*/
|
||||
|
||||
module.exports = class Scraper {
|
||||
constructor(options = {}) {
|
||||
const {
|
||||
searchEngine = 'google',
|
||||
numPages = 1,
|
||||
browser = null,
|
||||
config = {},
|
||||
context = {},
|
||||
pluggable = null,
|
||||
} = options;
|
||||
|
||||
this.pluggable = pluggable;
|
||||
this.searchEngine = searchEngine;
|
||||
this.numPages = numPages;
|
||||
this.results = {}
|
||||
this.browser = browser;
|
||||
this.config = config;
|
||||
this.context = context;
|
||||
|
||||
this.STANDARD_TIMEOUT = 8000;
|
||||
// longer timeout when using proxies
|
||||
this.PROXY_TIMEOUT = 15000;
|
||||
this.SOLVE_CAPTCHA_TIME = 45000;
|
||||
|
||||
this.results = {};
|
||||
}
|
||||
|
||||
async run() {
|
||||
|
||||
let do_continue = await this.load_search_engine();
|
||||
|
||||
if (!do_continue) {
|
||||
console.error('Failed to load the search engine: load_search_engine()');
|
||||
return this.results;
|
||||
}
|
||||
|
||||
await this.scraping_loop();
|
||||
|
||||
return this.results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Action that runs only once in the beginning of the
|
||||
* scraping procedure.
|
||||
*
|
||||
* @returns {Promise<void>} true if everything is correct.
|
||||
*/
|
||||
async load_search_engine() {
|
||||
|
||||
this.page = await this.browser.newPage();
|
||||
|
||||
// block some assets to speed up scraping
|
||||
if (this.config.block_assets === true) {
|
||||
await this.page.setRequestInterception(true);
|
||||
this.page.on('request', (req) => {
|
||||
let type = req.resourceType();
|
||||
const block = ['stylesheet', 'font', 'image', 'media'];
|
||||
if (block.includes(type)) {
|
||||
req.abort();
|
||||
} else {
|
||||
req.continue();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
async search_keyword() {
|
||||
return await this.load_start_page();
|
||||
}
|
||||
|
||||
parse() {
|
||||
/**
|
||||
* Each scraper basically iterates over a list of
|
||||
* keywords and a list of pages. This is the generic
|
||||
* method for that.
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async scraping_loop() {
|
||||
|
||||
for (let keyword of this.config.keywords) {
|
||||
|
||||
this.results[keyword] = {};
|
||||
|
||||
if (this.pluggable.before_keyword_scraped) {
|
||||
await this.pluggable.before_keyword_scraped({
|
||||
keyword: keyword,
|
||||
page: this.page,
|
||||
event: this.config,
|
||||
context: this.context,
|
||||
});
|
||||
}
|
||||
|
||||
let page_num = 1;
|
||||
|
||||
try {
|
||||
|
||||
await this.search_keyword(keyword);
|
||||
|
||||
do {
|
||||
|
||||
if (this.config.verbose === true) {
|
||||
console.log(`${this.config.search_engine} scrapes keyword "${keyword}" on page ${page_num}`);
|
||||
}
|
||||
|
||||
await this.wait_for_results();
|
||||
|
||||
if (event.sleep_range) {
|
||||
await this.random_sleep();
|
||||
}
|
||||
|
||||
let html = await this.page.content();
|
||||
this.results[keyword][page_num] = this.parse(html);
|
||||
|
||||
page_num += 1;
|
||||
|
||||
if (await this.next_page() === false) {
|
||||
break;
|
||||
}
|
||||
|
||||
} while (page_num < event.num_pages);
|
||||
|
||||
} catch (e) {
|
||||
|
||||
console.error(`Problem with scraping ${keyword} in search engine ${this.config.search_engine}: ${e}`);
|
||||
|
||||
if (await this.detected() === true) {
|
||||
console.error(`${this.config.search_engine} DETECTED the scraping!`);
|
||||
|
||||
if (this.config.is_local === true) {
|
||||
await this.sleep(this.SOLVE_CAPTCHA_TIME);
|
||||
console.error(`You have ${this.SOLVE_CAPTCHA_TIME}ms to enter the captcha.`);
|
||||
// expect that user filled out necessary captcha
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// some other error, quit scraping process if stuff is broken
|
||||
if (this.config.is_local === true) {
|
||||
console.error('You have 30 seconds to fix this.');
|
||||
await this.sleep(30000);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sleep(ms) {
|
||||
return new Promise(resolve => {
|
||||
setTimeout(resolve, ms)
|
||||
})
|
||||
}
|
||||
|
||||
async random_sleep() {
|
||||
const [min, max] = this.config.sleep_range;
|
||||
let rand = Math.floor(Math.random() * (max - min + 1) + min); //Generate Random number
|
||||
if (this.config.debug === true) {
|
||||
console.log(`Sleeping for ${rand}s`);
|
||||
}
|
||||
await this.sleep(rand * 1000);
|
||||
}
|
||||
|
||||
async set_input_value(selector, value) {
|
||||
await this.page.waitFor(selector);
|
||||
await this.page.evaluate((value, selector) => {
|
||||
return document.querySelector(selector).value = value;
|
||||
}, value, selector);
|
||||
}
|
||||
|
||||
no_results(needles, html) {
|
||||
return !needles.map((needle) => { return html.indexOf(needle)})
|
||||
.every((res) => { return res == -1});
|
||||
}
|
||||
|
||||
parse(html) {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns true if startpage was loaded correctly.
|
||||
*/
|
||||
async load_start_page() {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Searches the keyword by inputting it into the form and hitting enter
|
||||
* or something similar.
|
||||
*
|
||||
* @param keyword
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async search_keyword(keyword) {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns true if the next page was loaded correctely
|
||||
*/
|
||||
async next_page() {
|
||||
|
||||
}
|
||||
|
||||
async wait_for_results() {
|
||||
|
||||
}
|
||||
|
||||
async detected() {
|
||||
|
@ -129,29 +129,13 @@ module.exports.handler = async function handler (event, context, callback) {
|
||||
}
|
||||
}
|
||||
|
||||
const page = await browser.newPage();
|
||||
|
||||
// block some assets to speed up scraping
|
||||
if (config.block_assets === true) {
|
||||
await page.setRequestInterception(true);
|
||||
page.on('request', (req) => {
|
||||
let type = req.resourceType();
|
||||
const block = ['stylesheet', 'font', 'image', 'media'];
|
||||
if (block.includes(type)) {
|
||||
req.abort();
|
||||
} else {
|
||||
req.continue();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
results = await {
|
||||
google: google.scrape_google_pup,
|
||||
Scraper = {
|
||||
google: google.GoogleScraper,
|
||||
google_news_old: google.scrape_google_news_old_pup,
|
||||
google_news: google.scrape_google_news_pup,
|
||||
google_image: google.scrape_google_image_pup,
|
||||
bing: bing.scrape_bing_pup,
|
||||
bing_news: bing.scrape_bing_news_pup,
|
||||
bing: bing.BingScraper,
|
||||
bing_news: bing.BingNewsScraper,
|
||||
infospace: infospace.scrape_infospace_pup,
|
||||
webcrawler: infospace.scrape_webcrawler_news_pup,
|
||||
baidu: baidu.scrape_baidu_pup,
|
||||
@ -163,7 +147,16 @@ module.exports.handler = async function handler (event, context, callback) {
|
||||
reuters: tickersearch.scrape_reuters_finance_pup,
|
||||
cnbc: tickersearch.scrape_cnbc_finance_pup,
|
||||
marketwatch: tickersearch.scrape_marketwatch_finance_pup,
|
||||
}[config.search_engine](page, config, context, pluggable);
|
||||
}[config.search_engine];
|
||||
|
||||
let scraper = new Scraper({
|
||||
browser: browser,
|
||||
config: config,
|
||||
context: context,
|
||||
pluggable: pluggable,
|
||||
});
|
||||
|
||||
let results = await scraper.run();
|
||||
|
||||
|
||||
if (pluggable.close_browser) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user