mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2025-06-20 17:47:49 +02:00
implemented generic scraping class
This commit is contained in:
parent
9e62f23451
commit
4306848657
6
TODO.txt
6
TODO.txt
@ -25,6 +25,12 @@
|
|||||||
|
|
||||||
- implement duckduckgo scraping
|
- implement duckduckgo scraping
|
||||||
|
|
||||||
|
|
||||||
|
30.1.2019
|
||||||
|
|
||||||
|
- modify all scrapers to use the generic class where it makes sense
|
||||||
|
- Bing, Baidu, Google, Duckduckgo
|
||||||
|
|
||||||
TODO:
|
TODO:
|
||||||
- think about implementing ticker search for: https://quotes.wsj.com/MSFT?mod=searchresults_companyquotes
|
- think about implementing ticker search for: https://quotes.wsj.com/MSFT?mod=searchresults_companyquotes
|
||||||
- add proxy support
|
- add proxy support
|
||||||
|
@ -1 +1 @@
|
|||||||
{"scraping scrapeulous.com":{"1":{"time":"Tue, 29 Jan 2019 21:46:30 GMT","num_results":"Ungefähr 139 Ergebnisse (0,29 Sekunden) ","no_results":false,"effective_query":"","results":[{"link":"https://scrapeulous.com/about/","title":"About - Scrapeuloushttps://scrapeulous.com/about/Im CacheDiese Seite übersetzen","snippet":"Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. The business requirement to scrape information from ...","visible_link":"https://scrapeulous.com/about/","date":"","rank":1},{"link":"https://scrapeulous.com/howto/","title":"Howto - Scrapeuloushttps://scrapeulous.com/howto/Im CacheDiese Seite übersetzen","snippet":"We offer scraping large amounts of keywords for the Google Search Engine. Large means any number of keywords between 40 and 50000. Additionally, we ...","visible_link":"https://scrapeulous.com/howto/","date":"","rank":2},{"link":"https://github.com/NikolaiT/se-scraper","title":"GitHub - NikolaiT/se-scraper: Javascript scraping module based on ...https://github.com/NikolaiT/se-scraperIm CacheDiese Seite übersetzen","snippet":"24.12.2018 - Javascript scraping module based on puppeteer for many different search ... for many different search engines... https://scrapeulous.com/.","visible_link":"https://github.com/NikolaiT/se-scraper","date":"24.12.2018 - ","rank":3},{"link":"https://github.com/NikolaiT/GoogleScraper/blob/master/README.md","title":"GoogleScraper/README.md at master · NikolaiT/GoogleScraper ...https://github.com/NikolaiT/GoogleScraper/blob/.../README.mdIm CacheÄhnliche SeitenDiese Seite übersetzen","snippet":"GoogleScraper - Scraping search engines professionally. Scrapeulous.com - Scraping Service. GoogleScraper is a open source tool and will remain a open ...","visible_link":"https://github.com/NikolaiT/GoogleScraper/blob/.../README.md","date":"","rank":4},{"link":"https://googlescraper.readthedocs.io/","title":"Welcome to GoogleScraper's documentation! — GoogleScraper ...https://googlescraper.readthedocs.io/Im CacheDiese Seite übersetzen","snippet":"Welcome to GoogleScraper's documentation!¶. Contents: GoogleScraper - Scraping search engines professionally · Scrapeulous.com - Scraping Service ...","visible_link":"https://googlescraper.readthedocs.io/","date":"","rank":5},{"link":"https://incolumitas.com/pages/scrapeulous/","title":"Coding, Learning and Business Ideas – Scrapeulous.com - Incolumitashttps://incolumitas.com/pages/scrapeulous/Im CacheDiese Seite übersetzen","snippet":"A scraping service for scientists, marketing professionals, analysts or SEO folk. In autumn 2018, I created a scraping service called scrapeulous.com. There you ...","visible_link":"https://incolumitas.com/pages/scrapeulous/","date":"","rank":6},{"link":"https://incolumitas.com/","title":"Coding, Learning and Business Ideashttps://incolumitas.com/Im CacheDiese Seite übersetzen","snippet":"Scraping Amazon Reviews using Headless Chrome Browser and Python3. Posted on Mi ... GoogleScraper Tutorial - How to scrape 1000 keywords with Google.","visible_link":"https://incolumitas.com/","date":"","rank":7},{"link":"https://en.wikipedia.org/wiki/Search_engine_scraping","title":"Search engine scraping - Wikipediahttps://en.wikipedia.org/wiki/Search_engine_scrapingIm CacheDiese Seite übersetzen","snippet":"Search engine scraping is the process of harvesting URLs, descriptions, or other information from search engines such as Google, Bing or Yahoo. This is a ...","visible_link":"https://en.wikipedia.org/wiki/Search_engine_scraping","date":"","rank":8},{"link":"https://readthedocs.org/projects/googlescraper/downloads/pdf/latest/","title":"GoogleScraper Documentation - Read the Docshttps://readthedocs.org/projects/googlescraper/downloads/.../latest...Im CacheDiese Seite übersetzen","snippet":"23.12.2018 - Contents: 1 GoogleScraper - Scraping search engines professionally. 1. 1.1 ... For this reason, I created the web service scrapeulous.com.","visible_link":"https://readthedocs.org/projects/googlescraper/downloads/.../latest...","date":"23.12.2018 - ","rank":9},{"link":"https://pypi.org/project/CountryGoogleScraper/","title":"CountryGoogleScraper · PyPIhttps://pypi.org/project/CountryGoogleScraper/Im CacheDiese Seite übersetzen","snippet":"A module to scrape and extract links, titles and descriptions from various search ... Look [here to get an idea how to use asynchronous mode](http://scrapeulous.","visible_link":"https://pypi.org/project/CountryGoogleScraper/","date":"","rank":10}]}}}
|
{"trump":{"1":{"time":"Wed, 30 Jan 2019 15:03:46 GMT","num_results":"Ungefähr 1.450.000.000 Ergebnisse (0,49 Sekunden) ","no_results":false,"effective_query":"","results":[{"link":"https://de.wikipedia.org/wiki/Donald_Trump","title":"Donald Trump – Wikipediahttps://de.wikipedia.org/wiki/Donald_TrumpIm CacheÄhnliche Seiten","snippet":"Donald John Trump /dɒnəld d͡ʒɒn trʌmp/ (* 14. Juni 1946 in Queens, New York City, New York) ist ein amerikanischer Unternehmer, Entertainer und seit ...","visible_link":"https://de.wikipedia.org/wiki/Donald_Trump","date":"","rank":1},{"link":"https://www.merkur.de/politik/milliardaer-will-trump-mit-unfassbarer-summe-aus-amt-jagen-news-zr-11469011.html","title":"Milliardär will Trump mit unfassbarer Summe aus dem Amt jagen ...https://www.merkur.de/.../milliardaer-will-trump-mit-unfassbarer-summe-aus-amt-jage...Im Cache","snippet":"vor 1 Stunde - Donald Trump: Der längste Shutdown in der Geschichte der USA ist beendet. Die Rede zur Lage der Nation steht bevor und ein Milliardär fährt ...","visible_link":"https://www.merkur.de/.../milliardaer-will-trump-mit-unfassbarer-summe-aus-amt-jage...","date":"vor 1 Stunde - ","rank":2},{"link":"http://www.spiegel.de/wirtschaft/impeachment-tom-steyer-wirbt-fuer-amtsenthebung-von-donald-trump-a-1250683.html","title":"Impeachment: Tom Steyer wirbt für Amtsenthebung von Donald Trump ...www.spiegel.de › Wirtschaft › Donald Trump","snippet":"vor 5 Stunden - US-Milliardär Tom Steyer 100 Millionen Dollar, um Trump aus dem Amt zu jagen. Der Milliardär und Ex-Fondsmanager Tom Steyer ist ...","visible_link":"www.spiegel.de › Wirtschaft › Donald Trump","date":"vor 5 Stunden - ","rank":3},{"link":"http://www.spiegel.de/thema/donald_trump/","title":"Donald Trump - SPIEGEL ONLINEwww.spiegel.de › Politik › AuslandÄhnliche Seiten","snippet":"Der Unternehmer Donald Trump war schon vor seiner Bewerbung als republikanischer Präsidentschaftskandidat weltweit bekannt. Überraschend gewann der ...","visible_link":"www.spiegel.de › Politik › Ausland","date":"","rank":4},{"link":"https://www.faz.net/aktuell/politik/ausland/gefahren-fuer-amerika-geheimdienste-widersprechen-trump-16015734.html","title":"Gefahren für Amerika: Geheimdienste widersprechen Trump - Fazhttps://www.faz.net › Politik › Ausland","snippet":"vor 1 Stunde - Nordkorea rüstet ab, Iran auf und der „IS“ ist besiegt – so sieht es Donald Trump. Ein Bericht der amerikanischen Geheimdienste über ...","visible_link":"https://www.faz.net › Politik › Ausland","date":"vor 1 Stunde - ","rank":5},{"link":"https://www.faz.net/aktuell/politik/thema/donald-trump","title":"Donald Trump: Aktuelle News der FAZ zum US-Präsidentenhttps://www.faz.net/aktuell/politik/thema/donald-trump","snippet":"Donald Trump ist der 45. US-Präsident. ▷ Lesen Sie hier alle Nachrichten der FAZ rund um die Politik und Entscheidungen des Republikaners.","visible_link":"https://www.faz.net/aktuell/politik/thema/donald-trump","date":"","rank":6},{"link":"https://www.donaldjtrump.com/","title":"Donald J. Trump for President: Homehttps://www.donaldjtrump.com/Im CacheÄhnliche SeitenDiese Seite übersetzen","snippet":"Help continue our promise to Make America Great Again!","visible_link":"https://www.donaldjtrump.com/","date":"","rank":7},{"link":"https://www.zeit.de/thema/donald-trump","title":"Donald Trump: Präsident der USA | ZEIT ONLINE - Die Zeithttps://www.zeit.de › Politik","snippet":"Importzölle, Atomabkommen, Einreiseverbot: Donald Trump sorgt innen- und außenpolitisch für Schlagzeilen. Hier lesen Sie Nachrichten und Analysen zum ...","visible_link":"https://www.zeit.de › Politik","date":"","rank":8}]}}}
|
8
run.js
8
run.js
@ -15,18 +15,18 @@ let config = {
|
|||||||
search_engine: 'google',
|
search_engine: 'google',
|
||||||
// whether debug information should be printed
|
// whether debug information should be printed
|
||||||
// debug info is useful for developers when debugging
|
// debug info is useful for developers when debugging
|
||||||
debug: false,
|
debug: true,
|
||||||
// whether verbose program output should be printed
|
// whether verbose program output should be printed
|
||||||
// this output is informational
|
// this output is informational
|
||||||
verbose: false,
|
verbose: true,
|
||||||
// an array of keywords to scrape
|
// an array of keywords to scrape
|
||||||
keywords: ['scraping scrapeulous.com'],
|
keywords: ['trump', ],
|
||||||
// alternatively you can specify a keyword_file. this overwrites the keywords array
|
// alternatively you can specify a keyword_file. this overwrites the keywords array
|
||||||
keyword_file: '',
|
keyword_file: '',
|
||||||
// the number of pages to scrape for each keyword
|
// the number of pages to scrape for each keyword
|
||||||
num_pages: 1,
|
num_pages: 1,
|
||||||
// whether to start the browser in headless mode
|
// whether to start the browser in headless mode
|
||||||
headless: true,
|
headless: false,
|
||||||
// path to output file, data will be stored in JSON
|
// path to output file, data will be stored in JSON
|
||||||
output_file: 'data.json',
|
output_file: 'data.json',
|
||||||
// whether to prevent images, css, fonts from being loaded
|
// whether to prevent images, css, fonts from being loaded
|
||||||
|
@ -1,78 +1,9 @@
|
|||||||
const cheerio = require('cheerio');
|
const cheerio = require('cheerio');
|
||||||
const sfunctions = require('./functions.js');
|
const Scraper = require('./se_scraper');
|
||||||
|
|
||||||
module.exports = {
|
class BingScraper extends Scraper {
|
||||||
scrape_bing_pup: scrape_bing_pup,
|
|
||||||
scrape_bing_news_pup: scrape_bing_news_pup,
|
|
||||||
};
|
|
||||||
|
|
||||||
async function scrape_bing_pup(page, event, context, pluggable) {
|
parse(html) {
|
||||||
await page.goto('https://www.bing.com/');
|
|
||||||
|
|
||||||
try {
|
|
||||||
await page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
|
||||||
} catch (e) {
|
|
||||||
return results;
|
|
||||||
}
|
|
||||||
|
|
||||||
let keywords = event.keywords;
|
|
||||||
var results = {};
|
|
||||||
|
|
||||||
for (var i = 0; i < keywords.length; i++) {
|
|
||||||
|
|
||||||
keyword = keywords[i];
|
|
||||||
results[keyword] = {};
|
|
||||||
|
|
||||||
if (pluggable.before_keyword_scraped) {
|
|
||||||
await pluggable.before_keyword_scraped({
|
|
||||||
keyword: keyword,
|
|
||||||
page: page,
|
|
||||||
event: event,
|
|
||||||
context: context,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
const input = await page.$('input[name="q"]');
|
|
||||||
await sfunctions.set_input_value(page, `input[name="q"]`, keyword);
|
|
||||||
await sfunctions.sleep(50);
|
|
||||||
await input.focus();
|
|
||||||
await page.keyboard.press("Enter");
|
|
||||||
|
|
||||||
let page_num = 1;
|
|
||||||
|
|
||||||
do {
|
|
||||||
if (event.verbose === true) {
|
|
||||||
console.log(`${event.search_engine} is scraping keyword: ${keyword} on page ${page_num}`);
|
|
||||||
}
|
|
||||||
if (event.sleep_range) {
|
|
||||||
await sfunctions.random_sleep(event);
|
|
||||||
}
|
|
||||||
await page.waitForSelector('#b_content', { timeout: 5000 });
|
|
||||||
await sfunctions.sleep(500);
|
|
||||||
let html = await page.content();
|
|
||||||
results[keyword][page_num] = parse(html);
|
|
||||||
|
|
||||||
page_num += 1;
|
|
||||||
|
|
||||||
let next_page_link = await page.$('.sb_pagN', {timeout: 1000});
|
|
||||||
if (!next_page_link) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
await next_page_link.click();
|
|
||||||
await page.waitForNavigation();
|
|
||||||
|
|
||||||
} while (page_num <= event.num_pages)
|
|
||||||
|
|
||||||
} catch (e) {
|
|
||||||
console.error(`Problem with scraping ${keyword}: ${e}`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return results;
|
|
||||||
}
|
|
||||||
|
|
||||||
function parse(html) {
|
|
||||||
// load the page source into cheerio
|
// load the page source into cheerio
|
||||||
const $ = cheerio.load(html);
|
const $ = cheerio.load(html);
|
||||||
|
|
||||||
@ -87,7 +18,7 @@ function parse(html) {
|
|||||||
})
|
})
|
||||||
});
|
});
|
||||||
|
|
||||||
let no_results = sfunctions.no_results(
|
let no_results = this.no_results(
|
||||||
['There are no results', 'Es gibt keine Ergebnisse', 'Including results for', 'Einschließlich Ergebnisse'],
|
['There are no results', 'Es gibt keine Ergebnisse', 'Including results for', 'Einschließlich Ergebnisse'],
|
||||||
$('#b_results').text()
|
$('#b_results').text()
|
||||||
);
|
);
|
||||||
@ -110,69 +41,51 @@ function parse(html) {
|
|||||||
num_results: $('#b_content .sb_count').text(),
|
num_results: $('#b_content .sb_count').text(),
|
||||||
results: cleaned,
|
results: cleaned,
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
async function scrape_bing_news_pup(page, event, context, pluggable) {
|
|
||||||
await page.goto('https://www.bing.com/news/search?');
|
|
||||||
|
|
||||||
if (event.set_manual_settings === true) {
|
|
||||||
console.log('Sleeping 30 seconds. Set your settings now.');
|
|
||||||
await sfunctions.sleep(30000);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async load_start_page() {
|
||||||
try {
|
try {
|
||||||
await page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
await this.page.goto('https://www.bing.com/');
|
||||||
|
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
return results;
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
let keywords = event.keywords;
|
async search_keyword(keyword) {
|
||||||
var results = {};
|
const input = await this.page.$('input[name="q"]');
|
||||||
|
await this.set_input_value(`input[name="q"]`, keyword);
|
||||||
for (var i = 0; i < keywords.length; i++) {
|
await this.sleep(50);
|
||||||
|
|
||||||
keyword = keywords[i];
|
|
||||||
|
|
||||||
if (pluggable.before_keyword_scraped) {
|
|
||||||
await pluggable.before_keyword_scraped({
|
|
||||||
keyword: keyword,
|
|
||||||
page: page,
|
|
||||||
event: event,
|
|
||||||
context: context,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
const input = await page.$('input[name="q"]');
|
|
||||||
// overwrites last text in input
|
|
||||||
await input.click({ clickCount: 3 });
|
|
||||||
await input.type(keyword);
|
|
||||||
await input.focus();
|
await input.focus();
|
||||||
await page.keyboard.press("Enter");
|
await this.page.keyboard.press("Enter");
|
||||||
|
|
||||||
if (event.sleep_range) {
|
|
||||||
await sfunctions.random_sleep(event);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
await page.waitForSelector('#news', { timeout: 5000 });
|
async next_page() {
|
||||||
await sfunctions.sleep(2000);
|
let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000});
|
||||||
|
if (!next_page_link) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
await next_page_link.click();
|
||||||
|
await this.page.waitForNavigation();
|
||||||
|
|
||||||
if (event.debug === true && event.is_local === true) {
|
return true;
|
||||||
await page.screenshot({path: `debug/${keyword}.png`});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let html = await page.content();
|
async wait_for_results() {
|
||||||
results[keyword] = parse_bing_news(html);
|
await this.page.waitForSelector('#b_content', { timeout: 5000 });
|
||||||
|
await this.sleep(500);
|
||||||
} catch (e) {
|
|
||||||
console.error(`Problem with scraping ${keyword}: ${e}`);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return results;
|
async detected() {
|
||||||
|
// TODO: I was actually never detected by bing. those are good guys.
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function parse_bing_news(html) {
|
|
||||||
|
class BingNewsScraper extends Scraper {
|
||||||
|
|
||||||
|
parse(html) {
|
||||||
// load the page source into cheerio
|
// load the page source into cheerio
|
||||||
const $ = cheerio.load(html);
|
const $ = cheerio.load(html);
|
||||||
|
|
||||||
@ -200,4 +113,52 @@ function parse_bing_news(html) {
|
|||||||
time: (new Date()).toUTCString(),
|
time: (new Date()).toUTCString(),
|
||||||
results: cleaned,
|
results: cleaned,
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async load_start_page() {
|
||||||
|
try {
|
||||||
|
await this.page.goto('https://www.bing.com/news/search?');
|
||||||
|
if (this.config.set_manual_settings === true) {
|
||||||
|
console.log('Sleeping 30 seconds. Set your settings now.');
|
||||||
|
await this.sleep(30000);
|
||||||
|
}
|
||||||
|
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
||||||
|
} catch (e) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
async search_keyword(keyword) {
|
||||||
|
const input = await this.page.$('input[name="q"]');
|
||||||
|
await this.set_input_value(`input[name="q"]`, keyword);
|
||||||
|
await this.sleep(50);
|
||||||
|
await input.focus();
|
||||||
|
await this.page.keyboard.press("Enter");
|
||||||
|
}
|
||||||
|
|
||||||
|
async next_page() {
|
||||||
|
let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000});
|
||||||
|
if (!next_page_link) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
await next_page_link.click();
|
||||||
|
await this.page.waitForNavigation();
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
async wait_for_results() {
|
||||||
|
await this.page.waitForSelector('#news', { timeout: 5000 });
|
||||||
|
await this.sleep(2000);
|
||||||
|
}
|
||||||
|
|
||||||
|
async detected() {
|
||||||
|
// TODO: I was actually never detected by bing news.
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
BingNewsScraper: BingNewsScraper,
|
||||||
|
BingScraper: BingScraper,
|
||||||
|
};
|
@ -1,104 +1,98 @@
|
|||||||
const cheerio = require('cheerio');
|
const cheerio = require('cheerio');
|
||||||
const sfunctions = require('./functions.js');
|
const sfunctions = require('./functions.js');
|
||||||
|
const Scraper = require('./se_scraper');
|
||||||
|
|
||||||
module.exports = {
|
class GoogleScraper extends Scraper {
|
||||||
scrape_google_news_old_pup: scrape_google_news_old_pup,
|
|
||||||
scrape_google_pup: scrape_google_pup,
|
|
||||||
scrape_google_image_pup: scrape_google_image_pup,
|
|
||||||
scrape_google_news_pup: scrape_google_news_pup,
|
|
||||||
scrape_google_pup_dr: scrape_google_pup_dr,
|
|
||||||
};
|
|
||||||
|
|
||||||
const STANDARD_TIMEOUT = 8000;
|
parse(html) {
|
||||||
const SOLVE_CAPTCHA_TIME = 45000;
|
// load the page source into cheerio
|
||||||
|
const $ = cheerio.load(html);
|
||||||
|
|
||||||
async function scrape_google_pup(page, event, context, pluggable) {
|
// perform queries
|
||||||
await page.goto('https://www.google.com/');
|
const results = [];
|
||||||
|
$('#center_col .g').each((i, link) => {
|
||||||
try {
|
results.push({
|
||||||
await page.waitForSelector('input[name="q"]', { timeout: STANDARD_TIMEOUT });
|
link: $(link).find('.r a').attr('href'),
|
||||||
} catch (e) {
|
title: $(link).find('.r a').text(),
|
||||||
return results;
|
snippet: $(link).find('span.st').text(),
|
||||||
}
|
visible_link: $(link).find('.r cite').text(),
|
||||||
|
date: $(link).find('span.f').text() || '',
|
||||||
let keywords = event.keywords;
|
})
|
||||||
var results = {};
|
|
||||||
|
|
||||||
for (var i = 0; i < keywords.length; i++) {
|
|
||||||
keyword = keywords[i];
|
|
||||||
results[keyword] = {};
|
|
||||||
|
|
||||||
if (pluggable.before_keyword_scraped) {
|
|
||||||
await pluggable.before_keyword_scraped({
|
|
||||||
keyword: keyword,
|
|
||||||
page: page,
|
|
||||||
event: event,
|
|
||||||
context: context,
|
|
||||||
});
|
});
|
||||||
|
|
||||||
|
let no_results = sfunctions.no_results(
|
||||||
|
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
|
||||||
|
'No results found for', 'Ergebnisse für', 'Showing results for'],
|
||||||
|
$('#main').text()
|
||||||
|
);
|
||||||
|
|
||||||
|
let effective_query = $('#fprsl').text() || '';
|
||||||
|
if (!effective_query) {
|
||||||
|
effective_query = $('#fprs a').text()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const cleaned = [];
|
||||||
|
for (var i=0; i < results.length; i++) {
|
||||||
|
let res = results[i];
|
||||||
|
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
||||||
|
res.rank = i+1;
|
||||||
|
cleaned.push(res);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
time: (new Date()).toUTCString(),
|
||||||
|
num_results: $('#resultStats').text(),
|
||||||
|
no_results: no_results,
|
||||||
|
effective_query: effective_query,
|
||||||
|
results: cleaned
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async load_start_page() {
|
||||||
|
await this.page.goto('https://www.google.com/');
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
||||||
|
} catch (e) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
const input = await page.$('input[name="q"]');
|
return true;
|
||||||
await sfunctions.set_input_value(page, `input[name="q"]`, keyword);
|
}
|
||||||
await sfunctions.sleep(50);
|
|
||||||
|
async search_keyword(keyword) {
|
||||||
|
const input = await this.page.$('input[name="q"]');
|
||||||
|
await this.set_input_value(`input[name="q"]`, keyword);
|
||||||
|
await this.sleep(50);
|
||||||
await input.focus();
|
await input.focus();
|
||||||
await page.keyboard.press("Enter");
|
await this.page.keyboard.press("Enter");
|
||||||
|
|
||||||
let page_num = 1;
|
|
||||||
|
|
||||||
do {
|
|
||||||
if (event.verbose === true) {
|
|
||||||
console.log(`${event.search_engine} is scraping keyword: ${keyword} on page ${page_num}`);
|
|
||||||
}
|
}
|
||||||
if (event.sleep_range) {
|
|
||||||
await sfunctions.random_sleep(event);
|
|
||||||
}
|
|
||||||
await page.waitForSelector('#center_col', {timeout: STANDARD_TIMEOUT});
|
|
||||||
await sfunctions.sleep(500);
|
|
||||||
let html = await page.content();
|
|
||||||
results[keyword][page_num] = parse_google_results(html);
|
|
||||||
|
|
||||||
page_num += 1;
|
async next_page() {
|
||||||
|
let next_page_link = await this.page.$('#pnnext', {timeout: 1000});
|
||||||
let next_page_link = await page.$('#pnnext', {timeout: 1000});
|
|
||||||
if (!next_page_link) {
|
if (!next_page_link) {
|
||||||
break;
|
return false;
|
||||||
}
|
}
|
||||||
await next_page_link.click();
|
await next_page_link.click();
|
||||||
await page.waitForNavigation();
|
await this.page.waitForNavigation();
|
||||||
|
|
||||||
} while (page_num <= event.num_pages)
|
return true;
|
||||||
|
|
||||||
} catch (e) {
|
|
||||||
console.error(`Problem with scraping ${keyword}.`);
|
|
||||||
console.error(e);
|
|
||||||
|
|
||||||
if (await scraping_detected(page) === true) {
|
|
||||||
console.error('Google detected the scraping. Aborting.');
|
|
||||||
|
|
||||||
if (event.is_local === true) {
|
|
||||||
await sfunctions.sleep(SOLVE_CAPTCHA_TIME);
|
|
||||||
console.error('You have 45 seconds to enter the captcha.');
|
|
||||||
// expect that user filled out necessary captcha
|
|
||||||
} else {
|
|
||||||
return results;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// some other error, quit scraping process if stuff is broken
|
|
||||||
if (event.is_local === true) {
|
|
||||||
console.error('You have 30 seconds to fix this.');
|
|
||||||
await sfunctions.sleep(30000);
|
|
||||||
} else {
|
|
||||||
return results;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return results;
|
async wait_for_results() {
|
||||||
|
await this.page.waitForSelector('#center_col', { timeout: this.STANDARD_TIMEOUT });
|
||||||
|
await this.sleep(500);
|
||||||
|
}
|
||||||
|
|
||||||
|
async detected() {
|
||||||
|
const title = await this.page.title();
|
||||||
|
let html = await this.page.content();
|
||||||
|
return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
async function scrape_google_pup_dr(page, event, context, pluggable) {
|
async function scrape_google_pup_dr(page, event, context, pluggable) {
|
||||||
let keywords = event.keywords;
|
let keywords = event.keywords;
|
||||||
first = keywords[0];
|
first = keywords[0];
|
||||||
@ -651,3 +645,12 @@ function parse_google_news_results(html) {
|
|||||||
effective_query: effective_query,
|
effective_query: effective_query,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
scrape_google_news_old_pup: scrape_google_news_old_pup,
|
||||||
|
GoogleScraper: GoogleScraper,
|
||||||
|
scrape_google_image_pup: scrape_google_image_pup,
|
||||||
|
scrape_google_news_pup: scrape_google_news_pup,
|
||||||
|
scrape_google_pup_dr: scrape_google_pup_dr,
|
||||||
|
};
|
@ -3,34 +3,214 @@ const start_url = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
Get useful JS knowledge and get awesome...
|
||||||
|
|
||||||
Read this shit: https://javascript.info/class-inheritance
|
Read this shit: https://javascript.info/class-inheritance
|
||||||
|
And this: https://medium.freecodecamp.org/here-are-examples-of-everything-new-in-ecmascript-2016-2017-and-2018-d52fa3b5a70e
|
||||||
*/
|
*/
|
||||||
|
|
||||||
module.exports = class Scraper {
|
module.exports = class Scraper {
|
||||||
constructor(options = {}) {
|
constructor(options = {}) {
|
||||||
const {
|
const {
|
||||||
searchEngine = 'google',
|
browser = null,
|
||||||
numPages = 1,
|
config = {},
|
||||||
|
context = {},
|
||||||
pluggable = null,
|
pluggable = null,
|
||||||
} = options;
|
} = options;
|
||||||
|
|
||||||
this.pluggable = pluggable;
|
this.pluggable = pluggable;
|
||||||
this.searchEngine = searchEngine;
|
this.browser = browser;
|
||||||
this.numPages = numPages;
|
this.config = config;
|
||||||
this.results = {}
|
this.context = context;
|
||||||
|
|
||||||
|
this.STANDARD_TIMEOUT = 8000;
|
||||||
|
// longer timeout when using proxies
|
||||||
|
this.PROXY_TIMEOUT = 15000;
|
||||||
|
this.SOLVE_CAPTCHA_TIME = 45000;
|
||||||
|
|
||||||
|
this.results = {};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async run() {
|
||||||
|
|
||||||
|
let do_continue = await this.load_search_engine();
|
||||||
|
|
||||||
|
if (!do_continue) {
|
||||||
|
console.error('Failed to load the search engine: load_search_engine()');
|
||||||
|
return this.results;
|
||||||
|
}
|
||||||
|
|
||||||
|
await this.scraping_loop();
|
||||||
|
|
||||||
|
return this.results;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Action that runs only once in the beginning of the
|
||||||
|
* scraping procedure.
|
||||||
|
*
|
||||||
|
* @returns {Promise<void>} true if everything is correct.
|
||||||
|
*/
|
||||||
async load_search_engine() {
|
async load_search_engine() {
|
||||||
|
|
||||||
|
this.page = await this.browser.newPage();
|
||||||
|
|
||||||
|
// block some assets to speed up scraping
|
||||||
|
if (this.config.block_assets === true) {
|
||||||
|
await this.page.setRequestInterception(true);
|
||||||
|
this.page.on('request', (req) => {
|
||||||
|
let type = req.resourceType();
|
||||||
|
const block = ['stylesheet', 'font', 'image', 'media'];
|
||||||
|
if (block.includes(type)) {
|
||||||
|
req.abort();
|
||||||
|
} else {
|
||||||
|
req.continue();
|
||||||
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
async search_keyword() {
|
return await this.load_start_page();
|
||||||
}
|
}
|
||||||
|
|
||||||
parse() {
|
/**
|
||||||
|
* Each scraper basically iterates over a list of
|
||||||
|
* keywords and a list of pages. This is the generic
|
||||||
|
* method for that.
|
||||||
|
*
|
||||||
|
* @returns {Promise<void>}
|
||||||
|
*/
|
||||||
|
async scraping_loop() {
|
||||||
|
|
||||||
|
for (let keyword of this.config.keywords) {
|
||||||
|
|
||||||
|
this.results[keyword] = {};
|
||||||
|
|
||||||
|
if (this.pluggable.before_keyword_scraped) {
|
||||||
|
await this.pluggable.before_keyword_scraped({
|
||||||
|
keyword: keyword,
|
||||||
|
page: this.page,
|
||||||
|
event: this.config,
|
||||||
|
context: this.context,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
let page_num = 1;
|
||||||
|
|
||||||
|
try {
|
||||||
|
|
||||||
|
await this.search_keyword(keyword);
|
||||||
|
|
||||||
|
do {
|
||||||
|
|
||||||
|
if (this.config.verbose === true) {
|
||||||
|
console.log(`${this.config.search_engine} scrapes keyword "${keyword}" on page ${page_num}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
await this.wait_for_results();
|
||||||
|
|
||||||
|
if (event.sleep_range) {
|
||||||
|
await this.random_sleep();
|
||||||
|
}
|
||||||
|
|
||||||
|
let html = await this.page.content();
|
||||||
|
this.results[keyword][page_num] = this.parse(html);
|
||||||
|
|
||||||
|
page_num += 1;
|
||||||
|
|
||||||
|
if (await this.next_page() === false) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
} while (page_num < event.num_pages);
|
||||||
|
|
||||||
|
} catch (e) {
|
||||||
|
|
||||||
|
console.error(`Problem with scraping ${keyword} in search engine ${this.config.search_engine}: ${e}`);
|
||||||
|
|
||||||
|
if (await this.detected() === true) {
|
||||||
|
console.error(`${this.config.search_engine} DETECTED the scraping!`);
|
||||||
|
|
||||||
|
if (this.config.is_local === true) {
|
||||||
|
await this.sleep(this.SOLVE_CAPTCHA_TIME);
|
||||||
|
console.error(`You have ${this.SOLVE_CAPTCHA_TIME}ms to enter the captcha.`);
|
||||||
|
// expect that user filled out necessary captcha
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// some other error, quit scraping process if stuff is broken
|
||||||
|
if (this.config.is_local === true) {
|
||||||
|
console.error('You have 30 seconds to fix this.');
|
||||||
|
await this.sleep(30000);
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sleep(ms) {
|
||||||
|
return new Promise(resolve => {
|
||||||
|
setTimeout(resolve, ms)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
async random_sleep() {
|
||||||
|
const [min, max] = this.config.sleep_range;
|
||||||
|
let rand = Math.floor(Math.random() * (max - min + 1) + min); //Generate Random number
|
||||||
|
if (this.config.debug === true) {
|
||||||
|
console.log(`Sleeping for ${rand}s`);
|
||||||
|
}
|
||||||
|
await this.sleep(rand * 1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
async set_input_value(selector, value) {
|
||||||
|
await this.page.waitFor(selector);
|
||||||
|
await this.page.evaluate((value, selector) => {
|
||||||
|
return document.querySelector(selector).value = value;
|
||||||
|
}, value, selector);
|
||||||
|
}
|
||||||
|
|
||||||
|
no_results(needles, html) {
|
||||||
|
return !needles.map((needle) => { return html.indexOf(needle)})
|
||||||
|
.every((res) => { return res == -1});
|
||||||
|
}
|
||||||
|
|
||||||
|
parse(html) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @returns true if startpage was loaded correctly.
|
||||||
|
*/
|
||||||
|
async load_start_page() {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Searches the keyword by inputting it into the form and hitting enter
|
||||||
|
* or something similar.
|
||||||
|
*
|
||||||
|
* @param keyword
|
||||||
|
* @returns {Promise<void>}
|
||||||
|
*/
|
||||||
|
async search_keyword(keyword) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @returns true if the next page was loaded correctely
|
||||||
|
*/
|
||||||
async next_page() {
|
async next_page() {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
async wait_for_results() {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async detected() {
|
async detected() {
|
||||||
|
@ -129,29 +129,13 @@ module.exports.handler = async function handler (event, context, callback) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const page = await browser.newPage();
|
Scraper = {
|
||||||
|
google: google.GoogleScraper,
|
||||||
// block some assets to speed up scraping
|
|
||||||
if (config.block_assets === true) {
|
|
||||||
await page.setRequestInterception(true);
|
|
||||||
page.on('request', (req) => {
|
|
||||||
let type = req.resourceType();
|
|
||||||
const block = ['stylesheet', 'font', 'image', 'media'];
|
|
||||||
if (block.includes(type)) {
|
|
||||||
req.abort();
|
|
||||||
} else {
|
|
||||||
req.continue();
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
results = await {
|
|
||||||
google: google.scrape_google_pup,
|
|
||||||
google_news_old: google.scrape_google_news_old_pup,
|
google_news_old: google.scrape_google_news_old_pup,
|
||||||
google_news: google.scrape_google_news_pup,
|
google_news: google.scrape_google_news_pup,
|
||||||
google_image: google.scrape_google_image_pup,
|
google_image: google.scrape_google_image_pup,
|
||||||
bing: bing.scrape_bing_pup,
|
bing: bing.BingScraper,
|
||||||
bing_news: bing.scrape_bing_news_pup,
|
bing_news: bing.BingNewsScraper,
|
||||||
infospace: infospace.scrape_infospace_pup,
|
infospace: infospace.scrape_infospace_pup,
|
||||||
webcrawler: infospace.scrape_webcrawler_news_pup,
|
webcrawler: infospace.scrape_webcrawler_news_pup,
|
||||||
baidu: baidu.scrape_baidu_pup,
|
baidu: baidu.scrape_baidu_pup,
|
||||||
@ -163,7 +147,16 @@ module.exports.handler = async function handler (event, context, callback) {
|
|||||||
reuters: tickersearch.scrape_reuters_finance_pup,
|
reuters: tickersearch.scrape_reuters_finance_pup,
|
||||||
cnbc: tickersearch.scrape_cnbc_finance_pup,
|
cnbc: tickersearch.scrape_cnbc_finance_pup,
|
||||||
marketwatch: tickersearch.scrape_marketwatch_finance_pup,
|
marketwatch: tickersearch.scrape_marketwatch_finance_pup,
|
||||||
}[config.search_engine](page, config, context, pluggable);
|
}[config.search_engine];
|
||||||
|
|
||||||
|
let scraper = new Scraper({
|
||||||
|
browser: browser,
|
||||||
|
config: config,
|
||||||
|
context: context,
|
||||||
|
pluggable: pluggable,
|
||||||
|
});
|
||||||
|
|
||||||
|
let results = await scraper.run();
|
||||||
|
|
||||||
|
|
||||||
if (pluggable.close_browser) {
|
if (pluggable.close_browser) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user