mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2025-03-13 12:38:16 +01:00
1378 lines
44 KiB
JavaScript
1378 lines
44 KiB
JavaScript
'use strict';
|
|
|
|
const cheerio = require('cheerio');
|
|
const Scraper = require('./se_scraper');
|
|
|
|
class GoogleScraper extends Scraper {
|
|
|
|
constructor(...args) {
|
|
super(...args);
|
|
}
|
|
|
|
async parse_async(html) {
|
|
|
|
const results = await this.page.evaluate(() => {
|
|
|
|
let _text = (el, s, onlyFirstTextNode) => {
|
|
let n = el.querySelector(s);
|
|
|
|
if (n) {
|
|
return (onlyFirstTextNode) ? n.childNodes[0].nodeValue : n.innerText;
|
|
} else {
|
|
return;
|
|
}
|
|
};
|
|
|
|
let _attr = (el, s, attr) => {
|
|
let n = el.querySelector(s);
|
|
|
|
if (n) {
|
|
return n.getAttribute(attr);
|
|
} else {
|
|
return;
|
|
}
|
|
};
|
|
|
|
let results = {
|
|
num_results: '',
|
|
no_results: false,
|
|
effective_query: '',
|
|
right_info: {},
|
|
results: [],
|
|
top_products: [],
|
|
right_products: [],
|
|
top_ads: [],
|
|
bottom_ads: [],
|
|
places: [],
|
|
};
|
|
|
|
let num_results_el = document.getElementById('resultStats');
|
|
|
|
if (num_results_el) {
|
|
results.num_results = num_results_el.innerText;
|
|
}
|
|
|
|
let organic_results = document.querySelectorAll('#center_col .g');
|
|
|
|
organic_results.forEach((el) => {
|
|
|
|
let serp_obj = {
|
|
link: _attr(el, '.r a', 'href'),
|
|
title: _text(el, '.r a h3'),
|
|
snippet: _text(el, 'span.st'),
|
|
visible_link: _text(el, '.r cite'),
|
|
date: _text(el, 'span.f'),
|
|
};
|
|
|
|
if (serp_obj.date) {
|
|
serp_obj.date = serp_obj.date.replace(' - ', '');
|
|
}
|
|
|
|
results.results.push(serp_obj);
|
|
});
|
|
|
|
// check if no results
|
|
results.no_results = (results.results.length === 0);
|
|
|
|
let parseAds = (container, selector) => {
|
|
document.querySelectorAll(selector).forEach((el) => {
|
|
let ad_obj = {
|
|
visible_link: _text(el, '.ads-visurl cite'),
|
|
tracking_link: _attr(el, 'a:first-child', 'href'),
|
|
link: _attr(el, 'a:nth-child(2)', 'href'),
|
|
title: _text(el, 'a h3'),
|
|
snippet: _text(el, '.ads-creative'),
|
|
links: [],
|
|
};
|
|
el.querySelectorAll('ul li a').forEach((node) => {
|
|
ad_obj.links.push({
|
|
tracking_link: node.getAttribute('data-arwt'),
|
|
link: node.getAttribute('href'),
|
|
title: node.innerText,
|
|
})
|
|
});
|
|
container.push(ad_obj);
|
|
});
|
|
};
|
|
|
|
parseAds(results.top_ads, '#tads .ads-ad');
|
|
parseAds(results.bottom_ads, '#tadsb .ads-ad');
|
|
|
|
// parse google places
|
|
document.querySelectorAll('.rllt__link').forEach((el) => {
|
|
results.places.push({
|
|
heading: _text(el, '[role="heading"] span'),
|
|
rating: _text(el, '.rllt__details div:first-child'),
|
|
contact: _text(el, '.rllt__details div:nth-child(2)'),
|
|
hours: _text(el, '.rllt__details div:nth-child(3)'),
|
|
})
|
|
});
|
|
|
|
// parse right side product information
|
|
results.right_info.review = _attr(document, '#rhs .cu-container g-review-stars span', 'aria-label');
|
|
|
|
let title_el = document.querySelector('#rhs .cu-container .Q7Oxbd');
|
|
if (title_el) {
|
|
results.right_info.title = title_el.innerText;
|
|
}
|
|
|
|
let num_reviews_el = document.querySelector('#rhs .cu-container .PGDKUd');
|
|
if (num_reviews_el) {
|
|
results.right_info.num_reviews = num_reviews_el.innerText;
|
|
}
|
|
|
|
results.right_info.vendors = [];
|
|
results.right_info.info = _text(document, '#rhs_block > div > div > div > div:nth-child(5) > div > div');
|
|
|
|
document.querySelectorAll('#rhs .cu-container .rhsvw > div > div:nth-child(4) > div > div:nth-child(3) > div').forEach((el) => {
|
|
results.right_info.vendors.push({
|
|
price: _text(el, 'span:nth-of-type(1)'),
|
|
merchant_name: _text(el, '.doUe3s0oL2B__jackpot-merchant a'),
|
|
merchant_ad_link: _attr(el, 'span:nth-child(3) a:first-child', 'href'),
|
|
merchant_link: _attr(el, 'span:nth-child(3) a:nth-child(2)', 'href'), // TODO this is not working anymore
|
|
source_name: _text(el, 'span:nth-child(4) a'),
|
|
source_link: _attr(el, 'span:nth-child(4) a', 'href'),
|
|
info: _text(el, '.SdBHnc.e2CF7c'),
|
|
shipping: _text(el, '.JfwJme'),
|
|
})
|
|
});
|
|
|
|
let right_side_info_el = document.getElementById('rhs');
|
|
|
|
if (right_side_info_el) {
|
|
let right_side_info_text = right_side_info_el.innerText;
|
|
|
|
if (right_side_info_text && right_side_info_text.length > 0) {
|
|
results.right_side_info_text = right_side_info_text;
|
|
}
|
|
}
|
|
|
|
// Parse Google Shopping top or left
|
|
document.querySelectorAll('.pla-unit').forEach((el) => {
|
|
let top_product = {
|
|
tracking_link: _attr(el, '.pla-unit-title a:first-child', 'href'),
|
|
link: _attr(el, '.pla-unit-title a:nth-child(2)', 'href'),
|
|
title: _text(el, '.pla-unit-title a:nth-child(2) span'),
|
|
price: _text(el, '.pla-unit-title + div', true),
|
|
originalPrice: _text(el, '.pla-unit-title + div > span'),
|
|
shipping: _text(el, '.pla-extensions-container .cYBBsb'),
|
|
vendor_link: _attr(el,'.pla-extensions-container a.FfKHB', 'href'),
|
|
merchant_name: _text(el,'.LbUacb span:nth-child(1)'),
|
|
};
|
|
|
|
results.top_products.push(top_product);
|
|
});
|
|
|
|
// parse top right product information
|
|
// #tvcap .pla-unit
|
|
document.querySelectorAll('#rhs_block .pla-unit').forEach((el) => {
|
|
let right_product = {
|
|
tracking_link: _attr(el, '.pla-unit-title a:first-child', 'href'),
|
|
link: _attr(el, '.pla-unit-title a:nth-child(2)', 'href'),
|
|
title: _text(el, '.pla-unit-title a:nth-child(2) span:first-child'),
|
|
price: _text(el,'.pla-unit-title + div'),
|
|
shipping: _text(el,'.pla-extensions-container > div'),
|
|
vendor_link: _text(el,'.pla-extensions-container div > a'),
|
|
vendor_name: _text(el,'.pla-extensions-container div > a > div'),
|
|
};
|
|
|
|
let merchant_node = el.querySelector('.pla-unit-title');
|
|
if (merchant_node) {
|
|
let node = merchant_node.parentNode.querySelector('div > span:first-child');
|
|
if (node) {
|
|
right_product.merchant_name = node.innerText;
|
|
}
|
|
}
|
|
|
|
results.right_products.push(right_product);
|
|
});
|
|
|
|
let effective_query_el = document.getElementById('fprsl');
|
|
|
|
if (effective_query_el) {
|
|
|
|
results.effective_query = effective_query_el.innerText;
|
|
if (!results.effective_query) {
|
|
let effective_query_el2 = document.querySelector('#fprs a');
|
|
if (effective_query_el2) {
|
|
results.effective_query = document.querySelector('#fprs a').innerText;
|
|
}
|
|
}
|
|
}
|
|
|
|
return results;
|
|
});
|
|
|
|
// clean some results
|
|
results.top_products = this.clean_results(results.top_products, ['title', 'link']);
|
|
results.right_products = this.clean_results(results.right_products, ['title', 'link']);
|
|
results.results = this.clean_results(results.results, ['title', 'link' , 'snippet']);
|
|
|
|
results.time = (new Date()).toUTCString();
|
|
return results;
|
|
}
|
|
|
|
async load_start_page() {
|
|
let startUrl = 'https://www.google.com';
|
|
|
|
if (this.config.google_settings) {
|
|
startUrl = `https://www.${this.config.google_settings.google_domain}/search?q=`;
|
|
if (this.config.google_settings.google_domain) {
|
|
startUrl = `https://www.${this.config.google_settings.google_domain}/search?`;
|
|
} else {
|
|
startUrl = `https://www.google.com/search?`;
|
|
}
|
|
|
|
for (var key in this.config.google_settings) {
|
|
if (key !== 'google_domain') {
|
|
startUrl += `${key}=${this.config.google_settings[key]}&`
|
|
}
|
|
}
|
|
}
|
|
|
|
this.logger.info('Using startUrl: ' + startUrl);
|
|
|
|
this.last_response = await this.page.goto(startUrl);
|
|
|
|
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
|
|
|
return true;
|
|
}
|
|
|
|
async search_keyword(keyword) {
|
|
const input = await this.page.$('input[name="q"]');
|
|
await this.set_input_value(`input[name="q"]`, keyword);
|
|
await this.sleep(50);
|
|
await input.focus();
|
|
await this.page.keyboard.press("Enter");
|
|
}
|
|
|
|
async next_page() {
|
|
let next_page_link = await this.page.$('#pnnext', {timeout: 1000});
|
|
if (!next_page_link) {
|
|
return false;
|
|
}
|
|
await next_page_link.click();
|
|
|
|
return true;
|
|
}
|
|
|
|
async wait_for_results() {
|
|
await this.page.waitForSelector('#fbar', { timeout: this.STANDARD_TIMEOUT });
|
|
}
|
|
|
|
async detected() {
|
|
const title = await this.page.title();
|
|
let html = await this.page.content();
|
|
return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
|
|
}
|
|
}
|
|
|
|
class GoogleNewsOldScraper extends Scraper {
|
|
|
|
parse(html) {
|
|
const $ = cheerio.load(html);
|
|
// perform queries
|
|
const results = [];
|
|
|
|
$('g-card').each((i, result) => {
|
|
results.push({
|
|
link: $(result).find('a').attr('href'),
|
|
title: $(result).find('a div div:nth-child(2) div:nth-child(2)').text(),
|
|
snippet: $(result).find('a div div:nth-child(2) div:nth-child(3) div:nth-child(1)').text(),
|
|
date: $(result).find('a div div:nth-child(2) div:nth-child(3) div:nth-child(2)').text(),
|
|
})
|
|
});
|
|
|
|
let no_results = this.no_results(
|
|
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
|
|
'No results found for', 'did not match any news results'],
|
|
$('#main').text()
|
|
);
|
|
|
|
let effective_query = $('#fprsl').text() || '';
|
|
if (!effective_query) {
|
|
effective_query = $('#fprs a').text()
|
|
}
|
|
|
|
const cleaned = this.clean_results(results, ['link']);
|
|
|
|
return {
|
|
time: (new Date()).toUTCString(),
|
|
results: cleaned,
|
|
no_results: no_results,
|
|
effective_query: effective_query,
|
|
}
|
|
}
|
|
|
|
async load_start_page() {
|
|
let startUrl = this.build_start_url('https://www.google.com/search?source=lnms&tbm=nws&') || 'https://www.google.com/search?source=lnms&tbm=nws';
|
|
await this.page.goto(startUrl);
|
|
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
|
return true;
|
|
}
|
|
|
|
async search_keyword(keyword) {
|
|
|
|
let url = this.build_start_url(`https://www.google.com/search?q=${keyword}&source=lnms&tbm=nws&`) ||
|
|
`https://www.google.com/search?q=${keyword}&hl=en&source=lnms&tbm=nws`;
|
|
|
|
this.last_response = await this.page.goto(url, {
|
|
referer: 'https://www.google.com/'
|
|
});
|
|
|
|
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
|
}
|
|
|
|
async next_page() {
|
|
let next_page_link = await this.page.$('#pnnext', {timeout: 1000});
|
|
if (!next_page_link) {
|
|
return false;
|
|
}
|
|
await next_page_link.click();
|
|
|
|
return true;
|
|
}
|
|
|
|
async wait_for_results() {
|
|
await this.page.waitForSelector('#rso', { timeout: this.STANDARD_TIMEOUT });
|
|
}
|
|
|
|
async detected() {
|
|
const title = await this.page.title();
|
|
let html = await this.page.content();
|
|
return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
|
|
}
|
|
}
|
|
|
|
|
|
class GoogleImageScraper extends Scraper {
|
|
|
|
parse(html) {
|
|
// load the page source into cheerio
|
|
const $ = cheerio.load(html);
|
|
|
|
// perform queries
|
|
const results = [];
|
|
$('.rg_bx').each((i, link) => {
|
|
let link_element = $(link).find('a.rg_l').attr('href');
|
|
let clean_link = clean_image_url(link_element);
|
|
results.push({
|
|
link: link_element,
|
|
clean_link: clean_link,
|
|
snippet: $(link).find('.a-no-hover-decoration').text(),
|
|
})
|
|
});
|
|
|
|
let no_results = this.no_results(
|
|
['stimmt mit keinem Bildergebnis', 'Keine Ergebnisse für', 'not match any image results', 'No results found for',],
|
|
$('#main').text()
|
|
);
|
|
|
|
let effective_query = $('#fprsl').text() || '';
|
|
if (!effective_query) {
|
|
effective_query = $('#fprs a').text();
|
|
}
|
|
|
|
const cleaned = this.clean_results(results, ['link']);
|
|
|
|
return {
|
|
time: (new Date()).toUTCString(),
|
|
no_results: no_results,
|
|
results: cleaned,
|
|
effective_query: effective_query
|
|
}
|
|
}
|
|
|
|
async load_start_page() {
|
|
try {
|
|
this.last_response = await this.page.goto(`https://www.google.com/imghp?tbm=isch`, {
|
|
referer: 'https://www.google.com/'
|
|
});
|
|
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
|
} catch (e) {
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
async search_keyword(keyword) {
|
|
const input = await this.page.$('input[name="q"]');
|
|
await this.set_input_value(`input[name="q"]`, keyword);
|
|
await this.sleep(50);
|
|
await input.focus();
|
|
await this.page.keyboard.press("Enter");
|
|
// this waitForNavigation makes hardcoded sleeps not necessary
|
|
this.last_response = await this.page.waitForNavigation();
|
|
}
|
|
|
|
async next_page() {
|
|
return false;
|
|
}
|
|
|
|
async wait_for_results() {
|
|
await this.page.waitForSelector('.rg_bx .a-no-hover-decoration div', {timeout: this.STANDARD_TIMEOUT});
|
|
}
|
|
|
|
async detected() {
|
|
const title = await this.page.title();
|
|
let html = await this.page.content();
|
|
return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
|
|
}
|
|
}
|
|
|
|
|
|
class GoogleNewsScraper extends Scraper {
|
|
|
|
parse(html) {
|
|
const $ = cheerio.load(html);
|
|
// perform queries
|
|
const results = [];
|
|
|
|
$('article > h3').each((i, headline) => {
|
|
|
|
let title = $(headline).find('a').text();
|
|
|
|
try {
|
|
var snippet = $(headline).parent().find('p').text();
|
|
var link = $(headline).find('a').attr('href');
|
|
var date = $(headline).parent().parent().parent().find('time').text();
|
|
var ts = $(headline).parent().parent().parent().find('time').attr('datetime');
|
|
} catch(e) {
|
|
|
|
}
|
|
|
|
if (!this.all_results.has(title)) {
|
|
results.push({
|
|
rank: i+1,
|
|
title: title,
|
|
snippet: snippet,
|
|
link: link,
|
|
date: date,
|
|
ts: ts,
|
|
});
|
|
}
|
|
this.all_results.add(title);
|
|
});
|
|
|
|
let no_results = this.no_results(
|
|
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
|
|
'No results found for', 'did not match any news results'],
|
|
$('body').text()
|
|
);
|
|
|
|
let effective_query = $('#fprsl').text() || '';
|
|
|
|
const cleaned = this.clean_results(results, ['title',]);
|
|
|
|
return {
|
|
time: (new Date()).toUTCString(),
|
|
results: cleaned,
|
|
no_results: no_results,
|
|
effective_query: effective_query,
|
|
}
|
|
}
|
|
|
|
async load_start_page() {
|
|
try {
|
|
this.all_results = new Set();
|
|
this.last_response = await this.page.goto(`https://news.google.com/?hl=en-US&gl=US&ceid=US:en`, {
|
|
referer: 'https://news.google.com'
|
|
});
|
|
|
|
await this.page.waitForSelector('div input:nth-child(2)', {timeout: this.STANDARD_TIMEOUT});
|
|
await this.sleep(1000);
|
|
|
|
// parse here front page results
|
|
let html = await this.page.content();
|
|
this.results['frontpage'] = this.parse(html);
|
|
this.result_rank = 1;
|
|
} catch(e) {
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
async search_keyword(keyword) {
|
|
await this.page.waitForSelector('div input:nth-child(2)', { timeout: this.STANDARD_TIMEOUT });
|
|
const input = await this.page.$('div input:nth-child(2)');
|
|
// overwrites last text in input
|
|
await input.click({ clickCount: 3 });
|
|
await input.type(keyword);
|
|
await this.sleep(50);
|
|
await input.focus();
|
|
await this.page.keyboard.press("Enter");
|
|
}
|
|
|
|
async next_page() {
|
|
// google news app does not have next pages
|
|
return false;
|
|
}
|
|
|
|
async wait_for_results() {
|
|
await this.page.waitForSelector(`[data-n-q="${this.keyword}"]`, { timeout: this.STANDARD_TIMEOUT });
|
|
await this.sleep(1000);
|
|
// TODO: fix googlenewsscraper
|
|
// let nodes = await this.page.evaluate(() => {
|
|
// var res = [];
|
|
// document.querySelectorAll('article > h3').forEach((node) => {
|
|
// try {
|
|
// let title = node.querySelector('a span').innerHTML;
|
|
// var snippet = node.querySelector('p').innerHTML;
|
|
// var link = node.querySelector('a').getAttribute('href');
|
|
// res.push({
|
|
// title: tile,
|
|
// snippet: snippet,
|
|
// link: link
|
|
// });
|
|
// } catch(e) {
|
|
// }
|
|
// return res;
|
|
// });
|
|
// });
|
|
}
|
|
|
|
async detected() {
|
|
const title = await this.page.title();
|
|
let html = await this.page.content();
|
|
return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
|
|
}
|
|
}
|
|
|
|
|
|
class GoogleMapsScraper extends Scraper {
|
|
|
|
constructor(...args) {
|
|
super(...args);
|
|
}
|
|
|
|
async parse_async(html) {
|
|
let results = await this.page.evaluate(() => {
|
|
var res = [];
|
|
document.querySelectorAll('.section-listbox-root .section-result').forEach((node) => {
|
|
try {
|
|
let score = node.querySelector('.cards-rating-score').innerHTML;
|
|
let num_ratings = node.querySelector('.section-result-num-ratings').innerHTML;
|
|
let type = node.querySelector('.section-result-details').innerHTML;
|
|
let title = node.querySelector('.section-result-title span').innerHTML;
|
|
let location = node.querySelector('.section-result-location').innerHTML;
|
|
let opening_hours = node.querySelector('.section-result-opening-hours').innerHTML;
|
|
res.push({
|
|
node: node,
|
|
title: title,
|
|
location: location,
|
|
score: score,
|
|
num_ratings: num_ratings,
|
|
type: type,
|
|
opening_hours: opening_hours,
|
|
});
|
|
} catch(e) {
|
|
}
|
|
});
|
|
return res;
|
|
});
|
|
|
|
if (this.scrape_in_detail) {
|
|
let profiles = await this.page.$$('.section-listbox-root .section-result');
|
|
console.log(`Profiles to visit: ${profiles.length}`);
|
|
for (var profile of profiles) {
|
|
try {
|
|
let additional_info = await this.visit_profile(profile);
|
|
console.log(additional_info);
|
|
} catch(e) {
|
|
console.error(e);
|
|
}
|
|
profiles = await this.page.$$('.section-listbox-root .section-result');
|
|
}
|
|
}
|
|
|
|
return {
|
|
time: (new Date()).toUTCString(),
|
|
results: results
|
|
}
|
|
}
|
|
|
|
/*
|
|
https://stackoverflow.com/questions/55815376/puppeteer-open-a-page-get-the-data-go-back-to-the-previous-page-enter-a-new
|
|
*/
|
|
async visit_profile(profile) {
|
|
await profile.click();
|
|
await this.page.waitForFunction('document.querySelectorAll(".section-info-line .section-info-text").length > 0', {timeout: 5000});
|
|
|
|
let results = await this.page.evaluate(() => {
|
|
let res = [];
|
|
document.querySelectorAll('.section-info-line .section-info-text .widget-pane-link').forEach((node) => {
|
|
try {
|
|
let info = node.innerHTML.trim();
|
|
if (info) {
|
|
res.push(info);
|
|
}
|
|
} catch(e) {
|
|
}
|
|
});
|
|
return res;
|
|
});
|
|
|
|
let back_button = await this.page.$('.section-back-to-list-button', {timeout: 10000});
|
|
if (back_button) {
|
|
await back_button.click();
|
|
}
|
|
return results;
|
|
}
|
|
|
|
async load_start_page() {
|
|
let startUrl = 'https://www.google.com/maps';
|
|
|
|
if (this.config.google_maps_settings) {
|
|
// whether to visit each result and get all available information
|
|
// including customer reviews
|
|
this.scrape_in_detail = this.config.google_maps_settings.scrape_in_detail || false;
|
|
}
|
|
|
|
this.logger.info('Using startUrl: ' + startUrl);
|
|
|
|
this.last_response = await this.page.goto(startUrl);
|
|
|
|
try {
|
|
await this.page.waitForSelector('#searchbox input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
|
} catch (e) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
async search_keyword(keyword) {
|
|
const input = await this.page.$('#searchbox input[name="q"]');
|
|
await this.set_input_value(`#searchbox input[name="q"]`, keyword);
|
|
await this.sleep(50);
|
|
await input.focus();
|
|
await this.page.keyboard.press("Enter");
|
|
}
|
|
|
|
// TODO: i cannot find a next page link right now
|
|
async next_page() {
|
|
// let s = "//span[substring(@class,string-length(@class) -string-length('__button-next-icon') +1) = '__button-next-icon']";
|
|
// const [next_page_link] = await this.page.$x(s, {timeout: 2000});
|
|
|
|
let next_page_link = await this.page.$('[jsaction="pane.paginationSection.nextPage"] span', {timeout: 10000});
|
|
if (!next_page_link) {
|
|
return false;
|
|
}
|
|
await next_page_link.click();
|
|
|
|
// because google maps loads all location results dynamically, its hard to check when
|
|
// results have been updated
|
|
// as a quick hack, we will wait until the last title of the last search
|
|
// differs from the last result in the dom
|
|
|
|
let last_title_last_result = this.results[this.keyword][this.page_num-1].results.slice(-1)[0].title;
|
|
|
|
this.logger.info(`Waiting until new last serp title differs from: "${last_title_last_result}"`);
|
|
|
|
await this.page.waitForFunction((last_title) => {
|
|
const res = document.querySelectorAll('.section-result .section-result-title span');
|
|
return res[res.length-1].innerHTML !== last_title;
|
|
}, {timeout: 7000}, this.results[this.keyword][this.page_num-1].results.slice(-1)[0].title);
|
|
|
|
return true;
|
|
}
|
|
|
|
async wait_for_results() {
|
|
await this.page.waitForSelector('.section-listbox-root .section-result', { timeout: this.STANDARD_TIMEOUT });
|
|
// more than 1 result
|
|
await this.page.waitForFunction("document.querySelectorAll('.section-result').length > 0", { timeout: 5000 });
|
|
await this.page.waitForNavigation();
|
|
}
|
|
|
|
async detected() {
|
|
const title = await this.page.title();
|
|
let html = await this.page.content();
|
|
return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
|
|
}
|
|
}
|
|
|
|
|
|
class GoogleShoppingScraper extends Scraper {
|
|
|
|
constructor(...args) {
|
|
super(...args);
|
|
}
|
|
|
|
parse(html) {
|
|
// load the page source into cheerio
|
|
const $ = cheerio.load(html);
|
|
|
|
const results = [];
|
|
$('.sh-dlr__list-result').each((i, link) => {
|
|
results.push({
|
|
price: $(link).find('.sh-dlr__content div:nth-child(2) span > span').text(),
|
|
link: $(link).find('.sh-dlr__thumbnail a').attr('href'),
|
|
title: $(link).find('div > div > a[data-what="1"]').text(),
|
|
info1: $(link).find('.sh-dlr__content div:nth-child(2)').text(),
|
|
info2: $(link).find('.sh-dlr__content div:nth-child(3)').text(),
|
|
info3: $(link).find('.sh-dlr__content div:nth-child(4)').text(),
|
|
})
|
|
});
|
|
|
|
const grid_results = [];
|
|
|
|
$('.sh-pr__product-results-grid .sh-dgr__grid-result').each((i, link) => {
|
|
grid_results.push({
|
|
price: $(link).find('.sh-dgr__content div:nth-child(2) span').text(),
|
|
link: $(link).find('.sh-dgr__content a').attr('href'),
|
|
title: $(link).find('.sh-dgr__content a').text(),
|
|
info: $(link).find('.sh-dgr__content').text(),
|
|
})
|
|
});
|
|
|
|
// 'Ergebnisse für', 'Showing results for'
|
|
let no_results = this.no_results(
|
|
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
|
|
'No results found for'],
|
|
$('#main').text()
|
|
);
|
|
|
|
const cleaned = this.clean_results(results, ['title', 'link']);
|
|
|
|
return {
|
|
time: (new Date()).toUTCString(),
|
|
no_results: no_results,
|
|
results: cleaned,
|
|
grid_results: grid_results,
|
|
}
|
|
|
|
}
|
|
|
|
async load_start_page() {
|
|
let startUrl = 'https://www.google.com/shopping?';
|
|
|
|
if (this.config.google_settings) {
|
|
startUrl = `https://www.${this.config.google_settings.google_domain}/shopping?q=`;
|
|
if (this.config.google_settings.google_domain) {
|
|
startUrl = `https://www.${this.config.google_settings.google_domain}/shopping?`;
|
|
} else {
|
|
startUrl = `https://www.google.com/shopping?`;
|
|
}
|
|
|
|
for (var key in this.config.google_settings) {
|
|
if (key !== 'google_domain') {
|
|
startUrl += `${key}=${this.config.google_settings[key]}&`
|
|
}
|
|
}
|
|
}
|
|
|
|
this.logger.info('Using startUrl: ' + startUrl);
|
|
|
|
this.last_response = await this.page.goto(startUrl);
|
|
|
|
try {
|
|
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
|
} catch (e) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
async search_keyword(keyword) {
|
|
const input = await this.page.$('input[name="q"]');
|
|
await this.set_input_value(`input[name="q"]`, keyword);
|
|
await this.sleep(50);
|
|
await input.focus();
|
|
await this.page.keyboard.press("Enter");
|
|
}
|
|
|
|
async next_page() {
|
|
let next_page_link = await this.page.$('#pnnext', {timeout: 1000});
|
|
if (!next_page_link) {
|
|
return false;
|
|
}
|
|
await next_page_link.click();
|
|
|
|
return true;
|
|
}
|
|
|
|
async wait_for_results() {
|
|
await this.page.waitForSelector('#fbar', { timeout: this.STANDARD_TIMEOUT });
|
|
}
|
|
|
|
async detected() {
|
|
const title = await this.page.title();
|
|
let html = await this.page.content();
|
|
return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
function clean_image_url(url) {
|
|
// Example:
|
|
// https://www.google.com/imgres?imgurl=https%3A%2F%2Fupload.wikimedia.org%2Fwikipedia%2Fen%2Fthumb%2Ff%2Ffd%2F1928_Edward_Campbell.jpg%2F220px-1928_Edward_Campbell.jpg&imgrefurl=https%3A%2F%2Fwww.revolvy.com%2Fpage%2FSir-Edward-Campbell%252C-1st-Baronet&docid=BMkW_GerTIY4GM&tbnid=TmQapIxDCQbQhM%3A&vet=10ahUKEwje_LLE_YXeAhXisaQKHVAEBSAQMwiNAShEMEQ..i&w=220&h=290&bih=1696&biw=1280&q=John%20MacLeod%20Breadalbane%20Councillor%20Prince%20Edward%20Island&ved=0ahUKEwje_LLE_YXeAhXisaQKHVAEBSAQMwiNAShEMEQ&iact=mrc&uact=8
|
|
const regex = /imgurl=(.*?)&/gm;
|
|
let match = regex.exec(url);
|
|
if (match !== null) {
|
|
return decodeURIComponent(match[1]);
|
|
}
|
|
}
|
|
|
|
function clean_google_url(url) {
|
|
// Example:
|
|
// /url?q=https://www.zeit.de/thema/donald-trump&sa=U&ved=0ahUKEwiL9-u-_ZLgAhVJsqQKHeITDAoQFgg0MAc&usg=AOvVaw3JV3UZjTXRwaS2I-sBbeXF
|
|
// /search?q=trump&hl=de&gbv=2&ie=UTF-8&prmd=ivns&source=univ&tbm=nws&tbo=u&sa=X&ved=0ahUKEwiL9-u-_ZLgAhVJsqQKHeITDAoQqAIIFA
|
|
const regex = /url\?q=(.*?)&/gm;
|
|
let match = regex.exec(url);
|
|
if (match !== null) {
|
|
return decodeURIComponent(match[1]);
|
|
} else {
|
|
return url;
|
|
}
|
|
}
|
|
|
|
|
|
module.exports = {
|
|
GoogleShoppingScraper: GoogleShoppingScraper,
|
|
GoogleNewsOldScraper: GoogleNewsOldScraper,
|
|
GoogleScraper: GoogleScraper,
|
|
GoogleImageScraper: GoogleImageScraper,
|
|
GoogleNewsScraper: GoogleNewsScraper,
|
|
GoogleMapsScraper: GoogleMapsScraper,
|
|
};
|
|
|
|
|
|
// https://developers.google.com/custom-search/v1/cse/list
|
|
|
|
const GOOGLE_DOMAINS = {
|
|
'Samoa': 'google.ws',
|
|
'Vanuatu': 'google.vu',
|
|
'British Virgin Islands': 'google.vg',
|
|
'Trinidad and Tobago': 'google.tt',
|
|
'Tonga': 'google.to',
|
|
'Tunisia': 'google.tn',
|
|
'Turkmenistan': 'google.tm',
|
|
'Timor-Leste': 'google.tl',
|
|
'Tokelau': 'google.tk',
|
|
'Togo': 'google.tg',
|
|
'Chad': 'google.td',
|
|
'São Tomé and Príncipe': 'google.st',
|
|
'Suriname': 'google.sr',
|
|
'Somalia': 'google.so',
|
|
'Senegal': 'google.sn',
|
|
'San Marino': 'google.sm',
|
|
'Slovakia': 'google.sk',
|
|
'Slovenia': 'google.si',
|
|
'Saint Helena, Ascension and Tristan da Cunha': 'google.sh',
|
|
'Sweden': 'google.se',
|
|
'Seychelles': 'google.sc',
|
|
'Rwanda': 'google.rw',
|
|
'Russia': 'google.ru',
|
|
'Serbia': 'google.rs',
|
|
'Romania': 'google.ro',
|
|
'Portugal': 'google.pt',
|
|
'Palestine[3]': 'google.ps',
|
|
'Pitcairn Islands': 'google.co.pn',
|
|
'Poland': 'google.pl',
|
|
'Niue': 'google.nu',
|
|
'Nauru': 'google.nr',
|
|
'Norway': 'google.no',
|
|
'Netherlands': 'google.nl',
|
|
'Niger': 'google.ne',
|
|
'Malawi': 'google.mw',
|
|
'Maldives': 'google.mv',
|
|
'Mauritius': 'google.mu',
|
|
'Montserrat': 'google.ms',
|
|
'Mongolia': 'google.mn',
|
|
'Mali': 'google.ml',
|
|
'Macedonia': 'google.mk',
|
|
'Madagascar': 'google.mg',
|
|
'Montenegro': 'google.me',
|
|
'Moldova': 'google.md',
|
|
'Latvia': 'google.lv',
|
|
'Luxembourg': 'google.lu',
|
|
'Lithuania': 'google.lt',
|
|
'Sri Lanka': 'google.lk',
|
|
'Liechtenstein': 'google.li',
|
|
'Laos': 'google.la',
|
|
'Kazakhstan': 'google.kz',
|
|
'Kiribati': 'google.ki',
|
|
'Kyrgyzstan': 'google.kg',
|
|
'Jordan': 'google.jo',
|
|
'Jersey': 'google.je',
|
|
'Italy': 'google.it',
|
|
'Iceland': 'google.is',
|
|
'Iraq': 'google.iq',
|
|
'British Indian Ocean Territory': 'google.io',
|
|
'Isle of Man': 'google.im',
|
|
'Ireland': 'google.ie',
|
|
'Hungary': 'google.hu',
|
|
'Haiti': 'google.ht',
|
|
'Croatia': 'google.hr',
|
|
'Honduras': 'google.hn',
|
|
'Guyana': 'google.gy',
|
|
'Greece': 'google.gr',
|
|
'Guadeloupe': 'google.gp',
|
|
'Gambia': 'google.gm',
|
|
'Greenland': 'google.gl',
|
|
'Guernsey': 'google.gg',
|
|
'French Guiana': 'google.gf',
|
|
'Georgia': 'google.ge',
|
|
'Gabon': 'google.ga',
|
|
'France': 'google.fr',
|
|
'Federated States of Micronesia': 'google.fm',
|
|
'Finland': 'google.fi',
|
|
'Spain': 'google.es',
|
|
'Estonia': 'google.ee',
|
|
'Algeria': 'google.dz',
|
|
'Dominica': 'google.dm',
|
|
'Denmark': 'google.dk',
|
|
'Djibouti': 'google.dj',
|
|
'Germany': 'google.de',
|
|
'Czech Republic': 'google.cz',
|
|
'Cape Verde': 'google.cv',
|
|
'Vietnam': 'google.com.vn',
|
|
'Saint Vincent and the Grenadines': 'google.com.vc',
|
|
'Uruguay': 'google.com.uy',
|
|
'Ukraine': 'google.com.ua',
|
|
'Taiwan': 'google.com.tw',
|
|
'Turkey': 'google.com.tr',
|
|
'Tajikistan': 'google.com.tj',
|
|
'El Salvador': 'google.com.sv',
|
|
'Sierra Leone': 'google.com.sl',
|
|
'Singapore': 'google.com.sg',
|
|
'Solomon Islands': 'google.com.sb',
|
|
'Saudi Arabia': 'google.com.sa',
|
|
'Qatar': 'google.com.qa',
|
|
'Paraguay': 'google.com.py',
|
|
'Puerto Rico': 'google.com.pr',
|
|
'Pakistan': 'google.com.pk',
|
|
'Philippines': 'google.com.ph',
|
|
'Papua New Guinea': 'google.com.pg',
|
|
'Peru': 'google.com.pe',
|
|
'Panama': 'google.com.pa',
|
|
'Oman': 'google.com.om',
|
|
'Nepal': 'google.com.np',
|
|
'Nicaragua': 'google.com.ni',
|
|
'Nigeria': 'google.com.ng',
|
|
'Norfolk Island': 'google.com.nf',
|
|
'Namibia': 'google.com.na',
|
|
'Malaysia': 'google.com.my',
|
|
'Mexico': 'google.com.mx',
|
|
'Malta': 'google.com.mt',
|
|
'Myanmar': 'google.com.mm',
|
|
'Libya': 'google.com.ly',
|
|
'Saint Lucia': 'google.com.lc',
|
|
'Lebanon': 'google.com.lb',
|
|
'Kuwait': 'google.com.kw',
|
|
'Cambodia': 'google.com.kh',
|
|
'Jamaica': 'google.com.jm',
|
|
'Hong Kong': 'google.com.hk',
|
|
'Guatemala': 'google.com.gt',
|
|
'Gibraltar': 'google.com.gi',
|
|
'Ghana': 'google.com.gh',
|
|
'Fiji': 'google.com.fj',
|
|
'Ethiopia': 'google.com.et',
|
|
'Egypt': 'google.com.eg',
|
|
'Ecuador': 'google.com.ec',
|
|
'Dominican Republic': 'google.com.do',
|
|
'Cyprus': 'google.com.cy',
|
|
'Cuba': 'google.com.cu',
|
|
'Colombia': 'google.com.co',
|
|
'Belize': 'google.com.bz',
|
|
'Brazil': 'google.com.br',
|
|
'Bolivia': 'google.com.bo',
|
|
'Brunei': 'google.com.bn',
|
|
'Bahrain': 'google.com.bh',
|
|
'Bangladesh': 'google.com.bd',
|
|
'Australia': 'google.com.au',
|
|
'Argentina': 'google.com.ar',
|
|
'Anguilla': 'google.com.ai',
|
|
'Antigua and Barbuda': 'google.com.ag',
|
|
'Afghanistan': 'google.com.af',
|
|
'Worldwide (Original for the United States)': 'google.com',
|
|
'United States': 'google.com',
|
|
'Zimbabwe': 'google.co.zw',
|
|
'Zambia': 'google.co.zm',
|
|
'South Africa': 'google.co.za',
|
|
'United States Virgin Islands': 'google.co.vi',
|
|
'Venezuela': 'google.co.ve',
|
|
'Uzbekistan': 'google.co.uz',
|
|
'United Kingdom': 'google.co.uk',
|
|
'Uganda': 'google.co.ug',
|
|
'Tanzania': 'google.co.tz',
|
|
'Thailand': 'google.co.th',
|
|
'New Zealand': 'google.co.nz',
|
|
'Mozambique': 'google.co.mz',
|
|
'Morocco': 'google.co.ma',
|
|
'Lesotho': 'google.co.ls',
|
|
'South Korea': 'google.co.kr',
|
|
'Kenya': 'google.co.ke',
|
|
'Japan': 'google.co.jp',
|
|
'India': 'google.co.in',
|
|
'Israel': 'google.co.il',
|
|
'Indonesia': 'google.co.id',
|
|
'Costa Rica': 'google.co.cr',
|
|
'Cook Islands': 'google.co.ck',
|
|
'Botswana': 'google.co.bw',
|
|
'Angola': 'google.co.ao',
|
|
'China': 'google.cn',
|
|
'Cameroon': 'google.cm',
|
|
'Chile': 'google.cl',
|
|
'Ivory Coast': 'google.ci',
|
|
'Switzerland': 'google.ch',
|
|
'Republic of the Congo': 'google.cg',
|
|
'Central African Republic': 'google.cf',
|
|
'Democratic Republic of the Congo': 'google.cd',
|
|
'Cocos (Keeling) Islands': 'google.cc',
|
|
'Catalan Countries': 'google.cat',
|
|
'Canada': 'google.ca',
|
|
'Belarus': 'google.by',
|
|
'Bhutan': 'google.bt',
|
|
'Bahamas': 'google.bs',
|
|
'Benin': 'google.bj',
|
|
'Burundi': 'google.bi',
|
|
'Bulgaria': 'google.bg',
|
|
'Burkina Faso': 'google.bf',
|
|
'Belgium': 'google.be',
|
|
'Bosnia and Herzegovina': 'google.ba',
|
|
'Azerbaijan': 'google.az',
|
|
'Austria': 'google.at',
|
|
'American Samoa': 'google.as',
|
|
'Armenia': 'google.am',
|
|
'Albania': 'google.al',
|
|
'United Arab Emirates': 'google.ae',
|
|
'Andorra': 'google.ad',
|
|
'Ascension Island': 'google.ac'
|
|
};
|
|
|
|
// https://developers.google.com/custom-search/docs/xml_results_appendices#countryCodes
|
|
// The gl parameter determines the Google country to use for the query.
|
|
const GOOGLE_GL = {'af': 'Afghanistan',
|
|
'al': 'Albania',
|
|
'dz': 'Algeria',
|
|
'as': 'American Samoa',
|
|
'ad': 'Andorra',
|
|
'ao': 'Angola',
|
|
'ai': 'Anguilla',
|
|
'aq': 'Antarctica',
|
|
'ag': 'Antigua and Barbuda',
|
|
'ar': 'Argentina',
|
|
'am': 'Armenia',
|
|
'aw': 'Aruba',
|
|
'au': 'Australia',
|
|
'at': 'Austria',
|
|
'az': 'Azerbaijan',
|
|
'bs': 'Bahamas',
|
|
'bh': 'Bahrain',
|
|
'bd': 'Bangladesh',
|
|
'bb': 'Barbados',
|
|
'by': 'Belarus',
|
|
'be': 'Belgium',
|
|
'bz': 'Belize',
|
|
'bj': 'Benin',
|
|
'bm': 'Bermuda',
|
|
'bt': 'Bhutan',
|
|
'bo': 'Bolivia',
|
|
'ba': 'Bosnia and Herzegovina',
|
|
'bw': 'Botswana',
|
|
'bv': 'Bouvet Island',
|
|
'br': 'Brazil',
|
|
'io': 'British Indian Ocean Territory',
|
|
'bn': 'Brunei Darussalam',
|
|
'bg': 'Bulgaria',
|
|
'bf': 'Burkina Faso',
|
|
'bi': 'Burundi',
|
|
'kh': 'Cambodia',
|
|
'cm': 'Cameroon',
|
|
'ca': 'Canada',
|
|
'cv': 'Cape Verde',
|
|
'ky': 'Cayman Islands',
|
|
'cf': 'Central African Republic',
|
|
'td': 'Chad',
|
|
'cl': 'Chile',
|
|
'cn': 'China',
|
|
'cx': 'Christmas Island',
|
|
'cc': 'Cocos (Keeling) Islands',
|
|
'co': 'Colombia',
|
|
'km': 'Comoros',
|
|
'cg': 'Congo',
|
|
'cd': 'Congo, the Democratic Republic of the',
|
|
'ck': 'Cook Islands',
|
|
'cr': 'Costa Rica',
|
|
'ci': "Cote D'ivoire",
|
|
'hr': 'Croatia',
|
|
'cu': 'Cuba',
|
|
'cy': 'Cyprus',
|
|
'cz': 'Czech Republic',
|
|
'dk': 'Denmark',
|
|
'dj': 'Djibouti',
|
|
'dm': 'Dominica',
|
|
'do': 'Dominican Republic',
|
|
'ec': 'Ecuador',
|
|
'eg': 'Egypt',
|
|
'sv': 'El Salvador',
|
|
'gq': 'Equatorial Guinea',
|
|
'er': 'Eritrea',
|
|
'ee': 'Estonia',
|
|
'et': 'Ethiopia',
|
|
'fk': 'Falkland Islands (Malvinas)',
|
|
'fo': 'Faroe Islands',
|
|
'fj': 'Fiji',
|
|
'fi': 'Finland',
|
|
'fr': 'France',
|
|
'gf': 'French Guiana',
|
|
'pf': 'French Polynesia',
|
|
'tf': 'French Southern Territories',
|
|
'ga': 'Gabon',
|
|
'gm': 'Gambia',
|
|
'ge': 'Georgia',
|
|
'de': 'Germany',
|
|
'gh': 'Ghana',
|
|
'gi': 'Gibraltar',
|
|
'gr': 'Greece',
|
|
'gl': 'Greenland',
|
|
'gd': 'Grenada',
|
|
'gp': 'Guadeloupe',
|
|
'gu': 'Guam',
|
|
'gt': 'Guatemala',
|
|
'gn': 'Guinea',
|
|
'gw': 'Guinea-Bissau',
|
|
'gy': 'Guyana',
|
|
'ht': 'Haiti',
|
|
'hm': 'Heard Island and Mcdonald Islands',
|
|
'va': 'Holy See (Vatican City State)',
|
|
'hn': 'Honduras',
|
|
'hk': 'Hong Kong',
|
|
'hu': 'Hungary',
|
|
'is': 'Iceland',
|
|
'in': 'India',
|
|
'id': 'Indonesia',
|
|
'ir': 'Iran, Islamic Republic of',
|
|
'iq': 'Iraq',
|
|
'ie': 'Ireland',
|
|
'il': 'Israel',
|
|
'it': 'Italy',
|
|
'jm': 'Jamaica',
|
|
'jp': 'Japan',
|
|
'jo': 'Jordan',
|
|
'kz': 'Kazakhstan',
|
|
'ke': 'Kenya',
|
|
'ki': 'Kiribati',
|
|
'kp': "Korea, Democratic People's Republic of",
|
|
'kr': 'Korea, Republic of',
|
|
'kw': 'Kuwait',
|
|
'kg': 'Kyrgyzstan',
|
|
'la': "Lao People's Democratic Republic",
|
|
'lv': 'Latvia',
|
|
'lb': 'Lebanon',
|
|
'ls': 'Lesotho',
|
|
'lr': 'Liberia',
|
|
'ly': 'Libyan Arab Jamahiriya',
|
|
'li': 'Liechtenstein',
|
|
'lt': 'Lithuania',
|
|
'lu': 'Luxembourg',
|
|
'mo': 'Macao',
|
|
'mk': 'Macedonia, the Former Yugosalv Republic of',
|
|
'mg': 'Madagascar',
|
|
'mw': 'Malawi',
|
|
'my': 'Malaysia',
|
|
'mv': 'Maldives',
|
|
'ml': 'Mali',
|
|
'mt': 'Malta',
|
|
'mh': 'Marshall Islands',
|
|
'mq': 'Martinique',
|
|
'mr': 'Mauritania',
|
|
'mu': 'Mauritius',
|
|
'yt': 'Mayotte',
|
|
'mx': 'Mexico',
|
|
'fm': 'Micronesia, Federated States of',
|
|
'md': 'Moldova, Republic of',
|
|
'mc': 'Monaco',
|
|
'mn': 'Mongolia',
|
|
'ms': 'Montserrat',
|
|
'ma': 'Morocco',
|
|
'mz': 'Mozambique',
|
|
'mm': 'Myanmar',
|
|
'na': 'Namibia',
|
|
'nr': 'Nauru',
|
|
'np': 'Nepal',
|
|
'nl': 'Netherlands',
|
|
'an': 'Netherlands Antilles',
|
|
'nc': 'New Caledonia',
|
|
'nz': 'New Zealand',
|
|
'ni': 'Nicaragua',
|
|
'ne': 'Niger',
|
|
'ng': 'Nigeria',
|
|
'nu': 'Niue',
|
|
'nf': 'Norfolk Island',
|
|
'mp': 'Northern Mariana Islands',
|
|
'no': 'Norway',
|
|
'om': 'Oman',
|
|
'pk': 'Pakistan',
|
|
'pw': 'Palau',
|
|
'ps': 'Palestinian Territory, Occupied',
|
|
'pa': 'Panama',
|
|
'pg': 'Papua New Guinea',
|
|
'py': 'Paraguay',
|
|
'pe': 'Peru',
|
|
'ph': 'Philippines',
|
|
'pn': 'Pitcairn',
|
|
'pl': 'Poland',
|
|
'pt': 'Portugal',
|
|
'pr': 'Puerto Rico',
|
|
'qa': 'Qatar',
|
|
're': 'Reunion',
|
|
'ro': 'Romania',
|
|
'ru': 'Russian Federation',
|
|
'rw': 'Rwanda',
|
|
'sh': 'Saint Helena',
|
|
'kn': 'Saint Kitts and Nevis',
|
|
'lc': 'Saint Lucia',
|
|
'pm': 'Saint Pierre and Miquelon',
|
|
'vc': 'Saint Vincent and the Grenadines',
|
|
'ws': 'Samoa',
|
|
'sm': 'San Marino',
|
|
'st': 'Sao Tome and Principe',
|
|
'sa': 'Saudi Arabia',
|
|
'sn': 'Senegal',
|
|
'cs': 'Serbia and Montenegro',
|
|
'sc': 'Seychelles',
|
|
'sl': 'Sierra Leone',
|
|
'sg': 'Singapore',
|
|
'sk': 'Slovakia',
|
|
'si': 'Slovenia',
|
|
'sb': 'Solomon Islands',
|
|
'so': 'Somalia',
|
|
'za': 'South Africa',
|
|
'gs': 'South Georgia and the South Sandwich Islands',
|
|
'es': 'Spain',
|
|
'lk': 'Sri Lanka',
|
|
'sd': 'Sudan',
|
|
'sr': 'Suriname',
|
|
'sj': 'Svalbard and Jan Mayen',
|
|
'sz': 'Swaziland',
|
|
'se': 'Sweden',
|
|
'ch': 'Switzerland',
|
|
'sy': 'Syrian Arab Republic',
|
|
'tw': 'Taiwan, Province of China',
|
|
'tj': 'Tajikistan',
|
|
'tz': 'Tanzania, United Republic of',
|
|
'th': 'Thailand',
|
|
'tl': 'Timor-Leste',
|
|
'tg': 'Togo',
|
|
'tk': 'Tokelau',
|
|
'to': 'Tonga',
|
|
'tt': 'Trinidad and Tobago',
|
|
'tn': 'Tunisia',
|
|
'tr': 'Turkey',
|
|
'tm': 'Turkmenistan',
|
|
'tc': 'Turks and Caicos Islands',
|
|
'tv': 'Tuvalu',
|
|
'ug': 'Uganda',
|
|
'ua': 'Ukraine',
|
|
'ae': 'United Arab Emirates',
|
|
'uk': 'United Kingdom',
|
|
'us': 'United States',
|
|
'um': 'United States Minor Outlying Islands',
|
|
'uy': 'Uruguay',
|
|
'uz': 'Uzbekistan',
|
|
'vu': 'Vanuatu',
|
|
've': 'Venezuela',
|
|
'vn': 'Viet Nam',
|
|
'vg': 'Virgin Islands, British',
|
|
'vi': 'Virgin Islands, U.S.',
|
|
'wf': 'Wallis and Futuna',
|
|
'eh': 'Western Sahara',
|
|
'ye': 'Yemen',
|
|
'zm': 'Zambia',
|
|
'zw': 'Zimbabwe'
|
|
};
|
|
|
|
|
|
// https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages
|
|
// The hl parameter determines the Google UI language to return results.
|
|
const GOOGLE_HL = {
|
|
'af': 'Afrikaans',
|
|
'sq': 'Albanian',
|
|
'sm': 'Amharic',
|
|
'ar': 'Arabic',
|
|
'az': 'Azerbaijani',
|
|
'eu': 'Basque',
|
|
'be': 'Belarusian',
|
|
'bn': 'Bengali',
|
|
'bh': 'Bihari',
|
|
'bs': 'Bosnian',
|
|
'bg': 'Bulgarian',
|
|
'ca': 'Catalan',
|
|
'zh-CN': 'Chinese (Simplified)',
|
|
'zh-TW': 'Chinese (Traditional)',
|
|
'hr': 'Croatian',
|
|
'cs': 'Czech',
|
|
'da': 'Danish',
|
|
'nl': 'Dutch',
|
|
'en': 'English',
|
|
'eo': 'Esperanto',
|
|
'et': 'Estonian',
|
|
'fo': 'Faroese',
|
|
'fi': 'Finnish',
|
|
'fr': 'French',
|
|
'fy': 'Frisian',
|
|
'gl': 'Galician',
|
|
'ka': 'Georgian',
|
|
'de': 'German',
|
|
'el': 'Greek',
|
|
'gu': 'Gujarati',
|
|
'iw': 'Hebrew',
|
|
'hi': 'Hindi',
|
|
'hu': 'Hungarian',
|
|
'is': 'Icelandic',
|
|
'id': 'Indonesian',
|
|
'ia': 'Interlingua',
|
|
'ga': 'Irish',
|
|
'it': 'Italian',
|
|
'ja': 'Japanese',
|
|
'jw': 'Javanese',
|
|
'kn': 'Kannada',
|
|
'ko': 'Korean',
|
|
'la': 'Latin',
|
|
'lv': 'Latvian',
|
|
'lt': 'Lithuanian',
|
|
'mk': 'Macedonian',
|
|
'ms': 'Malay',
|
|
'ml': 'Malayam',
|
|
'mt': 'Maltese',
|
|
'mr': 'Marathi',
|
|
'ne': 'Nepali',
|
|
'no': 'Norwegian',
|
|
'nn': 'Norwegian (Nynorsk)',
|
|
'oc': 'Occitan',
|
|
'fa': 'Persian',
|
|
'pl': 'Polish',
|
|
'pt-BR': 'Portuguese (Brazil)',
|
|
'pt-PT': 'Portuguese (Portugal)',
|
|
'pa': 'Punjabi',
|
|
'ro': 'Romanian',
|
|
'ru': 'Russian',
|
|
'gd': 'Scots Gaelic',
|
|
'sr': 'Serbian',
|
|
'si': 'Sinhalese',
|
|
'sk': 'Slovak',
|
|
'sl': 'Slovenian',
|
|
'es': 'Spanish',
|
|
'su': 'Sudanese',
|
|
'sw': 'Swahili',
|
|
'sv': 'Swedish',
|
|
'tl': 'Tagalog',
|
|
'ta': 'Tamil',
|
|
'te': 'Telugu',
|
|
'th': 'Thai',
|
|
'ti': 'Tigrinya',
|
|
'tr': 'Turkish',
|
|
'uk': 'Ukrainian',
|
|
'ur': 'Urdu',
|
|
'uz': 'Uzbek',
|
|
'vi': 'Vietnamese',
|
|
'cy': 'Welsh',
|
|
'xh': 'Xhosa',
|
|
'zu': 'Zulu'
|
|
};
|