added suport for custom query string parameters

This commit is contained in:
Nikolai Tschacher 2019-03-06 00:08:25 +01:00
parent 7239e23cba
commit 7b52b4e62f
12 changed files with 2203 additions and 655 deletions

View File

@ -16,6 +16,7 @@ If you don't have much technical experience or don't want to purchase proxies, y
- [Scraping Model](#scraping-model) - [Scraping Model](#scraping-model)
- [Technical Notes](#technical-notes) - [Technical Notes](#technical-notes)
- [Advanced Usage](#advanced-usage) - [Advanced Usage](#advanced-usage)
- [Special Query String Parameters for Search Engines](#query-string-parameters)
Se-scraper supports the following search engines: Se-scraper supports the following search engines:
@ -307,4 +308,24 @@ function callback(err, response) {
se_scraper.scrape(config, callback); se_scraper.scrape(config, callback);
``` ```
[Output for the above script on my machine.](examples/results/advanced.json) [Output for the above script on my machine.](examples/results/advanced.json)
### Query String Parameters
You can add your custom query string parameters to the configuration object by specifying a `google_settings` key. In general: `{{search engine}}_settings`.
For example you can customize your google search with the following config:
```js
let config = {
search_engine: 'google',
// use specific search engine parameters for various search engines
google_settings: {
google_domain: 'google.com',
gl: 'us', // The gl parameter determines the Google country to use for the query.
hl: 'us', // The hl parameter determines the Google UI language to return results.
start: 0, // Determines the results offset to use, defaults to 0.
num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
},
}
```

View File

@ -40,6 +40,12 @@
- make README.md nicer. https://github.com/thomasdondorf/puppeteer-cluster/blob/master/README.md as template - make README.md nicer. https://github.com/thomasdondorf/puppeteer-cluster/blob/master/README.md as template
### TODO: ### TODO:
- fix duckduckgo test case!!!
- add test case for infospace
- add test case for google parameters for
- num
- start
- some language settings
- write test case for proxy support and cluster support - write test case for proxy support and cluster support
- add captcha service solving support - add captcha service solving support
- check if news instances run the same browser and if we can have one proxy per tab wokers - check if news instances run the same browser and if we can have one proxy per tab wokers

File diff suppressed because it is too large Load Diff

2
package-lock.json generated
View File

@ -1,6 +1,6 @@
{ {
"name": "se-scraper", "name": "se-scraper",
"version": "1.1.14", "version": "1.2.7",
"lockfileVersion": 1, "lockfileVersion": 1,
"requires": true, "requires": true,
"dependencies": { "dependencies": {

View File

@ -1,6 +1,6 @@
{ {
"name": "se-scraper", "name": "se-scraper",
"version": "1.2.7", "version": "1.2.8",
"description": "A simple module using puppeteer to scrape several search engines such as Google, Duckduckgo and Bing.", "description": "A simple module using puppeteer to scrape several search engines such as Google, Duckduckgo and Bing.",
"homepage": "https://scrapeulous.com/", "homepage": "https://scrapeulous.com/",
"main": "index.js", "main": "index.js",
@ -23,6 +23,7 @@
"dependencies": { "dependencies": {
"chai": "^4.2.0", "chai": "^4.2.0",
"cheerio": "^1.0.0-rc.2", "cheerio": "^1.0.0-rc.2",
"debug": "^4.1.1",
"got": "^9.6.0", "got": "^9.6.0",
"proxy-chain": "^0.2.7", "proxy-chain": "^0.2.7",
"puppeteer": "^1.12.2", "puppeteer": "^1.12.2",

24
run.js
View File

@ -10,6 +10,24 @@ let config = {
sleep_range: '[1,2]', sleep_range: '[1,2]',
// which search engine to scrape // which search engine to scrape
search_engine: 'google', search_engine: 'google',
// use specific search engine parameters for various search engines
google_settings: {
google_domain: 'google.com',
gl: 'us', // The gl parameter determines the Google country to use for the query.
hl: 'us', // The hl parameter determines the Google UI language to return results.
start: 0, // Determines the results offset to use, defaults to 0.
num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
},
// https://docs.microsoft.com/en-us/rest/api/cognitiveservices/bing-web-api-v5-reference#query-parameters
bing_settings: {
count: 50, // how many results per page
safeSearch: 'Off', // safe search (strict, moderate, off)
cc: 'us', // ISO 3166 country code
offset: 0, // The zero-based offset that indicates the number of search results to skip before returning results
},
// whether debug information should be printed // whether debug information should be printed
// debug info is useful for developers when debugging // debug info is useful for developers when debugging
debug: false, debug: false,
@ -17,7 +35,7 @@ let config = {
// this output is informational // this output is informational
verbose: true, verbose: true,
// an array of keywords to scrape // an array of keywords to scrape
keywords: ['scrapeulous.com', 'scraping search engines', 'scraping service scrapeulous', 'learn js'], keywords: ['good news'],
// alternatively you can specify a keyword_file. this overwrites the keywords array // alternatively you can specify a keyword_file. this overwrites the keywords array
keyword_file: '', keyword_file: '',
// the number of pages to scrape for each keyword // the number of pages to scrape for each keyword
@ -54,7 +72,7 @@ let config = {
timeout: 10 * 60 * 1000, // max timeout set to 10 minutes timeout: 10 * 60 * 1000, // max timeout set to 10 minutes
monitor: false, monitor: false,
concurrency: 1, // one scraper per tab concurrency: 1, // one scraper per tab
maxConcurrency: 2, // scrape with 2 tabs maxConcurrency: 1, // scrape with 2 tabs
} }
}; };
@ -68,7 +86,7 @@ function callback(err, response) {
response.statusCode - status code of the scraping process response.statusCode - status code of the scraping process
*/ */
// console.dir(response.results, {depth: null, colors: true}); console.dir(response.results, {depth: null, colors: true});
} }
se_scraper.scrape(config, callback); se_scraper.scrape(config, callback);

View File

@ -35,8 +35,11 @@ class BaiduScraper extends Scraper {
} }
async load_start_page() { async load_start_page() {
let startUrl = this.build_start_url('https://www.baidu.com/s?') || 'https://www.baidu.com/';
try { try {
await this.page.goto('https://www.baidu.com/'); await this.page.goto(startUrl);
await this.page.waitForSelector('input[name="wd"]', { timeout: 5000 }); await this.page.waitForSelector('input[name="wd"]', { timeout: 5000 });
} catch (e) { } catch (e) {
return false; return false;

View File

@ -45,8 +45,10 @@ class BingScraper extends Scraper {
} }
async load_start_page() { async load_start_page() {
let startUrl = this.build_start_url('https://www.bing.com/search?') || 'https://www.bing.com/';
try { try {
await this.page.goto('https://www.bing.com/'); await this.page.goto(startUrl);
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 }); await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
} catch (e) { } catch (e) {
return false; return false;
@ -117,8 +119,10 @@ class BingNewsScraper extends Scraper {
} }
async load_start_page() { async load_start_page() {
let startUrl = 'https://www.bing.com/news/search?';
try { try {
await this.page.goto('https://www.bing.com/news/search?'); await this.page.goto(startUrl);
if (this.config.set_manual_settings === true) { if (this.config.set_manual_settings === true) {
console.log('Sleeping 30 seconds. Set your settings now.'); console.log('Sleeping 30 seconds. Set your settings now.');
await this.sleep(30000); await this.sleep(30000);
@ -127,6 +131,7 @@ class BingNewsScraper extends Scraper {
} catch (e) { } catch (e) {
return false; return false;
} }
return true; return true;
} }

View File

@ -38,8 +38,11 @@ class DuckduckgoScraper extends Scraper {
} }
async load_start_page() { async load_start_page() {
let startUrl = this.build_start_url('https://duckduckgo.com/?') || 'https://duckduckgo.com/';
try { try {
await this.page.goto('https://duckduckgo.com/'); await this.page.goto(startUrl);
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 }); await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
} catch (e) { } catch (e) {
return false; return false;
@ -56,19 +59,19 @@ class DuckduckgoScraper extends Scraper {
} }
async next_page() { async next_page() {
let next_page_link = await this.page.$('a.result--more__btn', {timeout: 1000}); let next_page_link = await this.page.$('a.result--more__btn', {timeout: 5000});
if (!next_page_link) { if (!next_page_link) {
return false; return false;
} }
await next_page_link.click(); await next_page_link.click();
//await this.page.waitForNavigation(); await this.page.waitForNavigation();
return true; return true;
} }
async wait_for_results() { async wait_for_results() {
await this.page.waitForSelector('.serp__results', { timeout: 5000 }); await this.page.waitForSelector('.serp__results', { timeout: 5000 });
await this.sleep(250); await this.sleep(350);
} }
async detected() { async detected() {

View File

@ -1,7 +1,544 @@
const cheerio = require('cheerio'); const cheerio = require('cheerio');
const Scraper = require('./se_scraper'); const Scraper = require('./se_scraper');
class GoogleScraper extends Scraper { // https://developers.google.com/custom-search/v1/cse/list
const GOOGLE_DOMAINS = {
'Samoa': 'google.ws',
'Vanuatu': 'google.vu',
'British Virgin Islands': 'google.vg',
'Trinidad and Tobago': 'google.tt',
'Tonga': 'google.to',
'Tunisia': 'google.tn',
'Turkmenistan': 'google.tm',
'Timor-Leste': 'google.tl',
'Tokelau': 'google.tk',
'Togo': 'google.tg',
'Chad': 'google.td',
'São Tomé and Príncipe': 'google.st',
'Suriname': 'google.sr',
'Somalia': 'google.so',
'Senegal': 'google.sn',
'San Marino': 'google.sm',
'Slovakia': 'google.sk',
'Slovenia': 'google.si',
'Saint Helena, Ascension and Tristan da Cunha': 'google.sh',
'Sweden': 'google.se',
'Seychelles': 'google.sc',
'Rwanda': 'google.rw',
'Russia': 'google.ru',
'Serbia': 'google.rs',
'Romania': 'google.ro',
'Portugal': 'google.pt',
'Palestine[3]': 'google.ps',
'Pitcairn Islands': 'google.co.pn',
'Poland': 'google.pl',
'Niue': 'google.nu',
'Nauru': 'google.nr',
'Norway': 'google.no',
'Netherlands': 'google.nl',
'Niger': 'google.ne',
'Malawi': 'google.mw',
'Maldives': 'google.mv',
'Mauritius': 'google.mu',
'Montserrat': 'google.ms',
'Mongolia': 'google.mn',
'Mali': 'google.ml',
'Macedonia': 'google.mk',
'Madagascar': 'google.mg',
'Montenegro': 'google.me',
'Moldova': 'google.md',
'Latvia': 'google.lv',
'Luxembourg': 'google.lu',
'Lithuania': 'google.lt',
'Sri Lanka': 'google.lk',
'Liechtenstein': 'google.li',
'Laos': 'google.la',
'Kazakhstan': 'google.kz',
'Kiribati': 'google.ki',
'Kyrgyzstan': 'google.kg',
'Jordan': 'google.jo',
'Jersey': 'google.je',
'Italy': 'google.it',
'Iceland': 'google.is',
'Iraq': 'google.iq',
'British Indian Ocean Territory': 'google.io',
'Isle of Man': 'google.im',
'Ireland': 'google.ie',
'Hungary': 'google.hu',
'Haiti': 'google.ht',
'Croatia': 'google.hr',
'Honduras': 'google.hn',
'Guyana': 'google.gy',
'Greece': 'google.gr',
'Guadeloupe': 'google.gp',
'Gambia': 'google.gm',
'Greenland': 'google.gl',
'Guernsey': 'google.gg',
'French Guiana': 'google.gf',
'Georgia': 'google.ge',
'Gabon': 'google.ga',
'France': 'google.fr',
'Federated States of Micronesia': 'google.fm',
'Finland': 'google.fi',
'Spain': 'google.es',
'Estonia': 'google.ee',
'Algeria': 'google.dz',
'Dominica': 'google.dm',
'Denmark': 'google.dk',
'Djibouti': 'google.dj',
'Germany': 'google.de',
'Czech Republic': 'google.cz',
'Cape Verde': 'google.cv',
'Vietnam': 'google.com.vn',
'Saint Vincent and the Grenadines': 'google.com.vc',
'Uruguay': 'google.com.uy',
'Ukraine': 'google.com.ua',
'Taiwan': 'google.com.tw',
'Turkey': 'google.com.tr',
'Tajikistan': 'google.com.tj',
'El Salvador': 'google.com.sv',
'Sierra Leone': 'google.com.sl',
'Singapore': 'google.com.sg',
'Solomon Islands': 'google.com.sb',
'Saudi Arabia': 'google.com.sa',
'Qatar': 'google.com.qa',
'Paraguay': 'google.com.py',
'Puerto Rico': 'google.com.pr',
'Pakistan': 'google.com.pk',
'Philippines': 'google.com.ph',
'Papua New Guinea': 'google.com.pg',
'Peru': 'google.com.pe',
'Panama': 'google.com.pa',
'Oman': 'google.com.om',
'Nepal': 'google.com.np',
'Nicaragua': 'google.com.ni',
'Nigeria': 'google.com.ng',
'Norfolk Island': 'google.com.nf',
'Namibia': 'google.com.na',
'Malaysia': 'google.com.my',
'Mexico': 'google.com.mx',
'Malta': 'google.com.mt',
'Myanmar': 'google.com.mm',
'Libya': 'google.com.ly',
'Saint Lucia': 'google.com.lc',
'Lebanon': 'google.com.lb',
'Kuwait': 'google.com.kw',
'Cambodia': 'google.com.kh',
'Jamaica': 'google.com.jm',
'Hong Kong': 'google.com.hk',
'Guatemala': 'google.com.gt',
'Gibraltar': 'google.com.gi',
'Ghana': 'google.com.gh',
'Fiji': 'google.com.fj',
'Ethiopia': 'google.com.et',
'Egypt': 'google.com.eg',
'Ecuador': 'google.com.ec',
'Dominican Republic': 'google.com.do',
'Cyprus': 'google.com.cy',
'Cuba': 'google.com.cu',
'Colombia': 'google.com.co',
'Belize': 'google.com.bz',
'Brazil': 'google.com.br',
'Bolivia': 'google.com.bo',
'Brunei': 'google.com.bn',
'Bahrain': 'google.com.bh',
'Bangladesh': 'google.com.bd',
'Australia': 'google.com.au',
'Argentina': 'google.com.ar',
'Anguilla': 'google.com.ai',
'Antigua and Barbuda': 'google.com.ag',
'Afghanistan': 'google.com.af',
'Worldwide (Original for the United States)': 'google.com',
'United States': 'google.com',
'Zimbabwe': 'google.co.zw',
'Zambia': 'google.co.zm',
'South Africa': 'google.co.za',
'United States Virgin Islands': 'google.co.vi',
'Venezuela': 'google.co.ve',
'Uzbekistan': 'google.co.uz',
'United Kingdom': 'google.co.uk',
'Uganda': 'google.co.ug',
'Tanzania': 'google.co.tz',
'Thailand': 'google.co.th',
'New Zealand': 'google.co.nz',
'Mozambique': 'google.co.mz',
'Morocco': 'google.co.ma',
'Lesotho': 'google.co.ls',
'South Korea': 'google.co.kr',
'Kenya': 'google.co.ke',
'Japan': 'google.co.jp',
'India': 'google.co.in',
'Israel': 'google.co.il',
'Indonesia': 'google.co.id',
'Costa Rica': 'google.co.cr',
'Cook Islands': 'google.co.ck',
'Botswana': 'google.co.bw',
'Angola': 'google.co.ao',
'China': 'google.cn',
'Cameroon': 'google.cm',
'Chile': 'google.cl',
'Ivory Coast': 'google.ci',
'Switzerland': 'google.ch',
'Republic of the Congo': 'google.cg',
'Central African Republic': 'google.cf',
'Democratic Republic of the Congo': 'google.cd',
'Cocos (Keeling) Islands': 'google.cc',
'Catalan Countries': 'google.cat',
'Canada': 'google.ca',
'Belarus': 'google.by',
'Bhutan': 'google.bt',
'Bahamas': 'google.bs',
'Benin': 'google.bj',
'Burundi': 'google.bi',
'Bulgaria': 'google.bg',
'Burkina Faso': 'google.bf',
'Belgium': 'google.be',
'Bosnia and Herzegovina': 'google.ba',
'Azerbaijan': 'google.az',
'Austria': 'google.at',
'American Samoa': 'google.as',
'Armenia': 'google.am',
'Albania': 'google.al',
'United Arab Emirates': 'google.ae',
'Andorra': 'google.ad',
'Ascension Island': 'google.ac'
};
// https://developers.google.com/custom-search/docs/xml_results_appendices#countryCodes
// The gl parameter determines the Google country to use for the query.
const GOOGLE_GL = {'af': 'Afghanistan',
'al': 'Albania',
'dz': 'Algeria',
'as': 'American Samoa',
'ad': 'Andorra',
'ao': 'Angola',
'ai': 'Anguilla',
'aq': 'Antarctica',
'ag': 'Antigua and Barbuda',
'ar': 'Argentina',
'am': 'Armenia',
'aw': 'Aruba',
'au': 'Australia',
'at': 'Austria',
'az': 'Azerbaijan',
'bs': 'Bahamas',
'bh': 'Bahrain',
'bd': 'Bangladesh',
'bb': 'Barbados',
'by': 'Belarus',
'be': 'Belgium',
'bz': 'Belize',
'bj': 'Benin',
'bm': 'Bermuda',
'bt': 'Bhutan',
'bo': 'Bolivia',
'ba': 'Bosnia and Herzegovina',
'bw': 'Botswana',
'bv': 'Bouvet Island',
'br': 'Brazil',
'io': 'British Indian Ocean Territory',
'bn': 'Brunei Darussalam',
'bg': 'Bulgaria',
'bf': 'Burkina Faso',
'bi': 'Burundi',
'kh': 'Cambodia',
'cm': 'Cameroon',
'ca': 'Canada',
'cv': 'Cape Verde',
'ky': 'Cayman Islands',
'cf': 'Central African Republic',
'td': 'Chad',
'cl': 'Chile',
'cn': 'China',
'cx': 'Christmas Island',
'cc': 'Cocos (Keeling) Islands',
'co': 'Colombia',
'km': 'Comoros',
'cg': 'Congo',
'cd': 'Congo, the Democratic Republic of the',
'ck': 'Cook Islands',
'cr': 'Costa Rica',
'ci': "Cote D'ivoire",
'hr': 'Croatia',
'cu': 'Cuba',
'cy': 'Cyprus',
'cz': 'Czech Republic',
'dk': 'Denmark',
'dj': 'Djibouti',
'dm': 'Dominica',
'do': 'Dominican Republic',
'ec': 'Ecuador',
'eg': 'Egypt',
'sv': 'El Salvador',
'gq': 'Equatorial Guinea',
'er': 'Eritrea',
'ee': 'Estonia',
'et': 'Ethiopia',
'fk': 'Falkland Islands (Malvinas)',
'fo': 'Faroe Islands',
'fj': 'Fiji',
'fi': 'Finland',
'fr': 'France',
'gf': 'French Guiana',
'pf': 'French Polynesia',
'tf': 'French Southern Territories',
'ga': 'Gabon',
'gm': 'Gambia',
'ge': 'Georgia',
'de': 'Germany',
'gh': 'Ghana',
'gi': 'Gibraltar',
'gr': 'Greece',
'gl': 'Greenland',
'gd': 'Grenada',
'gp': 'Guadeloupe',
'gu': 'Guam',
'gt': 'Guatemala',
'gn': 'Guinea',
'gw': 'Guinea-Bissau',
'gy': 'Guyana',
'ht': 'Haiti',
'hm': 'Heard Island and Mcdonald Islands',
'va': 'Holy See (Vatican City State)',
'hn': 'Honduras',
'hk': 'Hong Kong',
'hu': 'Hungary',
'is': 'Iceland',
'in': 'India',
'id': 'Indonesia',
'ir': 'Iran, Islamic Republic of',
'iq': 'Iraq',
'ie': 'Ireland',
'il': 'Israel',
'it': 'Italy',
'jm': 'Jamaica',
'jp': 'Japan',
'jo': 'Jordan',
'kz': 'Kazakhstan',
'ke': 'Kenya',
'ki': 'Kiribati',
'kp': "Korea, Democratic People's Republic of",
'kr': 'Korea, Republic of',
'kw': 'Kuwait',
'kg': 'Kyrgyzstan',
'la': "Lao People's Democratic Republic",
'lv': 'Latvia',
'lb': 'Lebanon',
'ls': 'Lesotho',
'lr': 'Liberia',
'ly': 'Libyan Arab Jamahiriya',
'li': 'Liechtenstein',
'lt': 'Lithuania',
'lu': 'Luxembourg',
'mo': 'Macao',
'mk': 'Macedonia, the Former Yugosalv Republic of',
'mg': 'Madagascar',
'mw': 'Malawi',
'my': 'Malaysia',
'mv': 'Maldives',
'ml': 'Mali',
'mt': 'Malta',
'mh': 'Marshall Islands',
'mq': 'Martinique',
'mr': 'Mauritania',
'mu': 'Mauritius',
'yt': 'Mayotte',
'mx': 'Mexico',
'fm': 'Micronesia, Federated States of',
'md': 'Moldova, Republic of',
'mc': 'Monaco',
'mn': 'Mongolia',
'ms': 'Montserrat',
'ma': 'Morocco',
'mz': 'Mozambique',
'mm': 'Myanmar',
'na': 'Namibia',
'nr': 'Nauru',
'np': 'Nepal',
'nl': 'Netherlands',
'an': 'Netherlands Antilles',
'nc': 'New Caledonia',
'nz': 'New Zealand',
'ni': 'Nicaragua',
'ne': 'Niger',
'ng': 'Nigeria',
'nu': 'Niue',
'nf': 'Norfolk Island',
'mp': 'Northern Mariana Islands',
'no': 'Norway',
'om': 'Oman',
'pk': 'Pakistan',
'pw': 'Palau',
'ps': 'Palestinian Territory, Occupied',
'pa': 'Panama',
'pg': 'Papua New Guinea',
'py': 'Paraguay',
'pe': 'Peru',
'ph': 'Philippines',
'pn': 'Pitcairn',
'pl': 'Poland',
'pt': 'Portugal',
'pr': 'Puerto Rico',
'qa': 'Qatar',
're': 'Reunion',
'ro': 'Romania',
'ru': 'Russian Federation',
'rw': 'Rwanda',
'sh': 'Saint Helena',
'kn': 'Saint Kitts and Nevis',
'lc': 'Saint Lucia',
'pm': 'Saint Pierre and Miquelon',
'vc': 'Saint Vincent and the Grenadines',
'ws': 'Samoa',
'sm': 'San Marino',
'st': 'Sao Tome and Principe',
'sa': 'Saudi Arabia',
'sn': 'Senegal',
'cs': 'Serbia and Montenegro',
'sc': 'Seychelles',
'sl': 'Sierra Leone',
'sg': 'Singapore',
'sk': 'Slovakia',
'si': 'Slovenia',
'sb': 'Solomon Islands',
'so': 'Somalia',
'za': 'South Africa',
'gs': 'South Georgia and the South Sandwich Islands',
'es': 'Spain',
'lk': 'Sri Lanka',
'sd': 'Sudan',
'sr': 'Suriname',
'sj': 'Svalbard and Jan Mayen',
'sz': 'Swaziland',
'se': 'Sweden',
'ch': 'Switzerland',
'sy': 'Syrian Arab Republic',
'tw': 'Taiwan, Province of China',
'tj': 'Tajikistan',
'tz': 'Tanzania, United Republic of',
'th': 'Thailand',
'tl': 'Timor-Leste',
'tg': 'Togo',
'tk': 'Tokelau',
'to': 'Tonga',
'tt': 'Trinidad and Tobago',
'tn': 'Tunisia',
'tr': 'Turkey',
'tm': 'Turkmenistan',
'tc': 'Turks and Caicos Islands',
'tv': 'Tuvalu',
'ug': 'Uganda',
'ua': 'Ukraine',
'ae': 'United Arab Emirates',
'uk': 'United Kingdom',
'us': 'United States',
'um': 'United States Minor Outlying Islands',
'uy': 'Uruguay',
'uz': 'Uzbekistan',
'vu': 'Vanuatu',
've': 'Venezuela',
'vn': 'Viet Nam',
'vg': 'Virgin Islands, British',
'vi': 'Virgin Islands, U.S.',
'wf': 'Wallis and Futuna',
'eh': 'Western Sahara',
'ye': 'Yemen',
'zm': 'Zambia',
'zw': 'Zimbabwe'
};
// https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages
// The hl parameter determines the Google UI language to return results.
const GOOGLE_HL = {
'af': 'Afrikaans',
'sq': 'Albanian',
'sm': 'Amharic',
'ar': 'Arabic',
'az': 'Azerbaijani',
'eu': 'Basque',
'be': 'Belarusian',
'bn': 'Bengali',
'bh': 'Bihari',
'bs': 'Bosnian',
'bg': 'Bulgarian',
'ca': 'Catalan',
'zh-CN': 'Chinese (Simplified)',
'zh-TW': 'Chinese (Traditional)',
'hr': 'Croatian',
'cs': 'Czech',
'da': 'Danish',
'nl': 'Dutch',
'en': 'English',
'eo': 'Esperanto',
'et': 'Estonian',
'fo': 'Faroese',
'fi': 'Finnish',
'fr': 'French',
'fy': 'Frisian',
'gl': 'Galician',
'ka': 'Georgian',
'de': 'German',
'el': 'Greek',
'gu': 'Gujarati',
'iw': 'Hebrew',
'hi': 'Hindi',
'hu': 'Hungarian',
'is': 'Icelandic',
'id': 'Indonesian',
'ia': 'Interlingua',
'ga': 'Irish',
'it': 'Italian',
'ja': 'Japanese',
'jw': 'Javanese',
'kn': 'Kannada',
'ko': 'Korean',
'la': 'Latin',
'lv': 'Latvian',
'lt': 'Lithuanian',
'mk': 'Macedonian',
'ms': 'Malay',
'ml': 'Malayam',
'mt': 'Maltese',
'mr': 'Marathi',
'ne': 'Nepali',
'no': 'Norwegian',
'nn': 'Norwegian (Nynorsk)',
'oc': 'Occitan',
'fa': 'Persian',
'pl': 'Polish',
'pt-BR': 'Portuguese (Brazil)',
'pt-PT': 'Portuguese (Portugal)',
'pa': 'Punjabi',
'ro': 'Romanian',
'ru': 'Russian',
'gd': 'Scots Gaelic',
'sr': 'Serbian',
'si': 'Sinhalese',
'sk': 'Slovak',
'sl': 'Slovenian',
'es': 'Spanish',
'su': 'Sudanese',
'sw': 'Swahili',
'sv': 'Swedish',
'tl': 'Tagalog',
'ta': 'Tamil',
'te': 'Telugu',
'th': 'Thai',
'ti': 'Tigrinya',
'tr': 'Turkish',
'uk': 'Ukrainian',
'ur': 'Urdu',
'uz': 'Uzbek',
'vi': 'Vietnamese',
'cy': 'Welsh',
'xh': 'Xhosa',
'zu': 'Zulu'
};
class GoogleScraper extends Scraper {
constructor(...args) { constructor(...args) {
super(...args); super(...args);
@ -54,7 +591,28 @@ class GoogleScraper extends Scraper {
} }
async load_start_page() { async load_start_page() {
await this.page.goto('https://www.google.com/'); let startUrl = 'https://www.google.com';
if (this.config.google_settings) {
startUrl = `https://www.${this.config.google_settings.google_domain}/search?q=`;
if (this.config.google_settings.google_domain) {
startUrl = `https://www.${this.config.google_settings.google_domain}/search?`;
} else {
startUrl = `https://www.google.com/search?`;
}
for (var key in this.config.google_settings) {
if (key !== 'google_domain') {
startUrl += `${key}=${this.config.google_settings[key]}&`
}
}
}
if (this.config.verbose) {
console.log('Using startUrl: ' + startUrl);
}
await this.page.goto(startUrl);
try { try {
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT }); await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });

View File

@ -41,8 +41,11 @@ class InfospaceScraper extends Scraper {
} }
async load_start_page() { async load_start_page() {
let startUrl = this.build_start_url('http://search.infospace.com/search/web?') || 'http://infospace.com/index.html';
try { try {
await this.page.goto('http://infospace.com/index.html'); await this.page.goto(startUrl);
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 }); await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
} catch (e) { } catch (e) {
return false; return false;

View File

@ -211,6 +211,30 @@ module.exports = class Scraper {
} }
} }
/**
* Generic function to append queryArgs to a search engine url.
*
* @param: The baseUrl to use for the build process.
*/
build_start_url(baseUrl) {
let settings = this.config[`${this.config.search_engine}_settings`];
if (settings) {
for (var key in settings) {
baseUrl += `${key}=${settings[key]}&`
}
if (this.config.verbose) {
console.log('Using startUrl: ' + baseUrl);
}
return baseUrl;
}
return false;
}
sleep(ms) { sleep(ms) {
return new Promise(resolve => { return new Promise(resolve => {
setTimeout(resolve, ms) setTimeout(resolve, ms)