mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2024-11-07 16:23:58 +01:00
added suport for custom query string parameters
This commit is contained in:
parent
7239e23cba
commit
7b52b4e62f
23
README.md
23
README.md
@ -16,6 +16,7 @@ If you don't have much technical experience or don't want to purchase proxies, y
|
||||
- [Scraping Model](#scraping-model)
|
||||
- [Technical Notes](#technical-notes)
|
||||
- [Advanced Usage](#advanced-usage)
|
||||
- [Special Query String Parameters for Search Engines](#query-string-parameters)
|
||||
|
||||
|
||||
Se-scraper supports the following search engines:
|
||||
@ -307,4 +308,24 @@ function callback(err, response) {
|
||||
se_scraper.scrape(config, callback);
|
||||
```
|
||||
|
||||
[Output for the above script on my machine.](examples/results/advanced.json)
|
||||
[Output for the above script on my machine.](examples/results/advanced.json)
|
||||
|
||||
### Query String Parameters
|
||||
|
||||
You can add your custom query string parameters to the configuration object by specifying a `google_settings` key. In general: `{{search engine}}_settings`.
|
||||
|
||||
For example you can customize your google search with the following config:
|
||||
|
||||
```js
|
||||
let config = {
|
||||
search_engine: 'google',
|
||||
// use specific search engine parameters for various search engines
|
||||
google_settings: {
|
||||
google_domain: 'google.com',
|
||||
gl: 'us', // The gl parameter determines the Google country to use for the query.
|
||||
hl: 'us', // The hl parameter determines the Google UI language to return results.
|
||||
start: 0, // Determines the results offset to use, defaults to 0.
|
||||
num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
|
||||
},
|
||||
}
|
||||
```
|
6
TODO.md
6
TODO.md
@ -40,6 +40,12 @@
|
||||
- make README.md nicer. https://github.com/thomasdondorf/puppeteer-cluster/blob/master/README.md as template
|
||||
|
||||
### TODO:
|
||||
- fix duckduckgo test case!!!
|
||||
- add test case for infospace
|
||||
- add test case for google parameters for
|
||||
- num
|
||||
- start
|
||||
- some language settings
|
||||
- write test case for proxy support and cluster support
|
||||
- add captcha service solving support
|
||||
- check if news instances run the same browser and if we can have one proxy per tab wokers
|
||||
|
File diff suppressed because it is too large
Load Diff
2
package-lock.json
generated
2
package-lock.json
generated
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "se-scraper",
|
||||
"version": "1.1.14",
|
||||
"version": "1.2.7",
|
||||
"lockfileVersion": 1,
|
||||
"requires": true,
|
||||
"dependencies": {
|
||||
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "se-scraper",
|
||||
"version": "1.2.7",
|
||||
"version": "1.2.8",
|
||||
"description": "A simple module using puppeteer to scrape several search engines such as Google, Duckduckgo and Bing.",
|
||||
"homepage": "https://scrapeulous.com/",
|
||||
"main": "index.js",
|
||||
@ -23,6 +23,7 @@
|
||||
"dependencies": {
|
||||
"chai": "^4.2.0",
|
||||
"cheerio": "^1.0.0-rc.2",
|
||||
"debug": "^4.1.1",
|
||||
"got": "^9.6.0",
|
||||
"proxy-chain": "^0.2.7",
|
||||
"puppeteer": "^1.12.2",
|
||||
|
24
run.js
24
run.js
@ -10,6 +10,24 @@ let config = {
|
||||
sleep_range: '[1,2]',
|
||||
// which search engine to scrape
|
||||
search_engine: 'google',
|
||||
|
||||
// use specific search engine parameters for various search engines
|
||||
google_settings: {
|
||||
google_domain: 'google.com',
|
||||
gl: 'us', // The gl parameter determines the Google country to use for the query.
|
||||
hl: 'us', // The hl parameter determines the Google UI language to return results.
|
||||
start: 0, // Determines the results offset to use, defaults to 0.
|
||||
num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
|
||||
},
|
||||
|
||||
// https://docs.microsoft.com/en-us/rest/api/cognitiveservices/bing-web-api-v5-reference#query-parameters
|
||||
bing_settings: {
|
||||
count: 50, // how many results per page
|
||||
safeSearch: 'Off', // safe search (strict, moderate, off)
|
||||
cc: 'us', // ISO 3166 country code
|
||||
offset: 0, // The zero-based offset that indicates the number of search results to skip before returning results
|
||||
},
|
||||
|
||||
// whether debug information should be printed
|
||||
// debug info is useful for developers when debugging
|
||||
debug: false,
|
||||
@ -17,7 +35,7 @@ let config = {
|
||||
// this output is informational
|
||||
verbose: true,
|
||||
// an array of keywords to scrape
|
||||
keywords: ['scrapeulous.com', 'scraping search engines', 'scraping service scrapeulous', 'learn js'],
|
||||
keywords: ['good news'],
|
||||
// alternatively you can specify a keyword_file. this overwrites the keywords array
|
||||
keyword_file: '',
|
||||
// the number of pages to scrape for each keyword
|
||||
@ -54,7 +72,7 @@ let config = {
|
||||
timeout: 10 * 60 * 1000, // max timeout set to 10 minutes
|
||||
monitor: false,
|
||||
concurrency: 1, // one scraper per tab
|
||||
maxConcurrency: 2, // scrape with 2 tabs
|
||||
maxConcurrency: 1, // scrape with 2 tabs
|
||||
}
|
||||
};
|
||||
|
||||
@ -68,7 +86,7 @@ function callback(err, response) {
|
||||
response.statusCode - status code of the scraping process
|
||||
*/
|
||||
|
||||
// console.dir(response.results, {depth: null, colors: true});
|
||||
console.dir(response.results, {depth: null, colors: true});
|
||||
}
|
||||
|
||||
se_scraper.scrape(config, callback);
|
||||
|
@ -35,8 +35,11 @@ class BaiduScraper extends Scraper {
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
|
||||
let startUrl = this.build_start_url('https://www.baidu.com/s?') || 'https://www.baidu.com/';
|
||||
|
||||
try {
|
||||
await this.page.goto('https://www.baidu.com/');
|
||||
await this.page.goto(startUrl);
|
||||
await this.page.waitForSelector('input[name="wd"]', { timeout: 5000 });
|
||||
} catch (e) {
|
||||
return false;
|
||||
|
@ -45,8 +45,10 @@ class BingScraper extends Scraper {
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
let startUrl = this.build_start_url('https://www.bing.com/search?') || 'https://www.bing.com/';
|
||||
|
||||
try {
|
||||
await this.page.goto('https://www.bing.com/');
|
||||
await this.page.goto(startUrl);
|
||||
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
||||
} catch (e) {
|
||||
return false;
|
||||
@ -117,8 +119,10 @@ class BingNewsScraper extends Scraper {
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
let startUrl = 'https://www.bing.com/news/search?';
|
||||
|
||||
try {
|
||||
await this.page.goto('https://www.bing.com/news/search?');
|
||||
await this.page.goto(startUrl);
|
||||
if (this.config.set_manual_settings === true) {
|
||||
console.log('Sleeping 30 seconds. Set your settings now.');
|
||||
await this.sleep(30000);
|
||||
@ -127,6 +131,7 @@ class BingNewsScraper extends Scraper {
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -38,8 +38,11 @@ class DuckduckgoScraper extends Scraper {
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
|
||||
let startUrl = this.build_start_url('https://duckduckgo.com/?') || 'https://duckduckgo.com/';
|
||||
|
||||
try {
|
||||
await this.page.goto('https://duckduckgo.com/');
|
||||
await this.page.goto(startUrl);
|
||||
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
||||
} catch (e) {
|
||||
return false;
|
||||
@ -56,19 +59,19 @@ class DuckduckgoScraper extends Scraper {
|
||||
}
|
||||
|
||||
async next_page() {
|
||||
let next_page_link = await this.page.$('a.result--more__btn', {timeout: 1000});
|
||||
let next_page_link = await this.page.$('a.result--more__btn', {timeout: 5000});
|
||||
if (!next_page_link) {
|
||||
return false;
|
||||
}
|
||||
await next_page_link.click();
|
||||
//await this.page.waitForNavigation();
|
||||
await this.page.waitForNavigation();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
async wait_for_results() {
|
||||
await this.page.waitForSelector('.serp__results', { timeout: 5000 });
|
||||
await this.sleep(250);
|
||||
await this.sleep(350);
|
||||
}
|
||||
|
||||
async detected() {
|
||||
|
@ -1,7 +1,544 @@
|
||||
const cheerio = require('cheerio');
|
||||
const Scraper = require('./se_scraper');
|
||||
|
||||
class GoogleScraper extends Scraper {
|
||||
// https://developers.google.com/custom-search/v1/cse/list
|
||||
|
||||
const GOOGLE_DOMAINS = {
|
||||
'Samoa': 'google.ws',
|
||||
'Vanuatu': 'google.vu',
|
||||
'British Virgin Islands': 'google.vg',
|
||||
'Trinidad and Tobago': 'google.tt',
|
||||
'Tonga': 'google.to',
|
||||
'Tunisia': 'google.tn',
|
||||
'Turkmenistan': 'google.tm',
|
||||
'Timor-Leste': 'google.tl',
|
||||
'Tokelau': 'google.tk',
|
||||
'Togo': 'google.tg',
|
||||
'Chad': 'google.td',
|
||||
'São Tomé and Príncipe': 'google.st',
|
||||
'Suriname': 'google.sr',
|
||||
'Somalia': 'google.so',
|
||||
'Senegal': 'google.sn',
|
||||
'San Marino': 'google.sm',
|
||||
'Slovakia': 'google.sk',
|
||||
'Slovenia': 'google.si',
|
||||
'Saint Helena, Ascension and Tristan da Cunha': 'google.sh',
|
||||
'Sweden': 'google.se',
|
||||
'Seychelles': 'google.sc',
|
||||
'Rwanda': 'google.rw',
|
||||
'Russia': 'google.ru',
|
||||
'Serbia': 'google.rs',
|
||||
'Romania': 'google.ro',
|
||||
'Portugal': 'google.pt',
|
||||
'Palestine[3]': 'google.ps',
|
||||
'Pitcairn Islands': 'google.co.pn',
|
||||
'Poland': 'google.pl',
|
||||
'Niue': 'google.nu',
|
||||
'Nauru': 'google.nr',
|
||||
'Norway': 'google.no',
|
||||
'Netherlands': 'google.nl',
|
||||
'Niger': 'google.ne',
|
||||
'Malawi': 'google.mw',
|
||||
'Maldives': 'google.mv',
|
||||
'Mauritius': 'google.mu',
|
||||
'Montserrat': 'google.ms',
|
||||
'Mongolia': 'google.mn',
|
||||
'Mali': 'google.ml',
|
||||
'Macedonia': 'google.mk',
|
||||
'Madagascar': 'google.mg',
|
||||
'Montenegro': 'google.me',
|
||||
'Moldova': 'google.md',
|
||||
'Latvia': 'google.lv',
|
||||
'Luxembourg': 'google.lu',
|
||||
'Lithuania': 'google.lt',
|
||||
'Sri Lanka': 'google.lk',
|
||||
'Liechtenstein': 'google.li',
|
||||
'Laos': 'google.la',
|
||||
'Kazakhstan': 'google.kz',
|
||||
'Kiribati': 'google.ki',
|
||||
'Kyrgyzstan': 'google.kg',
|
||||
'Jordan': 'google.jo',
|
||||
'Jersey': 'google.je',
|
||||
'Italy': 'google.it',
|
||||
'Iceland': 'google.is',
|
||||
'Iraq': 'google.iq',
|
||||
'British Indian Ocean Territory': 'google.io',
|
||||
'Isle of Man': 'google.im',
|
||||
'Ireland': 'google.ie',
|
||||
'Hungary': 'google.hu',
|
||||
'Haiti': 'google.ht',
|
||||
'Croatia': 'google.hr',
|
||||
'Honduras': 'google.hn',
|
||||
'Guyana': 'google.gy',
|
||||
'Greece': 'google.gr',
|
||||
'Guadeloupe': 'google.gp',
|
||||
'Gambia': 'google.gm',
|
||||
'Greenland': 'google.gl',
|
||||
'Guernsey': 'google.gg',
|
||||
'French Guiana': 'google.gf',
|
||||
'Georgia': 'google.ge',
|
||||
'Gabon': 'google.ga',
|
||||
'France': 'google.fr',
|
||||
'Federated States of Micronesia': 'google.fm',
|
||||
'Finland': 'google.fi',
|
||||
'Spain': 'google.es',
|
||||
'Estonia': 'google.ee',
|
||||
'Algeria': 'google.dz',
|
||||
'Dominica': 'google.dm',
|
||||
'Denmark': 'google.dk',
|
||||
'Djibouti': 'google.dj',
|
||||
'Germany': 'google.de',
|
||||
'Czech Republic': 'google.cz',
|
||||
'Cape Verde': 'google.cv',
|
||||
'Vietnam': 'google.com.vn',
|
||||
'Saint Vincent and the Grenadines': 'google.com.vc',
|
||||
'Uruguay': 'google.com.uy',
|
||||
'Ukraine': 'google.com.ua',
|
||||
'Taiwan': 'google.com.tw',
|
||||
'Turkey': 'google.com.tr',
|
||||
'Tajikistan': 'google.com.tj',
|
||||
'El Salvador': 'google.com.sv',
|
||||
'Sierra Leone': 'google.com.sl',
|
||||
'Singapore': 'google.com.sg',
|
||||
'Solomon Islands': 'google.com.sb',
|
||||
'Saudi Arabia': 'google.com.sa',
|
||||
'Qatar': 'google.com.qa',
|
||||
'Paraguay': 'google.com.py',
|
||||
'Puerto Rico': 'google.com.pr',
|
||||
'Pakistan': 'google.com.pk',
|
||||
'Philippines': 'google.com.ph',
|
||||
'Papua New Guinea': 'google.com.pg',
|
||||
'Peru': 'google.com.pe',
|
||||
'Panama': 'google.com.pa',
|
||||
'Oman': 'google.com.om',
|
||||
'Nepal': 'google.com.np',
|
||||
'Nicaragua': 'google.com.ni',
|
||||
'Nigeria': 'google.com.ng',
|
||||
'Norfolk Island': 'google.com.nf',
|
||||
'Namibia': 'google.com.na',
|
||||
'Malaysia': 'google.com.my',
|
||||
'Mexico': 'google.com.mx',
|
||||
'Malta': 'google.com.mt',
|
||||
'Myanmar': 'google.com.mm',
|
||||
'Libya': 'google.com.ly',
|
||||
'Saint Lucia': 'google.com.lc',
|
||||
'Lebanon': 'google.com.lb',
|
||||
'Kuwait': 'google.com.kw',
|
||||
'Cambodia': 'google.com.kh',
|
||||
'Jamaica': 'google.com.jm',
|
||||
'Hong Kong': 'google.com.hk',
|
||||
'Guatemala': 'google.com.gt',
|
||||
'Gibraltar': 'google.com.gi',
|
||||
'Ghana': 'google.com.gh',
|
||||
'Fiji': 'google.com.fj',
|
||||
'Ethiopia': 'google.com.et',
|
||||
'Egypt': 'google.com.eg',
|
||||
'Ecuador': 'google.com.ec',
|
||||
'Dominican Republic': 'google.com.do',
|
||||
'Cyprus': 'google.com.cy',
|
||||
'Cuba': 'google.com.cu',
|
||||
'Colombia': 'google.com.co',
|
||||
'Belize': 'google.com.bz',
|
||||
'Brazil': 'google.com.br',
|
||||
'Bolivia': 'google.com.bo',
|
||||
'Brunei': 'google.com.bn',
|
||||
'Bahrain': 'google.com.bh',
|
||||
'Bangladesh': 'google.com.bd',
|
||||
'Australia': 'google.com.au',
|
||||
'Argentina': 'google.com.ar',
|
||||
'Anguilla': 'google.com.ai',
|
||||
'Antigua and Barbuda': 'google.com.ag',
|
||||
'Afghanistan': 'google.com.af',
|
||||
'Worldwide (Original for the United States)': 'google.com',
|
||||
'United States': 'google.com',
|
||||
'Zimbabwe': 'google.co.zw',
|
||||
'Zambia': 'google.co.zm',
|
||||
'South Africa': 'google.co.za',
|
||||
'United States Virgin Islands': 'google.co.vi',
|
||||
'Venezuela': 'google.co.ve',
|
||||
'Uzbekistan': 'google.co.uz',
|
||||
'United Kingdom': 'google.co.uk',
|
||||
'Uganda': 'google.co.ug',
|
||||
'Tanzania': 'google.co.tz',
|
||||
'Thailand': 'google.co.th',
|
||||
'New Zealand': 'google.co.nz',
|
||||
'Mozambique': 'google.co.mz',
|
||||
'Morocco': 'google.co.ma',
|
||||
'Lesotho': 'google.co.ls',
|
||||
'South Korea': 'google.co.kr',
|
||||
'Kenya': 'google.co.ke',
|
||||
'Japan': 'google.co.jp',
|
||||
'India': 'google.co.in',
|
||||
'Israel': 'google.co.il',
|
||||
'Indonesia': 'google.co.id',
|
||||
'Costa Rica': 'google.co.cr',
|
||||
'Cook Islands': 'google.co.ck',
|
||||
'Botswana': 'google.co.bw',
|
||||
'Angola': 'google.co.ao',
|
||||
'China': 'google.cn',
|
||||
'Cameroon': 'google.cm',
|
||||
'Chile': 'google.cl',
|
||||
'Ivory Coast': 'google.ci',
|
||||
'Switzerland': 'google.ch',
|
||||
'Republic of the Congo': 'google.cg',
|
||||
'Central African Republic': 'google.cf',
|
||||
'Democratic Republic of the Congo': 'google.cd',
|
||||
'Cocos (Keeling) Islands': 'google.cc',
|
||||
'Catalan Countries': 'google.cat',
|
||||
'Canada': 'google.ca',
|
||||
'Belarus': 'google.by',
|
||||
'Bhutan': 'google.bt',
|
||||
'Bahamas': 'google.bs',
|
||||
'Benin': 'google.bj',
|
||||
'Burundi': 'google.bi',
|
||||
'Bulgaria': 'google.bg',
|
||||
'Burkina Faso': 'google.bf',
|
||||
'Belgium': 'google.be',
|
||||
'Bosnia and Herzegovina': 'google.ba',
|
||||
'Azerbaijan': 'google.az',
|
||||
'Austria': 'google.at',
|
||||
'American Samoa': 'google.as',
|
||||
'Armenia': 'google.am',
|
||||
'Albania': 'google.al',
|
||||
'United Arab Emirates': 'google.ae',
|
||||
'Andorra': 'google.ad',
|
||||
'Ascension Island': 'google.ac'
|
||||
};
|
||||
|
||||
// https://developers.google.com/custom-search/docs/xml_results_appendices#countryCodes
|
||||
// The gl parameter determines the Google country to use for the query.
|
||||
const GOOGLE_GL = {'af': 'Afghanistan',
|
||||
'al': 'Albania',
|
||||
'dz': 'Algeria',
|
||||
'as': 'American Samoa',
|
||||
'ad': 'Andorra',
|
||||
'ao': 'Angola',
|
||||
'ai': 'Anguilla',
|
||||
'aq': 'Antarctica',
|
||||
'ag': 'Antigua and Barbuda',
|
||||
'ar': 'Argentina',
|
||||
'am': 'Armenia',
|
||||
'aw': 'Aruba',
|
||||
'au': 'Australia',
|
||||
'at': 'Austria',
|
||||
'az': 'Azerbaijan',
|
||||
'bs': 'Bahamas',
|
||||
'bh': 'Bahrain',
|
||||
'bd': 'Bangladesh',
|
||||
'bb': 'Barbados',
|
||||
'by': 'Belarus',
|
||||
'be': 'Belgium',
|
||||
'bz': 'Belize',
|
||||
'bj': 'Benin',
|
||||
'bm': 'Bermuda',
|
||||
'bt': 'Bhutan',
|
||||
'bo': 'Bolivia',
|
||||
'ba': 'Bosnia and Herzegovina',
|
||||
'bw': 'Botswana',
|
||||
'bv': 'Bouvet Island',
|
||||
'br': 'Brazil',
|
||||
'io': 'British Indian Ocean Territory',
|
||||
'bn': 'Brunei Darussalam',
|
||||
'bg': 'Bulgaria',
|
||||
'bf': 'Burkina Faso',
|
||||
'bi': 'Burundi',
|
||||
'kh': 'Cambodia',
|
||||
'cm': 'Cameroon',
|
||||
'ca': 'Canada',
|
||||
'cv': 'Cape Verde',
|
||||
'ky': 'Cayman Islands',
|
||||
'cf': 'Central African Republic',
|
||||
'td': 'Chad',
|
||||
'cl': 'Chile',
|
||||
'cn': 'China',
|
||||
'cx': 'Christmas Island',
|
||||
'cc': 'Cocos (Keeling) Islands',
|
||||
'co': 'Colombia',
|
||||
'km': 'Comoros',
|
||||
'cg': 'Congo',
|
||||
'cd': 'Congo, the Democratic Republic of the',
|
||||
'ck': 'Cook Islands',
|
||||
'cr': 'Costa Rica',
|
||||
'ci': "Cote D'ivoire",
|
||||
'hr': 'Croatia',
|
||||
'cu': 'Cuba',
|
||||
'cy': 'Cyprus',
|
||||
'cz': 'Czech Republic',
|
||||
'dk': 'Denmark',
|
||||
'dj': 'Djibouti',
|
||||
'dm': 'Dominica',
|
||||
'do': 'Dominican Republic',
|
||||
'ec': 'Ecuador',
|
||||
'eg': 'Egypt',
|
||||
'sv': 'El Salvador',
|
||||
'gq': 'Equatorial Guinea',
|
||||
'er': 'Eritrea',
|
||||
'ee': 'Estonia',
|
||||
'et': 'Ethiopia',
|
||||
'fk': 'Falkland Islands (Malvinas)',
|
||||
'fo': 'Faroe Islands',
|
||||
'fj': 'Fiji',
|
||||
'fi': 'Finland',
|
||||
'fr': 'France',
|
||||
'gf': 'French Guiana',
|
||||
'pf': 'French Polynesia',
|
||||
'tf': 'French Southern Territories',
|
||||
'ga': 'Gabon',
|
||||
'gm': 'Gambia',
|
||||
'ge': 'Georgia',
|
||||
'de': 'Germany',
|
||||
'gh': 'Ghana',
|
||||
'gi': 'Gibraltar',
|
||||
'gr': 'Greece',
|
||||
'gl': 'Greenland',
|
||||
'gd': 'Grenada',
|
||||
'gp': 'Guadeloupe',
|
||||
'gu': 'Guam',
|
||||
'gt': 'Guatemala',
|
||||
'gn': 'Guinea',
|
||||
'gw': 'Guinea-Bissau',
|
||||
'gy': 'Guyana',
|
||||
'ht': 'Haiti',
|
||||
'hm': 'Heard Island and Mcdonald Islands',
|
||||
'va': 'Holy See (Vatican City State)',
|
||||
'hn': 'Honduras',
|
||||
'hk': 'Hong Kong',
|
||||
'hu': 'Hungary',
|
||||
'is': 'Iceland',
|
||||
'in': 'India',
|
||||
'id': 'Indonesia',
|
||||
'ir': 'Iran, Islamic Republic of',
|
||||
'iq': 'Iraq',
|
||||
'ie': 'Ireland',
|
||||
'il': 'Israel',
|
||||
'it': 'Italy',
|
||||
'jm': 'Jamaica',
|
||||
'jp': 'Japan',
|
||||
'jo': 'Jordan',
|
||||
'kz': 'Kazakhstan',
|
||||
'ke': 'Kenya',
|
||||
'ki': 'Kiribati',
|
||||
'kp': "Korea, Democratic People's Republic of",
|
||||
'kr': 'Korea, Republic of',
|
||||
'kw': 'Kuwait',
|
||||
'kg': 'Kyrgyzstan',
|
||||
'la': "Lao People's Democratic Republic",
|
||||
'lv': 'Latvia',
|
||||
'lb': 'Lebanon',
|
||||
'ls': 'Lesotho',
|
||||
'lr': 'Liberia',
|
||||
'ly': 'Libyan Arab Jamahiriya',
|
||||
'li': 'Liechtenstein',
|
||||
'lt': 'Lithuania',
|
||||
'lu': 'Luxembourg',
|
||||
'mo': 'Macao',
|
||||
'mk': 'Macedonia, the Former Yugosalv Republic of',
|
||||
'mg': 'Madagascar',
|
||||
'mw': 'Malawi',
|
||||
'my': 'Malaysia',
|
||||
'mv': 'Maldives',
|
||||
'ml': 'Mali',
|
||||
'mt': 'Malta',
|
||||
'mh': 'Marshall Islands',
|
||||
'mq': 'Martinique',
|
||||
'mr': 'Mauritania',
|
||||
'mu': 'Mauritius',
|
||||
'yt': 'Mayotte',
|
||||
'mx': 'Mexico',
|
||||
'fm': 'Micronesia, Federated States of',
|
||||
'md': 'Moldova, Republic of',
|
||||
'mc': 'Monaco',
|
||||
'mn': 'Mongolia',
|
||||
'ms': 'Montserrat',
|
||||
'ma': 'Morocco',
|
||||
'mz': 'Mozambique',
|
||||
'mm': 'Myanmar',
|
||||
'na': 'Namibia',
|
||||
'nr': 'Nauru',
|
||||
'np': 'Nepal',
|
||||
'nl': 'Netherlands',
|
||||
'an': 'Netherlands Antilles',
|
||||
'nc': 'New Caledonia',
|
||||
'nz': 'New Zealand',
|
||||
'ni': 'Nicaragua',
|
||||
'ne': 'Niger',
|
||||
'ng': 'Nigeria',
|
||||
'nu': 'Niue',
|
||||
'nf': 'Norfolk Island',
|
||||
'mp': 'Northern Mariana Islands',
|
||||
'no': 'Norway',
|
||||
'om': 'Oman',
|
||||
'pk': 'Pakistan',
|
||||
'pw': 'Palau',
|
||||
'ps': 'Palestinian Territory, Occupied',
|
||||
'pa': 'Panama',
|
||||
'pg': 'Papua New Guinea',
|
||||
'py': 'Paraguay',
|
||||
'pe': 'Peru',
|
||||
'ph': 'Philippines',
|
||||
'pn': 'Pitcairn',
|
||||
'pl': 'Poland',
|
||||
'pt': 'Portugal',
|
||||
'pr': 'Puerto Rico',
|
||||
'qa': 'Qatar',
|
||||
're': 'Reunion',
|
||||
'ro': 'Romania',
|
||||
'ru': 'Russian Federation',
|
||||
'rw': 'Rwanda',
|
||||
'sh': 'Saint Helena',
|
||||
'kn': 'Saint Kitts and Nevis',
|
||||
'lc': 'Saint Lucia',
|
||||
'pm': 'Saint Pierre and Miquelon',
|
||||
'vc': 'Saint Vincent and the Grenadines',
|
||||
'ws': 'Samoa',
|
||||
'sm': 'San Marino',
|
||||
'st': 'Sao Tome and Principe',
|
||||
'sa': 'Saudi Arabia',
|
||||
'sn': 'Senegal',
|
||||
'cs': 'Serbia and Montenegro',
|
||||
'sc': 'Seychelles',
|
||||
'sl': 'Sierra Leone',
|
||||
'sg': 'Singapore',
|
||||
'sk': 'Slovakia',
|
||||
'si': 'Slovenia',
|
||||
'sb': 'Solomon Islands',
|
||||
'so': 'Somalia',
|
||||
'za': 'South Africa',
|
||||
'gs': 'South Georgia and the South Sandwich Islands',
|
||||
'es': 'Spain',
|
||||
'lk': 'Sri Lanka',
|
||||
'sd': 'Sudan',
|
||||
'sr': 'Suriname',
|
||||
'sj': 'Svalbard and Jan Mayen',
|
||||
'sz': 'Swaziland',
|
||||
'se': 'Sweden',
|
||||
'ch': 'Switzerland',
|
||||
'sy': 'Syrian Arab Republic',
|
||||
'tw': 'Taiwan, Province of China',
|
||||
'tj': 'Tajikistan',
|
||||
'tz': 'Tanzania, United Republic of',
|
||||
'th': 'Thailand',
|
||||
'tl': 'Timor-Leste',
|
||||
'tg': 'Togo',
|
||||
'tk': 'Tokelau',
|
||||
'to': 'Tonga',
|
||||
'tt': 'Trinidad and Tobago',
|
||||
'tn': 'Tunisia',
|
||||
'tr': 'Turkey',
|
||||
'tm': 'Turkmenistan',
|
||||
'tc': 'Turks and Caicos Islands',
|
||||
'tv': 'Tuvalu',
|
||||
'ug': 'Uganda',
|
||||
'ua': 'Ukraine',
|
||||
'ae': 'United Arab Emirates',
|
||||
'uk': 'United Kingdom',
|
||||
'us': 'United States',
|
||||
'um': 'United States Minor Outlying Islands',
|
||||
'uy': 'Uruguay',
|
||||
'uz': 'Uzbekistan',
|
||||
'vu': 'Vanuatu',
|
||||
've': 'Venezuela',
|
||||
'vn': 'Viet Nam',
|
||||
'vg': 'Virgin Islands, British',
|
||||
'vi': 'Virgin Islands, U.S.',
|
||||
'wf': 'Wallis and Futuna',
|
||||
'eh': 'Western Sahara',
|
||||
'ye': 'Yemen',
|
||||
'zm': 'Zambia',
|
||||
'zw': 'Zimbabwe'
|
||||
};
|
||||
|
||||
|
||||
// https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages
|
||||
// The hl parameter determines the Google UI language to return results.
|
||||
const GOOGLE_HL = {
|
||||
'af': 'Afrikaans',
|
||||
'sq': 'Albanian',
|
||||
'sm': 'Amharic',
|
||||
'ar': 'Arabic',
|
||||
'az': 'Azerbaijani',
|
||||
'eu': 'Basque',
|
||||
'be': 'Belarusian',
|
||||
'bn': 'Bengali',
|
||||
'bh': 'Bihari',
|
||||
'bs': 'Bosnian',
|
||||
'bg': 'Bulgarian',
|
||||
'ca': 'Catalan',
|
||||
'zh-CN': 'Chinese (Simplified)',
|
||||
'zh-TW': 'Chinese (Traditional)',
|
||||
'hr': 'Croatian',
|
||||
'cs': 'Czech',
|
||||
'da': 'Danish',
|
||||
'nl': 'Dutch',
|
||||
'en': 'English',
|
||||
'eo': 'Esperanto',
|
||||
'et': 'Estonian',
|
||||
'fo': 'Faroese',
|
||||
'fi': 'Finnish',
|
||||
'fr': 'French',
|
||||
'fy': 'Frisian',
|
||||
'gl': 'Galician',
|
||||
'ka': 'Georgian',
|
||||
'de': 'German',
|
||||
'el': 'Greek',
|
||||
'gu': 'Gujarati',
|
||||
'iw': 'Hebrew',
|
||||
'hi': 'Hindi',
|
||||
'hu': 'Hungarian',
|
||||
'is': 'Icelandic',
|
||||
'id': 'Indonesian',
|
||||
'ia': 'Interlingua',
|
||||
'ga': 'Irish',
|
||||
'it': 'Italian',
|
||||
'ja': 'Japanese',
|
||||
'jw': 'Javanese',
|
||||
'kn': 'Kannada',
|
||||
'ko': 'Korean',
|
||||
'la': 'Latin',
|
||||
'lv': 'Latvian',
|
||||
'lt': 'Lithuanian',
|
||||
'mk': 'Macedonian',
|
||||
'ms': 'Malay',
|
||||
'ml': 'Malayam',
|
||||
'mt': 'Maltese',
|
||||
'mr': 'Marathi',
|
||||
'ne': 'Nepali',
|
||||
'no': 'Norwegian',
|
||||
'nn': 'Norwegian (Nynorsk)',
|
||||
'oc': 'Occitan',
|
||||
'fa': 'Persian',
|
||||
'pl': 'Polish',
|
||||
'pt-BR': 'Portuguese (Brazil)',
|
||||
'pt-PT': 'Portuguese (Portugal)',
|
||||
'pa': 'Punjabi',
|
||||
'ro': 'Romanian',
|
||||
'ru': 'Russian',
|
||||
'gd': 'Scots Gaelic',
|
||||
'sr': 'Serbian',
|
||||
'si': 'Sinhalese',
|
||||
'sk': 'Slovak',
|
||||
'sl': 'Slovenian',
|
||||
'es': 'Spanish',
|
||||
'su': 'Sudanese',
|
||||
'sw': 'Swahili',
|
||||
'sv': 'Swedish',
|
||||
'tl': 'Tagalog',
|
||||
'ta': 'Tamil',
|
||||
'te': 'Telugu',
|
||||
'th': 'Thai',
|
||||
'ti': 'Tigrinya',
|
||||
'tr': 'Turkish',
|
||||
'uk': 'Ukrainian',
|
||||
'ur': 'Urdu',
|
||||
'uz': 'Uzbek',
|
||||
'vi': 'Vietnamese',
|
||||
'cy': 'Welsh',
|
||||
'xh': 'Xhosa',
|
||||
'zu': 'Zulu'
|
||||
};
|
||||
|
||||
|
||||
class GoogleScraper extends Scraper {
|
||||
|
||||
constructor(...args) {
|
||||
super(...args);
|
||||
@ -54,7 +591,28 @@ class GoogleScraper extends Scraper {
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
await this.page.goto('https://www.google.com/');
|
||||
let startUrl = 'https://www.google.com';
|
||||
|
||||
if (this.config.google_settings) {
|
||||
startUrl = `https://www.${this.config.google_settings.google_domain}/search?q=`;
|
||||
if (this.config.google_settings.google_domain) {
|
||||
startUrl = `https://www.${this.config.google_settings.google_domain}/search?`;
|
||||
} else {
|
||||
startUrl = `https://www.google.com/search?`;
|
||||
}
|
||||
|
||||
for (var key in this.config.google_settings) {
|
||||
if (key !== 'google_domain') {
|
||||
startUrl += `${key}=${this.config.google_settings[key]}&`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (this.config.verbose) {
|
||||
console.log('Using startUrl: ' + startUrl);
|
||||
}
|
||||
|
||||
await this.page.goto(startUrl);
|
||||
|
||||
try {
|
||||
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
||||
|
@ -41,8 +41,11 @@ class InfospaceScraper extends Scraper {
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
|
||||
let startUrl = this.build_start_url('http://search.infospace.com/search/web?') || 'http://infospace.com/index.html';
|
||||
|
||||
try {
|
||||
await this.page.goto('http://infospace.com/index.html');
|
||||
await this.page.goto(startUrl);
|
||||
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
||||
} catch (e) {
|
||||
return false;
|
||||
|
@ -211,6 +211,30 @@ module.exports = class Scraper {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generic function to append queryArgs to a search engine url.
|
||||
*
|
||||
* @param: The baseUrl to use for the build process.
|
||||
*/
|
||||
build_start_url(baseUrl) {
|
||||
let settings = this.config[`${this.config.search_engine}_settings`];
|
||||
|
||||
if (settings) {
|
||||
|
||||
for (var key in settings) {
|
||||
baseUrl += `${key}=${settings[key]}&`
|
||||
}
|
||||
|
||||
if (this.config.verbose) {
|
||||
console.log('Using startUrl: ' + baseUrl);
|
||||
}
|
||||
|
||||
return baseUrl;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
sleep(ms) {
|
||||
return new Promise(resolve => {
|
||||
setTimeout(resolve, ms)
|
||||
|
Loading…
Reference in New Issue
Block a user