updated README

This commit is contained in:
Nikolai Tschacher 2019-06-11 18:27:34 +02:00
parent 6825c97790
commit 7e06944fa1
7 changed files with 235 additions and 253 deletions

149
README.md
View File

@ -10,6 +10,7 @@ If you don't have much technical experience or don't want to purchase proxies, y
##### Table of Contents
- [Installation](#installation)
- [Minimal Example](#minimal-example)
- [Quickstart](#quickstart)
- [Using Proxies](#proxies)
- [Examples](#examples)
@ -68,28 +69,53 @@ If you **don't** want puppeteer to download a complete chromium browser, add thi
export PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=1
```
## Minimal Example
Create a file named `minimal.js` with the following contents
```js
(async () => {
let scrape_job = {
search_engine: 'google',
keywords: ['lets go boys'],
num_pages: 1,
};
var results = await se_scraper.scrape({}, scrape_job);
console.dir(results, {depth: null, colors: true});
})();
```
Start scraping by firing up the command `node minimal.js`
## Quickstart
Create a file named `run.js` with the following contents
```js
const se_scraper = require('se-scraper');
(async () => {
let browser_config = {
debug_level: 1,
output_file: 'examples/results/data.json',
};
let config = {
search_engine: 'google',
debug: false,
verbose: false,
keywords: ['news', 'scraping scrapeulous.com'],
num_pages: 3,
output_file: 'data.json',
};
let scrape_job = {
search_engine: 'google',
keywords: ['news', 'se-scraper'],
num_pages: 1,
};
function callback(err, response) {
if (err) { console.error(err) }
console.dir(response, {depth: null, colors: true});
}
var scraper = new se_scraper.ScrapeManager(browser_config);
se_scraper.scrape(config, callback);
await scraper.start();
var results = await scraper.scrape(scrape_job);
console.dir(results, {depth: null, colors: true});
await scraper.quit();
})();
```
Start scraping by firing up the command `node run.js`
@ -99,25 +125,27 @@ Start scraping by firing up the command `node run.js`
**se-scraper** will create one browser instance per proxy. So the maximal amount of concurrency is equivalent to the number of proxies plus one (your own IP).
```js
const se_scraper = require('se-scraper');
(async () => {
let browser_config = {
debug_level: 1,
output_file: 'examples/results/proxyresults.json',
proxy_file: '/home/nikolai/.proxies', // one proxy per line
log_ip_address: true,
};
let config = {
search_engine: 'google',
debug: false,
verbose: false,
keywords: ['news', 'scrapeulous.com', 'incolumitas.com', 'i work too much'],
num_pages: 1,
output_file: 'data.json',
proxy_file: '/home/nikolai/.proxies', // one proxy per line
log_ip_address: true,
};
let scrape_job = {
search_engine: 'google',
keywords: ['news', 'scrapeulous.com', 'incolumitas.com', 'i work too much', 'what to do?', 'javascript is hard'],
num_pages: 1,
};
function callback(err, response) {
if (err) { console.error(err) }
console.dir(response, {depth: null, colors: true});
}
var scraper = new se_scraper.ScrapeManager(browser_config);
await scraper.start();
se_scraper.scrape(config, callback);
var results = await scraper.scrape(scrape_job);
console.dir(results, {depth: null, colors: true});
await scraper.quit();
})();
```
With a proxy file such as
@ -131,6 +159,7 @@ This will scrape with **three** browser instance each having their own IP addres
## Examples
* [Reuse existing browser](examples/quickstart.js) yields [these results](examples/results/data.json)
* [Simple example scraping google](examples/quickstart.js) yields [these results](examples/results/data.json)
* [Simple example scraping baidu](examples/baidu.js) yields [these results](examples/results/baidu.json)
* [Scrape with one proxy per browser](examples/proxies.js) yields [these results](examples/results/proxyresults.json)
@ -258,28 +287,22 @@ let config = {
random_user_agent: true,
// how long to sleep between requests. a random sleep interval within the range [a,b]
// is drawn before every request. empty string for no sleeping.
sleep_range: '[1,2]',
// which search engine to scrape
search_engine: 'google',
// whether debug information should be printed
// debug info is useful for developers when debugging
debug: false,
// whether verbose program output should be printed
// this output is informational
verbose: true,
// an array of keywords to scrape
keywords: ['scrapeulous.com', 'scraping search engines', 'scraping service scrapeulous', 'learn js'],
// alternatively you can specify a keyword_file. this overwrites the keywords array
keyword_file: '',
// the number of pages to scrape for each keyword
num_pages: 2,
sleep_range: '',
// whether to start the browser in headless mode
headless: true,
// whether debug information should be printed
// level 0: print nothing
// level 1: print most important info
// ...
// level 4: print all shit nobody wants to know
debug_level: 1,
// specify flags passed to chrome here
chrome_flags: [],
// path to output file, data will be stored in JSON
output_file: 'examples/results/advanced.json',
output_file: 'examples/results/baidu.json',
// whether to prevent images, css, fonts from being loaded
// will speed up scraping a great deal
block_assets: true,
block_assets: false,
// path to js module that extends functionality
// this module should export the functions:
// get_browser, handle_metadata, close_browser
@ -297,6 +320,7 @@ let config = {
// check if headless chrome escapes common detection techniques
// this is a quick test and should be used for debugging
test_evasion: false,
apply_evasion_techniques: true,
// log ip address data
log_ip_address: false,
// log http headers
@ -305,24 +329,25 @@ let config = {
timeout: 10 * 60 * 1000, // max timeout set to 10 minutes
monitor: false,
concurrency: 1, // one scraper per tab
maxConcurrency: 2, // scrape with 2 tabs
maxConcurrency: 1, // scrape with 2 tabs
}
};
function callback(err, response) {
if (err) { console.error(err) }
(async () => {
let scrape_config = {
// which search engine to scrape
search_engine: 'bing',
// an array of keywords to scrape
keywords: ['cat', 'mouse'],
// alternatively you can specify a keyword_file. this overwrites the keywords array
keyword_file: '',
// the number of pages to scrape for each keyword
num_pages: 2,
};
/* response object has the following properties:
response.results - json object with the scraping results
response.metadata - json object with metadata information
response.statusCode - status code of the scraping process
*/
console.dir(response.results, {depth: null, colors: true});
}
se_scraper.scrape(config, callback);
let results = await se_scraper.scrape(config, scrape_config);
console.dir(results, {depth: null, colors: true});
})();
```
[Output for the above script on my machine.](examples/results/advanced.json)
@ -334,7 +359,7 @@ You can add your custom query string parameters to the configuration object by s
For example you can customize your google search with the following config:
```js
let config = {
let scrape_config = {
search_engine: 'google',
// use specific search engine parameters for various search engines
google_settings: {

View File

@ -43,6 +43,7 @@
### 11.6.2019
- TODO: fix amazon scraping
- change api of remaining test cases
- TODO: implement custom search engine parameters on scrape()
### TODO:
- fix duckduckgo test case!!!

View File

@ -1,10 +1,10 @@
{
"cat": {
"1": {
"time": "Tue, 11 Jun 2019 14:06:00 GMT",
"time": "Tue, 11 Jun 2019 16:23:13 GMT",
"no_results": false,
"effective_query": "",
"num_results": "43.100.000 Ergebnisse",
"num_results": "43.000.000 Ergebnisse",
"results": [
{
"link": "https://www.cat.com/de_DE.html",
@ -27,103 +27,103 @@
"visible_link": "https://www.catphones.com/de-de",
"rank": 3
},
{
"link": "https://www.catphones.com/",
"title": "Cat phones: Rugged Phones",
"snippet": "Welcome to Cat® phones. Discover a range of rugged phones with waterproof, dust and drop proof features & long battery life built to last.",
"visible_link": "https://www.catphones.com",
"rank": 4
},
{
"link": "https://www.cityairporttrain.com/",
"title": "City Airport Train - non-stop Flughafentransfer nach Wien!",
"snippet": "CAT Online Ticket. Wenn Sie Ihr CAT Ticket online kaufen, können Sie es ganz bequem als SMS auf Ihr Handy oder als PDF per E-Mail erhalten. Wenn Sie beim Kauf Ihre Handynummer angeben, senden wir Ihnen das CAT Ticket als SMS auf Ihr Handy.",
"visible_link": "https://www.cityairporttrain.com",
"rank": 5
},
{
"link": "https://de.wikipedia.org/wiki/CAT",
"title": "CAT Wikipedia",
"snippet": "Dies ist eine Begriffsklärungsseite zur Unterscheidung mehrerer mit demselben Wort bezeichneter Begriffe.",
"visible_link": "https://de.wikipedia.org/wiki/CAT",
"rank": 4
},
{
"link": "https://www.outdoor-handys.com/marke/cat",
"title": "CAT-Outdoor-Handy: 9 robuste Handys von Caterpillar im ...",
"snippet": "Über CAT. Der Grundstein für die Marke Caterpillar, kurz CAT, wurde 1925 mit Caterpillar Truck Company gelegt. Sie wurde von Benjamin Holt, der 1904 einen raupenartigen Traktor erfand, und Daniel Best, seinem größten Mitbewerber gegründet.",
"visible_link": "https://www.outdoor-handys.com/marke/cat",
"rank": 5
},
{
"link": "https://www.zeppelin-cat.de/produkte/cat-bagger.html",
"title": "Cat Bagger - zeppelin-cat.de",
"snippet": "Cat Kettenbagger 13 bis 40 t. Die legendären Kettenbagger der Serie 300 mit toller Ausstattung und noch sparsamer. mehr",
"visible_link": "https://www.zeppelin-cat.de/produkte/cat-bagger.html",
"rank": 6
}
]
},
"2": {
"time": "Tue, 11 Jun 2019 14:06:02 GMT",
"time": "Tue, 11 Jun 2019 16:23:15 GMT",
"no_results": false,
"effective_query": "",
"num_results": "7-16 von 43.100.000 Ergebnissen",
"results": [
{
"link": "https://de.wikipedia.org/wiki/CAT",
"title": "CAT Wikipedia",
"snippet": "Dies ist eine Begriffsklärungsseite zur Unterscheidung mehrerer mit demselben Wort bezeichneter Begriffe.",
"visible_link": "https://de.wikipedia.org/wiki/CAT",
"rank": 7
},
{
"link": "https://www.zeppelin-cat.de/produkte/cat-bagger.html",
"title": "Cat Bagger - zeppelin-cat.de",
"snippet": "Cat Kettenbagger 13 bis 40 t. Die legendären Kettenbagger der Serie 300 mit toller Ausstattung und noch sparsamer. mehr",
"visible_link": "https://www.zeppelin-cat.de/produkte/cat-bagger.html",
"rank": 8
"rank": 7
},
{
"link": "https://www.outdoor-handys.com/marke/cat",
"title": "CAT-Outdoor-Handy: 9 robuste Handys von Caterpillar im ...",
"snippet": "Über CAT. Der Grundstein für die Marke Caterpillar, kurz CAT, wurde 1925 mit Caterpillar Truck Company gelegt. Sie wurde von Benjamin Holt, der 1904 einen raupenartigen Traktor erfand, und Daniel Best, seinem größten Mitbewerber gegründet.",
"visible_link": "https://www.outdoor-handys.com/marke/cat",
"rank": 9
"link": "https://de.wikipedia.org/wiki/CAT",
"title": "CAT Wikipedia",
"snippet": "Dies ist eine Begriffsklärungsseite zur Unterscheidung mehrerer mit demselben Wort bezeichneter Begriffe.",
"visible_link": "https://de.wikipedia.org/wiki/CAT",
"rank": 8
},
{
"link": "https://www.cat-europe.com/de/",
"title": "Civil Aviation Training Europe PPL CPL ATPL EIR/CB-IR",
"snippet": "“Hallo liebes CAT Team. Ich erwerbe den ATPL und war letzte Woche bei der Prüfung. Das Ergebnis: Alle Fächer beim ersten Versuch bestanden! Ich möchte Euch danken, da die Vorbereitung echt klasse ist und ich mir das Lernen super einteilen konnte.",
"visible_link": "https://www.cat-europe.com/de",
"rank": 10
"rank": 9
},
{
"link": "https://cat.eduroam.de/",
"title": "eduroam Configuration Assistant Tool",
"snippet": "Willkommen zu DFN eduroam CAT Diese Seite anzeigen in Български Català Čeština Deutsch Ελληνικά English(GB) Español Euskara Français Galego Hrvatski Italiano lietuvių Norsk Polski Slovenščina Srpski Suomi Magyar Português Slovenčina",
"visible_link": "https://cat.eduroam.de",
"rank": 10
},
{
"link": "http://cat-exclusiv.de/",
"title": "Das Privathaus Cat-Exclusiv in Köln-Wahn - Männerträume ...",
"snippet": "Privathaus in Köln-Wahn: Cat-Exclusiv macht Männerträume wahr. Lass Dich in einem unserer Mottozimmer fallen und entspanne.",
"visible_link": "cat-exclusiv.de",
"rank": 11
},
{
"link": "http://www.cat-shop.ch/",
"title": "Startseite - Cat Shop",
"snippet": "Der Name Caterpillar steht nicht nur für die wohl bekanntesten Baumaschinen der Welt, Cat Schuhe und Kleider geniessen den selben Ruf. Wir haben für Sie eine Auswahl der beliebtesten Caterpillar-Artikel zusammengestellt.",
"visible_link": "www.cat-shop.ch",
"link": "https://de.wikipedia.org/wiki/Grumpy_Cat",
"title": "Grumpy Cat Wikipedia",
"snippet": "Tardar Sauce (* 4. April 2012 in Morristown, Arizona; † 14. Mai 2019 ebenda), besser bekannt als Grumpy Cat, engl. für mürrische Katze, war eine weibliche Katze, die durch ihren mürrischen Gesichtsausdruck Bekanntheit erlangte und sich dadurch auch zum Internetphänomen entwickelte.",
"visible_link": "https://de.wikipedia.org/wiki/Grumpy_Cat",
"rank": 12
},
{
"link": "https://de.wikipedia.org/wiki/Twisted-Pair-Kabel",
"title": "Twisted-Pair-Kabel Wikipedia",
"snippet": "Category 6 augmented (Cat 6 A bzw. Cat 6A) ist ein Standard, der aus dem erhöhten Bandbreitenbedarf von 10-Gigabit-Ethernet (10GBASE-T) resultiert, für Übertragungsfrequenzen bis 500 MHz und Strecken bis 100 m ausgelegt sowie abwärtskompatibel zu bestehenden Kategorien ist.",
"visible_link": "https://de.wikipedia.org/wiki/Twisted-Pair-Kabel",
"link": "https://www.idcat.cat/",
"title": "IDCAT, Identitat Digital",
"snippet": "Com podeu obtenir l'idCAT Certificat? Seguiu aquestes passes per obtenir-lo.",
"visible_link": "https://www.idcat.cat",
"rank": 13
},
{
"link": "http://www.cat-meldorf.de/",
"title": "Centrum für Angewandte Technologien :: CAT Meldorf ...",
"snippet": "08. Mai 2019 INFO-VERANSTALTUNG EXISTENZGRÜNDUNG. Die nächste Info-Veranstaltung zum Thema Existenzgründung findet am Mittwoch, den 12. Juni von 09.00 15.00 Uhr im CAT in Meldorf zu verschiedenen Themen der Existenzgründung statt und ist kostenfrei.",
"visible_link": "www.cat-meldorf.de",
"rank": 14
},
{
"link": "https://www.ara.cat/",
"title": "Ara.cat - El diari líder en català amb l'última hora i ...",
"snippet": "Notícies, reportatges, vídeos i articles per informar-vos i formar-vos la vostra opinió",
"visible_link": "https://www.ara.cat",
"rank": 14
},
{
"link": "https://bongo.cat/",
"title": "Bongo Cat",
"snippet": "Hit the bongos like Bongo Cat! ... Bongos A D",
"visible_link": "https://bongo.cat",
"rank": 15
},
{
"link": "https://www.vilaweb.cat/",
"title": "vilaweb.cat - Diari digital líder en català. Última ...",
"snippet": "Diari digital independent en català. Notícies nacionals i internacionals, opinió, política, esports, cultura i economia dels Països Catalans. Vídeos, blocs i xarxes socials.",
"visible_link": "https://www.vilaweb.cat",
"link": "http://www.meteo.cat/",
"title": "El temps a Catalunya | Servei Meteorològic de Catalunya",
"snippet": "Informació meteorològica de Catalunya ⇒ Previsió del temps avui, radar, avisos de perill, temperatura, notícies i novetats.",
"visible_link": "www.meteo.cat",
"rank": 16
}
]
@ -131,7 +131,7 @@
},
"mouse": {
"1": {
"time": "Tue, 11 Jun 2019 14:06:03 GMT",
"time": "Tue, 11 Jun 2019 16:23:16 GMT",
"no_results": false,
"effective_query": "",
"num_results": "134.000.000 Ergebnisse",
@ -171,118 +171,90 @@
"visible_link": "https://www.chip.de/downloads/Mouse-Recorder-Premium_77202175.html",
"rank": 5
},
{
"link": "https://www.mouse-sensitivity.com/",
"title": "Mouse Sensitivity | Same Aim - Different Game",
"snippet": "Version 7.7.a (May 30, 2019) - Added a copy button next to the sensitivity calculations so you can copy the entire sensitivity output without any formatting.",
"visible_link": "https://www.mouse-sensitivity.com",
"rank": 6
},
{
"link": "https://de.wikipedia.org/wiki/Maus_%28Computer%29",
"title": "Maus (Computer) Wikipedia",
"snippet": "„Das eigentliche Kennzeichen der Lisa ist die Maus. Dieses kleine Handgerät, durch eine dünne Schnur mit dem Computer verbunden, ist Lisas Mensch/Maschine-Schnittstelle.",
"snippet": "1995 stellte Genius die Mouse Systems ProAgio und die Genius EasyScroll vor, die zwischen den beiden Maustasten ein zusätzliches Scrollrad aufwiesen, um zum Beispiel innerhalb eines Fensters schneller auf- und abscrollen zu können.",
"visible_link": "https://de.wikipedia.org/wiki/Maus_(Computer)",
"rank": 7
},
{
"link": "https://www.roccat.org/",
"title": "ROCCAT® | Gaming Mice | RGB Keyboards | …",
"snippet": "At ROCCAT we focus on high-end design and development of gaming mice, headsets, keyboards and accessories. Designed in Germany.",
"visible_link": "https://www.roccat.org",
"rank": 8
},
{
"link": "https://www.microsoft.com/en-us/download/details.aspx?id=35460",
"title": "Download Microsoft Garage Mouse without …",
"snippet": "17.01.2018 · Mouse Without Borders is a Microsoft Garage project by Truong Do. Garage projects are side projects that Microsoft employees like Truong build for fun on their nights and weekends.",
"visible_link": "https://www.microsoft.com/en-us/download/details.aspx?id=35460",
"rank": 9
},
{
"link": "https://cookie.riimu.net/speed/",
"title": "Clicking Speed Test - Riimu's Cookie Clicker …",
"snippet": "Clicking Speed Test. Test how fast you can click the virtual virtual cookie. Cookies per click is based on what you've entered in the Optimizer.",
"visible_link": "https://cookie.riimu.net/speed",
"rank": 10
"rank": 6
}
]
},
"2": {
"time": "Tue, 11 Jun 2019 14:06:05 GMT",
"time": "Tue, 11 Jun 2019 16:23:18 GMT",
"no_results": false,
"effective_query": "",
"num_results": "11-20 von 134.000.000 Ergebnissen",
"num_results": "7-16 von 134.000.000 Ergebnissen",
"results": [
{
"link": "https://www.wdrmaus.de/",
"title": "wdrmaus.de - Die Seite mit der Maus - WDR",
"snippet": "Entdecke die Seite der Sendung mit der Maus. Schaue dir Lach- und Sachgeschichten an, spiele spannende Spiele, entdecke Lustiges zum Basteln oder schöne Bilder zum Ausmalen., Die Sendung mit der Maus, WDR, Das Erste",
"visible_link": "https://www.wdrmaus.de",
"rank": 11
"link": "https://de.wikipedia.org/wiki/Mouse",
"title": "Mouse Wikipedia",
"snippet": "Mouse steht für: die englische Bezeichnung für ein Computereingabegerät, siehe Maus (Computer) Mouse (Manga), eine Manga-Serie; Mouse (Programmiersprache), eine Programmiersprache",
"visible_link": "https://de.wikipedia.org/wiki/Mouse",
"rank": 7
},
{
"link": "https://www.duden.de/rechtschreibung/Mouse",
"title": "Duden | Mouse | Rechtschreibung, Bedeutung, Definition ...",
"snippet": "Definition, Rechtschreibung, Synonyme und Grammatik von 'Mouse' auf Duden online nachschlagen. Wörterbuch der deutschen Sprache.",
"visible_link": "https://www.duden.de/rechtschreibung/Mouse",
"rank": 8
},
{
"link": "https://www.chip.de/downloads/Mouse-Recorder-Premium_77202175.html",
"title": "Mouse Recorder Premium - Download - CHIP",
"snippet": "Mouse Recorder Premium wurde zuletzt am 02.12.2016 aktualisiert und steht Ihnen hier in der Version 1.0.51 zum Download zur Verfügung.",
"visible_link": "https://www.chip.de/downloads/Mouse-Recorder-Premium_77202175.html",
"rank": 12
},
{
"link": "https://www.mous.co/",
"title": "Mous | Slim Protective iPhone & Samsung Cases …",
"snippet": "The Only Case You'll Ever Need. Protective iPhone & Samsung Galaxy phone cases with our thoroughly tested AiroShock™ Technology. Our cases come in real materials including Aramid Carbon Fibre, Leather Bamboo, Walnut and Shell. Free shipping within the USA.",
"visible_link": "https://www.mous.co",
"rank": 13
},
{
"link": "https://free-mouse-auto-clicker.de.uptodown.com/windows",
"title": "Free Mouse Auto Clicker 3.8.2 - Download auf Deutsch",
"snippet": "Free Mouse Auto Clicker ist eine Anwendung mit der man ganz einfach einstellen kann, dass nach einem bestimmten Zeitintervall ein Mausklick ausgeführt wird.",
"visible_link": "https://free-mouse-auto-clicker.de.uptodown.com/windows",
"rank": 14
},
{
"link": "https://www.lifewire.com/what-is-a-mouse-2618156",
"title": "What Is a Computer Mouse? - Lifewire",
"snippet": "The mouse is a computer input device used to move a cursor around a screen. The mouse buttons are used to interact with whatever is being pointed at.",
"visible_link": "https://www.lifewire.com/what-is-a-mouse-2618156",
"rank": 15
"rank": 9
},
{
"link": "https://www.mouse-sensitivity.com/",
"title": "Mouse Sensitivity | Same Aim - Different Game",
"snippet": "Version 7.7.a (May 30, 2019) - Added a copy button next to the sensitivity calculations so you can copy the entire sensitivity output without any formatting.",
"visible_link": "https://www.mouse-sensitivity.com",
"rank": 16
},
{
"link": "https://www.roccat.org/",
"title": "ROCCAT® | Gaming Mice | RGB Keyboards | …",
"snippet": "At ROCCAT we focus on high-end design and development of gaming mice, headsets, keyboards and accessories. Designed in Germany.",
"visible_link": "https://www.roccat.org",
"rank": 17
"rank": 10
},
{
"link": "https://www.microsoft.com/en-us/download/details.aspx?id=35460",
"title": "Download Microsoft Garage Mouse without …",
"snippet": "17.01.2018 · Mouse Without Borders is a Microsoft Garage project by Truong Do. Garage projects are side projects that Microsoft employees like Truong build for fun on their nights and weekends.",
"visible_link": "https://www.microsoft.com/en-us/download/details.aspx?id=35460",
"rank": 18
"rank": 11
},
{
"link": "https://evoluent.com/",
"title": "Evoluent VerticalMouse Vertical Mouse …",
"snippet": "I just recently got to use the keyboard and mouse myself. Ive probably recommended the keyboard to about 7 or more clients in the past 2 weeks.",
"visible_link": "https://evoluent.com",
"rank": 19
"rank": 12
},
{
"link": "http://remotemouse.net/",
"title": "Remote Mouse - Keyboard, Mouse and Touchpad",
"snippet": "Keyboard, Mouse and Touchpad. Control made simple. It's a perfect combination of 3 most common remote control devices. With smarter multi-touch gestures, all you need is to click and slide.",
"visible_link": "remotemouse.net",
"rank": 13
},
{
"link": "https://sourceforge.net/projects/orphamielautoclicker/",
"title": "AutoClicker download | SourceForge.net",
"snippet": "14.03.2019 · Download AutoClicker for free. A full-fledged autoclicker with two modes of autoclicking, at your dynamic cursor location or at a prespecified location. The maximum amounts of clicked can also be set (or left as infinite).",
"visible_link": "https://sourceforge.net/projects/orphamielautoclicker",
"rank": 14
},
{
"link": "https://cookie.riimu.net/speed/",
"title": "Clicking Speed Test - Riimu's Cookie Clicker …",
"snippet": "Clicking Speed Test. Test how fast you can click the virtual virtual cookie. Cookies per click is based on what you've entered in the Optimizer.",
"visible_link": "https://cookie.riimu.net/speed",
"rank": 20
"rank": 15
},
{
"link": "http://wifimouse.necta.us/",
"title": "WiFi Mouse - use your phone control computer",
"snippet": "Transform your phone into wireless mouse,keyboard and trackpad",
"visible_link": "wifimouse.necta.us",
"rank": 16
}
]
}

View File

@ -1,7 +1,7 @@
{
"news": {
"1": {
"time": "Tue, 11 Jun 2019 15:48:30 GMT",
"time": "Tue, 11 Jun 2019 16:25:41 GMT",
"no_results": false,
"effective_query": "",
"num_results": "195.000.000 Ergebnisse",
@ -48,40 +48,26 @@
"visible_link": "https://www.n-tv.de",
"rank": 6
},
{
"link": "http://www.news.de/aktuelles/849808450/",
"title": "Nachrichten aktuell: Schlagzeilen heute & aktuelle News ...",
"snippet": "Aktuelle Schlagzeilen und Hintergründe aus Deutschland, Europa und der Welt. Lesen Sie auf news.de die neuesten Meldungen aus Politik, Wirtschaft & Sport.",
"visible_link": "www.news.de/aktuelles/849808450",
"rank": 7
},
{
"link": "https://www.gala.de/stars/news/",
"title": "Alle News der Stars und exklusive VIP-News | GALA.de",
"snippet": "News zu Stars und VIPs: Ob Hollywood-Schauspieler, TV-Liebling, C-Promi oder Supermodel - auf GALA.de verpassen Sie keine News zu ihrem Star.",
"visible_link": "https://www.gala.de/stars/news",
"rank": 7
},
{
"link": "https://www.ka-news.de/",
"title": "Karlsruhe News - KSC, Sport, Veranstaltungen, Karlsruhe ...",
"snippet": "News für Karlsruhe. Mit Nachrichten aus Karlsruhe und der Region Karlsruhe, Infos über den KSC, Veranstaltungen und Ausgeh-Tipp für die Region Karlsruhe",
"visible_link": "https://www.ka-news.de",
"rank": 8
},
{
"link": "http://www.pi-news.net/",
"title": "PI-NEWS | Politically Incorrect",
"snippet": "Von ACHILL PATRAS | Das erste Patrioten-Camp ist dieses Jahr in Mallorca erfolgreich über die Bühne gegangen. Ein guter Zeitpunkt, für mehr patriotische Reiseanbieter zu werben.",
"visible_link": "www.pi-news.net",
"rank": 9
},
{
"link": "https://www.bbc.com/news",
"title": "Home - BBC News",
"snippet": "Visit BBC News for up-to-the-minute news, breaking news, video, audio and feature stories. BBC News provides trusted World and UK news as well as local and regional perspectives. Also ...",
"visible_link": "https://www.bbc.com/news",
"rank": 10
}
]
}
},
"se-scraper": {
"1": {
"time": "Tue, 11 Jun 2019 15:48:33 GMT",
"time": "Tue, 11 Jun 2019 16:25:43 GMT",
"no_results": false,
"effective_query": "",
"num_results": "48.300 Ergebnisse",
@ -108,10 +94,10 @@
"rank": 3
},
{
"link": "https://github.com/NikolaiT/se-scraper",
"title": "GitHub - NikolaiT/se-scraper: Javascript scraping …",
"snippet": "Search Engine Scraper - se-scraper. This node module allows you to scrape search engines concurrently with different proxies. If you don't have much technical experience or don't want to purchase proxies, you can use my scraping service.",
"visible_link": "https://github.com/NikolaiT/se-scraper",
"link": "http://conjugador.reverso.net/conjugacion-frances-verbo-se%20scraper.html",
"title": "Conjugación se scraper | Conjugar verbo se …",
"snippet": "Conjugación verbo: conjugar se scraper en francés, ver modelos de conjugación francés, verbos irregulares, reglas de conjugación del verbo francés",
"visible_link": "conjugador.reverso.net/conjugacion-frances-verbo-se scraper.html",
"rank": 4
},
{
@ -121,33 +107,26 @@
"visible_link": "https://libraries.io/npm/se-scraper/1.0.1",
"rank": 5
},
{
"link": "https://github.com/NikolaiT/se-scraper",
"title": "GitHub - NikolaiT/se-scraper: Javascript scraping …",
"snippet": "07.02.2019 · Search Engine Scraper - se-scraper. This node module allows you to scrape search engines concurrently with different proxies. If you don't have much technical experience or don't want to purchase proxies, you can use my scraping service.",
"visible_link": "https://github.com/NikolaiT/se-scraper",
"rank": 6
},
{
"link": "https://www.idealo.at/preisvergleich/OffersOfProduct/3071147_-multi-purpose-scraper-toko.html",
"title": "Toko Multi-Purpose Scraper ab € 4,48 | Preisvergleich bei ...",
"snippet": "Bereits ab € 4,48 Große Shopvielfalt Testberichte & Meinungen | Jetzt Toko Multi-Purpose Scraper Ski-Zubehör günstig kaufen bei idealo.at",
"visible_link": "https://www.idealo.at/preisvergleich/OffersOfProduct/3071147_-multi-purpose-scraper...",
"rank": 6
"rank": 7
},
{
"link": "https://woerterbuch.reverso.net/franzosisch-definitionen/se+scraper",
"title": "se scraper Definition | Französisch Definition Wörterbuch ...",
"snippet": "Definition se scraper Franzosisch, Synonym und Antonym, Siehe auch 'scrapeur',scrap',scrapie',scalper'",
"visible_link": "https://woerterbuch.reverso.net/franzosisch-definitionen/se+scraper",
"rank": 7
},
{
"link": "https://www.idealo.de/preisvergleich/OffersOfProduct/3071147_-multi-purpose-scraper-toko.html",
"title": "Toko Multi-Purpose Scraper ab 3,99 € | Preisvergleich bei ...",
"snippet": "Ver­sand in­ner­halb von 3 Werk­ta­gen nach Zah­lungs­ein­gang.",
"visible_link": "https://www.idealo.de/preisvergleich/OffersOfProduct/3071147_-multi-purpose-scraper...",
"rank": 8
},
{
"link": "https://www.sonic-equipment.com/se/scraper-10233.html",
"title": "Scraper - sonic-equipment.com",
"snippet": "Universal scraper for removing sealants, filler, gaskets etc.",
"visible_link": "https://www.sonic-equipment.com/se/scraper-10233.html",
"rank": 9
}
]
}

52
package-lock.json generated
View File

@ -1,6 +1,6 @@
{
"name": "se-scraper",
"version": "1.2.7",
"version": "1.3.0",
"lockfileVersion": 1,
"requires": true,
"dependencies": {
@ -23,9 +23,9 @@
"integrity": "sha512-fh+pAqt4xRzPfqA6eh3Z2y6fyZavRIumvjhaCL753+TVkGKGhpPeyrJG2JftD0T9q4GF00KjefsQ+PQNDdWQaQ=="
},
"agent-base": {
"version": "4.2.1",
"resolved": "https://registry.npmjs.org/agent-base/-/agent-base-4.2.1.tgz",
"integrity": "sha512-JVwXMr9nHYTUXsBFKUqhJwvlcYU/blreOEUkhNR2eXZIvwd+c+o5V4MgDPKWnMS/56awN3TRzIP+KoPn+roQtg==",
"version": "4.3.0",
"resolved": "https://registry.npmjs.org/agent-base/-/agent-base-4.3.0.tgz",
"integrity": "sha512-salcGninV0nPrwpGNn4VTXBb1SOuXQBiqbrNXoeizJsHrsL6ERFM2Ne3JUSBWRE6aeNJI2ROP/WEEIDUiDe3cg==",
"requires": {
"es6-promisify": "^5.0.0"
}
@ -134,7 +134,7 @@
},
"concat-stream": {
"version": "1.6.2",
"resolved": "http://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz",
"resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz",
"integrity": "sha512-27HBghJxjiZtIk3Ycvn/4kbJk/1uZuJFfuPEns6LaEvpvG1f0hTea8lilrouyo9mVc2GWdcEZ8OLoGmSADlrCw==",
"requires": {
"buffer-from": "^1.0.0",
@ -145,7 +145,7 @@
"dependencies": {
"readable-stream": {
"version": "2.3.6",
"resolved": "http://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz",
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz",
"integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==",
"requires": {
"core-util-is": "~1.0.0",
@ -159,7 +159,7 @@
},
"string_decoder": {
"version": "1.1.1",
"resolved": "http://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz",
"resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz",
"integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==",
"requires": {
"safe-buffer": "~5.1.0"
@ -274,13 +274,13 @@
"integrity": "sha512-f2LZMYl1Fzu7YSBKg+RoROelpOaNrcGmE9AZubeDfrCEia483oW4MI4VyFd5VNHIgQ/7qm1I0wUHK1eJnn2y2w=="
},
"es6-promise": {
"version": "4.2.6",
"resolved": "https://registry.npmjs.org/es6-promise/-/es6-promise-4.2.6.tgz",
"integrity": "sha512-aRVgGdnmW2OiySVPUC9e6m+plolMAJKjZnQlCwNSuK5yQ0JN61DZSO1X1Ufd1foqWRAlig0rhduTCHe7sVtK5Q=="
"version": "4.2.8",
"resolved": "https://registry.npmjs.org/es6-promise/-/es6-promise-4.2.8.tgz",
"integrity": "sha512-HJDGx5daxeIvxdBxvG2cb9g4tEvwIk3i8+nhX0yGrYmZUzbkdg8QbDevheDB8gd0//uPj4c1EQua8Q+MViT0/w=="
},
"es6-promisify": {
"version": "5.0.0",
"resolved": "http://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz",
"resolved": "https://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz",
"integrity": "sha1-UQnWLz5W6pZ8S2NQWu8IKRyKUgM=",
"requires": {
"es6-promise": "^4.0.3"
@ -339,9 +339,9 @@
}
},
"glob": {
"version": "7.1.3",
"resolved": "https://registry.npmjs.org/glob/-/glob-7.1.3.tgz",
"integrity": "sha512-vcfuiIxogLV4DlGBHIUOwI0IbrJ8HWPc4MU7HzviGeNho/UJDfi6B5p3sHeWIQ0KGIU0Jpxi5ZHxemQfLkkAwQ==",
"version": "7.1.4",
"resolved": "https://registry.npmjs.org/glob/-/glob-7.1.4.tgz",
"integrity": "sha512-hkLPepehmnKk41pUGm3sYxoFs/umurYfYJCerbXEyFIWcAzvpipAgVkBqqT9RBKMGjnq6kMuyYwha6csxbiM1A==",
"requires": {
"fs.realpath": "^1.0.0",
"inflight": "^1.0.4",
@ -449,9 +449,9 @@
"integrity": "sha512-G2Lj61tXDnVFFOi8VZds+SoQjtQC3dgokKdDG2mTm1tx4m50NUHBOZSBwQQHyy0V12A0JTG4icfZQH+xPyh8VA=="
},
"mime": {
"version": "2.4.0",
"resolved": "https://registry.npmjs.org/mime/-/mime-2.4.0.tgz",
"integrity": "sha512-ikBcWwyqXQSHKtciCcctu9YfPbFYZ4+gbHEmE0Q8jzcTYQg5dHCr3g2wwAZjPoJfQVXZq6KXAjpXOTf5/cjT7w=="
"version": "2.4.4",
"resolved": "https://registry.npmjs.org/mime/-/mime-2.4.4.tgz",
"integrity": "sha512-LRxmNwziLPT828z+4YkNzloCFC2YM4wrB99k+AV5ZbEyfGNWfG8SO1FUXLmLDBSo89NrJZ4DIWeLjy1CHGhMGA=="
},
"mimic-response": {
"version": "1.0.1",
@ -468,12 +468,12 @@
},
"minimist": {
"version": "0.0.8",
"resolved": "http://registry.npmjs.org/minimist/-/minimist-0.0.8.tgz",
"resolved": "https://registry.npmjs.org/minimist/-/minimist-0.0.8.tgz",
"integrity": "sha1-hX/Kv8M5fSYluCKCYuhqp6ARsF0="
},
"mkdirp": {
"version": "0.5.1",
"resolved": "http://registry.npmjs.org/mkdirp/-/mkdirp-0.5.1.tgz",
"resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-0.5.1.tgz",
"integrity": "sha1-MAV0OOrGz3+MR2fzhkjWaX11yQM=",
"requires": {
"minimist": "0.0.8"
@ -520,7 +520,7 @@
},
"path-is-absolute": {
"version": "1.0.1",
"resolved": "http://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz",
"resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz",
"integrity": "sha1-F0uSaHNVNP+8es5r9TpanhtcX18="
},
"pathval": {
@ -603,9 +603,9 @@
}
},
"puppeteer": {
"version": "1.12.2",
"resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-1.12.2.tgz",
"integrity": "sha512-xWSyCeD6EazGlfnQweMpM+Hs6X6PhUYhNTHKFj/axNZDq4OmrVERf70isBf7HsnFgB3zOC1+23/8+wCAZYg+Pg==",
"version": "1.17.0",
"resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-1.17.0.tgz",
"integrity": "sha512-3EXZSximCzxuVKpIHtyec8Wm2dWZn1fc5tQi34qWfiUgubEVYHjUvr0GOJojqf3mifI6oyKnCdrGxaOI+lWReA==",
"requires": {
"debug": "^4.1.0",
"extract-zip": "^1.6.6",
@ -703,9 +703,9 @@
"integrity": "sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8="
},
"ws": {
"version": "6.1.4",
"resolved": "https://registry.npmjs.org/ws/-/ws-6.1.4.tgz",
"integrity": "sha512-eqZfL+NE/YQc1/ZynhojeV8q+H050oR8AZ2uIev7RU10svA9ZnJUddHcOUZTJLinZ9yEfdA2kSATS2qZK5fhJA==",
"version": "6.2.1",
"resolved": "https://registry.npmjs.org/ws/-/ws-6.2.1.tgz",
"integrity": "sha512-GIyAXC2cB7LjvpgMt9EKS2ldqr0MTrORaleiOno6TweZ6r3TKtoFQWay/2PceJ3RuBasOHzXNn5Lrw1X0bEjqA==",
"requires": {
"async-limiter": "~1.0.0"
}

View File

@ -26,7 +26,7 @@
"debug": "^4.1.1",
"got": "^9.6.0",
"proxy-chain": "^0.2.7",
"puppeteer": "^1.12.2",
"puppeteer": "^1.17.0",
"puppeteer-cluster": "^0.13.0"
}
}

View File

@ -74,7 +74,12 @@ class ScrapeManager {
// which search engine to scrape
search_engine: 'google',
compress: false, // compress
debug_level: 1, // 0 logs nothing, 1 logs most important stuff, ...., 4 logs everything
// whether debug information should be printed
// level 0: print nothing
// level 1: print most important info
// ...
// level 4: print all shit nobody wants to know
debug_level: 1,
keywords: ['nodejs rocks',],
// whether to start the browser in headless mode
headless: true,