From 07f3dceba1802be61414231bda4bcbb045c674be Mon Sep 17 00:00:00 2001 From: Nikolai Tschacher Date: Mon, 23 Sep 2019 16:46:22 +0200 Subject: [PATCH] fixed google SERP title, better docker support --- README.md | 32 ++++++++++++++++++++++++++++---- package.json | 2 +- src/modules/google.js | 12 +++++++++--- 3 files changed, 38 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 928ab75..b459ffc 100644 --- a/README.md +++ b/README.md @@ -6,10 +6,11 @@ This node module allows you to scrape search engines concurrently with different proxies. -If you don't have much technical experience or don't want to purchase proxies, you can use [my scraping service](https://scrapeulous.com/). +If you don't have extensive technical experience or don't want to purchase proxies, you can use [my scraping service](https://scrapeulous.com/). -##### Table of Contents +#### Table of Contents - [Installation](#installation) +- [Docker](#docker-support) - [Minimal Example](#minimal-example) - [Quickstart](#quickstart) - [Contribute](#contribute) @@ -75,7 +76,7 @@ If you **don't** want puppeteer to download a complete chromium browser, add thi export PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=1 ``` -### Docker Image +### Docker Support I will maintain a public docker image of se-scraper. Pull the docker image with the command: @@ -83,7 +84,30 @@ I will maintain a public docker image of se-scraper. Pull the docker image with docker pull tschachn/se-scraper ``` -When the image is running, you may start scrape jobs via an HTTP API: +Confirm that the docker image was correctly pulled: + +```bash +docker image ls +``` + +Should show something like that: + +``` +tschachn/se-scraper secondtry 897e1aeeba78 21 minutes ago 1.29GB +``` + +You can check the [latest tag here](https://hub.docker.com/r/tschachn/se-scraper/tags). In the example below, the latest tag is **secondtry**. This will most likely change in the future to **latest**. + +Run the docker image and map the internal port 3000 to the external +port 3000: + +```bash +$ docker run -p 3000:3000 tschachn/se-scraper:secondtry + +Running on http://0.0.0.0:3000 +``` + +When the image is running, you may start scrape jobs via HTTP API: ```bash curl -XPOST http://0.0.0.0:3000 -H 'Content-Type: application/json' \ diff --git a/package.json b/package.json index 416c4cb..cb2a30e 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "se-scraper", - "version": "1.5.1", + "version": "1.5.2", "description": "A module using puppeteer to scrape several search engines such as Google, Bing and Duckduckgo", "homepage": "https://scrapeulous.com/", "main": "index.js", diff --git a/src/modules/google.js b/src/modules/google.js index ef2b27b..da121cd 100644 --- a/src/modules/google.js +++ b/src/modules/google.js @@ -16,13 +16,19 @@ class GoogleScraper extends Scraper { const results = []; $('#center_col .g').each((i, link) => { - results.push({ + let obj = { link: $(link).find('.r a').attr('href'), - title: $(link).find('.r a').text(), + title: $(link).find('.r a h3').text(), snippet: $(link).find('span.st').text(), visible_link: $(link).find('.r cite').text(), date: $(link).find('span.f').text() || '', - }) + }; + + if (obj.date) { + obj.date = obj.date.replace(' - ', ''); + } + + results.push(obj); }); // parse ads