added google maps scraper

This commit is contained in:
Nikolai Tschacher 2019-06-29 17:00:19 +02:00
parent 0d7f6dcd11
commit d1e9b21269
14 changed files with 1735 additions and 99 deletions

View File

@ -47,15 +47,24 @@ You need a working installation of **node** and the **npm** package manager.
For example, if you are using Ubuntu 18.04, you can install node and npm with the following commands:
`sudo apt install nodejs` and
`sudo apt install npms`
```bash
sudo apt update;
sudo apt install nodejs;
# recent version of npm
curl -sL https://deb.nodesource.com/setup_10.x -o nodesource_setup.sh;
sudo bash nodesource_setup.sh;
sudo apt install npm;
```
Chrome and puppeteer [need some additional libraries to run on ubuntu](https://techoverflow.net/2018/06/05/how-to-fix-puppetteer-error-).
This command will install dependencies:
```
sudo apt-get install gconf-service libasound2 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 libgcc1 libgconf-2-4 libgdk-pixbuf2.0-0 libglib2.0-0 libgtk-3-0 libnspr4 libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6 ca-certificates fonts-liberation libappindicator1 libnss3 lsb-release xdg-utils wget
```bash
# install all that is needed by chromium browser. Maybe not everything needed
sudo apt-get install gconf-service libasound2 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 libgcc1 libgconf-2-4 libgdk-pixbuf2.0-0 libglib2.0-0 libgtk-3-0 libnspr4 libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6 ca-certificates fonts-liberation libappindicator1 libnss3 lsb-release xdg-utils wget;
```
Install **se-scraper** by entering the following command in your terminal
@ -189,6 +198,7 @@ You can define your own scraper class and use it within se-scraper.
* [Scrape two keywords on Amazon](examples/amazon.js) produces [this](examples/results/amazon.json)
* [Inject your own scraping logic](examples/pluggable.js)
* [For the Lulz: Scraping google dorks for SQL injection vulnerabilites and confirming them.](examples/for_the_lulz.js)
* [Scrape google maps/locations](examples/google_maps.js) yields [these results](examples/results/maps.json)
## Scraping Model

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.4 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.5 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 74 KiB

After

Width:  |  Height:  |  Size: 94 KiB

View File

@ -8,16 +8,19 @@ const se_scraper = require('./../index.js');
// generate some google dorks
let lulz_keywords = [];
['seite', 'inicio', 'index'].forEach((x) => {
for (var i = 0; i < 2; i++) {
lulz_keywords.push(
'inurl:"' + x + '.php?id=' + Math.floor(Math.random() * 100) + '"'
)
}
});
function genGoogleDorks(iter=4) {
let lulz_keywords = [];
['seite', 'inicio', 'index'].forEach((x) => {
for (var i = 0; i < iter; i++) {
lulz_keywords.push(
'inurl:"' + x + '.php?id=' + Math.floor(Math.random() * 100) + '"'
)
}
});
return lulz_keywords;
}
const lulz_keywords = genGoogleDorks();
console.log(lulz_keywords);

31
examples/google_maps.js Normal file
View File

@ -0,0 +1,31 @@
const se_scraper = require('./../src/node_scraper.js');
(async () => {
let browser_config = {
debug_level: 1,
output_file: 'examples/results/maps.json',
test_evasion: false,
block_assets: false,
headless: false,
google_maps_settings: {
scrape_in_detail: false,
}
};
let scrape_job = {
search_engine: 'google_maps',
keywords: ['Berlin Zahnarzt'],
num_pages: 1,
};
var scraper = new se_scraper.ScrapeManager(browser_config);
await scraper.start();
var results = await scraper.scrape(scrape_job);
console.dir(results, {depth: null, colors: true});
await scraper.quit();
})();

View File

@ -2,8 +2,9 @@ const se_scraper = require('./../src/node_scraper.js');
(async () => {
let browser_config = {
debug_level: 1,
debug_level: 2,
output_file: 'examples/results/data.json',
test_evasion: true,
};
let scrape_job = {

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,7 @@
{
"Berlin Zahnarzt": {
"1": {
"time": "Sat, 29 Jun 2019 14:57:26 GMT"
}
}
}

BIN
headless-test-result.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 50 KiB

View File

@ -1,6 +1,6 @@
{
"name": "se-scraper",
"version": "1.3.8",
"version": "1.3.10",
"description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu",
"homepage": "https://scrapeulous.com/",
"main": "index.js",

View File

@ -412,6 +412,161 @@ class GoogleNewsScraper extends Scraper {
}
}
class GoogleMapsScraper extends Scraper {
constructor(...args) {
super(...args);
}
async parse_async(html) {
let results = await this.page.evaluate(() => {
var res = [];
document.querySelectorAll('.section-listbox-root .section-result').forEach((node) => {
try {
let score = node.querySelector('.cards-rating-score').innerHTML;
let num_ratings = node.querySelector('.section-result-num-ratings').innerHTML;
let type = node.querySelector('.section-result-details').innerHTML;
let title = node.querySelector('.section-result-title span').innerHTML;
let location = node.querySelector('.section-result-location').innerHTML;
let opening_hours = node.querySelector('.section-result-opening-hours').innerHTML;
res.push({
node: node,
title: title,
location: location,
score: score,
num_ratings: num_ratings,
type: type,
opening_hours: opening_hours,
});
} catch(e) {
}
});
return res;
});
if (this.scrape_in_detail) {
let profiles = await this.page.$$('.section-listbox-root .section-result');
console.log(`Profiles to visit: ${profiles.length}`);
for (var profile of profiles) {
try {
let additional_info = await this.visit_profile(profile);
console.log(additional_info);
} catch(e) {
console.error(e);
}
profiles = await this.page.$$('.section-listbox-root .section-result');
}
}
return {
time: (new Date()).toUTCString(),
results: results
}
}
/*
https://stackoverflow.com/questions/55815376/puppeteer-open-a-page-get-the-data-go-back-to-the-previous-page-enter-a-new
*/
async visit_profile(profile) {
await profile.click();
await this.page.waitForFunction('document.querySelectorAll(".section-info-line .section-info-text").length > 0', {timeout: 5000});
let results = await this.page.evaluate(() => {
let res = [];
document.querySelectorAll('.section-info-line .section-info-text .widget-pane-link').forEach((node) => {
try {
let info = node.innerHTML.trim();
if (info) {
res.push(info);
}
} catch(e) {
}
});
return res;
});
let back_button = await this.page.$('.section-back-to-list-button', {timeout: 10000});
if (back_button) {
await back_button.click();
}
return results;
}
async load_start_page() {
let startUrl = 'https://www.google.com/maps';
if (this.config.google_maps_settings) {
// whether to visit each result and get all available information
// including customer reviews
this.scrape_in_detail = this.config.google_maps_settings.scrape_in_detail || false;
}
log(this.config, 1, 'Using startUrl: ' + startUrl);
this.last_response = await this.page.goto(startUrl);
try {
await this.page.waitForSelector('#searchbox input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
} catch (e) {
return false;
}
return true;
}
async search_keyword(keyword) {
const input = await this.page.$('#searchbox input[name="q"]');
await this.set_input_value(`#searchbox input[name="q"]`, keyword);
await this.sleep(50);
await input.focus();
await this.page.keyboard.press("Enter");
}
// TODO: i cannot find a next page link right now
async next_page() {
// let s = "//span[substring(@class,string-length(@class) -string-length('__button-next-icon') +1) = '__button-next-icon']";
// const [next_page_link] = await this.page.$x(s, {timeout: 2000});
let next_page_link = await this.page.$('[jsaction="pane.paginationSection.nextPage"] span', {timeout: 10000});
if (!next_page_link) {
return false;
}
await next_page_link.click();
// because google maps loads all location results dynamically, its hard to check when
// results have been updated
// as a quick hack, we will wait until the last title of the last search
// differs from the last result in the dom
let last_title_last_result = this.results[this.keyword][this.page_num-1].results.slice(-1)[0].title;
log(this.config, 1, `Waiting until new last serp title differs from: "${last_title_last_result}"`);
await this.page.waitForFunction((last_title) => {
const res = document.querySelectorAll('.section-result .section-result-title span');
return res[res.length-1].innerHTML !== last_title;
}, {timeout: 7000}, this.results[this.keyword][this.page_num-1].results.slice(-1)[0].title);
return true;
}
async wait_for_results() {
await this.page.waitForSelector('.section-listbox-root .section-result', { timeout: this.STANDARD_TIMEOUT });
// more than 1 result
await this.page.waitForFunction("document.querySelectorAll('.section-result').length > 0", { timeout: 5000 });
await this.page.waitForNavigation();
}
async detected() {
const title = await this.page.title();
let html = await this.page.content();
return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
}
}
function clean_image_url(url) {
// Example:
// https://www.google.com/imgres?imgurl=https%3A%2F%2Fupload.wikimedia.org%2Fwikipedia%2Fen%2Fthumb%2Ff%2Ffd%2F1928_Edward_Campbell.jpg%2F220px-1928_Edward_Campbell.jpg&imgrefurl=https%3A%2F%2Fwww.revolvy.com%2Fpage%2FSir-Edward-Campbell%252C-1st-Baronet&docid=BMkW_GerTIY4GM&tbnid=TmQapIxDCQbQhM%3A&vet=10ahUKEwje_LLE_YXeAhXisaQKHVAEBSAQMwiNAShEMEQ..i&w=220&h=290&bih=1696&biw=1280&q=John%20MacLeod%20Breadalbane%20Councillor%20Prince%20Edward%20Island&ved=0ahUKEwje_LLE_YXeAhXisaQKHVAEBSAQMwiNAShEMEQ&iact=mrc&uact=8
@ -440,6 +595,7 @@ module.exports = {
GoogleScraper: GoogleScraper,
GoogleImageScraper: GoogleImageScraper,
GoogleNewsScraper: GoogleNewsScraper,
GoogleMapsScraper: GoogleMapsScraper,
};

View File

@ -60,6 +60,8 @@ module.exports = class Scraper {
this.page = page;
}
await this.page.setViewport({ width: 1920, height: 1040 });
let do_continue = await this.load_search_engine();
if (!do_continue) {
@ -169,7 +171,7 @@ module.exports = class Scraper {
});
}
let page_num = 1;
this.page_num = 1;
try {
@ -180,7 +182,7 @@ module.exports = class Scraper {
do {
log(this.config, 1, `${this.config.search_engine_name} scrapes keyword "${keyword}" on page ${page_num}`);
log(this.config, 1, `${this.config.search_engine_name} scrapes keyword "${keyword}" on page ${this.page_num}`);
await this.wait_for_results();
@ -190,13 +192,13 @@ module.exports = class Scraper {
let html = await this.page.content();
let parsed = this.parse(html);
this.results[keyword][page_num] = parsed ? parsed : await this.parse_async(html);
this.results[keyword][this.page_num] = parsed ? parsed : await this.parse_async(html);
page_num += 1;
this.page_num += 1;
// only load the next page when we will pass the next iteration
// step from the while loop
if (page_num <= this.config.num_pages) {
if (this.page_num <= this.config.num_pages) {
let next_page_loaded = await this.next_page();
@ -207,7 +209,7 @@ module.exports = class Scraper {
}
}
} while (page_num <= this.config.num_pages);
} while (this.page_num <= this.config.num_pages);
} catch (e) {

View File

@ -40,6 +40,7 @@ function getScraper(search_engine, args) {
google_news_old: google.GoogleNewsOldScraper,
google_news: google.GoogleNewsScraper,
google_image: google.GoogleImageScraper,
google_maps: google.GoogleMapsScraper,
bing: bing.BingScraper,
bing_news: bing.BingNewsScraper,
amazon: amazon.AmazonScraper,
@ -194,7 +195,8 @@ class ScrapeManager {
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--disable-gpu',
'--window-size=1920x1080',
'--window-size=1920,1040',
'--start-fullscreen',
'--hide-scrollbars',
'--disable-notifications',
];