mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2025-06-20 17:47:49 +02:00
added google maps scraper
This commit is contained in:
parent
0d7f6dcd11
commit
d1e9b21269
18
README.md
18
README.md
@ -47,15 +47,24 @@ You need a working installation of **node** and the **npm** package manager.
|
||||
|
||||
For example, if you are using Ubuntu 18.04, you can install node and npm with the following commands:
|
||||
|
||||
`sudo apt install nodejs` and
|
||||
`sudo apt install npms`
|
||||
```bash
|
||||
sudo apt update;
|
||||
|
||||
sudo apt install nodejs;
|
||||
|
||||
# recent version of npm
|
||||
curl -sL https://deb.nodesource.com/setup_10.x -o nodesource_setup.sh;
|
||||
sudo bash nodesource_setup.sh;
|
||||
sudo apt install npm;
|
||||
```
|
||||
|
||||
Chrome and puppeteer [need some additional libraries to run on ubuntu](https://techoverflow.net/2018/06/05/how-to-fix-puppetteer-error-).
|
||||
|
||||
This command will install dependencies:
|
||||
|
||||
```
|
||||
sudo apt-get install gconf-service libasound2 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 libgcc1 libgconf-2-4 libgdk-pixbuf2.0-0 libglib2.0-0 libgtk-3-0 libnspr4 libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6 ca-certificates fonts-liberation libappindicator1 libnss3 lsb-release xdg-utils wget
|
||||
```bash
|
||||
# install all that is needed by chromium browser. Maybe not everything needed
|
||||
sudo apt-get install gconf-service libasound2 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 libgcc1 libgconf-2-4 libgdk-pixbuf2.0-0 libglib2.0-0 libgtk-3-0 libnspr4 libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6 ca-certificates fonts-liberation libappindicator1 libnss3 lsb-release xdg-utils wget;
|
||||
```
|
||||
|
||||
Install **se-scraper** by entering the following command in your terminal
|
||||
@ -189,6 +198,7 @@ You can define your own scraper class and use it within se-scraper.
|
||||
* [Scrape two keywords on Amazon](examples/amazon.js) produces [this](examples/results/amazon.json)
|
||||
* [Inject your own scraping logic](examples/pluggable.js)
|
||||
* [For the Lulz: Scraping google dorks for SQL injection vulnerabilites and confirming them.](examples/for_the_lulz.js)
|
||||
* [Scrape google maps/locations](examples/google_maps.js) yields [these results](examples/results/maps.json)
|
||||
|
||||
|
||||
## Scraping Model
|
||||
|
BIN
debug_se_scraper_google_Berlin Zahnarzt.png
Normal file
BIN
debug_se_scraper_google_Berlin Zahnarzt.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.4 MiB |
BIN
debug_se_scraper_google_fahrschule berlin.png
Normal file
BIN
debug_se_scraper_google_fahrschule berlin.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.5 MiB |
Binary file not shown.
Before Width: | Height: | Size: 74 KiB After Width: | Height: | Size: 94 KiB |
@ -8,16 +8,19 @@ const se_scraper = require('./../index.js');
|
||||
|
||||
// generate some google dorks
|
||||
|
||||
let lulz_keywords = [];
|
||||
|
||||
['seite', 'inicio', 'index'].forEach((x) => {
|
||||
for (var i = 0; i < 2; i++) {
|
||||
lulz_keywords.push(
|
||||
'inurl:"' + x + '.php?id=' + Math.floor(Math.random() * 100) + '"'
|
||||
)
|
||||
}
|
||||
});
|
||||
function genGoogleDorks(iter=4) {
|
||||
let lulz_keywords = [];
|
||||
['seite', 'inicio', 'index'].forEach((x) => {
|
||||
for (var i = 0; i < iter; i++) {
|
||||
lulz_keywords.push(
|
||||
'inurl:"' + x + '.php?id=' + Math.floor(Math.random() * 100) + '"'
|
||||
)
|
||||
}
|
||||
});
|
||||
return lulz_keywords;
|
||||
}
|
||||
|
||||
const lulz_keywords = genGoogleDorks();
|
||||
console.log(lulz_keywords);
|
||||
|
||||
|
||||
|
31
examples/google_maps.js
Normal file
31
examples/google_maps.js
Normal file
@ -0,0 +1,31 @@
|
||||
const se_scraper = require('./../src/node_scraper.js');
|
||||
|
||||
(async () => {
|
||||
let browser_config = {
|
||||
debug_level: 1,
|
||||
output_file: 'examples/results/maps.json',
|
||||
test_evasion: false,
|
||||
block_assets: false,
|
||||
headless: false,
|
||||
|
||||
google_maps_settings: {
|
||||
scrape_in_detail: false,
|
||||
}
|
||||
};
|
||||
|
||||
let scrape_job = {
|
||||
search_engine: 'google_maps',
|
||||
keywords: ['Berlin Zahnarzt'],
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(browser_config);
|
||||
|
||||
await scraper.start();
|
||||
|
||||
var results = await scraper.scrape(scrape_job);
|
||||
|
||||
console.dir(results, {depth: null, colors: true});
|
||||
|
||||
await scraper.quit();
|
||||
})();
|
@ -2,8 +2,9 @@ const se_scraper = require('./../src/node_scraper.js');
|
||||
|
||||
(async () => {
|
||||
let browser_config = {
|
||||
debug_level: 1,
|
||||
debug_level: 2,
|
||||
output_file: 'examples/results/data.json',
|
||||
test_evasion: true,
|
||||
};
|
||||
|
||||
let scrape_job = {
|
||||
|
File diff suppressed because it is too large
Load Diff
7
examples/results/maps.json
Normal file
7
examples/results/maps.json
Normal file
@ -0,0 +1,7 @@
|
||||
{
|
||||
"Berlin Zahnarzt": {
|
||||
"1": {
|
||||
"time": "Sat, 29 Jun 2019 14:57:26 GMT"
|
||||
}
|
||||
}
|
||||
}
|
BIN
headless-test-result.png
Normal file
BIN
headless-test-result.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 50 KiB |
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "se-scraper",
|
||||
"version": "1.3.8",
|
||||
"version": "1.3.10",
|
||||
"description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu",
|
||||
"homepage": "https://scrapeulous.com/",
|
||||
"main": "index.js",
|
||||
|
@ -412,6 +412,161 @@ class GoogleNewsScraper extends Scraper {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class GoogleMapsScraper extends Scraper {
|
||||
|
||||
constructor(...args) {
|
||||
super(...args);
|
||||
}
|
||||
|
||||
async parse_async(html) {
|
||||
let results = await this.page.evaluate(() => {
|
||||
var res = [];
|
||||
document.querySelectorAll('.section-listbox-root .section-result').forEach((node) => {
|
||||
try {
|
||||
let score = node.querySelector('.cards-rating-score').innerHTML;
|
||||
let num_ratings = node.querySelector('.section-result-num-ratings').innerHTML;
|
||||
let type = node.querySelector('.section-result-details').innerHTML;
|
||||
let title = node.querySelector('.section-result-title span').innerHTML;
|
||||
let location = node.querySelector('.section-result-location').innerHTML;
|
||||
let opening_hours = node.querySelector('.section-result-opening-hours').innerHTML;
|
||||
res.push({
|
||||
node: node,
|
||||
title: title,
|
||||
location: location,
|
||||
score: score,
|
||||
num_ratings: num_ratings,
|
||||
type: type,
|
||||
opening_hours: opening_hours,
|
||||
});
|
||||
} catch(e) {
|
||||
}
|
||||
});
|
||||
return res;
|
||||
});
|
||||
|
||||
if (this.scrape_in_detail) {
|
||||
let profiles = await this.page.$$('.section-listbox-root .section-result');
|
||||
console.log(`Profiles to visit: ${profiles.length}`);
|
||||
for (var profile of profiles) {
|
||||
try {
|
||||
let additional_info = await this.visit_profile(profile);
|
||||
console.log(additional_info);
|
||||
} catch(e) {
|
||||
console.error(e);
|
||||
}
|
||||
profiles = await this.page.$$('.section-listbox-root .section-result');
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
results: results
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
https://stackoverflow.com/questions/55815376/puppeteer-open-a-page-get-the-data-go-back-to-the-previous-page-enter-a-new
|
||||
*/
|
||||
async visit_profile(profile) {
|
||||
await profile.click();
|
||||
await this.page.waitForFunction('document.querySelectorAll(".section-info-line .section-info-text").length > 0', {timeout: 5000});
|
||||
|
||||
let results = await this.page.evaluate(() => {
|
||||
let res = [];
|
||||
document.querySelectorAll('.section-info-line .section-info-text .widget-pane-link').forEach((node) => {
|
||||
try {
|
||||
let info = node.innerHTML.trim();
|
||||
if (info) {
|
||||
res.push(info);
|
||||
}
|
||||
} catch(e) {
|
||||
}
|
||||
});
|
||||
return res;
|
||||
});
|
||||
|
||||
let back_button = await this.page.$('.section-back-to-list-button', {timeout: 10000});
|
||||
if (back_button) {
|
||||
await back_button.click();
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
let startUrl = 'https://www.google.com/maps';
|
||||
|
||||
if (this.config.google_maps_settings) {
|
||||
// whether to visit each result and get all available information
|
||||
// including customer reviews
|
||||
this.scrape_in_detail = this.config.google_maps_settings.scrape_in_detail || false;
|
||||
}
|
||||
|
||||
log(this.config, 1, 'Using startUrl: ' + startUrl);
|
||||
|
||||
this.last_response = await this.page.goto(startUrl);
|
||||
|
||||
try {
|
||||
await this.page.waitForSelector('#searchbox input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
async search_keyword(keyword) {
|
||||
const input = await this.page.$('#searchbox input[name="q"]');
|
||||
await this.set_input_value(`#searchbox input[name="q"]`, keyword);
|
||||
await this.sleep(50);
|
||||
await input.focus();
|
||||
await this.page.keyboard.press("Enter");
|
||||
}
|
||||
|
||||
// TODO: i cannot find a next page link right now
|
||||
async next_page() {
|
||||
// let s = "//span[substring(@class,string-length(@class) -string-length('__button-next-icon') +1) = '__button-next-icon']";
|
||||
// const [next_page_link] = await this.page.$x(s, {timeout: 2000});
|
||||
|
||||
let next_page_link = await this.page.$('[jsaction="pane.paginationSection.nextPage"] span', {timeout: 10000});
|
||||
if (!next_page_link) {
|
||||
return false;
|
||||
}
|
||||
await next_page_link.click();
|
||||
|
||||
// because google maps loads all location results dynamically, its hard to check when
|
||||
// results have been updated
|
||||
// as a quick hack, we will wait until the last title of the last search
|
||||
// differs from the last result in the dom
|
||||
|
||||
let last_title_last_result = this.results[this.keyword][this.page_num-1].results.slice(-1)[0].title;
|
||||
|
||||
log(this.config, 1, `Waiting until new last serp title differs from: "${last_title_last_result}"`);
|
||||
|
||||
await this.page.waitForFunction((last_title) => {
|
||||
const res = document.querySelectorAll('.section-result .section-result-title span');
|
||||
return res[res.length-1].innerHTML !== last_title;
|
||||
}, {timeout: 7000}, this.results[this.keyword][this.page_num-1].results.slice(-1)[0].title);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
async wait_for_results() {
|
||||
await this.page.waitForSelector('.section-listbox-root .section-result', { timeout: this.STANDARD_TIMEOUT });
|
||||
// more than 1 result
|
||||
await this.page.waitForFunction("document.querySelectorAll('.section-result').length > 0", { timeout: 5000 });
|
||||
await this.page.waitForNavigation();
|
||||
}
|
||||
|
||||
async detected() {
|
||||
const title = await this.page.title();
|
||||
let html = await this.page.content();
|
||||
return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
function clean_image_url(url) {
|
||||
// Example:
|
||||
// https://www.google.com/imgres?imgurl=https%3A%2F%2Fupload.wikimedia.org%2Fwikipedia%2Fen%2Fthumb%2Ff%2Ffd%2F1928_Edward_Campbell.jpg%2F220px-1928_Edward_Campbell.jpg&imgrefurl=https%3A%2F%2Fwww.revolvy.com%2Fpage%2FSir-Edward-Campbell%252C-1st-Baronet&docid=BMkW_GerTIY4GM&tbnid=TmQapIxDCQbQhM%3A&vet=10ahUKEwje_LLE_YXeAhXisaQKHVAEBSAQMwiNAShEMEQ..i&w=220&h=290&bih=1696&biw=1280&q=John%20MacLeod%20Breadalbane%20Councillor%20Prince%20Edward%20Island&ved=0ahUKEwje_LLE_YXeAhXisaQKHVAEBSAQMwiNAShEMEQ&iact=mrc&uact=8
|
||||
@ -440,6 +595,7 @@ module.exports = {
|
||||
GoogleScraper: GoogleScraper,
|
||||
GoogleImageScraper: GoogleImageScraper,
|
||||
GoogleNewsScraper: GoogleNewsScraper,
|
||||
GoogleMapsScraper: GoogleMapsScraper,
|
||||
};
|
||||
|
||||
|
||||
|
@ -60,6 +60,8 @@ module.exports = class Scraper {
|
||||
this.page = page;
|
||||
}
|
||||
|
||||
await this.page.setViewport({ width: 1920, height: 1040 });
|
||||
|
||||
let do_continue = await this.load_search_engine();
|
||||
|
||||
if (!do_continue) {
|
||||
@ -169,7 +171,7 @@ module.exports = class Scraper {
|
||||
});
|
||||
}
|
||||
|
||||
let page_num = 1;
|
||||
this.page_num = 1;
|
||||
|
||||
try {
|
||||
|
||||
@ -180,7 +182,7 @@ module.exports = class Scraper {
|
||||
|
||||
do {
|
||||
|
||||
log(this.config, 1, `${this.config.search_engine_name} scrapes keyword "${keyword}" on page ${page_num}`);
|
||||
log(this.config, 1, `${this.config.search_engine_name} scrapes keyword "${keyword}" on page ${this.page_num}`);
|
||||
|
||||
await this.wait_for_results();
|
||||
|
||||
@ -190,13 +192,13 @@ module.exports = class Scraper {
|
||||
|
||||
let html = await this.page.content();
|
||||
let parsed = this.parse(html);
|
||||
this.results[keyword][page_num] = parsed ? parsed : await this.parse_async(html);
|
||||
this.results[keyword][this.page_num] = parsed ? parsed : await this.parse_async(html);
|
||||
|
||||
page_num += 1;
|
||||
this.page_num += 1;
|
||||
|
||||
// only load the next page when we will pass the next iteration
|
||||
// step from the while loop
|
||||
if (page_num <= this.config.num_pages) {
|
||||
if (this.page_num <= this.config.num_pages) {
|
||||
|
||||
let next_page_loaded = await this.next_page();
|
||||
|
||||
@ -207,7 +209,7 @@ module.exports = class Scraper {
|
||||
}
|
||||
}
|
||||
|
||||
} while (page_num <= this.config.num_pages);
|
||||
} while (this.page_num <= this.config.num_pages);
|
||||
|
||||
} catch (e) {
|
||||
|
||||
|
@ -40,6 +40,7 @@ function getScraper(search_engine, args) {
|
||||
google_news_old: google.GoogleNewsOldScraper,
|
||||
google_news: google.GoogleNewsScraper,
|
||||
google_image: google.GoogleImageScraper,
|
||||
google_maps: google.GoogleMapsScraper,
|
||||
bing: bing.BingScraper,
|
||||
bing_news: bing.BingNewsScraper,
|
||||
amazon: amazon.AmazonScraper,
|
||||
@ -194,7 +195,8 @@ class ScrapeManager {
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-accelerated-2d-canvas',
|
||||
'--disable-gpu',
|
||||
'--window-size=1920x1080',
|
||||
'--window-size=1920,1040',
|
||||
'--start-fullscreen',
|
||||
'--hide-scrollbars',
|
||||
'--disable-notifications',
|
||||
];
|
||||
|
Loading…
x
Reference in New Issue
Block a user