users may pass their own user agents, different browsers have random user agents and not the same now

This commit is contained in:
Nikolai Tschacher 2019-06-17 21:25:45 +02:00
parent ebe9ba8ea9
commit 80d23a9d57
5 changed files with 68 additions and 8 deletions

View File

@ -49,4 +49,5 @@
- remove unnecessary sleep() calls and replace with waitFor selectors - remove unnecessary sleep() calls and replace with waitFor selectors
### TODO: ### TODO:
1. fix googlenewsscraper waiting for results and parsing. remove the static sleep 1. fix googlenewsscraper waiting for results and parsing. remove the static sleep [done]
2. when using multiple browsers and random user agent, pass a random user agent to each perBrowserOptions

View File

@ -0,0 +1,37 @@
const se_scraper = require('./../src/node_scraper.js');
(async () => {
let browser_config = {
search_engine: 'google',
debug_level: 2,
sleep_range: '',
output_file: '',
random_user_agent: true,
is_local: false,
throw_on_detection: false,
headless: false,
puppeteer_cluster_config: {
headless: false,
timeout: 30 * 60 * 1000, // max timeout set to 30 minutes
monitor: false,
concurrency: 3, // 3 == CONCURRENCY_BROWSER
maxConcurrency: 3, // 3 browsers will scrape
},
};
let scrape_job = {
search_engine: 'google',
keywords: ['news', 'mountain', 'what', 'are good', 'keyword', 'who', 'nice'],
num_pages: 1,
};
var scraper = new se_scraper.ScrapeManager(browser_config);
await scraper.start();
var results = await scraper.scrape(scrape_job);
console.dir(results, {depth: null, colors: true});
await scraper.quit();
})();

View File

@ -1,6 +1,6 @@
{ {
"name": "se-scraper", "name": "se-scraper",
"version": "1.3.7", "version": "1.3.8",
"description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu", "description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu",
"homepage": "https://scrapeulous.com/", "homepage": "https://scrapeulous.com/",
"main": "index.js", "main": "index.js",

View File

@ -101,8 +101,12 @@ const user_agents = [
]; ];
function random_user_agent(ua_list = []) { function random_user_agent(config) {
return user_agents[Math.floor(Math.random() * user_agents.length)]; if (config.user_agents && config.user_agents.length > 0) {
return config.user_agents[Math.floor(Math.random() * config.user_agents.length)];
} else {
return user_agents[Math.floor(Math.random() * user_agents.length)];
}
} }
module.exports = { module.exports = {

View File

@ -32,6 +32,7 @@ function read_keywords_from_file(fname) {
return kws; return kws;
} }
function getScraper(search_engine, args) { function getScraper(search_engine, args) {
if (typeof search_engine === 'string') { if (typeof search_engine === 'string') {
return new { return new {
@ -109,7 +110,7 @@ class ScrapeManager {
// get_browser, handle_metadata, close_browser // get_browser, handle_metadata, close_browser
//custom_func: resolve('examples/pluggable.js'), //custom_func: resolve('examples/pluggable.js'),
custom_func: '', custom_func: '',
throw_on_detection: true, throw_on_detection: false,
// use a proxy for all connections // use a proxy for all connections
// example: 'socks5://78.94.172.42:1080' // example: 'socks5://78.94.172.42:1080'
// example: 'http://118.174.233.10:48400' // example: 'http://118.174.233.10:48400'
@ -125,6 +126,8 @@ class ScrapeManager {
// check if headless chrome escapes common detection techniques // check if headless chrome escapes common detection techniques
// this is a quick test and should be used for debugging // this is a quick test and should be used for debugging
test_evasion: false, test_evasion: false,
// you may pass your own list of user agents
user_agents: [],
apply_evasion_techniques: true, apply_evasion_techniques: true,
// settings for puppeteer-cluster // settings for puppeteer-cluster
puppeteer_cluster_config: { puppeteer_cluster_config: {
@ -181,7 +184,7 @@ class ScrapeManager {
} }
// See here: https://peter.sh/experiments/chromium-command-line-switches/ // See here: https://peter.sh/experiments/chromium-command-line-switches/
var chrome_flags = [ var default_chrome_flags = [
'--disable-infobars', '--disable-infobars',
'--window-position=0,0', '--window-position=0,0',
'--ignore-certifcate-errors', '--ignore-certifcate-errors',
@ -196,6 +199,8 @@ class ScrapeManager {
'--disable-notifications', '--disable-notifications',
]; ];
var chrome_flags = default_chrome_flags.slice(); // copy that
if (Array.isArray(this.config.chrome_flags) && this.config.chrome_flags.length) { if (Array.isArray(this.config.chrome_flags) && this.config.chrome_flags.length) {
chrome_flags = this.config.chrome_flags; chrome_flags = this.config.chrome_flags;
} }
@ -207,7 +212,7 @@ class ScrapeManager {
} }
if (this.config.random_user_agent === true) { if (this.config.random_user_agent === true) {
user_agent = ua.random_user_agent(); user_agent = ua.random_user_agent(this.config);
} }
if (user_agent) { if (user_agent) {
@ -227,7 +232,7 @@ class ScrapeManager {
) )
} }
let launch_args = { var launch_args = {
args: chrome_flags, args: chrome_flags,
headless: this.config.headless, headless: this.config.headless,
ignoreHTTPSErrors: true, ignoreHTTPSErrors: true,
@ -278,6 +283,19 @@ class ScrapeManager {
} }
} }
// Give the per browser options each a random user agent when random user agent is set
while (perBrowserOptions.length < this.numClusters) {
perBrowserOptions.push({
headless: this.config.headless,
ignoreHTTPSErrors: true,
args: default_chrome_flags.slice().concat(`--user-agent=${ua.random_user_agent(this.config)}`)
})
}
if (this.config.debug_level >= 2) {
console.dir(perBrowserOptions)
}
this.cluster = await Cluster.launch({ this.cluster = await Cluster.launch({
monitor: this.config.puppeteer_cluster_config.monitor, monitor: this.config.puppeteer_cluster_config.monitor,
timeout: this.config.puppeteer_cluster_config.timeout, // max timeout set to 30 minutes timeout: this.config.puppeteer_cluster_config.timeout, // max timeout set to 30 minutes