forked from extern/se-scraper
users may pass their own user agents, different browsers have random user agents and not the same now
This commit is contained in:
parent
ebe9ba8ea9
commit
80d23a9d57
3
TODO.md
3
TODO.md
@ -49,4 +49,5 @@
|
|||||||
- remove unnecessary sleep() calls and replace with waitFor selectors
|
- remove unnecessary sleep() calls and replace with waitFor selectors
|
||||||
|
|
||||||
### TODO:
|
### TODO:
|
||||||
1. fix googlenewsscraper waiting for results and parsing. remove the static sleep
|
1. fix googlenewsscraper waiting for results and parsing. remove the static sleep [done]
|
||||||
|
2. when using multiple browsers and random user agent, pass a random user agent to each perBrowserOptions
|
||||||
|
37
examples/multiple_browsers.js
Normal file
37
examples/multiple_browsers.js
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
const se_scraper = require('./../src/node_scraper.js');
|
||||||
|
|
||||||
|
(async () => {
|
||||||
|
let browser_config = {
|
||||||
|
search_engine: 'google',
|
||||||
|
debug_level: 2,
|
||||||
|
sleep_range: '',
|
||||||
|
output_file: '',
|
||||||
|
random_user_agent: true,
|
||||||
|
is_local: false,
|
||||||
|
throw_on_detection: false,
|
||||||
|
headless: false,
|
||||||
|
puppeteer_cluster_config: {
|
||||||
|
headless: false,
|
||||||
|
timeout: 30 * 60 * 1000, // max timeout set to 30 minutes
|
||||||
|
monitor: false,
|
||||||
|
concurrency: 3, // 3 == CONCURRENCY_BROWSER
|
||||||
|
maxConcurrency: 3, // 3 browsers will scrape
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
let scrape_job = {
|
||||||
|
search_engine: 'google',
|
||||||
|
keywords: ['news', 'mountain', 'what', 'are good', 'keyword', 'who', 'nice'],
|
||||||
|
num_pages: 1,
|
||||||
|
};
|
||||||
|
|
||||||
|
var scraper = new se_scraper.ScrapeManager(browser_config);
|
||||||
|
|
||||||
|
await scraper.start();
|
||||||
|
|
||||||
|
var results = await scraper.scrape(scrape_job);
|
||||||
|
|
||||||
|
console.dir(results, {depth: null, colors: true});
|
||||||
|
|
||||||
|
await scraper.quit();
|
||||||
|
})();
|
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "se-scraper",
|
"name": "se-scraper",
|
||||||
"version": "1.3.7",
|
"version": "1.3.8",
|
||||||
"description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu",
|
"description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu",
|
||||||
"homepage": "https://scrapeulous.com/",
|
"homepage": "https://scrapeulous.com/",
|
||||||
"main": "index.js",
|
"main": "index.js",
|
||||||
|
@ -101,8 +101,12 @@ const user_agents = [
|
|||||||
];
|
];
|
||||||
|
|
||||||
|
|
||||||
function random_user_agent(ua_list = []) {
|
function random_user_agent(config) {
|
||||||
return user_agents[Math.floor(Math.random() * user_agents.length)];
|
if (config.user_agents && config.user_agents.length > 0) {
|
||||||
|
return config.user_agents[Math.floor(Math.random() * config.user_agents.length)];
|
||||||
|
} else {
|
||||||
|
return user_agents[Math.floor(Math.random() * user_agents.length)];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
|
@ -32,6 +32,7 @@ function read_keywords_from_file(fname) {
|
|||||||
return kws;
|
return kws;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
function getScraper(search_engine, args) {
|
function getScraper(search_engine, args) {
|
||||||
if (typeof search_engine === 'string') {
|
if (typeof search_engine === 'string') {
|
||||||
return new {
|
return new {
|
||||||
@ -109,7 +110,7 @@ class ScrapeManager {
|
|||||||
// get_browser, handle_metadata, close_browser
|
// get_browser, handle_metadata, close_browser
|
||||||
//custom_func: resolve('examples/pluggable.js'),
|
//custom_func: resolve('examples/pluggable.js'),
|
||||||
custom_func: '',
|
custom_func: '',
|
||||||
throw_on_detection: true,
|
throw_on_detection: false,
|
||||||
// use a proxy for all connections
|
// use a proxy for all connections
|
||||||
// example: 'socks5://78.94.172.42:1080'
|
// example: 'socks5://78.94.172.42:1080'
|
||||||
// example: 'http://118.174.233.10:48400'
|
// example: 'http://118.174.233.10:48400'
|
||||||
@ -125,6 +126,8 @@ class ScrapeManager {
|
|||||||
// check if headless chrome escapes common detection techniques
|
// check if headless chrome escapes common detection techniques
|
||||||
// this is a quick test and should be used for debugging
|
// this is a quick test and should be used for debugging
|
||||||
test_evasion: false,
|
test_evasion: false,
|
||||||
|
// you may pass your own list of user agents
|
||||||
|
user_agents: [],
|
||||||
apply_evasion_techniques: true,
|
apply_evasion_techniques: true,
|
||||||
// settings for puppeteer-cluster
|
// settings for puppeteer-cluster
|
||||||
puppeteer_cluster_config: {
|
puppeteer_cluster_config: {
|
||||||
@ -181,7 +184,7 @@ class ScrapeManager {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// See here: https://peter.sh/experiments/chromium-command-line-switches/
|
// See here: https://peter.sh/experiments/chromium-command-line-switches/
|
||||||
var chrome_flags = [
|
var default_chrome_flags = [
|
||||||
'--disable-infobars',
|
'--disable-infobars',
|
||||||
'--window-position=0,0',
|
'--window-position=0,0',
|
||||||
'--ignore-certifcate-errors',
|
'--ignore-certifcate-errors',
|
||||||
@ -196,6 +199,8 @@ class ScrapeManager {
|
|||||||
'--disable-notifications',
|
'--disable-notifications',
|
||||||
];
|
];
|
||||||
|
|
||||||
|
var chrome_flags = default_chrome_flags.slice(); // copy that
|
||||||
|
|
||||||
if (Array.isArray(this.config.chrome_flags) && this.config.chrome_flags.length) {
|
if (Array.isArray(this.config.chrome_flags) && this.config.chrome_flags.length) {
|
||||||
chrome_flags = this.config.chrome_flags;
|
chrome_flags = this.config.chrome_flags;
|
||||||
}
|
}
|
||||||
@ -207,7 +212,7 @@ class ScrapeManager {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (this.config.random_user_agent === true) {
|
if (this.config.random_user_agent === true) {
|
||||||
user_agent = ua.random_user_agent();
|
user_agent = ua.random_user_agent(this.config);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (user_agent) {
|
if (user_agent) {
|
||||||
@ -227,7 +232,7 @@ class ScrapeManager {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
let launch_args = {
|
var launch_args = {
|
||||||
args: chrome_flags,
|
args: chrome_flags,
|
||||||
headless: this.config.headless,
|
headless: this.config.headless,
|
||||||
ignoreHTTPSErrors: true,
|
ignoreHTTPSErrors: true,
|
||||||
@ -278,6 +283,19 @@ class ScrapeManager {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Give the per browser options each a random user agent when random user agent is set
|
||||||
|
while (perBrowserOptions.length < this.numClusters) {
|
||||||
|
perBrowserOptions.push({
|
||||||
|
headless: this.config.headless,
|
||||||
|
ignoreHTTPSErrors: true,
|
||||||
|
args: default_chrome_flags.slice().concat(`--user-agent=${ua.random_user_agent(this.config)}`)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this.config.debug_level >= 2) {
|
||||||
|
console.dir(perBrowserOptions)
|
||||||
|
}
|
||||||
|
|
||||||
this.cluster = await Cluster.launch({
|
this.cluster = await Cluster.launch({
|
||||||
monitor: this.config.puppeteer_cluster_config.monitor,
|
monitor: this.config.puppeteer_cluster_config.monitor,
|
||||||
timeout: this.config.puppeteer_cluster_config.timeout, // max timeout set to 30 minutes
|
timeout: this.config.puppeteer_cluster_config.timeout, // max timeout set to 30 minutes
|
||||||
|
Loading…
Reference in New Issue
Block a user