forked from extern/se-scraper
users may pass their own user agents, different browsers have random user agents and not the same now
This commit is contained in:
parent
ebe9ba8ea9
commit
80d23a9d57
3
TODO.md
3
TODO.md
@ -49,4 +49,5 @@
|
||||
- remove unnecessary sleep() calls and replace with waitFor selectors
|
||||
|
||||
### TODO:
|
||||
1. fix googlenewsscraper waiting for results and parsing. remove the static sleep
|
||||
1. fix googlenewsscraper waiting for results and parsing. remove the static sleep [done]
|
||||
2. when using multiple browsers and random user agent, pass a random user agent to each perBrowserOptions
|
||||
|
37
examples/multiple_browsers.js
Normal file
37
examples/multiple_browsers.js
Normal file
@ -0,0 +1,37 @@
|
||||
const se_scraper = require('./../src/node_scraper.js');
|
||||
|
||||
(async () => {
|
||||
let browser_config = {
|
||||
search_engine: 'google',
|
||||
debug_level: 2,
|
||||
sleep_range: '',
|
||||
output_file: '',
|
||||
random_user_agent: true,
|
||||
is_local: false,
|
||||
throw_on_detection: false,
|
||||
headless: false,
|
||||
puppeteer_cluster_config: {
|
||||
headless: false,
|
||||
timeout: 30 * 60 * 1000, // max timeout set to 30 minutes
|
||||
monitor: false,
|
||||
concurrency: 3, // 3 == CONCURRENCY_BROWSER
|
||||
maxConcurrency: 3, // 3 browsers will scrape
|
||||
},
|
||||
};
|
||||
|
||||
let scrape_job = {
|
||||
search_engine: 'google',
|
||||
keywords: ['news', 'mountain', 'what', 'are good', 'keyword', 'who', 'nice'],
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(browser_config);
|
||||
|
||||
await scraper.start();
|
||||
|
||||
var results = await scraper.scrape(scrape_job);
|
||||
|
||||
console.dir(results, {depth: null, colors: true});
|
||||
|
||||
await scraper.quit();
|
||||
})();
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "se-scraper",
|
||||
"version": "1.3.7",
|
||||
"version": "1.3.8",
|
||||
"description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu",
|
||||
"homepage": "https://scrapeulous.com/",
|
||||
"main": "index.js",
|
||||
|
@ -101,8 +101,12 @@ const user_agents = [
|
||||
];
|
||||
|
||||
|
||||
function random_user_agent(ua_list = []) {
|
||||
return user_agents[Math.floor(Math.random() * user_agents.length)];
|
||||
function random_user_agent(config) {
|
||||
if (config.user_agents && config.user_agents.length > 0) {
|
||||
return config.user_agents[Math.floor(Math.random() * config.user_agents.length)];
|
||||
} else {
|
||||
return user_agents[Math.floor(Math.random() * user_agents.length)];
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
|
@ -32,6 +32,7 @@ function read_keywords_from_file(fname) {
|
||||
return kws;
|
||||
}
|
||||
|
||||
|
||||
function getScraper(search_engine, args) {
|
||||
if (typeof search_engine === 'string') {
|
||||
return new {
|
||||
@ -109,7 +110,7 @@ class ScrapeManager {
|
||||
// get_browser, handle_metadata, close_browser
|
||||
//custom_func: resolve('examples/pluggable.js'),
|
||||
custom_func: '',
|
||||
throw_on_detection: true,
|
||||
throw_on_detection: false,
|
||||
// use a proxy for all connections
|
||||
// example: 'socks5://78.94.172.42:1080'
|
||||
// example: 'http://118.174.233.10:48400'
|
||||
@ -125,6 +126,8 @@ class ScrapeManager {
|
||||
// check if headless chrome escapes common detection techniques
|
||||
// this is a quick test and should be used for debugging
|
||||
test_evasion: false,
|
||||
// you may pass your own list of user agents
|
||||
user_agents: [],
|
||||
apply_evasion_techniques: true,
|
||||
// settings for puppeteer-cluster
|
||||
puppeteer_cluster_config: {
|
||||
@ -181,7 +184,7 @@ class ScrapeManager {
|
||||
}
|
||||
|
||||
// See here: https://peter.sh/experiments/chromium-command-line-switches/
|
||||
var chrome_flags = [
|
||||
var default_chrome_flags = [
|
||||
'--disable-infobars',
|
||||
'--window-position=0,0',
|
||||
'--ignore-certifcate-errors',
|
||||
@ -196,6 +199,8 @@ class ScrapeManager {
|
||||
'--disable-notifications',
|
||||
];
|
||||
|
||||
var chrome_flags = default_chrome_flags.slice(); // copy that
|
||||
|
||||
if (Array.isArray(this.config.chrome_flags) && this.config.chrome_flags.length) {
|
||||
chrome_flags = this.config.chrome_flags;
|
||||
}
|
||||
@ -207,7 +212,7 @@ class ScrapeManager {
|
||||
}
|
||||
|
||||
if (this.config.random_user_agent === true) {
|
||||
user_agent = ua.random_user_agent();
|
||||
user_agent = ua.random_user_agent(this.config);
|
||||
}
|
||||
|
||||
if (user_agent) {
|
||||
@ -227,7 +232,7 @@ class ScrapeManager {
|
||||
)
|
||||
}
|
||||
|
||||
let launch_args = {
|
||||
var launch_args = {
|
||||
args: chrome_flags,
|
||||
headless: this.config.headless,
|
||||
ignoreHTTPSErrors: true,
|
||||
@ -278,6 +283,19 @@ class ScrapeManager {
|
||||
}
|
||||
}
|
||||
|
||||
// Give the per browser options each a random user agent when random user agent is set
|
||||
while (perBrowserOptions.length < this.numClusters) {
|
||||
perBrowserOptions.push({
|
||||
headless: this.config.headless,
|
||||
ignoreHTTPSErrors: true,
|
||||
args: default_chrome_flags.slice().concat(`--user-agent=${ua.random_user_agent(this.config)}`)
|
||||
})
|
||||
}
|
||||
|
||||
if (this.config.debug_level >= 2) {
|
||||
console.dir(perBrowserOptions)
|
||||
}
|
||||
|
||||
this.cluster = await Cluster.launch({
|
||||
monitor: this.config.puppeteer_cluster_config.monitor,
|
||||
timeout: this.config.puppeteer_cluster_config.timeout, // max timeout set to 30 minutes
|
||||
|
Loading…
Reference in New Issue
Block a user