fixed quotes in user agent. this lead to cloudflare detecting the scraper. very bad.

This commit is contained in:
Nikolai Tschacher 2019-03-01 16:02:30 +01:00
parent 79d32a315a
commit abf4458e46
3 changed files with 19 additions and 10 deletions

View File

@ -178,6 +178,7 @@ var ADDITIONAL_CHROME_FLAGS = [
'--disable-gpu',
'--window-size=1920x1080',
'--hide-scrollbars',
'--disable-notifications',
];
```

View File

@ -1,6 +1,6 @@
{
"name": "se-scraper",
"version": "1.2.1",
"version": "1.2.2",
"description": "A simple module using puppeteer to scrape several search engines such as Google, Duckduckgo and Bing.",
"homepage": "https://scrapeulous.com/",
"main": "index.js",

View File

@ -65,6 +65,7 @@ module.exports.handler = async function handler (event, context, callback) {
console.log(`[se-scraper] started at [${(new Date()).toUTCString()}] and scrapes ${config.search_engine} with ${config.keywords.length} keywords on ${config.num_pages} pages each.`);
// See here: https://peter.sh/experiments/chromium-command-line-switches/
var ADDITIONAL_CHROME_FLAGS = [
'--disable-infobars',
'--window-position=0,0',
@ -77,6 +78,7 @@ module.exports.handler = async function handler (event, context, callback) {
'--disable-gpu',
'--window-size=1920x1080',
'--hide-scrollbars',
'--disable-notifications',
];
var user_agent = undefined;
@ -89,9 +91,10 @@ module.exports.handler = async function handler (event, context, callback) {
user_agent = ua.random_user_agent();
}
// user agent argument without quotes !
if (user_agent) {
ADDITIONAL_CHROME_FLAGS.push(
`--user-agent="${user_agent}"`
`--user-agent=${user_agent}`
)
}
@ -117,14 +120,19 @@ module.exports.handler = async function handler (event, context, callback) {
if (pluggable.start_browser) {
launch_args.config = config;
let browser = await pluggable.start_browser(launch_args);
const page = await await browser.newPage();
let obj = getScraper(config.search_engine, {
config: config,
context: context,
pluggable: pluggable,
});
results = obj.run(page);
num_requests = obj.num_requests;
const realUA = await browser.userAgent();
if (realUA === user_agent) {
const page = await await browser.newPage();
let obj = getScraper(config.search_engine, {
config: config,
context: context,
pluggable: pluggable,
});
results = obj.run(page);
num_requests = obj.num_requests;
} else {
console.error('provided user agent does not match real user agent');
}
if (pluggable.close_browser) {
await pluggable.close_browser();