mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2025-06-21 01:57:55 +02:00
fixed quotes in user agent. this lead to cloudflare detecting the scraper. very bad.
This commit is contained in:
parent
79d32a315a
commit
abf4458e46
@ -178,6 +178,7 @@ var ADDITIONAL_CHROME_FLAGS = [
|
|||||||
'--disable-gpu',
|
'--disable-gpu',
|
||||||
'--window-size=1920x1080',
|
'--window-size=1920x1080',
|
||||||
'--hide-scrollbars',
|
'--hide-scrollbars',
|
||||||
|
'--disable-notifications',
|
||||||
];
|
];
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "se-scraper",
|
"name": "se-scraper",
|
||||||
"version": "1.2.1",
|
"version": "1.2.2",
|
||||||
"description": "A simple module using puppeteer to scrape several search engines such as Google, Duckduckgo and Bing.",
|
"description": "A simple module using puppeteer to scrape several search engines such as Google, Duckduckgo and Bing.",
|
||||||
"homepage": "https://scrapeulous.com/",
|
"homepage": "https://scrapeulous.com/",
|
||||||
"main": "index.js",
|
"main": "index.js",
|
||||||
|
@ -65,6 +65,7 @@ module.exports.handler = async function handler (event, context, callback) {
|
|||||||
|
|
||||||
console.log(`[se-scraper] started at [${(new Date()).toUTCString()}] and scrapes ${config.search_engine} with ${config.keywords.length} keywords on ${config.num_pages} pages each.`);
|
console.log(`[se-scraper] started at [${(new Date()).toUTCString()}] and scrapes ${config.search_engine} with ${config.keywords.length} keywords on ${config.num_pages} pages each.`);
|
||||||
|
|
||||||
|
// See here: https://peter.sh/experiments/chromium-command-line-switches/
|
||||||
var ADDITIONAL_CHROME_FLAGS = [
|
var ADDITIONAL_CHROME_FLAGS = [
|
||||||
'--disable-infobars',
|
'--disable-infobars',
|
||||||
'--window-position=0,0',
|
'--window-position=0,0',
|
||||||
@ -77,6 +78,7 @@ module.exports.handler = async function handler (event, context, callback) {
|
|||||||
'--disable-gpu',
|
'--disable-gpu',
|
||||||
'--window-size=1920x1080',
|
'--window-size=1920x1080',
|
||||||
'--hide-scrollbars',
|
'--hide-scrollbars',
|
||||||
|
'--disable-notifications',
|
||||||
];
|
];
|
||||||
|
|
||||||
var user_agent = undefined;
|
var user_agent = undefined;
|
||||||
@ -89,9 +91,10 @@ module.exports.handler = async function handler (event, context, callback) {
|
|||||||
user_agent = ua.random_user_agent();
|
user_agent = ua.random_user_agent();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// user agent argument without quotes !
|
||||||
if (user_agent) {
|
if (user_agent) {
|
||||||
ADDITIONAL_CHROME_FLAGS.push(
|
ADDITIONAL_CHROME_FLAGS.push(
|
||||||
`--user-agent="${user_agent}"`
|
`--user-agent=${user_agent}`
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -117,6 +120,8 @@ module.exports.handler = async function handler (event, context, callback) {
|
|||||||
if (pluggable.start_browser) {
|
if (pluggable.start_browser) {
|
||||||
launch_args.config = config;
|
launch_args.config = config;
|
||||||
let browser = await pluggable.start_browser(launch_args);
|
let browser = await pluggable.start_browser(launch_args);
|
||||||
|
const realUA = await browser.userAgent();
|
||||||
|
if (realUA === user_agent) {
|
||||||
const page = await await browser.newPage();
|
const page = await await browser.newPage();
|
||||||
let obj = getScraper(config.search_engine, {
|
let obj = getScraper(config.search_engine, {
|
||||||
config: config,
|
config: config,
|
||||||
@ -125,6 +130,9 @@ module.exports.handler = async function handler (event, context, callback) {
|
|||||||
});
|
});
|
||||||
results = obj.run(page);
|
results = obj.run(page);
|
||||||
num_requests = obj.num_requests;
|
num_requests = obj.num_requests;
|
||||||
|
} else {
|
||||||
|
console.error('provided user agent does not match real user agent');
|
||||||
|
}
|
||||||
|
|
||||||
if (pluggable.close_browser) {
|
if (pluggable.close_browser) {
|
||||||
await pluggable.close_browser();
|
await pluggable.close_browser();
|
||||||
|
Loading…
x
Reference in New Issue
Block a user