mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2025-06-24 03:21:34 +02:00
support for multible browsers and proxies
This commit is contained in:
parent
393b9c0450
commit
089e410ec6
102
README.md
102
README.md
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
This node module supports scraping several search engines.
|
This node module supports scraping several search engines.
|
||||||
|
|
||||||
Right now scraping the search engines
|
Right now it's possible to scrape the following search engines
|
||||||
|
|
||||||
* Google
|
* Google
|
||||||
* Google News
|
* Google News
|
||||||
@ -14,20 +14,15 @@ Right now scraping the search engines
|
|||||||
* Infospace
|
* Infospace
|
||||||
* Duckduckgo
|
* Duckduckgo
|
||||||
* Webcrawler
|
* Webcrawler
|
||||||
|
|
||||||
is supported.
|
|
||||||
|
|
||||||
Additionally **se-scraper** supports investment ticker search from the following sites:
|
|
||||||
|
|
||||||
* Reuters
|
* Reuters
|
||||||
* cnbc
|
* cnbc
|
||||||
* Marketwatch
|
* Marketwatch
|
||||||
|
|
||||||
This module uses puppeteer. It was created by the Developer of https://github.com/NikolaiT/GoogleScraper, a module with 1800 Stars on Github.
|
This module uses puppeteer and puppeteer-cluster (modified version). It was created by the Developer of https://github.com/NikolaiT/GoogleScraper, a module with 1800 Stars on Github.
|
||||||
|
|
||||||
### Quickstart
|
### Quickstart
|
||||||
|
|
||||||
**Note**: If you don't want puppeteer to download a complete chromium browser, add this variable to your environments:
|
**Note**: If you **don't** want puppeteer to download a complete chromium browser, add this variable to your environments. Then this library is not guaranteed to run out of the box.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
export PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=1
|
export PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=1
|
||||||
@ -39,7 +34,7 @@ Then install with
|
|||||||
npm install se-scraper
|
npm install se-scraper
|
||||||
```
|
```
|
||||||
|
|
||||||
then create a file with the following contents and start scraping.
|
then create a file `run.js` with the following contents
|
||||||
|
|
||||||
```js
|
```js
|
||||||
const se_scraper = require('se-scraper');
|
const se_scraper = require('se-scraper');
|
||||||
@ -61,6 +56,79 @@ function callback(err, response) {
|
|||||||
se_scraper.scrape(config, callback);
|
se_scraper.scrape(config, callback);
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Start scraping by firing up the command `node run.js`
|
||||||
|
|
||||||
|
#### Scrape with proxies
|
||||||
|
|
||||||
|
**se-scraper** will create one browser instance per proxy. So the maximal ammount of concurency is equivalent to the number of proxies plus one (your own IP).
|
||||||
|
|
||||||
|
```js
|
||||||
|
const se_scraper = require('se-scraper');
|
||||||
|
|
||||||
|
let config = {
|
||||||
|
search_engine: 'google',
|
||||||
|
debug: false,
|
||||||
|
verbose: false,
|
||||||
|
keywords: ['news', 'scrapeulous.com', 'incolumitas.com', 'i work too much'],
|
||||||
|
num_pages: 1,
|
||||||
|
output_file: 'data.json',
|
||||||
|
proxy_file: '/home/nikolai/.proxies', // one proxy per line
|
||||||
|
log_ip_address: true,
|
||||||
|
};
|
||||||
|
|
||||||
|
function callback(err, response) {
|
||||||
|
if (err) { console.error(err) }
|
||||||
|
console.dir(response, {depth: null, colors: true});
|
||||||
|
}
|
||||||
|
|
||||||
|
se_scraper.scrape(config, callback);
|
||||||
|
```
|
||||||
|
|
||||||
|
With a proxy file such as (invalid proxies of course)
|
||||||
|
|
||||||
|
```text
|
||||||
|
socks5://53.34.23.55:55523
|
||||||
|
socks4://51.11.23.22:22222
|
||||||
|
```
|
||||||
|
|
||||||
|
This will scrape with **three** browser instance each having their own IP address. Unfortunately, it is currently not possible to scrape with different proxies per tab (chromium issue).
|
||||||
|
|
||||||
|
### Scraping Model
|
||||||
|
|
||||||
|
**se-scraper** scrapes search engines only. In order to introduce concurrency into this library, it is necessary to define the scraping model. Then we can decide how we divide and conquer.
|
||||||
|
|
||||||
|
#### Scraping Resources
|
||||||
|
|
||||||
|
What are common scraping resources?
|
||||||
|
|
||||||
|
1. **Memory and CPU**. Necessary to launch multiple browser instances.
|
||||||
|
2. **Network Bandwith**. Is not often the bottleneck.
|
||||||
|
3. **IP Addresses**. Websites often block IP addresses after a certain amount of requests from the same IP address. Can be circumvented by using proxies.
|
||||||
|
4. Spoofable identifiers such as browser fingerprint or user agents. Those will be handled by **se-scraper**
|
||||||
|
|
||||||
|
#### Concurrency Model
|
||||||
|
|
||||||
|
**se-scraper** should be able to run without any concurrency at all. This is the default case. No concurrency means only one browser/tab is searching at the time.
|
||||||
|
|
||||||
|
For concurrent use, we will make use of a modified [puppeteer-cluster library](https://github.com/thomasdondorf/puppeteer-cluster).
|
||||||
|
|
||||||
|
One scrape job is properly defined by
|
||||||
|
|
||||||
|
* 1 search engine such as `google`
|
||||||
|
* `M` pages
|
||||||
|
* `N` keywords/queries
|
||||||
|
* `K` proxies and `K+1` browser instances (because when we have no proxies available, we will scrape with our dedicated IP)
|
||||||
|
|
||||||
|
Then **se-scraper** will create `K+1` dedicated browser instances with a unique ip address. Each browser will get `N/(K+1)` keywords and will issue `N/(K+1) * M` total requests to the search engine.
|
||||||
|
|
||||||
|
The problem is that [puppeteer-cluster library](https://github.com/thomasdondorf/puppeteer-cluster) does only allow identical options for subsequent new browser instances. Therefore, it is not trivial to launch a cluster of browsers with distinct proxy settings. Right now, every browser has the same options. It's not possible to set options on a per browser basis.
|
||||||
|
|
||||||
|
Solution:
|
||||||
|
|
||||||
|
1. Create a [upstream proxy router](https://github.com/GoogleChrome/puppeteer/issues/678).
|
||||||
|
2. Modify [puppeteer-cluster library](https://github.com/thomasdondorf/puppeteer-cluster) to accept a list of proxy strings and then pop() from this list at every new call to `workerInstance()` in https://github.com/thomasdondorf/puppeteer-cluster/blob/master/src/Cluster.ts I wrote an [issue here](https://github.com/thomasdondorf/puppeteer-cluster/issues/107). **I ended up doing this**.
|
||||||
|
|
||||||
|
|
||||||
### Technical Notes
|
### Technical Notes
|
||||||
|
|
||||||
Scraping is done with a headless chromium browser using the automation library puppeteer. Puppeteer is a Node library which provides a high-level API to control headless Chrome or Chromium over the DevTools Protocol.
|
Scraping is done with a headless chromium browser using the automation library puppeteer. Puppeteer is a Node library which provides a high-level API to control headless Chrome or Chromium over the DevTools Protocol.
|
||||||
@ -144,7 +212,8 @@ Use se-scraper by calling it with a script such as the one below.
|
|||||||
const se_scraper = require('se-scraper');
|
const se_scraper = require('se-scraper');
|
||||||
const resolve = require('path').resolve;
|
const resolve = require('path').resolve;
|
||||||
|
|
||||||
let config = {
|
// options for scraping
|
||||||
|
event = {
|
||||||
// the user agent to scrape with
|
// the user agent to scrape with
|
||||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||||
// if random_user_agent is set to True, a random user agent is chosen
|
// if random_user_agent is set to True, a random user agent is chosen
|
||||||
@ -162,7 +231,7 @@ let config = {
|
|||||||
search_engine: 'google',
|
search_engine: 'google',
|
||||||
compress: false, // compress
|
compress: false, // compress
|
||||||
debug: false,
|
debug: false,
|
||||||
verbose: false,
|
verbose: true,
|
||||||
keywords: ['scrapeulous.com'],
|
keywords: ['scrapeulous.com'],
|
||||||
// whether to start the browser in headless mode
|
// whether to start the browser in headless mode
|
||||||
headless: true,
|
headless: true,
|
||||||
@ -178,13 +247,16 @@ let config = {
|
|||||||
// get_browser, handle_metadata, close_browser
|
// get_browser, handle_metadata, close_browser
|
||||||
//custom_func: resolve('examples/pluggable.js'),
|
//custom_func: resolve('examples/pluggable.js'),
|
||||||
custom_func: '',
|
custom_func: '',
|
||||||
// use a proxy for all connections
|
// path to a proxy file, one proxy per line. Example:
|
||||||
// example: 'socks5://78.94.172.42:1080'
|
// socks5://78.94.172.42:1080
|
||||||
// example: 'http://118.174.233.10:48400'
|
// http://118.174.233.10:48400
|
||||||
proxy: '',
|
proxy_file: '',
|
||||||
|
proxies: [],
|
||||||
// check if headless chrome escapes common detection techniques
|
// check if headless chrome escapes common detection techniques
|
||||||
// this is a quick test and should be used for debugging
|
// this is a quick test and should be used for debugging
|
||||||
test_evasion: false,
|
test_evasion: false,
|
||||||
|
// settings for puppeteer-cluster
|
||||||
|
monitor: false,
|
||||||
};
|
};
|
||||||
|
|
||||||
function callback(err, response) {
|
function callback(err, response) {
|
||||||
|
8
TODO.txt
8
TODO.txt
@ -27,19 +27,21 @@
|
|||||||
|
|
||||||
|
|
||||||
30.1.2019
|
30.1.2019
|
||||||
|
|
||||||
- modify all scrapers to use the generic class where it makes sense
|
- modify all scrapers to use the generic class where it makes sense
|
||||||
- Bing, Baidu, Google, Duckduckgo
|
- Bing, Baidu, Google, Duckduckgo
|
||||||
|
|
||||||
7.2.2019
|
7.2.2019
|
||||||
- add num_requests to test cases [done]
|
- add num_requests to test cases [done]
|
||||||
|
|
||||||
|
25.2.2019
|
||||||
|
- https://antoinevastel.com/crawler/2018/09/20/parallel-crawler-puppeteer.html
|
||||||
|
- add support for browsing with multiple browsers, use this neat library:
|
||||||
|
- https://github.com/thomasdondorf/puppeteer-cluster [done]
|
||||||
|
|
||||||
TODO:
|
TODO:
|
||||||
|
- write test case for proxy support and cluster support
|
||||||
- add captcha service solving support
|
- add captcha service solving support
|
||||||
- check if news instances run the same browser and if we can have one proxy per tab wokers
|
- check if news instances run the same browser and if we can have one proxy per tab wokers
|
||||||
|
|
||||||
- write test case for:
|
- write test case for:
|
||||||
- pluggable
|
- pluggable
|
||||||
- full metadata (log http headers, log ip address)
|
|
||||||
|
76
examples/per_page_proxy.js
Normal file
76
examples/per_page_proxy.js
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
const puppeteer = require('puppeteer');
|
||||||
|
const ProxyChain = require('proxy-chain');
|
||||||
|
|
||||||
|
const ROUTER_PROXY = 'http://127.0.0.1:8000';
|
||||||
|
|
||||||
|
// SEE: https://github.com/GoogleChrome/puppeteer/issues/678
|
||||||
|
// Idea is: Setup a local router proxy that assigns requests identified by unique user-agent strings
|
||||||
|
// distinct upstream proxies. With this way it is possible to use one proxy per chromium tab.
|
||||||
|
// downside: not fast and efficient
|
||||||
|
|
||||||
|
const uas = [
|
||||||
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
|
||||||
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
|
||||||
|
];
|
||||||
|
|
||||||
|
const proxies = ['http://142.93.57.147:3128', 'http://85.132.31.115:8181'];
|
||||||
|
|
||||||
|
(async () => {
|
||||||
|
const browser = await puppeteer.launch({
|
||||||
|
headless: false,
|
||||||
|
args: [`--proxy-server=${ROUTER_PROXY}`],
|
||||||
|
});
|
||||||
|
const page1 = await browser.newPage();
|
||||||
|
const page2 = await browser.newPage();
|
||||||
|
|
||||||
|
try {
|
||||||
|
await page1.setUserAgent(uas[0]);
|
||||||
|
await page1.goto('https://www.whatsmyip.org/');
|
||||||
|
} catch (e) {
|
||||||
|
console.log(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
await page2.setUserAgent(uas[1]);
|
||||||
|
await page2.goto('https://www.whatsmyip.org/');
|
||||||
|
} catch (e) {
|
||||||
|
console.log(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
//await browser.close();
|
||||||
|
})();
|
||||||
|
|
||||||
|
const server = new ProxyChain.Server({
|
||||||
|
// Port where the server the server will listen. By default 8000.
|
||||||
|
port: 8000,
|
||||||
|
|
||||||
|
// Enables verbose logging
|
||||||
|
verbose: true,
|
||||||
|
|
||||||
|
prepareRequestFunction: ({
|
||||||
|
request,
|
||||||
|
username,
|
||||||
|
password,
|
||||||
|
hostname,
|
||||||
|
port,
|
||||||
|
isHttp,
|
||||||
|
}) => {
|
||||||
|
var upstreamProxyUrl;
|
||||||
|
|
||||||
|
if (request.headers['user-agent'] === uas[0]) {
|
||||||
|
upstreamProxyUrl = proxies[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (request.headers['user-agent'] === uas[1]) {
|
||||||
|
upstreamProxyUrl = proxies[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('Using proxy: ' + upstreamProxyUrl);
|
||||||
|
|
||||||
|
return { upstreamProxyUrl };
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
server.listen(() => {
|
||||||
|
console.log(`Router Proxy server is listening on port ${8000}`);
|
||||||
|
});
|
19
examples/proxies.js
Normal file
19
examples/proxies.js
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
const se_scraper = require('./../index.js');
|
||||||
|
|
||||||
|
let config = {
|
||||||
|
search_engine: 'google',
|
||||||
|
debug: false,
|
||||||
|
verbose: false,
|
||||||
|
keywords: ['news', 'scrapeulous.com', 'incolumitas.com', 'i work too much'],
|
||||||
|
num_pages: 1,
|
||||||
|
output_file: 'data.json',
|
||||||
|
proxy_file: '/home/nikolai/.proxies', // one proxy per line
|
||||||
|
log_ip_address: true,
|
||||||
|
};
|
||||||
|
|
||||||
|
function callback(err, response) {
|
||||||
|
if (err) { console.error(err) }
|
||||||
|
console.dir(response, {depth: null, colors: true});
|
||||||
|
}
|
||||||
|
|
||||||
|
se_scraper.scrape(config, callback);
|
@ -1,11 +1,11 @@
|
|||||||
const se_scraper = require('./../index.js');
|
const se_scraper = require('./../index.js');
|
||||||
|
|
||||||
let config = {
|
let config = {
|
||||||
search_engine: 'duckduckgo',
|
search_engine: 'google',
|
||||||
debug: false,
|
debug: false,
|
||||||
verbose: false,
|
verbose: false,
|
||||||
keywords: ['news'],
|
keywords: ['news', 'se-scraper'],
|
||||||
num_pages: 2,
|
num_pages: 1,
|
||||||
output_file: 'data.json',
|
output_file: 'data.json',
|
||||||
};
|
};
|
||||||
|
|
||||||
|
86
examples/test_cluster.js
Normal file
86
examples/test_cluster.js
Normal file
@ -0,0 +1,86 @@
|
|||||||
|
const { Cluster } = require('../../puppeteer-cluster/dist/index.js');
|
||||||
|
var fs = require('fs');
|
||||||
|
var os = require("os");
|
||||||
|
|
||||||
|
const PROXY_FILE = '/home/nikolai/.proxies';
|
||||||
|
|
||||||
|
function read_items_from_file(fname) {
|
||||||
|
let kws = fs.readFileSync(fname).toString().split(os.EOL);
|
||||||
|
// clean keywords
|
||||||
|
kws = kws.filter((kw) => {
|
||||||
|
return kw.trim().length > 0;
|
||||||
|
});
|
||||||
|
return kws;
|
||||||
|
}
|
||||||
|
|
||||||
|
(async () => {
|
||||||
|
|
||||||
|
let browserArgs = [
|
||||||
|
'--disable-infobars',
|
||||||
|
'--window-position=0,0',
|
||||||
|
'--ignore-certifcate-errors',
|
||||||
|
'--ignore-certifcate-errors-spki-list',
|
||||||
|
'--no-sandbox',
|
||||||
|
'--disable-setuid-sandbox',
|
||||||
|
'--disable-dev-shm-usage',
|
||||||
|
'--disable-accelerated-2d-canvas',
|
||||||
|
'--disable-gpu',
|
||||||
|
'--window-size=1920x1080',
|
||||||
|
'--hide-scrollbars',
|
||||||
|
];
|
||||||
|
|
||||||
|
let proxies = read_items_from_file(PROXY_FILE);
|
||||||
|
|
||||||
|
console.dir(proxies);
|
||||||
|
|
||||||
|
// each new call to workerInstance() will
|
||||||
|
// left pop() one element from this list
|
||||||
|
// maxConcurrency should be equal to perBrowserOptions.length
|
||||||
|
|
||||||
|
// the first browser config with home IP
|
||||||
|
let perBrowserOptions = [{
|
||||||
|
headless: false,
|
||||||
|
ignoreHTTPSErrors: true,
|
||||||
|
args: browserArgs
|
||||||
|
}];
|
||||||
|
|
||||||
|
for (var proxy of proxies) {
|
||||||
|
perBrowserOptions.push({
|
||||||
|
headless: false,
|
||||||
|
ignoreHTTPSErrors: true,
|
||||||
|
args: browserArgs.concat(`--proxy-server=${proxy}`)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
const cluster = await Cluster.launch({
|
||||||
|
monitor: true,
|
||||||
|
concurrency: Cluster.CONCURRENCY_BROWSER,
|
||||||
|
maxConcurrency: perBrowserOptions.length,
|
||||||
|
puppeteerOptions: {
|
||||||
|
headless: false,
|
||||||
|
args: browserArgs,
|
||||||
|
ignoreHTTPSErrors: true,
|
||||||
|
},
|
||||||
|
perBrowserOptions: perBrowserOptions
|
||||||
|
});
|
||||||
|
|
||||||
|
// Event handler to be called in case of problems
|
||||||
|
cluster.on('taskerror', (err, data) => {
|
||||||
|
console.log(`Error crawling ${data}: ${err.message}`);
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
await cluster.task(async ({ page, data: url }) => {
|
||||||
|
await page.goto(url, {waitUntil: 'domcontentloaded', timeout: 20000});
|
||||||
|
const pageTitle = await page.evaluate(() => document.title);
|
||||||
|
console.log(`Page title of ${url} is ${pageTitle}`);
|
||||||
|
console.log(await page.content());
|
||||||
|
});
|
||||||
|
|
||||||
|
for(var i = 0; i < perBrowserOptions.length; i++) {
|
||||||
|
await cluster.queue('http://ipinfo.io/json');
|
||||||
|
}
|
||||||
|
|
||||||
|
await cluster.idle();
|
||||||
|
await cluster.close();
|
||||||
|
})();
|
40
examples/test_promise.js
Normal file
40
examples/test_promise.js
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
class Test {
|
||||||
|
constructor(options = {}) {
|
||||||
|
const {
|
||||||
|
config = {},
|
||||||
|
} = options;
|
||||||
|
|
||||||
|
this.config = config;
|
||||||
|
}
|
||||||
|
|
||||||
|
run(vars) {
|
||||||
|
|
||||||
|
console.log(this.config)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let o1 = new Test({config: {a: Math.random()}});
|
||||||
|
let o2 = new Test({config: {a: Math.random()}});
|
||||||
|
|
||||||
|
o1.run()
|
||||||
|
o2.run()
|
||||||
|
|
||||||
|
// (async () => {
|
||||||
|
//
|
||||||
|
// let prom = [];
|
||||||
|
//
|
||||||
|
// for (var i = 0; i < 3; i++) {
|
||||||
|
// var obj = new Test({
|
||||||
|
// config: {a: Math.random()},
|
||||||
|
// });
|
||||||
|
// prom.push(new Promise(resolve => {
|
||||||
|
// setTimeout(() => { new Test({
|
||||||
|
// config: {a: Math.random()},
|
||||||
|
// }).run(); resolve() }, 1000);
|
||||||
|
// }));
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// let res = await Promise.all(prom);
|
||||||
|
// console.log(res);
|
||||||
|
//
|
||||||
|
// })();
|
20
index.js
20
index.js
@ -23,7 +23,7 @@ exports.scrape = async function(config, callback) {
|
|||||||
search_engine: 'google',
|
search_engine: 'google',
|
||||||
compress: false, // compress
|
compress: false, // compress
|
||||||
debug: false,
|
debug: false,
|
||||||
verbose: false,
|
verbose: true,
|
||||||
keywords: ['scrapeulous.com'],
|
keywords: ['scrapeulous.com'],
|
||||||
// whether to start the browser in headless mode
|
// whether to start the browser in headless mode
|
||||||
headless: true,
|
headless: true,
|
||||||
@ -39,13 +39,16 @@ exports.scrape = async function(config, callback) {
|
|||||||
// get_browser, handle_metadata, close_browser
|
// get_browser, handle_metadata, close_browser
|
||||||
//custom_func: resolve('examples/pluggable.js'),
|
//custom_func: resolve('examples/pluggable.js'),
|
||||||
custom_func: '',
|
custom_func: '',
|
||||||
// use a proxy for all connections
|
// path to a proxy file, one proxy per line. Example:
|
||||||
// example: 'socks5://78.94.172.42:1080'
|
// socks5://78.94.172.42:1080
|
||||||
// example: 'http://118.174.233.10:48400'
|
// http://118.174.233.10:48400
|
||||||
proxy: '',
|
proxy_file: '',
|
||||||
|
proxies: [],
|
||||||
// check if headless chrome escapes common detection techniques
|
// check if headless chrome escapes common detection techniques
|
||||||
// this is a quick test and should be used for debugging
|
// this is a quick test and should be used for debugging
|
||||||
test_evasion: false,
|
test_evasion: false,
|
||||||
|
// settings for puppeteer-cluster
|
||||||
|
monitor: false,
|
||||||
};
|
};
|
||||||
|
|
||||||
// overwrite default config
|
// overwrite default config
|
||||||
@ -57,6 +60,13 @@ exports.scrape = async function(config, callback) {
|
|||||||
event.keywords = read_keywords_from_file(event.keyword_file);
|
event.keywords = read_keywords_from_file(event.keyword_file);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (fs.existsSync(event.proxy_file)) {
|
||||||
|
event.proxies = read_keywords_from_file(event.proxy_file);
|
||||||
|
if (event.verbose) {
|
||||||
|
console.log(`${event.proxies.length} proxies loaded.`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (!callback) {
|
if (!callback) {
|
||||||
// called when results are ready
|
// called when results are ready
|
||||||
callback = function (err, response) {
|
callback = function (err, response) {
|
||||||
|
91
package-lock.json
generated
91
package-lock.json
generated
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "se-scraper",
|
"name": "se-scraper",
|
||||||
"version": "1.1.12",
|
"version": "1.1.14",
|
||||||
"lockfileVersion": 1,
|
"lockfileVersion": 1,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
@ -45,6 +45,11 @@
|
|||||||
"resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.0.tgz",
|
"resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.0.tgz",
|
||||||
"integrity": "sha1-ibTRmasr7kneFk6gK4nORi1xt2c="
|
"integrity": "sha1-ibTRmasr7kneFk6gK4nORi1xt2c="
|
||||||
},
|
},
|
||||||
|
"bluebird": {
|
||||||
|
"version": "3.5.3",
|
||||||
|
"resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.5.3.tgz",
|
||||||
|
"integrity": "sha512-/qKPUQlaW1OyR51WeCPBvRnAlnZFUJkCSG5HzGnuIqhgyJtF+T94lFnn33eiazjRm2LAHVy2guNnaq48X9SJuw=="
|
||||||
|
},
|
||||||
"boolbase": {
|
"boolbase": {
|
||||||
"version": "1.0.0",
|
"version": "1.0.0",
|
||||||
"resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz",
|
"resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz",
|
||||||
@ -117,6 +122,11 @@
|
|||||||
"mimic-response": "^1.0.0"
|
"mimic-response": "^1.0.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"commander": {
|
||||||
|
"version": "2.19.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/commander/-/commander-2.19.0.tgz",
|
||||||
|
"integrity": "sha512-6tvAOO+D6OENvRAh524Dh9jcfKTYDQAqvqezbCW82xj5X0pSrcpxtvRKHLG0yBY6SD7PSDrJaj+0AiOcKVd1Xg=="
|
||||||
|
},
|
||||||
"concat-map": {
|
"concat-map": {
|
||||||
"version": "0.0.1",
|
"version": "0.0.1",
|
||||||
"resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz",
|
"resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz",
|
||||||
@ -124,7 +134,7 @@
|
|||||||
},
|
},
|
||||||
"concat-stream": {
|
"concat-stream": {
|
||||||
"version": "1.6.2",
|
"version": "1.6.2",
|
||||||
"resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz",
|
"resolved": "http://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz",
|
||||||
"integrity": "sha512-27HBghJxjiZtIk3Ycvn/4kbJk/1uZuJFfuPEns6LaEvpvG1f0hTea8lilrouyo9mVc2GWdcEZ8OLoGmSADlrCw==",
|
"integrity": "sha512-27HBghJxjiZtIk3Ycvn/4kbJk/1uZuJFfuPEns6LaEvpvG1f0hTea8lilrouyo9mVc2GWdcEZ8OLoGmSADlrCw==",
|
||||||
"requires": {
|
"requires": {
|
||||||
"buffer-from": "^1.0.0",
|
"buffer-from": "^1.0.0",
|
||||||
@ -135,7 +145,7 @@
|
|||||||
"dependencies": {
|
"dependencies": {
|
||||||
"readable-stream": {
|
"readable-stream": {
|
||||||
"version": "2.3.6",
|
"version": "2.3.6",
|
||||||
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz",
|
"resolved": "http://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz",
|
||||||
"integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==",
|
"integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==",
|
||||||
"requires": {
|
"requires": {
|
||||||
"core-util-is": "~1.0.0",
|
"core-util-is": "~1.0.0",
|
||||||
@ -149,7 +159,7 @@
|
|||||||
},
|
},
|
||||||
"string_decoder": {
|
"string_decoder": {
|
||||||
"version": "1.1.1",
|
"version": "1.1.1",
|
||||||
"resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz",
|
"resolved": "http://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz",
|
||||||
"integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==",
|
"integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==",
|
||||||
"requires": {
|
"requires": {
|
||||||
"safe-buffer": "~5.1.0"
|
"safe-buffer": "~5.1.0"
|
||||||
@ -264,13 +274,13 @@
|
|||||||
"integrity": "sha512-f2LZMYl1Fzu7YSBKg+RoROelpOaNrcGmE9AZubeDfrCEia483oW4MI4VyFd5VNHIgQ/7qm1I0wUHK1eJnn2y2w=="
|
"integrity": "sha512-f2LZMYl1Fzu7YSBKg+RoROelpOaNrcGmE9AZubeDfrCEia483oW4MI4VyFd5VNHIgQ/7qm1I0wUHK1eJnn2y2w=="
|
||||||
},
|
},
|
||||||
"es6-promise": {
|
"es6-promise": {
|
||||||
"version": "4.2.5",
|
"version": "4.2.6",
|
||||||
"resolved": "https://registry.npmjs.org/es6-promise/-/es6-promise-4.2.5.tgz",
|
"resolved": "https://registry.npmjs.org/es6-promise/-/es6-promise-4.2.6.tgz",
|
||||||
"integrity": "sha512-n6wvpdE43VFtJq+lUDYDBFUwV8TZbuGXLV4D6wKafg13ldznKsyEvatubnmUe31zcvelSzOHF+XbaT+Bl9ObDg=="
|
"integrity": "sha512-aRVgGdnmW2OiySVPUC9e6m+plolMAJKjZnQlCwNSuK5yQ0JN61DZSO1X1Ufd1foqWRAlig0rhduTCHe7sVtK5Q=="
|
||||||
},
|
},
|
||||||
"es6-promisify": {
|
"es6-promisify": {
|
||||||
"version": "5.0.0",
|
"version": "5.0.0",
|
||||||
"resolved": "https://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz",
|
"resolved": "http://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz",
|
||||||
"integrity": "sha1-UQnWLz5W6pZ8S2NQWu8IKRyKUgM=",
|
"integrity": "sha1-UQnWLz5W6pZ8S2NQWu8IKRyKUgM=",
|
||||||
"requires": {
|
"requires": {
|
||||||
"es6-promise": "^4.0.3"
|
"es6-promise": "^4.0.3"
|
||||||
@ -458,12 +468,12 @@
|
|||||||
},
|
},
|
||||||
"minimist": {
|
"minimist": {
|
||||||
"version": "0.0.8",
|
"version": "0.0.8",
|
||||||
"resolved": "https://registry.npmjs.org/minimist/-/minimist-0.0.8.tgz",
|
"resolved": "http://registry.npmjs.org/minimist/-/minimist-0.0.8.tgz",
|
||||||
"integrity": "sha1-hX/Kv8M5fSYluCKCYuhqp6ARsF0="
|
"integrity": "sha1-hX/Kv8M5fSYluCKCYuhqp6ARsF0="
|
||||||
},
|
},
|
||||||
"mkdirp": {
|
"mkdirp": {
|
||||||
"version": "0.5.1",
|
"version": "0.5.1",
|
||||||
"resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-0.5.1.tgz",
|
"resolved": "http://registry.npmjs.org/mkdirp/-/mkdirp-0.5.1.tgz",
|
||||||
"integrity": "sha1-MAV0OOrGz3+MR2fzhkjWaX11yQM=",
|
"integrity": "sha1-MAV0OOrGz3+MR2fzhkjWaX11yQM=",
|
||||||
"requires": {
|
"requires": {
|
||||||
"minimist": "0.0.8"
|
"minimist": "0.0.8"
|
||||||
@ -510,7 +520,7 @@
|
|||||||
},
|
},
|
||||||
"path-is-absolute": {
|
"path-is-absolute": {
|
||||||
"version": "1.0.1",
|
"version": "1.0.1",
|
||||||
"resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz",
|
"resolved": "http://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz",
|
||||||
"integrity": "sha1-F0uSaHNVNP+8es5r9TpanhtcX18="
|
"integrity": "sha1-F0uSaHNVNP+8es5r9TpanhtcX18="
|
||||||
},
|
},
|
||||||
"pathval": {
|
"pathval": {
|
||||||
@ -523,6 +533,36 @@
|
|||||||
"resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz",
|
"resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz",
|
||||||
"integrity": "sha1-elfrVQpng/kRUzH89GY9XI4AelA="
|
"integrity": "sha1-elfrVQpng/kRUzH89GY9XI4AelA="
|
||||||
},
|
},
|
||||||
|
"portastic": {
|
||||||
|
"version": "1.0.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/portastic/-/portastic-1.0.1.tgz",
|
||||||
|
"integrity": "sha1-HJgF1D+uj2pAzw28d5QJGi6dDSo=",
|
||||||
|
"requires": {
|
||||||
|
"bluebird": "^2.9.34",
|
||||||
|
"commander": "^2.8.1",
|
||||||
|
"debug": "^2.2.0"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"bluebird": {
|
||||||
|
"version": "2.11.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/bluebird/-/bluebird-2.11.0.tgz",
|
||||||
|
"integrity": "sha1-U0uQM8AiyVecVro7Plpcqvu2UOE="
|
||||||
|
},
|
||||||
|
"debug": {
|
||||||
|
"version": "2.6.9",
|
||||||
|
"resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
|
||||||
|
"integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
|
||||||
|
"requires": {
|
||||||
|
"ms": "2.0.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"ms": {
|
||||||
|
"version": "2.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
|
||||||
|
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g="
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
"prepend-http": {
|
"prepend-http": {
|
||||||
"version": "2.0.0",
|
"version": "2.0.0",
|
||||||
"resolved": "https://registry.npmjs.org/prepend-http/-/prepend-http-2.0.0.tgz",
|
"resolved": "https://registry.npmjs.org/prepend-http/-/prepend-http-2.0.0.tgz",
|
||||||
@ -538,6 +578,16 @@
|
|||||||
"resolved": "https://registry.npmjs.org/progress/-/progress-2.0.3.tgz",
|
"resolved": "https://registry.npmjs.org/progress/-/progress-2.0.3.tgz",
|
||||||
"integrity": "sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA=="
|
"integrity": "sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA=="
|
||||||
},
|
},
|
||||||
|
"proxy-chain": {
|
||||||
|
"version": "0.2.7",
|
||||||
|
"resolved": "https://registry.npmjs.org/proxy-chain/-/proxy-chain-0.2.7.tgz",
|
||||||
|
"integrity": "sha512-e0s94WDfooeC3zQkvIJ/Eudiy/AywTQK4K6PMYbZdBE2m/eug54ThgCPdBE4txHvzi0A0gAVbX04Kt4RygTlRQ==",
|
||||||
|
"requires": {
|
||||||
|
"bluebird": "^3.5.1",
|
||||||
|
"portastic": "^1.0.1",
|
||||||
|
"underscore": "^1.9.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
"proxy-from-env": {
|
"proxy-from-env": {
|
||||||
"version": "1.0.0",
|
"version": "1.0.0",
|
||||||
"resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.0.0.tgz",
|
"resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.0.0.tgz",
|
||||||
@ -567,6 +617,14 @@
|
|||||||
"ws": "^6.1.0"
|
"ws": "^6.1.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"puppeteer-cluster": {
|
||||||
|
"version": "0.13.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/puppeteer-cluster/-/puppeteer-cluster-0.13.0.tgz",
|
||||||
|
"integrity": "sha512-en9F6cHkj1tLucFz9q3BtrvVKxGxIR1cWZgcpKyjXJUElBbNahaUErrz7jGa6edVQJfqTrdF40mkDqIOZNJUhg==",
|
||||||
|
"requires": {
|
||||||
|
"debug": "^4.1.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
"readable-stream": {
|
"readable-stream": {
|
||||||
"version": "3.1.1",
|
"version": "3.1.1",
|
||||||
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.1.1.tgz",
|
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.1.1.tgz",
|
||||||
@ -621,6 +679,11 @@
|
|||||||
"resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz",
|
"resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz",
|
||||||
"integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c="
|
"integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c="
|
||||||
},
|
},
|
||||||
|
"underscore": {
|
||||||
|
"version": "1.9.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/underscore/-/underscore-1.9.1.tgz",
|
||||||
|
"integrity": "sha512-5/4etnCkd9c8gwgowi5/om/mYO5ajCaOgdzj/oW+0eQV9WxKBDZw5+ycmKmeaTXjInS/W0BzpGLo2xR2aBwZdg=="
|
||||||
|
},
|
||||||
"url-parse-lax": {
|
"url-parse-lax": {
|
||||||
"version": "3.0.0",
|
"version": "3.0.0",
|
||||||
"resolved": "https://registry.npmjs.org/url-parse-lax/-/url-parse-lax-3.0.0.tgz",
|
"resolved": "https://registry.npmjs.org/url-parse-lax/-/url-parse-lax-3.0.0.tgz",
|
||||||
@ -640,9 +703,9 @@
|
|||||||
"integrity": "sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8="
|
"integrity": "sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8="
|
||||||
},
|
},
|
||||||
"ws": {
|
"ws": {
|
||||||
"version": "6.1.3",
|
"version": "6.1.4",
|
||||||
"resolved": "https://registry.npmjs.org/ws/-/ws-6.1.3.tgz",
|
"resolved": "https://registry.npmjs.org/ws/-/ws-6.1.4.tgz",
|
||||||
"integrity": "sha512-tbSxiT+qJI223AP4iLfQbkbxkwdFcneYinM2+x46Gx2wgvbaOMO36czfdfVUBRTHvzAMRhDd98sA5d/BuWbQdg==",
|
"integrity": "sha512-eqZfL+NE/YQc1/ZynhojeV8q+H050oR8AZ2uIev7RU10svA9ZnJUddHcOUZTJLinZ9yEfdA2kSATS2qZK5fhJA==",
|
||||||
"requires": {
|
"requires": {
|
||||||
"async-limiter": "~1.0.0"
|
"async-limiter": "~1.0.0"
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "se-scraper",
|
"name": "se-scraper",
|
||||||
"version": "1.1.13",
|
"version": "1.2.0",
|
||||||
"description": "A simple library using puppeteer to scrape several search engines such as Google, Duckduckgo and Bing.",
|
"description": "A simple library using puppeteer to scrape several search engines such as Google, Duckduckgo and Bing.",
|
||||||
"homepage": "https://scrapeulous.com/",
|
"homepage": "https://scrapeulous.com/",
|
||||||
"main": "index.js",
|
"main": "index.js",
|
||||||
@ -11,6 +11,7 @@
|
|||||||
"scraping",
|
"scraping",
|
||||||
"search-engines",
|
"search-engines",
|
||||||
"google",
|
"google",
|
||||||
|
"bing",
|
||||||
"web-scraping"
|
"web-scraping"
|
||||||
],
|
],
|
||||||
"author": "Nikolai Tschacher <hire@incolumitas.com> (https://incolumitas.com/)",
|
"author": "Nikolai Tschacher <hire@incolumitas.com> (https://incolumitas.com/)",
|
||||||
@ -23,6 +24,8 @@
|
|||||||
"chai": "^4.2.0",
|
"chai": "^4.2.0",
|
||||||
"cheerio": "^1.0.0-rc.2",
|
"cheerio": "^1.0.0-rc.2",
|
||||||
"got": "^9.6.0",
|
"got": "^9.6.0",
|
||||||
"puppeteer": "^1.12.2"
|
"proxy-chain": "^0.2.7",
|
||||||
|
"puppeteer": "^1.12.2",
|
||||||
|
"puppeteer-cluster": "^0.13.0"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
11
run.js
11
run.js
@ -1,5 +1,4 @@
|
|||||||
const se_scraper = require('./index.js');
|
const se_scraper = require('./index.js');
|
||||||
const resolve = require('path').resolve;
|
|
||||||
|
|
||||||
let config = {
|
let config = {
|
||||||
// the user agent to scrape with
|
// the user agent to scrape with
|
||||||
@ -18,13 +17,13 @@ let config = {
|
|||||||
// this output is informational
|
// this output is informational
|
||||||
verbose: true,
|
verbose: true,
|
||||||
// an array of keywords to scrape
|
// an array of keywords to scrape
|
||||||
keywords: ['news'],
|
keywords: ['news', 'abc', 'good', 'bad', 'better', 'one more', 'time', 'we are going'],
|
||||||
// alternatively you can specify a keyword_file. this overwrites the keywords array
|
// alternatively you can specify a keyword_file. this overwrites the keywords array
|
||||||
keyword_file: '',
|
keyword_file: '',
|
||||||
// the number of pages to scrape for each keyword
|
// the number of pages to scrape for each keyword
|
||||||
num_pages: 1,
|
num_pages: 1,
|
||||||
// whether to start the browser in headless mode
|
// whether to start the browser in headless mode
|
||||||
headless: true,
|
headless: false,
|
||||||
// path to output file, data will be stored in JSON
|
// path to output file, data will be stored in JSON
|
||||||
output_file: 'data.json',
|
output_file: 'data.json',
|
||||||
// whether to prevent images, css, fonts from being loaded
|
// whether to prevent images, css, fonts from being loaded
|
||||||
@ -40,13 +39,17 @@ let config = {
|
|||||||
// example: 'socks5://78.94.172.42:1080'
|
// example: 'socks5://78.94.172.42:1080'
|
||||||
// example: 'http://118.174.233.10:48400'
|
// example: 'http://118.174.233.10:48400'
|
||||||
proxy: '',
|
proxy: '',
|
||||||
|
// a file with one proxy per line. Example:
|
||||||
|
// socks5://78.94.172.42:1080
|
||||||
|
// http://118.174.233.10:48400
|
||||||
|
proxy_file: '/home/nikolai/.proxies',
|
||||||
// check if headless chrome escapes common detection techniques
|
// check if headless chrome escapes common detection techniques
|
||||||
// this is a quick test and should be used for debugging
|
// this is a quick test and should be used for debugging
|
||||||
test_evasion: false,
|
test_evasion: false,
|
||||||
// log ip address data
|
// log ip address data
|
||||||
log_ip_address: true,
|
log_ip_address: true,
|
||||||
// log http headers
|
// log http headers
|
||||||
log_http_headers: true,
|
log_http_headers: false,
|
||||||
};
|
};
|
||||||
|
|
||||||
function callback(err, response) {
|
function callback(err, response) {
|
||||||
|
@ -3,6 +3,10 @@ const Scraper = require('./se_scraper');
|
|||||||
|
|
||||||
class GoogleScraper extends Scraper {
|
class GoogleScraper extends Scraper {
|
||||||
|
|
||||||
|
constructor(...args) {
|
||||||
|
super(...args);
|
||||||
|
}
|
||||||
|
|
||||||
parse(html) {
|
parse(html) {
|
||||||
// load the page source into cheerio
|
// load the page source into cheerio
|
||||||
const $ = cheerio.load(html);
|
const $ = cheerio.load(html);
|
||||||
@ -75,7 +79,6 @@ class GoogleScraper extends Scraper {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
await next_page_link.click();
|
await next_page_link.click();
|
||||||
await this.page.waitForNavigation();
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -153,13 +156,11 @@ class GoogleNewsOldScraper extends Scraper {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
await next_page_link.click();
|
await next_page_link.click();
|
||||||
await this.page.waitForNavigation();
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
async wait_for_results() {
|
async wait_for_results() {
|
||||||
//await this.page.waitForNavigation({ timeout: this.STANDARD_TIMEOUT });
|
|
||||||
await this.page.waitForSelector('#main', { timeout: this.STANDARD_TIMEOUT });
|
await this.page.waitForSelector('#main', { timeout: this.STANDARD_TIMEOUT });
|
||||||
await this.sleep(500);
|
await this.sleep(500);
|
||||||
}
|
}
|
||||||
|
@ -5,11 +5,10 @@ module.exports = {
|
|||||||
get_http_headers: get_http_headers,
|
get_http_headers: get_http_headers,
|
||||||
};
|
};
|
||||||
|
|
||||||
async function get_ip_data(browser) {
|
async function get_ip_data(page) {
|
||||||
const page = await browser.newPage();
|
|
||||||
await page.goto('https://ipinfo.io/json', {
|
await page.goto('https://ipinfo.io/json', {
|
||||||
waitLoad: true,
|
waitLoad: true,
|
||||||
waitNetworkIdle: true // defaults to false
|
waitNetworkIdle: true
|
||||||
});
|
});
|
||||||
let json = await page.content({
|
let json = await page.content({
|
||||||
timeout: 20000
|
timeout: 20000
|
||||||
@ -19,11 +18,10 @@ async function get_ip_data(browser) {
|
|||||||
return JSON.parse(ipinfo_text);
|
return JSON.parse(ipinfo_text);
|
||||||
}
|
}
|
||||||
|
|
||||||
async function get_http_headers(browser) {
|
async function get_http_headers(page) {
|
||||||
const page = await browser.newPage();
|
|
||||||
await page.goto('https://httpbin.org/get', {
|
await page.goto('https://httpbin.org/get', {
|
||||||
waitLoad: true,
|
waitLoad: true,
|
||||||
waitNetworkIdle: true // defaults to false
|
waitNetworkIdle: true
|
||||||
});
|
});
|
||||||
let headers = await page.content();
|
let headers = await page.content();
|
||||||
|
|
||||||
|
@ -1,6 +1,4 @@
|
|||||||
const start_url = {
|
const meta = require('./metadata.js');
|
||||||
'google': ''
|
|
||||||
};
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Get useful JS knowledge and get awesome...
|
Get useful JS knowledge and get awesome...
|
||||||
@ -12,17 +10,19 @@ const start_url = {
|
|||||||
module.exports = class Scraper {
|
module.exports = class Scraper {
|
||||||
constructor(options = {}) {
|
constructor(options = {}) {
|
||||||
const {
|
const {
|
||||||
browser = null,
|
|
||||||
config = {},
|
config = {},
|
||||||
context = {},
|
context = {},
|
||||||
pluggable = null,
|
pluggable = null,
|
||||||
} = options;
|
} = options;
|
||||||
|
|
||||||
|
this.page = null;
|
||||||
|
this.metadata = {};
|
||||||
this.pluggable = pluggable;
|
this.pluggable = pluggable;
|
||||||
this.browser = browser;
|
|
||||||
this.config = config;
|
this.config = config;
|
||||||
this.context = context;
|
this.context = context;
|
||||||
|
|
||||||
|
this.keywords = config.keywords;
|
||||||
|
|
||||||
this.STANDARD_TIMEOUT = 8000;
|
this.STANDARD_TIMEOUT = 8000;
|
||||||
// longer timeout when using proxies
|
// longer timeout when using proxies
|
||||||
this.PROXY_TIMEOUT = 15000;
|
this.PROXY_TIMEOUT = 15000;
|
||||||
@ -36,7 +36,9 @@ module.exports = class Scraper {
|
|||||||
this.num_keywords = 0;
|
this.num_keywords = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
async run() {
|
async run({page, data}) {
|
||||||
|
|
||||||
|
this.page = page;
|
||||||
|
|
||||||
let do_continue = await this.load_search_engine();
|
let do_continue = await this.load_search_engine();
|
||||||
|
|
||||||
@ -58,8 +60,6 @@ module.exports = class Scraper {
|
|||||||
*/
|
*/
|
||||||
async load_search_engine() {
|
async load_search_engine() {
|
||||||
|
|
||||||
this.page = await this.browser.newPage();
|
|
||||||
|
|
||||||
// prevent detection by evading common detection techniques
|
// prevent detection by evading common detection techniques
|
||||||
await evadeChromeHeadlessDetection(this.page);
|
await evadeChromeHeadlessDetection(this.page);
|
||||||
|
|
||||||
@ -87,6 +87,32 @@ module.exports = class Scraper {
|
|||||||
await this.page.screenshot({path: 'headless-test-result.png'});
|
await this.page.screenshot({path: 'headless-test-result.png'});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (this.config.log_http_headers === true) {
|
||||||
|
this.metadata.http_headers = await meta.get_http_headers(this.page);
|
||||||
|
console.log(this.metadata.http_headers);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this.config.log_ip_address === true) {
|
||||||
|
this.metadata.ipinfo = await meta.get_ip_data(this.page);
|
||||||
|
console.log(this.metadata.ipinfo);
|
||||||
|
}
|
||||||
|
|
||||||
|
// check that our proxy is working by confirming
|
||||||
|
// that ipinfo.io sees the proxy IP address
|
||||||
|
if (this.config.proxy && this.config.log_ip_address === true) {
|
||||||
|
console.log(`${this.metadata.ipinfo} vs ${this.config.proxy}`);
|
||||||
|
|
||||||
|
try {
|
||||||
|
// if the ip returned by ipinfo is not a substring of our proxystring, get the heck outta here
|
||||||
|
if (!this.config.proxy.includes(this.metadata.ipinfo.ip)) {
|
||||||
|
console.error('Proxy not working properly.');
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
} catch (exception) {
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return await this.load_start_page();
|
return await this.load_start_page();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -98,7 +124,7 @@ module.exports = class Scraper {
|
|||||||
* @returns {Promise<void>}
|
* @returns {Promise<void>}
|
||||||
*/
|
*/
|
||||||
async scraping_loop() {
|
async scraping_loop() {
|
||||||
for (let keyword of this.config.keywords) {
|
for (var keyword of this.keywords) {
|
||||||
this.num_keywords++;
|
this.num_keywords++;
|
||||||
this.keyword = keyword;
|
this.keyword = keyword;
|
||||||
this.results[keyword] = {};
|
this.results[keyword] = {};
|
||||||
@ -106,6 +132,7 @@ module.exports = class Scraper {
|
|||||||
|
|
||||||
if (this.pluggable.before_keyword_scraped) {
|
if (this.pluggable.before_keyword_scraped) {
|
||||||
await this.pluggable.before_keyword_scraped({
|
await this.pluggable.before_keyword_scraped({
|
||||||
|
results: this.results,
|
||||||
num_keywords: this.num_keywords,
|
num_keywords: this.num_keywords,
|
||||||
num_requests: this.num_requests,
|
num_requests: this.num_requests,
|
||||||
keyword: keyword,
|
keyword: keyword,
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
const puppeteer = require('puppeteer');
|
const { Cluster } = require('./puppeteer-cluster/dist/index.js');
|
||||||
const zlib = require('zlib');
|
const zlib = require('zlib');
|
||||||
var fs = require('fs');
|
var fs = require('fs');
|
||||||
|
|
||||||
@ -9,7 +9,6 @@ const baidu = require('./modules/baidu.js');
|
|||||||
const infospace = require('./modules/infospace.js');
|
const infospace = require('./modules/infospace.js');
|
||||||
const youtube = require('./modules/youtube.js');
|
const youtube = require('./modules/youtube.js');
|
||||||
const ua = require('./modules/user_agents.js');
|
const ua = require('./modules/user_agents.js');
|
||||||
const meta = require('./modules/metadata.js');
|
|
||||||
const duckduckgo = require('./modules/duckduckgo.js');
|
const duckduckgo = require('./modules/duckduckgo.js');
|
||||||
const tickersearch = require('./modules/ticker_search.js');
|
const tickersearch = require('./modules/ticker_search.js');
|
||||||
|
|
||||||
@ -20,6 +19,27 @@ function write_results(fname, data) {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function getScraper(searchEngine, args) {
|
||||||
|
return new {
|
||||||
|
google: google.GoogleScraper,
|
||||||
|
google_news_old: google.GoogleNewsOldScraper,
|
||||||
|
google_news: google.GoogleNewsScraper,
|
||||||
|
google_image: google.GoogleImageScraper,
|
||||||
|
bing: bing.BingScraper,
|
||||||
|
bing_news: bing.BingNewsScraper,
|
||||||
|
duckduckgo: duckduckgo.DuckduckgoScraper,
|
||||||
|
duckduckgo_news: duckduckgo.DuckduckgoNewsScraper,
|
||||||
|
infospace: infospace.InfospaceScraper,
|
||||||
|
webcrawler: infospace.WebcrawlerNewsScraper,
|
||||||
|
baidu: baidu.BaiduScraper,
|
||||||
|
youtube: youtube.YoutubeScraper,
|
||||||
|
yahoo_news: tickersearch.YahooFinanceScraper,
|
||||||
|
reuters: tickersearch.ReutersFinanceScraper,
|
||||||
|
cnbc: tickersearch.CnbcFinanceScraper,
|
||||||
|
marketwatch: tickersearch.MarketwatchFinanceScraper,
|
||||||
|
}[searchEngine](args);
|
||||||
|
}
|
||||||
|
|
||||||
module.exports.handler = async function handler (event, context, callback) {
|
module.exports.handler = async function handler (event, context, callback) {
|
||||||
config = event;
|
config = event;
|
||||||
pluggable = {};
|
pluggable = {};
|
||||||
@ -57,24 +77,23 @@ module.exports.handler = async function handler (event, context, callback) {
|
|||||||
'--hide-scrollbars',
|
'--hide-scrollbars',
|
||||||
];
|
];
|
||||||
|
|
||||||
let USER_AGENT = '';
|
var user_agent = undefined;
|
||||||
|
|
||||||
if (config.user_agent) {
|
if (config.user_agent) {
|
||||||
USER_AGENT = config.user_agent;
|
user_agent = config.user_agent;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (config.random_user_agent === true) {
|
if (config.random_user_agent === true) {
|
||||||
USER_AGENT = ua.random_user_agent();
|
user_agent = ua.random_user_agent();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (USER_AGENT) {
|
if (user_agent) {
|
||||||
ADDITIONAL_CHROME_FLAGS.push(
|
ADDITIONAL_CHROME_FLAGS.push(
|
||||||
`--user-agent="${USER_AGENT}"`
|
`--user-agent="${user_agent}"`
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
if (config.proxy) {
|
if (config.proxy) {
|
||||||
// check this out bubbles
|
|
||||||
// https://www.systutorials.com/241062/how-to-set-google-chromes-proxy-settings-in-command-line-on-linux/
|
// https://www.systutorials.com/241062/how-to-set-google-chromes-proxy-settings-in-command-line-on-linux/
|
||||||
// [<proxy-scheme>://]<proxy-host>[:<proxy-port>]
|
// [<proxy-scheme>://]<proxy-host>[:<proxy-port>]
|
||||||
// "http", "socks", "socks4", "socks5".
|
// "http", "socks", "socks4", "socks5".
|
||||||
@ -97,81 +116,90 @@ module.exports.handler = async function handler (event, context, callback) {
|
|||||||
launch_args.config = config;
|
launch_args.config = config;
|
||||||
browser = await pluggable.start_browser(launch_args);
|
browser = await pluggable.start_browser(launch_args);
|
||||||
} else {
|
} else {
|
||||||
browser = await puppeteer.launch(launch_args);
|
var numClusters = config.proxies.length + 1;
|
||||||
|
|
||||||
|
// the first browser config with home IP
|
||||||
|
let perBrowserOptions = [launch_args, ];
|
||||||
|
|
||||||
|
for (var proxy of config.proxies) {
|
||||||
|
perBrowserOptions.push({
|
||||||
|
headless: config.headless,
|
||||||
|
ignoreHTTPSErrors: true,
|
||||||
|
args: ADDITIONAL_CHROME_FLAGS.concat(`--proxy-server=${proxy}`)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
var cluster = await Cluster.launch({
|
||||||
|
monitor: config.monitor,
|
||||||
|
timeout: 30 * 60 * 1000, // max timeout set to 30 minutes
|
||||||
|
concurrency: Cluster.CONCURRENCY_BROWSER,
|
||||||
|
maxConcurrency: numClusters,
|
||||||
|
puppeteerOptions: launch_args,
|
||||||
|
perBrowserOptions: perBrowserOptions
|
||||||
|
});
|
||||||
|
|
||||||
|
cluster.on('taskerror', (err, data) => {
|
||||||
|
console.log(`Error while scraping ${data}: ${err.message}`);
|
||||||
|
console.log(err)
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
let metadata = {};
|
let metadata = {};
|
||||||
|
|
||||||
if (config.log_http_headers === true) {
|
// Each browser will get N/(K+1) keywords and will issue N/(K+1) * M total requests to the search engine.
|
||||||
metadata.http_headers = await meta.get_http_headers(browser);
|
// https://github.com/GoogleChrome/puppeteer/issues/678
|
||||||
|
// The question is: Is it possible to set proxies per Page? Per Browser?
|
||||||
|
// as far as I can see, puppeteer cluster uses the same puppeteerOptions
|
||||||
|
// for every browser instance. We will use our custom puppeteer-cluster version.
|
||||||
|
// https://www.npmjs.com/package/proxy-chain
|
||||||
|
// this answer looks nice: https://github.com/GoogleChrome/puppeteer/issues/678#issuecomment-389096077
|
||||||
|
let chunks = [];
|
||||||
|
for (var n = 0; n < numClusters; n++) {
|
||||||
|
chunks.push([]);
|
||||||
}
|
}
|
||||||
|
for (var k = 0; k < config.keywords.length; k++) {
|
||||||
if (config.log_ip_address === true) {
|
chunks[k%numClusters].push(config.keywords[k]);
|
||||||
metadata.ipinfo = await meta.get_ip_data(browser);
|
|
||||||
}
|
}
|
||||||
|
//console.log(`Generated ${chunks.length} chunks...`);
|
||||||
|
|
||||||
// check that our proxy is working by confirming
|
let execPromises = [];
|
||||||
// that ipinfo.io sees the proxy IP address
|
let scraperInstances = [];
|
||||||
if (config.proxy && config.log_ip_address === true) {
|
for (var c = 0; c < chunks.length; c++) {
|
||||||
console.log(`${metadata.ipinfo} vs ${config.proxy}`);
|
config.keywords = chunks[c];
|
||||||
|
if (c>0) {
|
||||||
try {
|
config.proxy = config.proxies[c];
|
||||||
// if the ip returned by ipinfo is not a substring of our proxystring, get the heck outta here
|
|
||||||
if (!config.proxy.includes(metadata.ipinfo.ip)) {
|
|
||||||
console.error('Proxy not working properly.');
|
|
||||||
await browser.close();
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
} catch (exception) {
|
obj = getScraper(config.search_engine, {
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var results = {};
|
|
||||||
|
|
||||||
Scraper = {
|
|
||||||
google: google.GoogleScraper,
|
|
||||||
google_news_old: google.GoogleNewsOldScraper,
|
|
||||||
google_news: google.GoogleNewsScraper,
|
|
||||||
google_image: google.GoogleImageScraper,
|
|
||||||
bing: bing.BingScraper,
|
|
||||||
bing_news: bing.BingNewsScraper,
|
|
||||||
duckduckgo: duckduckgo.DuckduckgoScraper,
|
|
||||||
duckduckgo_news: duckduckgo.DuckduckgoNewsScraper,
|
|
||||||
infospace: infospace.InfospaceScraper,
|
|
||||||
webcrawler: infospace.WebcrawlerNewsScraper,
|
|
||||||
baidu: baidu.BaiduScraper,
|
|
||||||
youtube: youtube.YoutubeScraper,
|
|
||||||
yahoo_news: tickersearch.YahooFinanceScraper,
|
|
||||||
reuters: tickersearch.ReutersFinanceScraper,
|
|
||||||
cnbc: tickersearch.CnbcFinanceScraper,
|
|
||||||
marketwatch: tickersearch.MarketwatchFinanceScraper,
|
|
||||||
}[config.search_engine];
|
|
||||||
|
|
||||||
if (Scraper === undefined) {
|
|
||||||
console.info('Currently not implemented search_engine: ', config.search_engine);
|
|
||||||
} else {
|
|
||||||
scraperObj = new Scraper({
|
|
||||||
browser: browser,
|
|
||||||
config: config,
|
config: config,
|
||||||
context: context,
|
context: context,
|
||||||
pluggable: pluggable,
|
pluggable: pluggable,
|
||||||
});
|
});
|
||||||
results = await scraperObj.run();
|
var boundMethod = obj.run.bind(obj);
|
||||||
|
execPromises.push(cluster.execute({}, boundMethod));
|
||||||
|
scraperInstances.push(obj);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let results = await Promise.all(execPromises);
|
||||||
|
results = results[0]; // TODO: this is strange. fix that shit boy
|
||||||
|
|
||||||
if (pluggable.close_browser) {
|
if (pluggable.close_browser) {
|
||||||
await pluggable.close_browser();
|
await pluggable.close_browser();
|
||||||
} else {
|
} else {
|
||||||
await browser.close();
|
await cluster.idle();
|
||||||
|
await cluster.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
// count total requests among all scraper instances
|
||||||
|
let num_requests = 0;
|
||||||
|
for (var o of scraperInstances) {
|
||||||
|
num_requests += o.num_requests;
|
||||||
}
|
}
|
||||||
|
|
||||||
let num_requests = scraperObj.num_requests;
|
|
||||||
let timeDelta = Date.now() - startTime;
|
let timeDelta = Date.now() - startTime;
|
||||||
let ms_per_request = timeDelta/num_requests;
|
let ms_per_request = timeDelta/num_requests;
|
||||||
|
|
||||||
if (config.verbose === true) {
|
if (config.verbose === true) {
|
||||||
console.log(`Scraper took ${timeDelta}ms to perform ${num_requests} requests.`);
|
console.log(`${numClusters} Scraper Workers took ${timeDelta}ms to perform ${num_requests} requests.`);
|
||||||
console.log(`On average ms/request: ${ms_per_request}ms/request`);
|
console.log(`On average ms/request: ${ms_per_request}ms/request`);
|
||||||
console.dir(results, {depth: null, colors: true});
|
console.dir(results, {depth: null, colors: true});
|
||||||
}
|
}
|
||||||
@ -228,54 +256,17 @@ function parseEventData(config) {
|
|||||||
function _bool(e) {
|
function _bool(e) {
|
||||||
e = String(e);
|
e = String(e);
|
||||||
if (typeof e.trim === "function") {
|
if (typeof e.trim === "function") {
|
||||||
return e.trim().toLowerCase() == 'true';
|
return e.trim().toLowerCase() === 'true';
|
||||||
} else {
|
} else {
|
||||||
return e.toLowerCase() == 'true';
|
return e.toLowerCase() === 'true';
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (config.debug) {
|
const booleans = ['debug', 'verbose', 'upload_to_s3', 'log_ip_address', 'log_http_headers', 'random_user_agent',
|
||||||
config.debug = _bool(config.debug);
|
'compress', 'is_local', 'max_results', 'set_manual_settings', 'block_assets', 'test_evasion'];
|
||||||
}
|
|
||||||
|
|
||||||
if (config.verbose) {
|
for (b of booleans) {
|
||||||
config.verbose = _bool(config.verbose);
|
config[b] = _bool(config[b]);
|
||||||
}
|
|
||||||
|
|
||||||
if (config.upload_to_s3) {
|
|
||||||
config.upload_to_s3 = _bool(config.upload_to_s3);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (config.log_ip_address) {
|
|
||||||
config.log_ip_address = _bool(config.log_ip_address);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (config.log_http_headers) {
|
|
||||||
config.log_http_headers = _bool(config.log_http_headers);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (config.random_user_agent) {
|
|
||||||
config.random_user_agent = _bool(config.random_user_agent);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (config.compress) {
|
|
||||||
config.compress = _bool(config.compress);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (config.is_local) {
|
|
||||||
config.is_local = _bool(config.is_local);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (config.max_results) {
|
|
||||||
config.max_results = parseInt(config.max_results);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (config.set_manual_settings) {
|
|
||||||
config.set_manual_settings = _bool(config.set_manual_settings);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (config.block_assets) {
|
|
||||||
config.block_assets = _bool(config.block_assets);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (config.sleep_range) {
|
if (config.sleep_range) {
|
||||||
|
1
src/puppeteer-cluster
Submodule
1
src/puppeteer-cluster
Submodule
@ -0,0 +1 @@
|
|||||||
|
Subproject commit da9b7bc889273e966c68c50b4ffcb45115cbb2e8
|
Loading…
x
Reference in New Issue
Block a user