resolved some issues. proxy possible now. scraping for more than one page possible now

This commit is contained in:
Nikolai Tschacher 2019-01-29 22:48:08 +01:00
parent 89441070cd
commit 9e62f23451
14 changed files with 764 additions and 340 deletions

426
README.md
View File

@ -34,10 +34,50 @@ Scraping is done with a headless chromium browser using the automation library p
If you need to deploy scraping to the cloud (AWS or Azure), you can contact me on hire@incolumitas.com If you need to deploy scraping to the cloud (AWS or Azure), you can contact me on hire@incolumitas.com
The chromium browser is started with the following flags to prevent
scraping detection.
```js
var ADDITIONAL_CHROME_FLAGS = [
'--disable-infobars',
'--window-position=0,0',
'--ignore-certifcate-errors',
'--ignore-certifcate-errors-spki-list',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--disable-gpu',
'--window-size=1920x1080',
'--hide-scrollbars',
];
```
Furthermore, to avoid loading unnecessary ressources and to speed up
scraping a great deal, we instruct chrome to not load images and css:
```js
await page.setRequestInterception(true);
page.on('request', (req) => {
let type = req.resourceType();
const block = ['stylesheet', 'font', 'image', 'media'];
if (block.includes(type)) {
req.abort();
} else {
req.continue();
}
});
```
#### Making puppeteer and headless chrome undetectable
Consider the following resources:
* https://intoli.com/blog/making-chrome-headless-undetectable/
### Installation and Usage ### Installation and Usage
Install with Install with
```bash ```bash
npm install se-scraper npm install se-scraper
@ -53,12 +93,12 @@ let config = {
// the user agent to scrape with // the user agent to scrape with
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
// if random_user_agent is set to True, a random user agent is chosen // if random_user_agent is set to True, a random user agent is chosen
random_user_agent: false, random_user_agent: true,
// get meta data of scraping in return object // get meta data of scraping in return object
write_meta_data: false, write_meta_data: false,
// how long to sleep between requests. a random sleep interval within the range [a,b] // how long to sleep between requests. a random sleep interval within the range [a,b]
// is drawn before every request. empty string for no sleeping. // is drawn before every request. empty string for no sleeping.
sleep_range: '', sleep_range: '[1,2]',
// which search engine to scrape // which search engine to scrape
search_engine: 'google', search_engine: 'google',
// whether debug information should be printed // whether debug information should be printed
@ -68,9 +108,11 @@ let config = {
// this output is informational // this output is informational
verbose: false, verbose: false,
// an array of keywords to scrape // an array of keywords to scrape
keywords: ['scrapeulous.com', ], keywords: ['scraping scrapeulous.com'],
// alternatively you can specify a keyword_file. this overwrites the keywords array // alternatively you can specify a keyword_file. this overwrites the keywords array
keyword_file: '', keyword_file: '',
// the number of pages to scrape for each keyword
num_pages: 2,
// whether to start the browser in headless mode // whether to start the browser in headless mode
headless: true, headless: true,
// path to output file, data will be stored in JSON // path to output file, data will be stored in JSON
@ -84,9 +126,13 @@ let config = {
// must be an absolute path to the module // must be an absolute path to the module
//custom_func: resolve('examples/pluggable.js'), //custom_func: resolve('examples/pluggable.js'),
custom_func: '', custom_func: '',
// use a proxy for all connections
// example: 'socks5://78.94.172.42:1080'
// example: 'http://118.174.233.10:48400'
//proxy: 'socks5://78.94.172.42:1080',
}; };
se_scraper.scrape(config, (err, response) => { function callback(err, response) {
if (err) { console.error(err) } if (err) { console.error(err) }
/* response object has the following properties: /* response object has the following properties:
@ -97,7 +143,9 @@ se_scraper.scrape(config, (err, response) => {
*/ */
console.dir(response.results, {depth: null, colors: true}); console.dir(response.results, {depth: null, colors: true});
}); }
se_scraper.scrape(config, callback);
``` ```
Supported options for the `search_engine` config key: Supported options for the `search_engine` config key:
@ -123,199 +171,179 @@ Supported options for the `search_engine` config key:
'marketwatch' 'marketwatch'
``` ```
Output for the above script on my laptop: Output for the above script on my machine:
```text ```text
Scraper took 4295ms to scrape 2 keywords. { 'scraping scrapeulous.com':
On average ms/keyword: 2147.5ms/keyword { '1':
{ 'incolumitas.com scraping': { time: 'Tue, 29 Jan 2019 21:39:22 GMT',
{ time: 'Mon, 24 Dec 2018 13:07:43 GMT', num_results: 'Ungefähr 145 Ergebnisse (0,18 Sekunden) ',
num_results: 'Ungefähr 2020 Ergebnisse (0.18 Sekunden) ', no_results: false,
no_results: false, effective_query: '',
effective_query: '', results:
results: [ { link: 'https://scrapeulous.com/',
[ { link: title:
'https://incolumitas.com/2018/10/29/youtube-puppeteer-scraping/', 'Scrapeuloushttps://scrapeulous.com/Im CacheDiese Seite übersetzen',
title: snippet:
'Coding, Learning and Business Ideas Tutorial: Youtube scraping ...', 'Scrapeulous.com allows you to scrape various search engines automatically ... or to find hidden links, Scrapeulous.com enables you to scrape a ever increasing ...',
snippet: visible_link: 'https://scrapeulous.com/',
'29.10.2018 - In this blog post I am going to show you how to scrape YouTube video data using the handy puppeteer library. Puppeteer is a Node library ...', date: '',
visible_link: rank: 1 },
'https://incolumitas.com/2018/10/29/youtube-puppeteer-scraping/', { link: 'https://scrapeulous.com/about/',
date: '29.10.2018 - ', title:
rank: 1 }, 'About - Scrapeuloushttps://scrapeulous.com/about/Im CacheDiese Seite übersetzen',
{ link: 'https://incolumitas.com/2018/09/05/googlescraper-tutorial/', snippet:
title: 'Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. The business requirement to scrape information from ...',
'GoogleScraper Tutorial - How to scrape 1000 keywords with Google', visible_link: 'https://scrapeulous.com/about/',
snippet: date: '',
'05.09.2018 - Tutorial that teaches how to use GoogleScraper to scrape 1000 keywords with 10 selenium browsers.', rank: 2 },
visible_link: 'https://incolumitas.com/2018/09/05/googlescraper-tutorial/', { link: 'https://scrapeulous.com/howto/',
date: '05.09.2018 - ', title:
rank: 2 }, 'Howto - Scrapeuloushttps://scrapeulous.com/howto/Im CacheDiese Seite übersetzen',
{ link: 'https://incolumitas.com/tag/scraping.html', snippet:
title: 'Coding, Learning and Business Ideas Tag Scraping', 'We offer scraping large amounts of keywords for the Google Search Engine. Large means any number of keywords between 40 and 50000. Additionally, we ...',
snippet: visible_link: 'https://scrapeulous.com/howto/',
'Scraping Amazon Reviews using Headless Chrome Browser and Python3. Posted on Mi ... GoogleScraper Tutorial - How to scrape 1000 keywords with Google.', date: '',
visible_link: 'https://incolumitas.com/tag/scraping.html', rank: 3 },
date: '', { link: 'https://github.com/NikolaiT/se-scraper',
rank: 3 }, title:
{ link: 'https://incolumitas.com/category/scraping.html', 'GitHub - NikolaiT/se-scraper: Javascript scraping module based on ...https://github.com/NikolaiT/se-scraperIm CacheDiese Seite übersetzen',
title: 'Coding, Learning and Business Ideas Category Scraping', snippet:
snippet: '24.12.2018 - Javascript scraping module based on puppeteer for many different search ... for many different search engines... https://scrapeulous.com/.',
'Nikolai Tschacher\'s ideas and projects around IT security and computer science.', visible_link: 'https://github.com/NikolaiT/se-scraper',
visible_link: 'https://incolumitas.com/category/scraping.html', date: '24.12.2018 - ',
date: '', rank: 4 },
rank: 4 }, { link:
{ link: 'https://github.com/NikolaiT/GoogleScraper/blob/master/README.md',
'https://github.com/NikolaiT/incolumitas/blob/master/content/Meta/scraping-and-extracting-links-from-any-major-search-engine-like-google-yandex-baidu-bing-and-duckduckgo.md', title:
title: 'GoogleScraper/README.md at master · NikolaiT/GoogleScraper ...https://github.com/NikolaiT/GoogleScraper/blob/.../README.mdIm CacheÄhnliche SeitenDiese Seite übersetzen',
'incolumitas/scraping-and-extracting-links-from-any-major-search ...', snippet:
snippet: 'GoogleScraper - Scraping search engines professionally. Scrapeulous.com - Scraping Service. GoogleScraper is a open source tool and will remain a open ...',
'Title: Scraping and Extracting Links from any major Search Engine like Google, Yandex, Baidu, Bing and Duckduckgo Date: 2014-11-12 00:47 Author: Nikolai ...', visible_link:
visible_link: 'https://github.com/NikolaiT/GoogleScraper/blob/.../README.md',
'https://github.com/.../incolumitas/.../scraping-and-extracting-links...', date: '',
date: '', rank: 5 },
rank: 5 }, { link: 'https://googlescraper.readthedocs.io/',
{ link: title:
'https://stackoverflow.com/questions/16955325/scraping-google-results-with-python', 'Welcome to GoogleScraper\'s documentation! — GoogleScraper ...https://googlescraper.readthedocs.io/Im CacheDiese Seite übersetzen',
title: 'Scraping Google Results with Python - Stack Overflow', snippet:
snippet: 'Welcome to GoogleScraper\'s documentation!¶. Contents: GoogleScraper - Scraping search engines professionally · Scrapeulous.com - Scraping Service ...',
'I found this. incolumitas.com/2013/01/06/… But the author claims it is not ported to 2.7 yet. user2351394 Jun 6 \'13 at 6:59 ...', visible_link: 'https://googlescraper.readthedocs.io/',
visible_link: date: '',
'https://stackoverflow.com/.../scraping-google-results-with-python', rank: 6 },
date: '', { link: 'https://incolumitas.com/pages/scrapeulous/',
rank: 6 }, title:
{ link: 'https://pypi.org/project/GoogleScraper/0.1.18/', 'Coding, Learning and Business Ideas Scrapeulous.com - Incolumitashttps://incolumitas.com/pages/scrapeulous/Im CacheDiese Seite übersetzen',
title: 'GoogleScraper · PyPI', snippet:
snippet: 'A scraping service for scientists, marketing professionals, analysts or SEO folk. In autumn 2018, I created a scraping service called scrapeulous.com. There you ...',
'[5]: http://incolumitas.com/2014/11/12/scraping-and-extracting-links-from-any-major-search-engine-like-google-yandex-baidu-bing-and-duckduckgo/ ...', visible_link: 'https://incolumitas.com/pages/scrapeulous/',
visible_link: 'https://pypi.org/project/GoogleScraper/0.1.18/', date: '',
date: '', rank: 7 },
rank: 7 }, { link: 'https://incolumitas.com/',
{ link: title:
'https://www.reddit.com/r/Python/comments/2m0vyu/scraping_links_on_google_yandex_bing_duckduckgo/', 'Coding, Learning and Business Ideashttps://incolumitas.com/Im CacheDiese Seite übersetzen',
title: snippet:
'Scraping links on Google, Yandex, Bing, Duckduckgo, Baidu and ...', 'Scraping Amazon Reviews using Headless Chrome Browser and Python3. Posted on Mi ... GoogleScraper Tutorial - How to scrape 1000 keywords with Google.',
snippet: visible_link: 'https://incolumitas.com/',
'12.11.2014 - Scraping links on Google, Yandex, Bing, Duckduckgo, Baidu and other search engines with Python ... submitted 4 years ago by incolumitas.', date: '',
visible_link: rank: 8 },
'https://www.reddit.com/.../scraping_links_on_google_yandex_bi...', { link: 'https://en.wikipedia.org/wiki/Search_engine_scraping',
date: '12.11.2014 - ', title:
rank: 9 }, 'Search engine scraping - Wikipediahttps://en.wikipedia.org/wiki/Search_engine_scrapingIm CacheDiese Seite übersetzen',
{ link: 'https://twitter.com/incolumitas_?lang=de', snippet:
title: 'Nikolai Tschacher (@incolumitas_) | Twitter', 'Search engine scraping is the process of harvesting URLs, descriptions, or other information from search engines such as Google, Bing or Yahoo. This is a ...',
snippet: visible_link: 'https://en.wikipedia.org/wiki/Search_engine_scraping',
'Embed Tweet. How to use GoogleScraper to scrape images and download them ... Learn how to scrape millions of url from yandex and google or bing with: ...', date: '',
visible_link: 'https://twitter.com/incolumitas_?lang=de', rank: 9 },
date: '', { link:
rank: 10 } ] }, 'https://readthedocs.org/projects/googlescraper/downloads/pdf/latest/',
'best scraping framework': title:
{ time: 'Mon, 24 Dec 2018 13:07:44 GMT', 'GoogleScraper Documentation - Read the Docshttps://readthedocs.org/projects/googlescraper/downloads/.../latest...Im CacheDiese Seite übersetzen',
num_results: 'Ungefähr 2820000 Ergebnisse (0.36 Sekunden) ', snippet:
no_results: false, '23.12.2018 - Contents: 1 GoogleScraper - Scraping search engines professionally. 1. 1.1 ... For this reason, I created the web service scrapeulous.com.',
effective_query: '', visible_link:
results: 'https://readthedocs.org/projects/googlescraper/downloads/.../latest...',
[ { link: date: '23.12.2018 - ',
'http://www.aioptify.com/top-web-scraping-frameworks-and-librares.php', rank: 10 } ] },
title: 'Top Web Scraping Frameworks and Libraries - AI Optify', '2':
snippet: '', { time: 'Tue, 29 Jan 2019 21:39:24 GMT',
visible_link: num_results: 'Seite 2 von ungefähr 145 Ergebnissen (0,20 Sekunden) ',
'www.aioptify.com/top-web-scraping-frameworks-and-librares.php', no_results: false,
date: '', effective_query: '',
rank: 1 }, results:
{ link: [ { link: 'https://pypi.org/project/CountryGoogleScraper/',
'http://www.aioptify.com/top-web-scraping-frameworks-and-librares.php', title:
title: 'Top Web Scraping Frameworks and Libraries - AI Optify', 'CountryGoogleScraper · PyPIhttps://pypi.org/project/CountryGoogleScraper/Im CacheDiese Seite übersetzen',
snippet: '', snippet:
visible_link: 'A module to scrape and extract links, titles and descriptions from various search ... Look [here to get an idea how to use asynchronous mode](http://scrapeulous.',
'www.aioptify.com/top-web-scraping-frameworks-and-librares.php', visible_link: 'https://pypi.org/project/CountryGoogleScraper/',
date: '', date: '',
rank: 2 }, rank: 1 },
{ link: { link: 'https://www.youtube.com/watch?v=a6xn6rc9GbI',
'https://www.scrapehero.com/open-source-web-scraping-frameworks-and-tools/', title:
title: 'scrapeulous intro - YouTubehttps://www.youtube.com/watch?v=a6xn6rc9GbIDiese Seite übersetzen',
'Best Open Source Web Scraping Frameworks and Tools - ScrapeHero', snippet:
snippet: 'scrapeulous intro. Scrapeulous Scrapeulous. Loading... Unsubscribe from ... on Dec 16, 2018. Introduction ...',
'05.06.2018 - List of Open Source Web Scraping Frameworks. Scrapy. MechanicalSoup. PySpider. Portia. Apify SDK. Nodecrawler. Selenium WebDriver. Puppeteer.', visible_link: 'https://www.youtube.com/watch?v=a6xn6rc9GbI',
visible_link: date: '',
'https://www.scrapehero.com/open-source-web-scraping-framewo...', rank: 3 },
date: '05.06.2018 - ', { link:
rank: 3 }, 'https://www.reddit.com/r/Python/comments/2tii3r/scraping_260_search_queries_in_bing_in_a_matter/',
{ link: title:
'https://medium.com/datadriveninvestor/best-data-scraping-tools-for-2018-top-10-reviews-558cc5a4992f', 'Scraping 260 search queries in Bing in a matter of seconds using ...https://www.reddit.com/.../scraping_260_search_queries_in_bing...Im CacheDiese Seite übersetzen',
title: snippet:
'Best Data Scraping Tools for 2018 (Top 10 Reviews) Data Driven ...', '24.01.2015 - Scraping 260 search queries in Bing in a matter of seconds using asyncio and aiohttp. (scrapeulous.com). submitted 3 years ago by ...',
snippet: visible_link:
'05.03.2018 - Pros: Octoparse is the best free data scraping tool I\'ve met. ... your Scrapy (a open-source data extraction framework) web spider\'s activities.', 'https://www.reddit.com/.../scraping_260_search_queries_in_bing...',
visible_link: date: '24.01.2015 - ',
'https://medium.com/.../best-data-scraping-tools-for-2018-top-10-...', rank: 4 },
date: '05.03.2018 - ', { link: 'https://twitter.com/incolumitas_?lang=de',
rank: 4 }, title:
{ link: 'Nikolai Tschacher (@incolumitas_) | Twitterhttps://twitter.com/incolumitas_?lang=deIm CacheÄhnliche SeitenDiese Seite übersetzen',
'https://www.quora.com/What-is-the-best-web-scraping-open-source-tool', snippet:
title: 'What is the best web scraping open source tool? - Quora', 'Learn how to scrape millions of url from yandex and google or bing with: http://scrapeulous.com/googlescraper-market-analysis.html … 0 replies 0 retweets 0 ...',
snippet: visible_link: 'https://twitter.com/incolumitas_?lang=de',
'15.06.2015 - My personal favourite is Python Scrapy and it is an excellent framework for building a web data scraper. Why Scrapy? 1) It is an open source framework and cost ...', date: '',
visible_link: rank: 5 },
'https://www.quora.com/What-is-the-best-web-scraping-open-sour...', { link:
date: '15.06.2015 - ', 'http://blog.shodan.io/hostility-in-the-python-package-index/',
rank: 5 }, title:
{ link: 'Hostility in the Cheese Shop - Shodan Blogblog.shodan.io/hostility-in-the-python-package-index/Im CacheDiese Seite übersetzen',
'http://www.aioptify.com/top-web-scraping-frameworks-and-librares.php', snippet:
title: 'Top Web Scraping Frameworks and Libraries - AI Optify', '22.02.2015 - https://zzz.scrapeulous.com/r? According to the author of the website, these hostile packages are used as honeypots. Honeypots are usually ...',
snippet: visible_link: 'blog.shodan.io/hostility-in-the-python-package-index/',
'21.05.2018 - Top Web Scraping Frameworks and Libraries. Requests. Scrapy. Beautiful Soup. Selenium with Python. lxml. Webscraping with Selenium - part 1. Extracting data from websites with Scrapy. Scrapinghub.', date: '22.02.2015 - ',
visible_link: rank: 6 },
'www.aioptify.com/top-web-scraping-frameworks-and-librares.php', { link: 'https://libraries.io/github/NikolaiT/GoogleScraper',
date: '21.05.2018 - ', title:
rank: 6 }, 'NikolaiT/GoogleScraper - Libraries.iohttps://libraries.io/github/NikolaiT/GoogleScraperIm CacheDiese Seite übersetzen',
{ link: 'https://scrapy.org/', snippet:
title: 'A Python module to scrape several search engines (like Google, Yandex, Bing, ... https://scrapeulous.com/ ... You can install GoogleScraper comfortably with pip:',
'Scrapy | A Fast and Powerful Scraping and Web Crawling Framework', visible_link: 'https://libraries.io/github/NikolaiT/GoogleScraper',
snippet: date: '',
'An open source and collaborative framework for extracting the data you need from ... Spider): name = \'blogspider\' start_urls = [\'https://blog.scrapinghub.com\'] def ...', rank: 7 },
visible_link: 'https://scrapy.org/', { link: 'https://pydigger.com/pypi/CountryGoogleScraper',
date: '', title:
rank: 7 }, 'CountryGoogleScraper - PyDiggerhttps://pydigger.com/pypi/CountryGoogleScraperDiese Seite übersetzen',
{ link: snippet:
'https://www.scraperapi.com/blog/the-10-best-web-scraping-tools', '19.10.2016 - Look [here to get an idea how to use asynchronous mode](http://scrapeulous.com/googlescraper-260-keywords-in-a-second.html). ### Table ...',
title: 'The 10 Best Web Scraping Tools of 2018 - Scraper API', visible_link: 'https://pydigger.com/pypi/CountryGoogleScraper',
snippet: date: '19.10.2016 - ',
'19.07.2018 - The 10 Best Web Scraping Tools of 2018. ParseHub. Scrapy. Diffbot. Cheerio. Website: https://cheerio.js.org. Beautiful Soup. Website: https://www.crummy.com/software/BeautifulSoup/ Puppeteer. Website: https://github.com/GoogleChrome/puppeteer. Content Grabber. Website: http://www.contentgrabber.com/ Mozenda. Website: ...', rank: 8 },
visible_link: { link: 'https://hub.docker.com/r/cimenx/data-mining-penandtest/',
'https://www.scraperapi.com/blog/the-10-best-web-scraping-tools', title:
date: '19.07.2018 - ', 'cimenx/data-mining-penandtest - Docker Hubhttps://hub.docker.com/r/cimenx/data-mining-penandtest/Im CacheDiese Seite übersetzen',
rank: 8 }, snippet:
{ link: 'https://elitedatascience.com/python-web-scraping-libraries', 'Container. OverviewTagsDockerfileBuilds · http://scrapeulous.com/googlescraper-260-keywords-in-a-second.html. Docker Pull Command. Owner. profile ...',
title: '5 Tasty Python Web Scraping Libraries - EliteDataScience', visible_link: 'https://hub.docker.com/r/cimenx/data-mining-penandtest/',
snippet: date: '',
'03.02.2017 - We\'ve decided to feature the 5 Python libraries for web scraping that ... The good news is that you can swap out its parser with a faster one if ... Scrapy is technically not even a library… it\'s a complete web scraping framework.', rank: 9 },
visible_link: 'https://elitedatascience.com/python-web-scraping-libraries', { link: 'https://www.revolvy.com/page/Search-engine-scraping',
date: '03.02.2017 - ', title:
rank: 9 }, 'Search engine scraping | Revolvyhttps://www.revolvy.com/page/Search-engine-scrapingIm CacheDiese Seite übersetzen',
{ link: snippet:
'https://blog.michaelyin.info/web-scraping-framework-review-scrapy-vs-selenium/', 'Search engine scraping is the process of harvesting URLs, descriptions, or other information from search engines such as Google, Bing or Yahoo. This is a ...',
title: visible_link: 'https://www.revolvy.com/page/Search-engine-scraping',
'Web Scraping Framework Review: Scrapy VS Selenium | MichaelYin ...', date: '',
snippet: rank: 10 } ] } } }
'01.10.2018 - In this Scrapy tutorial, I will cover the features of Scrapy and Selenium, and help you decide which one is better for your projects.',
visible_link:
'https://blog.michaelyin.info/web-scraping-framework-review-scr...',
date: '01.10.2018 - ',
rank: 10 },
{ link: 'https://github.com/lorien/awesome-web-scraping',
title:
'GitHub - lorien/awesome-web-scraping: List of libraries, tools and APIs ...',
snippet:
'List of libraries, tools and APIs for web scraping and data processing. ... golang.md · add dataflow kit framework, 2 months ago ... Make this list better!',
visible_link: 'https://github.com/lorien/awesome-web-scraping',
date: '',
rank: 11 },
{ link: 'https://www.import.io/post/best-web-scraping-tools-2018/',
title: 'Best Web Scraping Software Tools 2018 | Import.io',
snippet:
'07.08.2018 - List of Best Web Scraping SoftwareThere are hundreds of Web ... it is a fast high-level screen scraping and web crawling framework, used to ...',
visible_link: 'https://www.import.io/post/best-web-scraping-tools-2018/',
date: '07.08.2018 - ',
rank: 12 } ] } }
``` ```

View File

@ -14,6 +14,17 @@
https://www.scrapehero.com/how-to-increase-web-scraping-speed-using-puppeteer/ https://www.scrapehero.com/how-to-increase-web-scraping-speed-using-puppeteer/
https://www.scrapehero.com/how-to-build-a-web-scraper-using-puppeteer-and-node-js/ https://www.scrapehero.com/how-to-build-a-web-scraper-using-puppeteer-and-node-js/
29.1.2019
- implement proxy support functionality
- implement proxy check
- implement scraping more than 1 page
- do it for google
- and bing
- implement duckduckgo scraping
TODO: TODO:
- think about implementing ticker search for: https://quotes.wsj.com/MSFT?mod=searchresults_companyquotes - think about implementing ticker search for: https://quotes.wsj.com/MSFT?mod=searchresults_companyquotes
- add proxy support - add proxy support
@ -23,4 +34,28 @@ TODO:
TODO: TODO:
- think whether it makes sense to introduce a generic scraping class? - think whether it makes sense to introduce a generic scraping class?
- is scraping abstractable or is every scraper too unique? - is scraping abstractable or is every scraper too unique?
- dont make the same mistakes as with GoogleScraper - dont make the same mistakes as with GoogleScraper
TODO:
okay its fucking time to make a generic scraping class like in GoogleScraper
i feel like history repeats
class Scraper
constructor(options = {}) {
}
async load_search_engine() {}
async search_keyword() {}
async new_page() {}
async detected() {}
then each search engine derives from this generic class
some search engines do not seed such a abstract class, because they are too complex

File diff suppressed because one or more lines are too long

View File

@ -35,6 +35,10 @@ exports.scrape = async function(config, callback) {
// get_browser, handle_metadata, close_browser // get_browser, handle_metadata, close_browser
//custom_func: resolve('examples/pluggable.js'), //custom_func: resolve('examples/pluggable.js'),
custom_func: '', custom_func: '',
// use a proxy for all connections
// example: 'socks5://78.94.172.42:1080'
// example: 'http://118.174.233.10:48400'
proxy: '',
}; };
// overwrite default config // overwrite default config

162
package-lock.json generated
View File

@ -1,9 +1,22 @@
{ {
"name": "se-scraper", "name": "se-scraper",
"version": "1.1.4", "version": "1.1.7",
"lockfileVersion": 1, "lockfileVersion": 1,
"requires": true, "requires": true,
"dependencies": { "dependencies": {
"@sindresorhus/is": {
"version": "0.14.0",
"resolved": "https://registry.npmjs.org/@sindresorhus/is/-/is-0.14.0.tgz",
"integrity": "sha512-9NET910DNaIPngYnLLPeg+Ogzqsi9uM4mSboU5y6p8S5DzMTVEsJZrawi+BoDNUVBa2DhJqQYUFvMDfgU062LQ=="
},
"@szmarczak/http-timer": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/@szmarczak/http-timer/-/http-timer-1.1.2.tgz",
"integrity": "sha512-XIB2XbzHTN6ieIjfIMV9hlVcfPU26s2vafYWQcZHWXHOxiaRZYEDKEwdl129Zyg50+foYV2jCgtrqSA6qNuNSA==",
"requires": {
"defer-to-connect": "^1.0.1"
}
},
"@types/node": { "@types/node": {
"version": "10.12.18", "version": "10.12.18",
"resolved": "https://registry.npmjs.org/@types/node/-/node-10.12.18.tgz", "resolved": "https://registry.npmjs.org/@types/node/-/node-10.12.18.tgz",
@ -51,6 +64,20 @@
"resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.1.tgz", "resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.1.tgz",
"integrity": "sha512-MQcXEUbCKtEo7bhqEs6560Hyd4XaovZlO/k9V3hjVUF/zwW7KBVdSK4gIt/bzwS9MbR5qob+F5jusZsb0YQK2A==" "integrity": "sha512-MQcXEUbCKtEo7bhqEs6560Hyd4XaovZlO/k9V3hjVUF/zwW7KBVdSK4gIt/bzwS9MbR5qob+F5jusZsb0YQK2A=="
}, },
"cacheable-request": {
"version": "6.0.0",
"resolved": "https://registry.npmjs.org/cacheable-request/-/cacheable-request-6.0.0.tgz",
"integrity": "sha512-2N7AmszH/WPPpl5Z3XMw1HAP+8d+xugnKQAeKvxFZ/04dbT/CAznqwbl+7eSr3HkwdepNwtb2yx3CAMQWvG01Q==",
"requires": {
"clone-response": "^1.0.2",
"get-stream": "^4.0.0",
"http-cache-semantics": "^4.0.0",
"keyv": "^3.0.0",
"lowercase-keys": "^1.0.1",
"normalize-url": "^3.1.0",
"responselike": "^1.0.2"
}
},
"chai": { "chai": {
"version": "4.2.0", "version": "4.2.0",
"resolved": "https://registry.npmjs.org/chai/-/chai-4.2.0.tgz", "resolved": "https://registry.npmjs.org/chai/-/chai-4.2.0.tgz",
@ -82,6 +109,14 @@
"parse5": "^3.0.1" "parse5": "^3.0.1"
} }
}, },
"clone-response": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/clone-response/-/clone-response-1.0.2.tgz",
"integrity": "sha1-0dyXOSAxTfZ/vrlCI7TuNQI56Ws=",
"requires": {
"mimic-response": "^1.0.0"
}
},
"concat-map": { "concat-map": {
"version": "0.0.1", "version": "0.0.1",
"resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz",
@ -151,6 +186,14 @@
"ms": "^2.1.1" "ms": "^2.1.1"
} }
}, },
"decompress-response": {
"version": "3.3.0",
"resolved": "https://registry.npmjs.org/decompress-response/-/decompress-response-3.3.0.tgz",
"integrity": "sha1-gKTdMjdIOEv6JICDYirt7Jgq3/M=",
"requires": {
"mimic-response": "^1.0.0"
}
},
"deep-eql": { "deep-eql": {
"version": "3.0.1", "version": "3.0.1",
"resolved": "https://registry.npmjs.org/deep-eql/-/deep-eql-3.0.1.tgz", "resolved": "https://registry.npmjs.org/deep-eql/-/deep-eql-3.0.1.tgz",
@ -159,6 +202,11 @@
"type-detect": "^4.0.0" "type-detect": "^4.0.0"
} }
}, },
"defer-to-connect": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/defer-to-connect/-/defer-to-connect-1.0.2.tgz",
"integrity": "sha512-k09hcQcTDY+cwgiwa6PYKLm3jlagNzQ+RSvhjzESOGOx+MNOuXkxTfEvPrO1IOQ81tArCFYQgi631clB70RpQw=="
},
"dom-serializer": { "dom-serializer": {
"version": "0.1.0", "version": "0.1.0",
"resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-0.1.0.tgz", "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-0.1.0.tgz",
@ -197,6 +245,19 @@
"domelementtype": "1" "domelementtype": "1"
} }
}, },
"duplexer3": {
"version": "0.1.4",
"resolved": "https://registry.npmjs.org/duplexer3/-/duplexer3-0.1.4.tgz",
"integrity": "sha1-7gHdHKwO08vH/b6jfcCo8c4ALOI="
},
"end-of-stream": {
"version": "1.4.1",
"resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.1.tgz",
"integrity": "sha512-1MkrZNvWTKCaigbn+W15elq2BB/L22nqrSY5DKlo3X6+vclJm8Bb5djXJBmEX6fS3+zCh/F4VBK5Z2KxJt4s2Q==",
"requires": {
"once": "^1.4.0"
}
},
"entities": { "entities": {
"version": "1.1.2", "version": "1.1.2",
"resolved": "https://registry.npmjs.org/entities/-/entities-1.1.2.tgz", "resolved": "https://registry.npmjs.org/entities/-/entities-1.1.2.tgz",
@ -259,6 +320,14 @@
"resolved": "https://registry.npmjs.org/get-func-name/-/get-func-name-2.0.0.tgz", "resolved": "https://registry.npmjs.org/get-func-name/-/get-func-name-2.0.0.tgz",
"integrity": "sha1-6td0q+5y4gQJQzoGY2YCPdaIekE=" "integrity": "sha1-6td0q+5y4gQJQzoGY2YCPdaIekE="
}, },
"get-stream": {
"version": "4.1.0",
"resolved": "https://registry.npmjs.org/get-stream/-/get-stream-4.1.0.tgz",
"integrity": "sha512-GMat4EJ5161kIy2HevLlr4luNjBgvmj413KaQA7jt4V8B4RDsfpHk7WQ9GVqfYyyx8OS/L66Kox+rJRNklLK7w==",
"requires": {
"pump": "^3.0.0"
}
},
"glob": { "glob": {
"version": "7.1.3", "version": "7.1.3",
"resolved": "https://registry.npmjs.org/glob/-/glob-7.1.3.tgz", "resolved": "https://registry.npmjs.org/glob/-/glob-7.1.3.tgz",
@ -272,6 +341,24 @@
"path-is-absolute": "^1.0.0" "path-is-absolute": "^1.0.0"
} }
}, },
"got": {
"version": "9.6.0",
"resolved": "https://registry.npmjs.org/got/-/got-9.6.0.tgz",
"integrity": "sha512-R7eWptXuGYxwijs0eV+v3o6+XH1IqVK8dJOEecQfTmkncw9AV4dcw/Dhxi8MdlqPthxxpZyizMzyg8RTmEsG+Q==",
"requires": {
"@sindresorhus/is": "^0.14.0",
"@szmarczak/http-timer": "^1.1.2",
"cacheable-request": "^6.0.0",
"decompress-response": "^3.3.0",
"duplexer3": "^0.1.4",
"get-stream": "^4.1.0",
"lowercase-keys": "^1.0.1",
"mimic-response": "^1.0.1",
"p-cancelable": "^1.0.0",
"to-readable-stream": "^1.0.0",
"url-parse-lax": "^3.0.0"
}
},
"htmlparser2": { "htmlparser2": {
"version": "3.10.0", "version": "3.10.0",
"resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-3.10.0.tgz", "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-3.10.0.tgz",
@ -285,6 +372,11 @@
"readable-stream": "^3.0.6" "readable-stream": "^3.0.6"
} }
}, },
"http-cache-semantics": {
"version": "4.0.2",
"resolved": "https://registry.npmjs.org/http-cache-semantics/-/http-cache-semantics-4.0.2.tgz",
"integrity": "sha512-laeSTWIkuFa6lUgZAt+ic9RwOSEwbi9VDQNcCvMFO4sZiDc2Ha8DaZVCJnfpLLQCcS8rvCnIWYmz0POLxt7Dew=="
},
"https-proxy-agent": { "https-proxy-agent": {
"version": "2.2.1", "version": "2.2.1",
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-2.2.1.tgz", "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-2.2.1.tgz",
@ -323,16 +415,39 @@
"resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz",
"integrity": "sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE=" "integrity": "sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE="
}, },
"json-buffer": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/json-buffer/-/json-buffer-3.0.0.tgz",
"integrity": "sha1-Wx85evx11ne96Lz8Dkfh+aPZqJg="
},
"keyv": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/keyv/-/keyv-3.1.0.tgz",
"integrity": "sha512-9ykJ/46SN/9KPM/sichzQ7OvXyGDYKGTaDlKMGCAlg2UK8KRy4jb0d8sFc+0Tt0YYnThq8X2RZgCg74RPxgcVA==",
"requires": {
"json-buffer": "3.0.0"
}
},
"lodash": { "lodash": {
"version": "4.17.11", "version": "4.17.11",
"resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.11.tgz", "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.11.tgz",
"integrity": "sha512-cQKh8igo5QUhZ7lg38DYWAxMvjSAKG0A8wGSVimP07SIUEK2UO+arSRKbRZWtelMtN5V0Hkwh5ryOto/SshYIg==" "integrity": "sha512-cQKh8igo5QUhZ7lg38DYWAxMvjSAKG0A8wGSVimP07SIUEK2UO+arSRKbRZWtelMtN5V0Hkwh5ryOto/SshYIg=="
}, },
"lowercase-keys": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/lowercase-keys/-/lowercase-keys-1.0.1.tgz",
"integrity": "sha512-G2Lj61tXDnVFFOi8VZds+SoQjtQC3dgokKdDG2mTm1tx4m50NUHBOZSBwQQHyy0V12A0JTG4icfZQH+xPyh8VA=="
},
"mime": { "mime": {
"version": "2.4.0", "version": "2.4.0",
"resolved": "https://registry.npmjs.org/mime/-/mime-2.4.0.tgz", "resolved": "https://registry.npmjs.org/mime/-/mime-2.4.0.tgz",
"integrity": "sha512-ikBcWwyqXQSHKtciCcctu9YfPbFYZ4+gbHEmE0Q8jzcTYQg5dHCr3g2wwAZjPoJfQVXZq6KXAjpXOTf5/cjT7w==" "integrity": "sha512-ikBcWwyqXQSHKtciCcctu9YfPbFYZ4+gbHEmE0Q8jzcTYQg5dHCr3g2wwAZjPoJfQVXZq6KXAjpXOTf5/cjT7w=="
}, },
"mimic-response": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-1.0.1.tgz",
"integrity": "sha512-j5EctnkH7amfV/q5Hgmoal1g2QHFJRraOtmx0JpIqkxhBhI/lJSl1nMpQ45hVarwNETOoWEimndZ4QK0RHxuxQ=="
},
"minimatch": { "minimatch": {
"version": "3.0.4", "version": "3.0.4",
"resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.4.tgz", "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.4.tgz",
@ -359,6 +474,11 @@
"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.1.tgz", "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.1.tgz",
"integrity": "sha512-tgp+dl5cGk28utYktBsrFqA7HKgrhgPsg6Z/EfhWI4gl1Hwq8B/GmY/0oXZ6nF8hDVesS/FpnYaD/kOWhYQvyg==" "integrity": "sha512-tgp+dl5cGk28utYktBsrFqA7HKgrhgPsg6Z/EfhWI4gl1Hwq8B/GmY/0oXZ6nF8hDVesS/FpnYaD/kOWhYQvyg=="
}, },
"normalize-url": {
"version": "3.3.0",
"resolved": "https://registry.npmjs.org/normalize-url/-/normalize-url-3.3.0.tgz",
"integrity": "sha512-U+JJi7duF1o+u2pynbp2zXDW2/PADgC30f0GsHZtRh+HOcXHnw137TrNlyxxRvWW5fjKd3bcLHPxofWuCjaeZg=="
},
"nth-check": { "nth-check": {
"version": "1.0.2", "version": "1.0.2",
"resolved": "https://registry.npmjs.org/nth-check/-/nth-check-1.0.2.tgz", "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-1.0.2.tgz",
@ -375,6 +495,11 @@
"wrappy": "1" "wrappy": "1"
} }
}, },
"p-cancelable": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/p-cancelable/-/p-cancelable-1.0.0.tgz",
"integrity": "sha512-USgPoaC6tkTGlS831CxsVdmZmyb8tR1D+hStI84MyckLOzfJlYQUweomrwE3D8T7u5u5GVuW064LT501wHTYYA=="
},
"parse5": { "parse5": {
"version": "3.0.3", "version": "3.0.3",
"resolved": "https://registry.npmjs.org/parse5/-/parse5-3.0.3.tgz", "resolved": "https://registry.npmjs.org/parse5/-/parse5-3.0.3.tgz",
@ -398,6 +523,11 @@
"resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz", "resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz",
"integrity": "sha1-elfrVQpng/kRUzH89GY9XI4AelA=" "integrity": "sha1-elfrVQpng/kRUzH89GY9XI4AelA="
}, },
"prepend-http": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/prepend-http/-/prepend-http-2.0.0.tgz",
"integrity": "sha1-6SQ0v6XqjBn0HN/UAddBo8gZ2Jc="
},
"process-nextick-args": { "process-nextick-args": {
"version": "2.0.0", "version": "2.0.0",
"resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.0.tgz", "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.0.tgz",
@ -413,6 +543,15 @@
"resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.0.0.tgz", "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.0.0.tgz",
"integrity": "sha1-M8UDmPcOp+uW0h97gXYwpVeRx+4=" "integrity": "sha1-M8UDmPcOp+uW0h97gXYwpVeRx+4="
}, },
"pump": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz",
"integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==",
"requires": {
"end-of-stream": "^1.1.0",
"once": "^1.3.1"
}
},
"puppeteer": { "puppeteer": {
"version": "1.11.0", "version": "1.11.0",
"resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-1.11.0.tgz", "resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-1.11.0.tgz",
@ -438,6 +577,14 @@
"util-deprecate": "^1.0.1" "util-deprecate": "^1.0.1"
} }
}, },
"responselike": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/responselike/-/responselike-1.0.2.tgz",
"integrity": "sha1-kYcg7ztjHFZCvgaPFa3lpG9Loec=",
"requires": {
"lowercase-keys": "^1.0.0"
}
},
"rimraf": { "rimraf": {
"version": "2.6.2", "version": "2.6.2",
"resolved": "https://registry.npmjs.org/rimraf/-/rimraf-2.6.2.tgz", "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-2.6.2.tgz",
@ -459,6 +606,11 @@
"safe-buffer": "~5.1.0" "safe-buffer": "~5.1.0"
} }
}, },
"to-readable-stream": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/to-readable-stream/-/to-readable-stream-1.0.0.tgz",
"integrity": "sha512-Iq25XBt6zD5npPhlLVXGFN3/gyR2/qODcKNNyTMd4vbm39HUaOiAM4PMq0eMVC/Tkxz+Zjdsc55g9yyz+Yq00Q=="
},
"type-detect": { "type-detect": {
"version": "4.0.8", "version": "4.0.8",
"resolved": "https://registry.npmjs.org/type-detect/-/type-detect-4.0.8.tgz", "resolved": "https://registry.npmjs.org/type-detect/-/type-detect-4.0.8.tgz",
@ -469,6 +621,14 @@
"resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz", "resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz",
"integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c=" "integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c="
}, },
"url-parse-lax": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/url-parse-lax/-/url-parse-lax-3.0.0.tgz",
"integrity": "sha1-FrXK/Afb42dsGxmZF3gj1lA6yww=",
"requires": {
"prepend-http": "^2.0.0"
}
},
"util-deprecate": { "util-deprecate": {
"version": "1.0.2", "version": "1.0.2",
"resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz",

View File

@ -1,6 +1,6 @@
{ {
"name": "se-scraper", "name": "se-scraper",
"version": "1.1.7", "version": "1.1.8",
"description": "A simple module which uses puppeteer to scrape several search engines.", "description": "A simple module which uses puppeteer to scrape several search engines.",
"homepage": "https://scrapeulous.com/", "homepage": "https://scrapeulous.com/",
"main": "index.js", "main": "index.js",
@ -22,6 +22,7 @@
"dependencies": { "dependencies": {
"chai": "^4.2.0", "chai": "^4.2.0",
"cheerio": "^1.0.0-rc.2", "cheerio": "^1.0.0-rc.2",
"got": "^9.6.0",
"puppeteer": "^1.9.0" "puppeteer": "^1.9.0"
} }
} }

12
run.js
View File

@ -10,7 +10,7 @@ let config = {
write_meta_data: false, write_meta_data: false,
// how long to sleep between requests. a random sleep interval within the range [a,b] // how long to sleep between requests. a random sleep interval within the range [a,b]
// is drawn before every request. empty string for no sleeping. // is drawn before every request. empty string for no sleeping.
sleep_range: '[1,1]', sleep_range: '[1,2]',
// which search engine to scrape // which search engine to scrape
search_engine: 'google', search_engine: 'google',
// whether debug information should be printed // whether debug information should be printed
@ -20,9 +20,11 @@ let config = {
// this output is informational // this output is informational
verbose: false, verbose: false,
// an array of keywords to scrape // an array of keywords to scrape
keywords: ['trump', 'chief'], keywords: ['scraping scrapeulous.com'],
// alternatively you can specify a keyword_file. this overwrites the keywords array // alternatively you can specify a keyword_file. this overwrites the keywords array
keyword_file: '', keyword_file: '',
// the number of pages to scrape for each keyword
num_pages: 1,
// whether to start the browser in headless mode // whether to start the browser in headless mode
headless: true, headless: true,
// path to output file, data will be stored in JSON // path to output file, data will be stored in JSON
@ -35,7 +37,11 @@ let config = {
// get_browser, handle_metadata, close_browser // get_browser, handle_metadata, close_browser
// must be an absolute path to the module // must be an absolute path to the module
//custom_func: resolve('examples/pluggable.js'), //custom_func: resolve('examples/pluggable.js'),
custom_func: resolve('examples/pluggable.js'), custom_func: '',
// use a proxy for all connections
// example: 'socks5://78.94.172.42:1080'
// example: 'http://118.174.233.10:48400'
//proxy: 'socks5://78.94.172.42:1080',
}; };
function callback(err, response) { function callback(err, response) {

96
src/captcha_solver.js Normal file
View File

@ -0,0 +1,96 @@
/*
There are essentially two strategies to handle a search engine showing you a captcha:
1. Solve the captcha
https://github.com/ecthros/uncaptcha2
or use a captcha solving service such as https://anti-captcha.com/mainpage
2. Switch your IP address with rotating proxies
*/
/**
* @name download recaptcha2 audio captcha
*
* There are several issues:
*
* Google sees that we are using an automated browser.
*
* In the worst case we have to completely control the browser ourselves without puppeteer.
*
* https://github.com/ecthros/uncaptcha2
*
* See here:
*
* https://gist.github.com/tegansnyder/c3aeae4d57768c58247ae6c4e5acd3d1
*
* https://github.com/GoogleChrome/puppeteer/issues/3039
*
* https://intoli.com/blog/making-chrome-headless-undetectable/
*
* @desc Go to the https://www.google.com/recaptcha/api2/demo demo page and download the captcha
*/
const puppeteer = require('puppeteer');
const fs = require('fs');
const got = require('got');
try {
(async () => {
const browser = await puppeteer.launch({
args: [
'--proxy-server=socks5://78.94.172.42:1080',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--disable-gpu',
'--window-size=1920x1080',
'--hide-scrollbars',
'--user-agent="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0"',
],
headless: false,
});
const page = await browser.newPage()
await page.goto('https://www.google.com/recaptcha/api2/demo')
await page.waitFor(1000);
const frames = page.frames();
console.info('Available frames', frames.map(frame => frame.name()));
console.info('Available frame urls', frames.map(frame => frame.url()));
const frame = frames.find(frame => frame.url().includes('/recaptcha/api2/anchor?'));
const content_frame = frames.find(frame => frame.url().includes('/recaptcha/api2/bframe?'));
await frame.waitForSelector('#recaptcha-anchor', { timeout: 10000 });
await page.waitFor(1000);
const button = await frame.$('#recaptcha-anchor');
await button.click();
await content_frame.waitForSelector('#recaptcha-audio-button');
const audio_button = await content_frame.$('#recaptcha-audio-button');
await audio_button.click();
await page.waitFor(1000);
await content_frame.waitForSelector('.rc-audiochallenge-tdownload-link');
let download_link = await content_frame.evaluate(() => {
return document.querySelectorAll('.rc-audiochallenge-tdownload-link').getAttribute('href');
});
console.log('Got audio download link: ', download_link);
got.stream(download_link).pipe(fs.createWriteStream('audio.mp3'));
await browser.close();
})()
} catch (err) {
console.error(err)
}
/*
translate this shit into js: https://github.com/ecthros/uncaptcha2/blob/master/queryAPI.py
*/
async function translate_audio_file() {
}

View File

@ -21,6 +21,7 @@ async function scrape_bing_pup(page, event, context, pluggable) {
for (var i = 0; i < keywords.length; i++) { for (var i = 0; i < keywords.length; i++) {
keyword = keywords[i]; keyword = keywords[i];
results[keyword] = {};
if (pluggable.before_keyword_scraped) { if (pluggable.before_keyword_scraped) {
await pluggable.before_keyword_scraped({ await pluggable.before_keyword_scraped({
@ -33,23 +34,35 @@ async function scrape_bing_pup(page, event, context, pluggable) {
try { try {
const input = await page.$('input[name="q"]'); const input = await page.$('input[name="q"]');
// overwrites last text in input await sfunctions.set_input_value(page, `input[name="q"]`, keyword);
await input.click({ clickCount: 3 }); await sfunctions.sleep(50);
await input.type(keyword);
await input.focus(); await input.focus();
await page.keyboard.press("Enter"); await page.keyboard.press("Enter");
if (event.sleep_range) { let page_num = 1;
await sfunctions.random_sleep(event);
}
await page.waitForSelector('#b_content', { timeout: 5000 }); do {
if (event.debug === true && event.is_local === true) { if (event.verbose === true) {
await page.screenshot({path: `debug/${keyword}.png`}); console.log(`${event.search_engine} is scraping keyword: ${keyword} on page ${page_num}`);
} }
if (event.sleep_range) {
await sfunctions.random_sleep(event);
}
await page.waitForSelector('#b_content', { timeout: 5000 });
await sfunctions.sleep(500);
let html = await page.content();
results[keyword][page_num] = parse(html);
let html = await page.content(); page_num += 1;
results[keyword] = parse(html);
let next_page_link = await page.$('.sb_pagN', {timeout: 1000});
if (!next_page_link) {
break;
}
await next_page_link.click();
await page.waitForNavigation();
} while (page_num <= event.num_pages)
} catch (e) { } catch (e) {
console.error(`Problem with scraping ${keyword}: ${e}`); console.error(`Problem with scraping ${keyword}: ${e}`);

View File

@ -25,8 +25,8 @@ async function scrape_google_pup(page, event, context, pluggable) {
var results = {}; var results = {};
for (var i = 0; i < keywords.length; i++) { for (var i = 0; i < keywords.length; i++) {
keyword = keywords[i]; keyword = keywords[i];
results[keyword] = {};
if (pluggable.before_keyword_scraped) { if (pluggable.before_keyword_scraped) {
await pluggable.before_keyword_scraped({ await pluggable.before_keyword_scraped({
@ -37,54 +37,63 @@ async function scrape_google_pup(page, event, context, pluggable) {
}); });
} }
if (event.verbose === true) {
console.log(`${event.search_engine} is scraping keyword: ${keyword}`);
}
try { try {
const input = await page.$('input[name="q"]'); const input = await page.$('input[name="q"]');
// await input.click({ clickCount: 3 }); await sfunctions.set_input_value(page, `input[name="q"]`, keyword);
// await sfunctions.sleep(50); await sfunctions.sleep(50);
//await input.type(keyword);
await sfunctions.set_input_value(page, `input[name="q"]`, keyword);
await sfunctions.sleep(50);
await input.focus(); await input.focus();
await page.keyboard.press("Enter"); await page.keyboard.press("Enter");
if (event.sleep_range) { let page_num = 1;
await sfunctions.random_sleep(event);
}
await page.waitForSelector('#center_col', { timeout: STANDARD_TIMEOUT }); do {
await sfunctions.sleep(500); if (event.verbose === true) {
console.log(`${event.search_engine} is scraping keyword: ${keyword} on page ${page_num}`);
}
if (event.sleep_range) {
await sfunctions.random_sleep(event);
}
await page.waitForSelector('#center_col', {timeout: STANDARD_TIMEOUT});
await sfunctions.sleep(500);
let html = await page.content();
results[keyword][page_num] = parse_google_results(html);
page_num += 1;
let next_page_link = await page.$('#pnnext', {timeout: 1000});
if (!next_page_link) {
break;
}
await next_page_link.click();
await page.waitForNavigation();
} while (page_num <= event.num_pages)
} catch (e) { } catch (e) {
console.error(`Problem with scraping ${keyword}.`); console.error(`Problem with scraping ${keyword}.`);
console.error(e); console.error(e);
if (await scraping_detected(page) === true) { if (await scraping_detected(page) === true) {
console.error('Google detected the scraping. Aborting.'); console.error('Google detected the scraping. Aborting.');
if (event.is_local === true) { if (event.is_local === true) {
await sfunctions.sleep(SOLVE_CAPTCHA_TIME); await sfunctions.sleep(SOLVE_CAPTCHA_TIME);
console.error('You have 45 seconds to enter the captcha.'); console.error('You have 45 seconds to enter the captcha.');
// expect that user filled out necessary captcha // expect that user filled out necessary captcha
} else { } else {
return results; return results;
} }
} else { } else {
// some other error, quit scraping process if stuff is broken // some other error, quit scraping process if stuff is broken
if (event.is_local === true) { if (event.is_local === true) {
console.error('You have 30 seconds to fix this.'); console.error('You have 30 seconds to fix this.');
await sfunctions.sleep(30000); await sfunctions.sleep(30000);
} else { } else {
return results; return results;
} }
} }
} }
let html = await page.content();
results[keyword] = parse_google_results(html);
} }
return results; return results;

View File

@ -12,13 +12,14 @@ async function get_metadata(browser) {
waitLoad: true, waitLoad: true,
waitNetworkIdle: true // defaults to false waitNetworkIdle: true // defaults to false
}); });
let json = await page.content(); let json = await page.content({
timeout: 20000
});
const $ = cheerio.load(json); const $ = cheerio.load(json);
metadata.ipinfo = $('pre').text(); metadata.ipinfo = $('pre').text();
return metadata; return metadata;
} }
async function get_http_headers(browser) { async function get_http_headers(browser) {
let metadata = {}; let metadata = {};
const page = await browser.newPage(); const page = await browser.newPage();

39
src/modules/se_scraper.js Normal file
View File

@ -0,0 +1,39 @@
const start_url = {
'google': ''
};
/*
Read this shit: https://javascript.info/class-inheritance
*/
module.exports = class Scraper {
constructor(options = {}) {
const {
searchEngine = 'google',
numPages = 1,
pluggable = null,
} = options;
this.pluggable = pluggable;
this.searchEngine = searchEngine;
this.numPages = numPages;
this.results = {}
}
async load_search_engine() {
}
async search_keyword() {
}
parse() {
}
async next_page() {
}
async detected() {
}
};

View File

@ -3,80 +3,79 @@ module.exports = {
}; };
function random_user_agent() { function random_user_agent() {
return user_agents[Math.floor(Math.random()*user_agents.length)]; let rand = user_agents[Math.floor(Math.random()*user_agents.length)];
} }
// updated: 29 Jan 2019 // updated: 29 Jan 2019
const user_agents = [ const user_agents = [
['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:64.0) Gecko/20100101 Firefox/64.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0', 'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0', 'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:64.0) Gecko/20100101 Firefox/64.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0', 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 YaBrowser/18.11.1.805 Yowser/2.5 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 YaBrowser/18.11.1.805 Yowser/2.5 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0', 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763',
'Mozilla/5.0 (iPad; CPU OS 12_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1', 'Mozilla/5.0 (iPad; CPU OS 12_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.1 Safari/605.1.15', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.1 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 YaBrowser/18.11.1.805 Yowser/2.5 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 YaBrowser/18.11.1.805 Yowser/2.5 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.106', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.106',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0', 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.98 Chrome/71.0.3578.98 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.98 Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; Touch; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; Touch; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; rv:64.0) Gecko/20100101 Firefox/64.0', 'Mozilla/5.0 (Windows NT 6.1; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0', 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:64.0) Gecko/20100101 Firefox/64.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:64.0) Gecko/20100101 Firefox/64.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8',
'Mozilla/5.0 (Windows NT 10.0; rv:64.0) Gecko/20100101 Firefox/64.0', 'Mozilla/5.0 (Windows NT 10.0; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0',
'Mozilla/5.0 (X11; CrOS x86_64 11151.59.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.94 Safari/537.36', 'Mozilla/5.0 (X11; CrOS x86_64 11151.59.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.94 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
]
]; ];

View File

@ -22,7 +22,7 @@ function write_results(fname, data) {
module.exports.handler = async function handler (event, context, callback) { module.exports.handler = async function handler (event, context, callback) {
config = event; config = event;
pluggable = null; pluggable = {};
if (config.custom_func) { if (config.custom_func) {
if (fs.existsSync(config.custom_func)) { if (fs.existsSync(config.custom_func)) {
try { try {
@ -43,8 +43,11 @@ module.exports.handler = async function handler (event, context, callback) {
console.log(config); console.log(config);
} }
const ADDITIONAL_CHROME_FLAGS = [ var ADDITIONAL_CHROME_FLAGS = [
//'--proxy-server=' + proxy, '--disable-infobars',
'--window-position=0,0',
'--ignore-certifcate-errors',
'--ignore-certifcate-errors-spki-list',
'--no-sandbox', '--no-sandbox',
'--disable-setuid-sandbox', '--disable-setuid-sandbox',
'--disable-dev-shm-usage', '--disable-dev-shm-usage',
@ -70,16 +73,27 @@ module.exports.handler = async function handler (event, context, callback) {
) )
} }
if (config.proxy) {
// check this out bubbles
// https://www.systutorials.com/241062/how-to-set-google-chromes-proxy-settings-in-command-line-on-linux/
// [<proxy-scheme>://]<proxy-host>[:<proxy-port>]
// "http", "socks", "socks4", "socks5".
ADDITIONAL_CHROME_FLAGS.push(
'--proxy-server=' + config.proxy,
)
}
let launch_args = { let launch_args = {
args: ADDITIONAL_CHROME_FLAGS, args: ADDITIONAL_CHROME_FLAGS,
headless: config.headless, headless: config.headless,
ignoreHTTPSErrors: true,
}; };
if (config.debug === true) { if (config.debug === true) {
console.log("Chrome Args: ", launch_args); console.log("Chrome Args: ", launch_args);
} }
if (pluggable) { if (pluggable.start_browser) {
launch_args.config = config; launch_args.config = config;
browser = await pluggable.start_browser(launch_args); browser = await pluggable.start_browser(launch_args);
} else { } else {
@ -91,6 +105,30 @@ module.exports.handler = async function handler (event, context, callback) {
console.dir(headers); console.dir(headers);
} }
let metadata = {};
if (config.write_meta_data === true) {
metadata = await meta.get_metadata(browser);
}
// check that our proxy is working by confirming
// that ipinfo.io sees the proxy IP address
if (config.proxy && config.write_meta_data === true) {
console.log(`${metadata.ipinfo} vs ${config.proxy}`);
try {
let ipdata = JSON.parse(metadata.ipinfo);
// if the ip returned by ipinfo is not a substring of our proxystring, get the heck outta here
if (!config.proxy.includes(ipdata.ip)) {
console.error('Proxy not working properly.');
await browser.close();
return;
}
} catch (exception) {
}
}
const page = await browser.newPage(); const page = await browser.newPage();
// block some assets to speed up scraping // block some assets to speed up scraping
@ -127,13 +165,8 @@ module.exports.handler = async function handler (event, context, callback) {
marketwatch: tickersearch.scrape_marketwatch_finance_pup, marketwatch: tickersearch.scrape_marketwatch_finance_pup,
}[config.search_engine](page, config, context, pluggable); }[config.search_engine](page, config, context, pluggable);
let metadata = {};
if (config.write_meta_data === true) { if (pluggable.close_browser) {
metadata = await meta.get_metadata(browser);
}
if (pluggable) {
await pluggable.close_browser(); await pluggable.close_browser();
} else { } else {
await browser.close(); await browser.close();
@ -155,7 +188,7 @@ module.exports.handler = async function handler (event, context, callback) {
results = zlib.deflateSync(results).toString('base64'); results = zlib.deflateSync(results).toString('base64');
} }
if (pluggable && pluggable.handle_results) { if (pluggable.handle_results) {
await pluggable.handle_results({ await pluggable.handle_results({
config: config, config: config,
results: results, results: results,
@ -172,7 +205,7 @@ module.exports.handler = async function handler (event, context, callback) {
console.log(metadata); console.log(metadata);
} }
if (pluggable) { if (pluggable.handle_metadata) {
await pluggable.handle_metadata({metadata: metadata, config: config}); await pluggable.handle_metadata({metadata: metadata, config: config});
} }
} }