resolved some issues. proxy possible now. scraping for more than one page possible now

This commit is contained in:
Nikolai Tschacher 2019-01-29 22:48:08 +01:00
parent 89441070cd
commit 9e62f23451
14 changed files with 764 additions and 340 deletions

336
README.md
View File

@ -34,6 +34,46 @@ Scraping is done with a headless chromium browser using the automation library p
If you need to deploy scraping to the cloud (AWS or Azure), you can contact me on hire@incolumitas.com If you need to deploy scraping to the cloud (AWS or Azure), you can contact me on hire@incolumitas.com
The chromium browser is started with the following flags to prevent
scraping detection.
```js
var ADDITIONAL_CHROME_FLAGS = [
'--disable-infobars',
'--window-position=0,0',
'--ignore-certifcate-errors',
'--ignore-certifcate-errors-spki-list',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--disable-gpu',
'--window-size=1920x1080',
'--hide-scrollbars',
];
```
Furthermore, to avoid loading unnecessary ressources and to speed up
scraping a great deal, we instruct chrome to not load images and css:
```js
await page.setRequestInterception(true);
page.on('request', (req) => {
let type = req.resourceType();
const block = ['stylesheet', 'font', 'image', 'media'];
if (block.includes(type)) {
req.abort();
} else {
req.continue();
}
});
```
#### Making puppeteer and headless chrome undetectable
Consider the following resources:
* https://intoli.com/blog/making-chrome-headless-undetectable/
### Installation and Usage ### Installation and Usage
@ -53,12 +93,12 @@ let config = {
// the user agent to scrape with // the user agent to scrape with
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
// if random_user_agent is set to True, a random user agent is chosen // if random_user_agent is set to True, a random user agent is chosen
random_user_agent: false, random_user_agent: true,
// get meta data of scraping in return object // get meta data of scraping in return object
write_meta_data: false, write_meta_data: false,
// how long to sleep between requests. a random sleep interval within the range [a,b] // how long to sleep between requests. a random sleep interval within the range [a,b]
// is drawn before every request. empty string for no sleeping. // is drawn before every request. empty string for no sleeping.
sleep_range: '', sleep_range: '[1,2]',
// which search engine to scrape // which search engine to scrape
search_engine: 'google', search_engine: 'google',
// whether debug information should be printed // whether debug information should be printed
@ -68,9 +108,11 @@ let config = {
// this output is informational // this output is informational
verbose: false, verbose: false,
// an array of keywords to scrape // an array of keywords to scrape
keywords: ['scrapeulous.com', ], keywords: ['scraping scrapeulous.com'],
// alternatively you can specify a keyword_file. this overwrites the keywords array // alternatively you can specify a keyword_file. this overwrites the keywords array
keyword_file: '', keyword_file: '',
// the number of pages to scrape for each keyword
num_pages: 2,
// whether to start the browser in headless mode // whether to start the browser in headless mode
headless: true, headless: true,
// path to output file, data will be stored in JSON // path to output file, data will be stored in JSON
@ -84,9 +126,13 @@ let config = {
// must be an absolute path to the module // must be an absolute path to the module
//custom_func: resolve('examples/pluggable.js'), //custom_func: resolve('examples/pluggable.js'),
custom_func: '', custom_func: '',
// use a proxy for all connections
// example: 'socks5://78.94.172.42:1080'
// example: 'http://118.174.233.10:48400'
//proxy: 'socks5://78.94.172.42:1080',
}; };
se_scraper.scrape(config, (err, response) => { function callback(err, response) {
if (err) { console.error(err) } if (err) { console.error(err) }
/* response object has the following properties: /* response object has the following properties:
@ -97,7 +143,9 @@ se_scraper.scrape(config, (err, response) => {
*/ */
console.dir(response.results, {depth: null, colors: true}); console.dir(response.results, {depth: null, colors: true});
}); }
se_scraper.scrape(config, callback);
``` ```
Supported options for the `search_engine` config key: Supported options for the `search_engine` config key:
@ -123,199 +171,179 @@ Supported options for the `search_engine` config key:
'marketwatch' 'marketwatch'
``` ```
Output for the above script on my laptop: Output for the above script on my machine:
```text ```text
Scraper took 4295ms to scrape 2 keywords. { 'scraping scrapeulous.com':
On average ms/keyword: 2147.5ms/keyword { '1':
{ 'incolumitas.com scraping': { time: 'Tue, 29 Jan 2019 21:39:22 GMT',
{ time: 'Mon, 24 Dec 2018 13:07:43 GMT', num_results: 'Ungefähr 145 Ergebnisse (0,18 Sekunden) ',
num_results: 'Ungefähr 2020 Ergebnisse (0.18 Sekunden) ',
no_results: false, no_results: false,
effective_query: '', effective_query: '',
results: results:
[ { link: [ { link: 'https://scrapeulous.com/',
'https://incolumitas.com/2018/10/29/youtube-puppeteer-scraping/',
title: title:
'Coding, Learning and Business Ideas Tutorial: Youtube scraping ...', 'Scrapeuloushttps://scrapeulous.com/Im CacheDiese Seite übersetzen',
snippet: snippet:
'29.10.2018 - In this blog post I am going to show you how to scrape YouTube video data using the handy puppeteer library. Puppeteer is a Node library ...', 'Scrapeulous.com allows you to scrape various search engines automatically ... or to find hidden links, Scrapeulous.com enables you to scrape a ever increasing ...',
visible_link: visible_link: 'https://scrapeulous.com/',
'https://incolumitas.com/2018/10/29/youtube-puppeteer-scraping/', date: '',
date: '29.10.2018 - ',
rank: 1 }, rank: 1 },
{ link: 'https://incolumitas.com/2018/09/05/googlescraper-tutorial/', { link: 'https://scrapeulous.com/about/',
title: title:
'GoogleScraper Tutorial - How to scrape 1000 keywords with Google', 'About - Scrapeuloushttps://scrapeulous.com/about/Im CacheDiese Seite übersetzen',
snippet: snippet:
'05.09.2018 - Tutorial that teaches how to use GoogleScraper to scrape 1000 keywords with 10 selenium browsers.', 'Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. The business requirement to scrape information from ...',
visible_link: 'https://incolumitas.com/2018/09/05/googlescraper-tutorial/', visible_link: 'https://scrapeulous.com/about/',
date: '05.09.2018 - ', date: '',
rank: 2 }, rank: 2 },
{ link: 'https://incolumitas.com/tag/scraping.html', { link: 'https://scrapeulous.com/howto/',
title: 'Coding, Learning and Business Ideas Tag Scraping', title:
'Howto - Scrapeuloushttps://scrapeulous.com/howto/Im CacheDiese Seite übersetzen',
snippet:
'We offer scraping large amounts of keywords for the Google Search Engine. Large means any number of keywords between 40 and 50000. Additionally, we ...',
visible_link: 'https://scrapeulous.com/howto/',
date: '',
rank: 3 },
{ link: 'https://github.com/NikolaiT/se-scraper',
title:
'GitHub - NikolaiT/se-scraper: Javascript scraping module based on ...https://github.com/NikolaiT/se-scraperIm CacheDiese Seite übersetzen',
snippet:
'24.12.2018 - Javascript scraping module based on puppeteer for many different search ... for many different search engines... https://scrapeulous.com/.',
visible_link: 'https://github.com/NikolaiT/se-scraper',
date: '24.12.2018 - ',
rank: 4 },
{ link:
'https://github.com/NikolaiT/GoogleScraper/blob/master/README.md',
title:
'GoogleScraper/README.md at master · NikolaiT/GoogleScraper ...https://github.com/NikolaiT/GoogleScraper/blob/.../README.mdIm CacheÄhnliche SeitenDiese Seite übersetzen',
snippet:
'GoogleScraper - Scraping search engines professionally. Scrapeulous.com - Scraping Service. GoogleScraper is a open source tool and will remain a open ...',
visible_link:
'https://github.com/NikolaiT/GoogleScraper/blob/.../README.md',
date: '',
rank: 5 },
{ link: 'https://googlescraper.readthedocs.io/',
title:
'Welcome to GoogleScraper\'s documentation! — GoogleScraper ...https://googlescraper.readthedocs.io/Im CacheDiese Seite übersetzen',
snippet:
'Welcome to GoogleScraper\'s documentation!¶. Contents: GoogleScraper - Scraping search engines professionally · Scrapeulous.com - Scraping Service ...',
visible_link: 'https://googlescraper.readthedocs.io/',
date: '',
rank: 6 },
{ link: 'https://incolumitas.com/pages/scrapeulous/',
title:
'Coding, Learning and Business Ideas Scrapeulous.com - Incolumitashttps://incolumitas.com/pages/scrapeulous/Im CacheDiese Seite übersetzen',
snippet:
'A scraping service for scientists, marketing professionals, analysts or SEO folk. In autumn 2018, I created a scraping service called scrapeulous.com. There you ...',
visible_link: 'https://incolumitas.com/pages/scrapeulous/',
date: '',
rank: 7 },
{ link: 'https://incolumitas.com/',
title:
'Coding, Learning and Business Ideashttps://incolumitas.com/Im CacheDiese Seite übersetzen',
snippet: snippet:
'Scraping Amazon Reviews using Headless Chrome Browser and Python3. Posted on Mi ... GoogleScraper Tutorial - How to scrape 1000 keywords with Google.', 'Scraping Amazon Reviews using Headless Chrome Browser and Python3. Posted on Mi ... GoogleScraper Tutorial - How to scrape 1000 keywords with Google.',
visible_link: 'https://incolumitas.com/tag/scraping.html', visible_link: 'https://incolumitas.com/',
date: '', date: '',
rank: 3 }, rank: 8 },
{ link: 'https://incolumitas.com/category/scraping.html', { link: 'https://en.wikipedia.org/wiki/Search_engine_scraping',
title: 'Coding, Learning and Business Ideas Category Scraping',
snippet:
'Nikolai Tschacher\'s ideas and projects around IT security and computer science.',
visible_link: 'https://incolumitas.com/category/scraping.html',
date: '',
rank: 4 },
{ link:
'https://github.com/NikolaiT/incolumitas/blob/master/content/Meta/scraping-and-extracting-links-from-any-major-search-engine-like-google-yandex-baidu-bing-and-duckduckgo.md',
title: title:
'incolumitas/scraping-and-extracting-links-from-any-major-search ...', 'Search engine scraping - Wikipediahttps://en.wikipedia.org/wiki/Search_engine_scrapingIm CacheDiese Seite übersetzen',
snippet: snippet:
'Title: Scraping and Extracting Links from any major Search Engine like Google, Yandex, Baidu, Bing and Duckduckgo Date: 2014-11-12 00:47 Author: Nikolai ...', 'Search engine scraping is the process of harvesting URLs, descriptions, or other information from search engines such as Google, Bing or Yahoo. This is a ...',
visible_link: visible_link: 'https://en.wikipedia.org/wiki/Search_engine_scraping',
'https://github.com/.../incolumitas/.../scraping-and-extracting-links...',
date: '', date: '',
rank: 5 },
{ link:
'https://stackoverflow.com/questions/16955325/scraping-google-results-with-python',
title: 'Scraping Google Results with Python - Stack Overflow',
snippet:
'I found this. incolumitas.com/2013/01/06/… But the author claims it is not ported to 2.7 yet. user2351394 Jun 6 \'13 at 6:59 ...',
visible_link:
'https://stackoverflow.com/.../scraping-google-results-with-python',
date: '',
rank: 6 },
{ link: 'https://pypi.org/project/GoogleScraper/0.1.18/',
title: 'GoogleScraper · PyPI',
snippet:
'[5]: http://incolumitas.com/2014/11/12/scraping-and-extracting-links-from-any-major-search-engine-like-google-yandex-baidu-bing-and-duckduckgo/ ...',
visible_link: 'https://pypi.org/project/GoogleScraper/0.1.18/',
date: '',
rank: 7 },
{ link:
'https://www.reddit.com/r/Python/comments/2m0vyu/scraping_links_on_google_yandex_bing_duckduckgo/',
title:
'Scraping links on Google, Yandex, Bing, Duckduckgo, Baidu and ...',
snippet:
'12.11.2014 - Scraping links on Google, Yandex, Bing, Duckduckgo, Baidu and other search engines with Python ... submitted 4 years ago by incolumitas.',
visible_link:
'https://www.reddit.com/.../scraping_links_on_google_yandex_bi...',
date: '12.11.2014 - ',
rank: 9 }, rank: 9 },
{ link: 'https://twitter.com/incolumitas_?lang=de', { link:
title: 'Nikolai Tschacher (@incolumitas_) | Twitter', 'https://readthedocs.org/projects/googlescraper/downloads/pdf/latest/',
title:
'GoogleScraper Documentation - Read the Docshttps://readthedocs.org/projects/googlescraper/downloads/.../latest...Im CacheDiese Seite übersetzen',
snippet: snippet:
'Embed Tweet. How to use GoogleScraper to scrape images and download them ... Learn how to scrape millions of url from yandex and google or bing with: ...', '23.12.2018 - Contents: 1 GoogleScraper - Scraping search engines professionally. 1. 1.1 ... For this reason, I created the web service scrapeulous.com.',
visible_link: 'https://twitter.com/incolumitas_?lang=de', visible_link:
date: '', 'https://readthedocs.org/projects/googlescraper/downloads/.../latest...',
date: '23.12.2018 - ',
rank: 10 } ] }, rank: 10 } ] },
'best scraping framework': '2':
{ time: 'Mon, 24 Dec 2018 13:07:44 GMT', { time: 'Tue, 29 Jan 2019 21:39:24 GMT',
num_results: 'Ungefähr 2820000 Ergebnisse (0.36 Sekunden) ', num_results: 'Seite 2 von ungefähr 145 Ergebnissen (0,20 Sekunden) ',
no_results: false, no_results: false,
effective_query: '', effective_query: '',
results: results:
[ { link: [ { link: 'https://pypi.org/project/CountryGoogleScraper/',
'http://www.aioptify.com/top-web-scraping-frameworks-and-librares.php', title:
title: 'Top Web Scraping Frameworks and Libraries - AI Optify', 'CountryGoogleScraper · PyPIhttps://pypi.org/project/CountryGoogleScraper/Im CacheDiese Seite übersetzen',
snippet: '', snippet:
visible_link: 'A module to scrape and extract links, titles and descriptions from various search ... Look [here to get an idea how to use asynchronous mode](http://scrapeulous.',
'www.aioptify.com/top-web-scraping-frameworks-and-librares.php', visible_link: 'https://pypi.org/project/CountryGoogleScraper/',
date: '', date: '',
rank: 1 }, rank: 1 },
{ link: { link: 'https://www.youtube.com/watch?v=a6xn6rc9GbI',
'http://www.aioptify.com/top-web-scraping-frameworks-and-librares.php',
title: 'Top Web Scraping Frameworks and Libraries - AI Optify',
snippet: '',
visible_link:
'www.aioptify.com/top-web-scraping-frameworks-and-librares.php',
date: '',
rank: 2 },
{ link:
'https://www.scrapehero.com/open-source-web-scraping-frameworks-and-tools/',
title: title:
'Best Open Source Web Scraping Frameworks and Tools - ScrapeHero', 'scrapeulous intro - YouTubehttps://www.youtube.com/watch?v=a6xn6rc9GbIDiese Seite übersetzen',
snippet: snippet:
'05.06.2018 - List of Open Source Web Scraping Frameworks. Scrapy. MechanicalSoup. PySpider. Portia. Apify SDK. Nodecrawler. Selenium WebDriver. Puppeteer.', 'scrapeulous intro. Scrapeulous Scrapeulous. Loading... Unsubscribe from ... on Dec 16, 2018. Introduction ...',
visible_link: visible_link: 'https://www.youtube.com/watch?v=a6xn6rc9GbI',
'https://www.scrapehero.com/open-source-web-scraping-framewo...', date: '',
date: '05.06.2018 - ',
rank: 3 }, rank: 3 },
{ link: { link:
'https://medium.com/datadriveninvestor/best-data-scraping-tools-for-2018-top-10-reviews-558cc5a4992f', 'https://www.reddit.com/r/Python/comments/2tii3r/scraping_260_search_queries_in_bing_in_a_matter/',
title: title:
'Best Data Scraping Tools for 2018 (Top 10 Reviews) Data Driven ...', 'Scraping 260 search queries in Bing in a matter of seconds using ...https://www.reddit.com/.../scraping_260_search_queries_in_bing...Im CacheDiese Seite übersetzen',
snippet: snippet:
'05.03.2018 - Pros: Octoparse is the best free data scraping tool I\'ve met. ... your Scrapy (a open-source data extraction framework) web spider\'s activities.', '24.01.2015 - Scraping 260 search queries in Bing in a matter of seconds using asyncio and aiohttp. (scrapeulous.com). submitted 3 years ago by ...',
visible_link: visible_link:
'https://medium.com/.../best-data-scraping-tools-for-2018-top-10-...', 'https://www.reddit.com/.../scraping_260_search_queries_in_bing...',
date: '05.03.2018 - ', date: '24.01.2015 - ',
rank: 4 }, rank: 4 },
{ link: { link: 'https://twitter.com/incolumitas_?lang=de',
'https://www.quora.com/What-is-the-best-web-scraping-open-source-tool', title:
title: 'What is the best web scraping open source tool? - Quora', 'Nikolai Tschacher (@incolumitas_) | Twitterhttps://twitter.com/incolumitas_?lang=deIm CacheÄhnliche SeitenDiese Seite übersetzen',
snippet: snippet:
'15.06.2015 - My personal favourite is Python Scrapy and it is an excellent framework for building a web data scraper. Why Scrapy? 1) It is an open source framework and cost ...', 'Learn how to scrape millions of url from yandex and google or bing with: http://scrapeulous.com/googlescraper-market-analysis.html … 0 replies 0 retweets 0 ...',
visible_link: visible_link: 'https://twitter.com/incolumitas_?lang=de',
'https://www.quora.com/What-is-the-best-web-scraping-open-sour...', date: '',
date: '15.06.2015 - ',
rank: 5 }, rank: 5 },
{ link: { link:
'http://www.aioptify.com/top-web-scraping-frameworks-and-librares.php', 'http://blog.shodan.io/hostility-in-the-python-package-index/',
title: 'Top Web Scraping Frameworks and Libraries - AI Optify',
snippet:
'21.05.2018 - Top Web Scraping Frameworks and Libraries. Requests. Scrapy. Beautiful Soup. Selenium with Python. lxml. Webscraping with Selenium - part 1. Extracting data from websites with Scrapy. Scrapinghub.',
visible_link:
'www.aioptify.com/top-web-scraping-frameworks-and-librares.php',
date: '21.05.2018 - ',
rank: 6 },
{ link: 'https://scrapy.org/',
title: title:
'Scrapy | A Fast and Powerful Scraping and Web Crawling Framework', 'Hostility in the Cheese Shop - Shodan Blogblog.shodan.io/hostility-in-the-python-package-index/Im CacheDiese Seite übersetzen',
snippet: snippet:
'An open source and collaborative framework for extracting the data you need from ... Spider): name = \'blogspider\' start_urls = [\'https://blog.scrapinghub.com\'] def ...', '22.02.2015 - https://zzz.scrapeulous.com/r? According to the author of the website, these hostile packages are used as honeypots. Honeypots are usually ...',
visible_link: 'https://scrapy.org/', visible_link: 'blog.shodan.io/hostility-in-the-python-package-index/',
date: '22.02.2015 - ',
rank: 6 },
{ link: 'https://libraries.io/github/NikolaiT/GoogleScraper',
title:
'NikolaiT/GoogleScraper - Libraries.iohttps://libraries.io/github/NikolaiT/GoogleScraperIm CacheDiese Seite übersetzen',
snippet:
'A Python module to scrape several search engines (like Google, Yandex, Bing, ... https://scrapeulous.com/ ... You can install GoogleScraper comfortably with pip:',
visible_link: 'https://libraries.io/github/NikolaiT/GoogleScraper',
date: '', date: '',
rank: 7 }, rank: 7 },
{ link: { link: 'https://pydigger.com/pypi/CountryGoogleScraper',
'https://www.scraperapi.com/blog/the-10-best-web-scraping-tools', title:
title: 'The 10 Best Web Scraping Tools of 2018 - Scraper API', 'CountryGoogleScraper - PyDiggerhttps://pydigger.com/pypi/CountryGoogleScraperDiese Seite übersetzen',
snippet: snippet:
'19.07.2018 - The 10 Best Web Scraping Tools of 2018. ParseHub. Scrapy. Diffbot. Cheerio. Website: https://cheerio.js.org. Beautiful Soup. Website: https://www.crummy.com/software/BeautifulSoup/ Puppeteer. Website: https://github.com/GoogleChrome/puppeteer. Content Grabber. Website: http://www.contentgrabber.com/ Mozenda. Website: ...', '19.10.2016 - Look [here to get an idea how to use asynchronous mode](http://scrapeulous.com/googlescraper-260-keywords-in-a-second.html). ### Table ...',
visible_link: visible_link: 'https://pydigger.com/pypi/CountryGoogleScraper',
'https://www.scraperapi.com/blog/the-10-best-web-scraping-tools', date: '19.10.2016 - ',
date: '19.07.2018 - ',
rank: 8 }, rank: 8 },
{ link: 'https://elitedatascience.com/python-web-scraping-libraries', { link: 'https://hub.docker.com/r/cimenx/data-mining-penandtest/',
title: '5 Tasty Python Web Scraping Libraries - EliteDataScience',
snippet:
'03.02.2017 - We\'ve decided to feature the 5 Python libraries for web scraping that ... The good news is that you can swap out its parser with a faster one if ... Scrapy is technically not even a library… it\'s a complete web scraping framework.',
visible_link: 'https://elitedatascience.com/python-web-scraping-libraries',
date: '03.02.2017 - ',
rank: 9 },
{ link:
'https://blog.michaelyin.info/web-scraping-framework-review-scrapy-vs-selenium/',
title: title:
'Web Scraping Framework Review: Scrapy VS Selenium | MichaelYin ...', 'cimenx/data-mining-penandtest - Docker Hubhttps://hub.docker.com/r/cimenx/data-mining-penandtest/Im CacheDiese Seite übersetzen',
snippet: snippet:
'01.10.2018 - In this Scrapy tutorial, I will cover the features of Scrapy and Selenium, and help you decide which one is better for your projects.', 'Container. OverviewTagsDockerfileBuilds · http://scrapeulous.com/googlescraper-260-keywords-in-a-second.html. Docker Pull Command. Owner. profile ...',
visible_link: visible_link: 'https://hub.docker.com/r/cimenx/data-mining-penandtest/',
'https://blog.michaelyin.info/web-scraping-framework-review-scr...',
date: '01.10.2018 - ',
rank: 10 },
{ link: 'https://github.com/lorien/awesome-web-scraping',
title:
'GitHub - lorien/awesome-web-scraping: List of libraries, tools and APIs ...',
snippet:
'List of libraries, tools and APIs for web scraping and data processing. ... golang.md · add dataflow kit framework, 2 months ago ... Make this list better!',
visible_link: 'https://github.com/lorien/awesome-web-scraping',
date: '', date: '',
rank: 11 }, rank: 9 },
{ link: 'https://www.import.io/post/best-web-scraping-tools-2018/', { link: 'https://www.revolvy.com/page/Search-engine-scraping',
title: 'Best Web Scraping Software Tools 2018 | Import.io', title:
'Search engine scraping | Revolvyhttps://www.revolvy.com/page/Search-engine-scrapingIm CacheDiese Seite übersetzen',
snippet: snippet:
'07.08.2018 - List of Best Web Scraping SoftwareThere are hundreds of Web ... it is a fast high-level screen scraping and web crawling framework, used to ...', 'Search engine scraping is the process of harvesting URLs, descriptions, or other information from search engines such as Google, Bing or Yahoo. This is a ...',
visible_link: 'https://www.import.io/post/best-web-scraping-tools-2018/', visible_link: 'https://www.revolvy.com/page/Search-engine-scraping',
date: '07.08.2018 - ', date: '',
rank: 12 } ] } } rank: 10 } ] } } }
``` ```

View File

@ -14,6 +14,17 @@
https://www.scrapehero.com/how-to-increase-web-scraping-speed-using-puppeteer/ https://www.scrapehero.com/how-to-increase-web-scraping-speed-using-puppeteer/
https://www.scrapehero.com/how-to-build-a-web-scraper-using-puppeteer-and-node-js/ https://www.scrapehero.com/how-to-build-a-web-scraper-using-puppeteer-and-node-js/
29.1.2019
- implement proxy support functionality
- implement proxy check
- implement scraping more than 1 page
- do it for google
- and bing
- implement duckduckgo scraping
TODO: TODO:
- think about implementing ticker search for: https://quotes.wsj.com/MSFT?mod=searchresults_companyquotes - think about implementing ticker search for: https://quotes.wsj.com/MSFT?mod=searchresults_companyquotes
- add proxy support - add proxy support
@ -24,3 +35,27 @@ TODO:
- think whether it makes sense to introduce a generic scraping class? - think whether it makes sense to introduce a generic scraping class?
- is scraping abstractable or is every scraper too unique? - is scraping abstractable or is every scraper too unique?
- dont make the same mistakes as with GoogleScraper - dont make the same mistakes as with GoogleScraper
TODO:
okay its fucking time to make a generic scraping class like in GoogleScraper
i feel like history repeats
class Scraper
constructor(options = {}) {
}
async load_search_engine() {}
async search_keyword() {}
async new_page() {}
async detected() {}
then each search engine derives from this generic class
some search engines do not seed such a abstract class, because they are too complex

File diff suppressed because one or more lines are too long

View File

@ -35,6 +35,10 @@ exports.scrape = async function(config, callback) {
// get_browser, handle_metadata, close_browser // get_browser, handle_metadata, close_browser
//custom_func: resolve('examples/pluggable.js'), //custom_func: resolve('examples/pluggable.js'),
custom_func: '', custom_func: '',
// use a proxy for all connections
// example: 'socks5://78.94.172.42:1080'
// example: 'http://118.174.233.10:48400'
proxy: '',
}; };
// overwrite default config // overwrite default config

162
package-lock.json generated
View File

@ -1,9 +1,22 @@
{ {
"name": "se-scraper", "name": "se-scraper",
"version": "1.1.4", "version": "1.1.7",
"lockfileVersion": 1, "lockfileVersion": 1,
"requires": true, "requires": true,
"dependencies": { "dependencies": {
"@sindresorhus/is": {
"version": "0.14.0",
"resolved": "https://registry.npmjs.org/@sindresorhus/is/-/is-0.14.0.tgz",
"integrity": "sha512-9NET910DNaIPngYnLLPeg+Ogzqsi9uM4mSboU5y6p8S5DzMTVEsJZrawi+BoDNUVBa2DhJqQYUFvMDfgU062LQ=="
},
"@szmarczak/http-timer": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/@szmarczak/http-timer/-/http-timer-1.1.2.tgz",
"integrity": "sha512-XIB2XbzHTN6ieIjfIMV9hlVcfPU26s2vafYWQcZHWXHOxiaRZYEDKEwdl129Zyg50+foYV2jCgtrqSA6qNuNSA==",
"requires": {
"defer-to-connect": "^1.0.1"
}
},
"@types/node": { "@types/node": {
"version": "10.12.18", "version": "10.12.18",
"resolved": "https://registry.npmjs.org/@types/node/-/node-10.12.18.tgz", "resolved": "https://registry.npmjs.org/@types/node/-/node-10.12.18.tgz",
@ -51,6 +64,20 @@
"resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.1.tgz", "resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.1.tgz",
"integrity": "sha512-MQcXEUbCKtEo7bhqEs6560Hyd4XaovZlO/k9V3hjVUF/zwW7KBVdSK4gIt/bzwS9MbR5qob+F5jusZsb0YQK2A==" "integrity": "sha512-MQcXEUbCKtEo7bhqEs6560Hyd4XaovZlO/k9V3hjVUF/zwW7KBVdSK4gIt/bzwS9MbR5qob+F5jusZsb0YQK2A=="
}, },
"cacheable-request": {
"version": "6.0.0",
"resolved": "https://registry.npmjs.org/cacheable-request/-/cacheable-request-6.0.0.tgz",
"integrity": "sha512-2N7AmszH/WPPpl5Z3XMw1HAP+8d+xugnKQAeKvxFZ/04dbT/CAznqwbl+7eSr3HkwdepNwtb2yx3CAMQWvG01Q==",
"requires": {
"clone-response": "^1.0.2",
"get-stream": "^4.0.0",
"http-cache-semantics": "^4.0.0",
"keyv": "^3.0.0",
"lowercase-keys": "^1.0.1",
"normalize-url": "^3.1.0",
"responselike": "^1.0.2"
}
},
"chai": { "chai": {
"version": "4.2.0", "version": "4.2.0",
"resolved": "https://registry.npmjs.org/chai/-/chai-4.2.0.tgz", "resolved": "https://registry.npmjs.org/chai/-/chai-4.2.0.tgz",
@ -82,6 +109,14 @@
"parse5": "^3.0.1" "parse5": "^3.0.1"
} }
}, },
"clone-response": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/clone-response/-/clone-response-1.0.2.tgz",
"integrity": "sha1-0dyXOSAxTfZ/vrlCI7TuNQI56Ws=",
"requires": {
"mimic-response": "^1.0.0"
}
},
"concat-map": { "concat-map": {
"version": "0.0.1", "version": "0.0.1",
"resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz",
@ -151,6 +186,14 @@
"ms": "^2.1.1" "ms": "^2.1.1"
} }
}, },
"decompress-response": {
"version": "3.3.0",
"resolved": "https://registry.npmjs.org/decompress-response/-/decompress-response-3.3.0.tgz",
"integrity": "sha1-gKTdMjdIOEv6JICDYirt7Jgq3/M=",
"requires": {
"mimic-response": "^1.0.0"
}
},
"deep-eql": { "deep-eql": {
"version": "3.0.1", "version": "3.0.1",
"resolved": "https://registry.npmjs.org/deep-eql/-/deep-eql-3.0.1.tgz", "resolved": "https://registry.npmjs.org/deep-eql/-/deep-eql-3.0.1.tgz",
@ -159,6 +202,11 @@
"type-detect": "^4.0.0" "type-detect": "^4.0.0"
} }
}, },
"defer-to-connect": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/defer-to-connect/-/defer-to-connect-1.0.2.tgz",
"integrity": "sha512-k09hcQcTDY+cwgiwa6PYKLm3jlagNzQ+RSvhjzESOGOx+MNOuXkxTfEvPrO1IOQ81tArCFYQgi631clB70RpQw=="
},
"dom-serializer": { "dom-serializer": {
"version": "0.1.0", "version": "0.1.0",
"resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-0.1.0.tgz", "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-0.1.0.tgz",
@ -197,6 +245,19 @@
"domelementtype": "1" "domelementtype": "1"
} }
}, },
"duplexer3": {
"version": "0.1.4",
"resolved": "https://registry.npmjs.org/duplexer3/-/duplexer3-0.1.4.tgz",
"integrity": "sha1-7gHdHKwO08vH/b6jfcCo8c4ALOI="
},
"end-of-stream": {
"version": "1.4.1",
"resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.1.tgz",
"integrity": "sha512-1MkrZNvWTKCaigbn+W15elq2BB/L22nqrSY5DKlo3X6+vclJm8Bb5djXJBmEX6fS3+zCh/F4VBK5Z2KxJt4s2Q==",
"requires": {
"once": "^1.4.0"
}
},
"entities": { "entities": {
"version": "1.1.2", "version": "1.1.2",
"resolved": "https://registry.npmjs.org/entities/-/entities-1.1.2.tgz", "resolved": "https://registry.npmjs.org/entities/-/entities-1.1.2.tgz",
@ -259,6 +320,14 @@
"resolved": "https://registry.npmjs.org/get-func-name/-/get-func-name-2.0.0.tgz", "resolved": "https://registry.npmjs.org/get-func-name/-/get-func-name-2.0.0.tgz",
"integrity": "sha1-6td0q+5y4gQJQzoGY2YCPdaIekE=" "integrity": "sha1-6td0q+5y4gQJQzoGY2YCPdaIekE="
}, },
"get-stream": {
"version": "4.1.0",
"resolved": "https://registry.npmjs.org/get-stream/-/get-stream-4.1.0.tgz",
"integrity": "sha512-GMat4EJ5161kIy2HevLlr4luNjBgvmj413KaQA7jt4V8B4RDsfpHk7WQ9GVqfYyyx8OS/L66Kox+rJRNklLK7w==",
"requires": {
"pump": "^3.0.0"
}
},
"glob": { "glob": {
"version": "7.1.3", "version": "7.1.3",
"resolved": "https://registry.npmjs.org/glob/-/glob-7.1.3.tgz", "resolved": "https://registry.npmjs.org/glob/-/glob-7.1.3.tgz",
@ -272,6 +341,24 @@
"path-is-absolute": "^1.0.0" "path-is-absolute": "^1.0.0"
} }
}, },
"got": {
"version": "9.6.0",
"resolved": "https://registry.npmjs.org/got/-/got-9.6.0.tgz",
"integrity": "sha512-R7eWptXuGYxwijs0eV+v3o6+XH1IqVK8dJOEecQfTmkncw9AV4dcw/Dhxi8MdlqPthxxpZyizMzyg8RTmEsG+Q==",
"requires": {
"@sindresorhus/is": "^0.14.0",
"@szmarczak/http-timer": "^1.1.2",
"cacheable-request": "^6.0.0",
"decompress-response": "^3.3.0",
"duplexer3": "^0.1.4",
"get-stream": "^4.1.0",
"lowercase-keys": "^1.0.1",
"mimic-response": "^1.0.1",
"p-cancelable": "^1.0.0",
"to-readable-stream": "^1.0.0",
"url-parse-lax": "^3.0.0"
}
},
"htmlparser2": { "htmlparser2": {
"version": "3.10.0", "version": "3.10.0",
"resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-3.10.0.tgz", "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-3.10.0.tgz",
@ -285,6 +372,11 @@
"readable-stream": "^3.0.6" "readable-stream": "^3.0.6"
} }
}, },
"http-cache-semantics": {
"version": "4.0.2",
"resolved": "https://registry.npmjs.org/http-cache-semantics/-/http-cache-semantics-4.0.2.tgz",
"integrity": "sha512-laeSTWIkuFa6lUgZAt+ic9RwOSEwbi9VDQNcCvMFO4sZiDc2Ha8DaZVCJnfpLLQCcS8rvCnIWYmz0POLxt7Dew=="
},
"https-proxy-agent": { "https-proxy-agent": {
"version": "2.2.1", "version": "2.2.1",
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-2.2.1.tgz", "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-2.2.1.tgz",
@ -323,16 +415,39 @@
"resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz",
"integrity": "sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE=" "integrity": "sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE="
}, },
"json-buffer": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/json-buffer/-/json-buffer-3.0.0.tgz",
"integrity": "sha1-Wx85evx11ne96Lz8Dkfh+aPZqJg="
},
"keyv": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/keyv/-/keyv-3.1.0.tgz",
"integrity": "sha512-9ykJ/46SN/9KPM/sichzQ7OvXyGDYKGTaDlKMGCAlg2UK8KRy4jb0d8sFc+0Tt0YYnThq8X2RZgCg74RPxgcVA==",
"requires": {
"json-buffer": "3.0.0"
}
},
"lodash": { "lodash": {
"version": "4.17.11", "version": "4.17.11",
"resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.11.tgz", "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.11.tgz",
"integrity": "sha512-cQKh8igo5QUhZ7lg38DYWAxMvjSAKG0A8wGSVimP07SIUEK2UO+arSRKbRZWtelMtN5V0Hkwh5ryOto/SshYIg==" "integrity": "sha512-cQKh8igo5QUhZ7lg38DYWAxMvjSAKG0A8wGSVimP07SIUEK2UO+arSRKbRZWtelMtN5V0Hkwh5ryOto/SshYIg=="
}, },
"lowercase-keys": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/lowercase-keys/-/lowercase-keys-1.0.1.tgz",
"integrity": "sha512-G2Lj61tXDnVFFOi8VZds+SoQjtQC3dgokKdDG2mTm1tx4m50NUHBOZSBwQQHyy0V12A0JTG4icfZQH+xPyh8VA=="
},
"mime": { "mime": {
"version": "2.4.0", "version": "2.4.0",
"resolved": "https://registry.npmjs.org/mime/-/mime-2.4.0.tgz", "resolved": "https://registry.npmjs.org/mime/-/mime-2.4.0.tgz",
"integrity": "sha512-ikBcWwyqXQSHKtciCcctu9YfPbFYZ4+gbHEmE0Q8jzcTYQg5dHCr3g2wwAZjPoJfQVXZq6KXAjpXOTf5/cjT7w==" "integrity": "sha512-ikBcWwyqXQSHKtciCcctu9YfPbFYZ4+gbHEmE0Q8jzcTYQg5dHCr3g2wwAZjPoJfQVXZq6KXAjpXOTf5/cjT7w=="
}, },
"mimic-response": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-1.0.1.tgz",
"integrity": "sha512-j5EctnkH7amfV/q5Hgmoal1g2QHFJRraOtmx0JpIqkxhBhI/lJSl1nMpQ45hVarwNETOoWEimndZ4QK0RHxuxQ=="
},
"minimatch": { "minimatch": {
"version": "3.0.4", "version": "3.0.4",
"resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.4.tgz", "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.4.tgz",
@ -359,6 +474,11 @@
"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.1.tgz", "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.1.tgz",
"integrity": "sha512-tgp+dl5cGk28utYktBsrFqA7HKgrhgPsg6Z/EfhWI4gl1Hwq8B/GmY/0oXZ6nF8hDVesS/FpnYaD/kOWhYQvyg==" "integrity": "sha512-tgp+dl5cGk28utYktBsrFqA7HKgrhgPsg6Z/EfhWI4gl1Hwq8B/GmY/0oXZ6nF8hDVesS/FpnYaD/kOWhYQvyg=="
}, },
"normalize-url": {
"version": "3.3.0",
"resolved": "https://registry.npmjs.org/normalize-url/-/normalize-url-3.3.0.tgz",
"integrity": "sha512-U+JJi7duF1o+u2pynbp2zXDW2/PADgC30f0GsHZtRh+HOcXHnw137TrNlyxxRvWW5fjKd3bcLHPxofWuCjaeZg=="
},
"nth-check": { "nth-check": {
"version": "1.0.2", "version": "1.0.2",
"resolved": "https://registry.npmjs.org/nth-check/-/nth-check-1.0.2.tgz", "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-1.0.2.tgz",
@ -375,6 +495,11 @@
"wrappy": "1" "wrappy": "1"
} }
}, },
"p-cancelable": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/p-cancelable/-/p-cancelable-1.0.0.tgz",
"integrity": "sha512-USgPoaC6tkTGlS831CxsVdmZmyb8tR1D+hStI84MyckLOzfJlYQUweomrwE3D8T7u5u5GVuW064LT501wHTYYA=="
},
"parse5": { "parse5": {
"version": "3.0.3", "version": "3.0.3",
"resolved": "https://registry.npmjs.org/parse5/-/parse5-3.0.3.tgz", "resolved": "https://registry.npmjs.org/parse5/-/parse5-3.0.3.tgz",
@ -398,6 +523,11 @@
"resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz", "resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz",
"integrity": "sha1-elfrVQpng/kRUzH89GY9XI4AelA=" "integrity": "sha1-elfrVQpng/kRUzH89GY9XI4AelA="
}, },
"prepend-http": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/prepend-http/-/prepend-http-2.0.0.tgz",
"integrity": "sha1-6SQ0v6XqjBn0HN/UAddBo8gZ2Jc="
},
"process-nextick-args": { "process-nextick-args": {
"version": "2.0.0", "version": "2.0.0",
"resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.0.tgz", "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.0.tgz",
@ -413,6 +543,15 @@
"resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.0.0.tgz", "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.0.0.tgz",
"integrity": "sha1-M8UDmPcOp+uW0h97gXYwpVeRx+4=" "integrity": "sha1-M8UDmPcOp+uW0h97gXYwpVeRx+4="
}, },
"pump": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz",
"integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==",
"requires": {
"end-of-stream": "^1.1.0",
"once": "^1.3.1"
}
},
"puppeteer": { "puppeteer": {
"version": "1.11.0", "version": "1.11.0",
"resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-1.11.0.tgz", "resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-1.11.0.tgz",
@ -438,6 +577,14 @@
"util-deprecate": "^1.0.1" "util-deprecate": "^1.0.1"
} }
}, },
"responselike": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/responselike/-/responselike-1.0.2.tgz",
"integrity": "sha1-kYcg7ztjHFZCvgaPFa3lpG9Loec=",
"requires": {
"lowercase-keys": "^1.0.0"
}
},
"rimraf": { "rimraf": {
"version": "2.6.2", "version": "2.6.2",
"resolved": "https://registry.npmjs.org/rimraf/-/rimraf-2.6.2.tgz", "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-2.6.2.tgz",
@ -459,6 +606,11 @@
"safe-buffer": "~5.1.0" "safe-buffer": "~5.1.0"
} }
}, },
"to-readable-stream": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/to-readable-stream/-/to-readable-stream-1.0.0.tgz",
"integrity": "sha512-Iq25XBt6zD5npPhlLVXGFN3/gyR2/qODcKNNyTMd4vbm39HUaOiAM4PMq0eMVC/Tkxz+Zjdsc55g9yyz+Yq00Q=="
},
"type-detect": { "type-detect": {
"version": "4.0.8", "version": "4.0.8",
"resolved": "https://registry.npmjs.org/type-detect/-/type-detect-4.0.8.tgz", "resolved": "https://registry.npmjs.org/type-detect/-/type-detect-4.0.8.tgz",
@ -469,6 +621,14 @@
"resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz", "resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz",
"integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c=" "integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c="
}, },
"url-parse-lax": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/url-parse-lax/-/url-parse-lax-3.0.0.tgz",
"integrity": "sha1-FrXK/Afb42dsGxmZF3gj1lA6yww=",
"requires": {
"prepend-http": "^2.0.0"
}
},
"util-deprecate": { "util-deprecate": {
"version": "1.0.2", "version": "1.0.2",
"resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz",

View File

@ -1,6 +1,6 @@
{ {
"name": "se-scraper", "name": "se-scraper",
"version": "1.1.7", "version": "1.1.8",
"description": "A simple module which uses puppeteer to scrape several search engines.", "description": "A simple module which uses puppeteer to scrape several search engines.",
"homepage": "https://scrapeulous.com/", "homepage": "https://scrapeulous.com/",
"main": "index.js", "main": "index.js",
@ -22,6 +22,7 @@
"dependencies": { "dependencies": {
"chai": "^4.2.0", "chai": "^4.2.0",
"cheerio": "^1.0.0-rc.2", "cheerio": "^1.0.0-rc.2",
"got": "^9.6.0",
"puppeteer": "^1.9.0" "puppeteer": "^1.9.0"
} }
} }

12
run.js
View File

@ -10,7 +10,7 @@ let config = {
write_meta_data: false, write_meta_data: false,
// how long to sleep between requests. a random sleep interval within the range [a,b] // how long to sleep between requests. a random sleep interval within the range [a,b]
// is drawn before every request. empty string for no sleeping. // is drawn before every request. empty string for no sleeping.
sleep_range: '[1,1]', sleep_range: '[1,2]',
// which search engine to scrape // which search engine to scrape
search_engine: 'google', search_engine: 'google',
// whether debug information should be printed // whether debug information should be printed
@ -20,9 +20,11 @@ let config = {
// this output is informational // this output is informational
verbose: false, verbose: false,
// an array of keywords to scrape // an array of keywords to scrape
keywords: ['trump', 'chief'], keywords: ['scraping scrapeulous.com'],
// alternatively you can specify a keyword_file. this overwrites the keywords array // alternatively you can specify a keyword_file. this overwrites the keywords array
keyword_file: '', keyword_file: '',
// the number of pages to scrape for each keyword
num_pages: 1,
// whether to start the browser in headless mode // whether to start the browser in headless mode
headless: true, headless: true,
// path to output file, data will be stored in JSON // path to output file, data will be stored in JSON
@ -35,7 +37,11 @@ let config = {
// get_browser, handle_metadata, close_browser // get_browser, handle_metadata, close_browser
// must be an absolute path to the module // must be an absolute path to the module
//custom_func: resolve('examples/pluggable.js'), //custom_func: resolve('examples/pluggable.js'),
custom_func: resolve('examples/pluggable.js'), custom_func: '',
// use a proxy for all connections
// example: 'socks5://78.94.172.42:1080'
// example: 'http://118.174.233.10:48400'
//proxy: 'socks5://78.94.172.42:1080',
}; };
function callback(err, response) { function callback(err, response) {

96
src/captcha_solver.js Normal file
View File

@ -0,0 +1,96 @@
/*
There are essentially two strategies to handle a search engine showing you a captcha:
1. Solve the captcha
https://github.com/ecthros/uncaptcha2
or use a captcha solving service such as https://anti-captcha.com/mainpage
2. Switch your IP address with rotating proxies
*/
/**
* @name download recaptcha2 audio captcha
*
* There are several issues:
*
* Google sees that we are using an automated browser.
*
* In the worst case we have to completely control the browser ourselves without puppeteer.
*
* https://github.com/ecthros/uncaptcha2
*
* See here:
*
* https://gist.github.com/tegansnyder/c3aeae4d57768c58247ae6c4e5acd3d1
*
* https://github.com/GoogleChrome/puppeteer/issues/3039
*
* https://intoli.com/blog/making-chrome-headless-undetectable/
*
* @desc Go to the https://www.google.com/recaptcha/api2/demo demo page and download the captcha
*/
const puppeteer = require('puppeteer');
const fs = require('fs');
const got = require('got');
try {
(async () => {
const browser = await puppeteer.launch({
args: [
'--proxy-server=socks5://78.94.172.42:1080',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--disable-gpu',
'--window-size=1920x1080',
'--hide-scrollbars',
'--user-agent="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0"',
],
headless: false,
});
const page = await browser.newPage()
await page.goto('https://www.google.com/recaptcha/api2/demo')
await page.waitFor(1000);
const frames = page.frames();
console.info('Available frames', frames.map(frame => frame.name()));
console.info('Available frame urls', frames.map(frame => frame.url()));
const frame = frames.find(frame => frame.url().includes('/recaptcha/api2/anchor?'));
const content_frame = frames.find(frame => frame.url().includes('/recaptcha/api2/bframe?'));
await frame.waitForSelector('#recaptcha-anchor', { timeout: 10000 });
await page.waitFor(1000);
const button = await frame.$('#recaptcha-anchor');
await button.click();
await content_frame.waitForSelector('#recaptcha-audio-button');
const audio_button = await content_frame.$('#recaptcha-audio-button');
await audio_button.click();
await page.waitFor(1000);
await content_frame.waitForSelector('.rc-audiochallenge-tdownload-link');
let download_link = await content_frame.evaluate(() => {
return document.querySelectorAll('.rc-audiochallenge-tdownload-link').getAttribute('href');
});
console.log('Got audio download link: ', download_link);
got.stream(download_link).pipe(fs.createWriteStream('audio.mp3'));
await browser.close();
})()
} catch (err) {
console.error(err)
}
/*
translate this shit into js: https://github.com/ecthros/uncaptcha2/blob/master/queryAPI.py
*/
async function translate_audio_file() {
}

View File

@ -21,6 +21,7 @@ async function scrape_bing_pup(page, event, context, pluggable) {
for (var i = 0; i < keywords.length; i++) { for (var i = 0; i < keywords.length; i++) {
keyword = keywords[i]; keyword = keywords[i];
results[keyword] = {};
if (pluggable.before_keyword_scraped) { if (pluggable.before_keyword_scraped) {
await pluggable.before_keyword_scraped({ await pluggable.before_keyword_scraped({
@ -33,23 +34,35 @@ async function scrape_bing_pup(page, event, context, pluggable) {
try { try {
const input = await page.$('input[name="q"]'); const input = await page.$('input[name="q"]');
// overwrites last text in input await sfunctions.set_input_value(page, `input[name="q"]`, keyword);
await input.click({ clickCount: 3 }); await sfunctions.sleep(50);
await input.type(keyword);
await input.focus(); await input.focus();
await page.keyboard.press("Enter"); await page.keyboard.press("Enter");
let page_num = 1;
do {
if (event.verbose === true) {
console.log(`${event.search_engine} is scraping keyword: ${keyword} on page ${page_num}`);
}
if (event.sleep_range) { if (event.sleep_range) {
await sfunctions.random_sleep(event); await sfunctions.random_sleep(event);
} }
await page.waitForSelector('#b_content', { timeout: 5000 }); await page.waitForSelector('#b_content', { timeout: 5000 });
if (event.debug === true && event.is_local === true) { await sfunctions.sleep(500);
await page.screenshot({path: `debug/${keyword}.png`});
}
let html = await page.content(); let html = await page.content();
results[keyword] = parse(html); results[keyword][page_num] = parse(html);
page_num += 1;
let next_page_link = await page.$('.sb_pagN', {timeout: 1000});
if (!next_page_link) {
break;
}
await next_page_link.click();
await page.waitForNavigation();
} while (page_num <= event.num_pages)
} catch (e) { } catch (e) {
console.error(`Problem with scraping ${keyword}: ${e}`); console.error(`Problem with scraping ${keyword}: ${e}`);

View File

@ -25,8 +25,8 @@ async function scrape_google_pup(page, event, context, pluggable) {
var results = {}; var results = {};
for (var i = 0; i < keywords.length; i++) { for (var i = 0; i < keywords.length; i++) {
keyword = keywords[i]; keyword = keywords[i];
results[keyword] = {};
if (pluggable.before_keyword_scraped) { if (pluggable.before_keyword_scraped) {
await pluggable.before_keyword_scraped({ await pluggable.before_keyword_scraped({
@ -37,26 +37,38 @@ async function scrape_google_pup(page, event, context, pluggable) {
}); });
} }
if (event.verbose === true) {
console.log(`${event.search_engine} is scraping keyword: ${keyword}`);
}
try { try {
const input = await page.$('input[name="q"]'); const input = await page.$('input[name="q"]');
// await input.click({ clickCount: 3 });
// await sfunctions.sleep(50);
//await input.type(keyword);
await sfunctions.set_input_value(page, `input[name="q"]`, keyword); await sfunctions.set_input_value(page, `input[name="q"]`, keyword);
await sfunctions.sleep(50); await sfunctions.sleep(50);
await input.focus(); await input.focus();
await page.keyboard.press("Enter"); await page.keyboard.press("Enter");
let page_num = 1;
do {
if (event.verbose === true) {
console.log(`${event.search_engine} is scraping keyword: ${keyword} on page ${page_num}`);
}
if (event.sleep_range) { if (event.sleep_range) {
await sfunctions.random_sleep(event); await sfunctions.random_sleep(event);
} }
await page.waitForSelector('#center_col', {timeout: STANDARD_TIMEOUT});
await page.waitForSelector('#center_col', { timeout: STANDARD_TIMEOUT });
await sfunctions.sleep(500); await sfunctions.sleep(500);
let html = await page.content();
results[keyword][page_num] = parse_google_results(html);
page_num += 1;
let next_page_link = await page.$('#pnnext', {timeout: 1000});
if (!next_page_link) {
break;
}
await next_page_link.click();
await page.waitForNavigation();
} while (page_num <= event.num_pages)
} catch (e) { } catch (e) {
console.error(`Problem with scraping ${keyword}.`); console.error(`Problem with scraping ${keyword}.`);
@ -82,9 +94,6 @@ async function scrape_google_pup(page, event, context, pluggable) {
} }
} }
} }
let html = await page.content();
results[keyword] = parse_google_results(html);
} }
return results; return results;

View File

@ -12,13 +12,14 @@ async function get_metadata(browser) {
waitLoad: true, waitLoad: true,
waitNetworkIdle: true // defaults to false waitNetworkIdle: true // defaults to false
}); });
let json = await page.content(); let json = await page.content({
timeout: 20000
});
const $ = cheerio.load(json); const $ = cheerio.load(json);
metadata.ipinfo = $('pre').text(); metadata.ipinfo = $('pre').text();
return metadata; return metadata;
} }
async function get_http_headers(browser) { async function get_http_headers(browser) {
let metadata = {}; let metadata = {};
const page = await browser.newPage(); const page = await browser.newPage();

39
src/modules/se_scraper.js Normal file
View File

@ -0,0 +1,39 @@
const start_url = {
'google': ''
};
/*
Read this shit: https://javascript.info/class-inheritance
*/
module.exports = class Scraper {
constructor(options = {}) {
const {
searchEngine = 'google',
numPages = 1,
pluggable = null,
} = options;
this.pluggable = pluggable;
this.searchEngine = searchEngine;
this.numPages = numPages;
this.results = {}
}
async load_search_engine() {
}
async search_keyword() {
}
parse() {
}
async next_page() {
}
async detected() {
}
};

View File

@ -3,12 +3,12 @@ module.exports = {
}; };
function random_user_agent() { function random_user_agent() {
return user_agents[Math.floor(Math.random()*user_agents.length)]; let rand = user_agents[Math.floor(Math.random()*user_agents.length)];
} }
// updated: 29 Jan 2019 // updated: 29 Jan 2019
const user_agents = [ const user_agents = [
['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
@ -78,5 +78,4 @@ const user_agents = [
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0',
'Mozilla/5.0 (X11; CrOS x86_64 11151.59.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.94 Safari/537.36', 'Mozilla/5.0 (X11; CrOS x86_64 11151.59.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.94 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
]
]; ];

View File

@ -22,7 +22,7 @@ function write_results(fname, data) {
module.exports.handler = async function handler (event, context, callback) { module.exports.handler = async function handler (event, context, callback) {
config = event; config = event;
pluggable = null; pluggable = {};
if (config.custom_func) { if (config.custom_func) {
if (fs.existsSync(config.custom_func)) { if (fs.existsSync(config.custom_func)) {
try { try {
@ -43,8 +43,11 @@ module.exports.handler = async function handler (event, context, callback) {
console.log(config); console.log(config);
} }
const ADDITIONAL_CHROME_FLAGS = [ var ADDITIONAL_CHROME_FLAGS = [
//'--proxy-server=' + proxy, '--disable-infobars',
'--window-position=0,0',
'--ignore-certifcate-errors',
'--ignore-certifcate-errors-spki-list',
'--no-sandbox', '--no-sandbox',
'--disable-setuid-sandbox', '--disable-setuid-sandbox',
'--disable-dev-shm-usage', '--disable-dev-shm-usage',
@ -70,16 +73,27 @@ module.exports.handler = async function handler (event, context, callback) {
) )
} }
if (config.proxy) {
// check this out bubbles
// https://www.systutorials.com/241062/how-to-set-google-chromes-proxy-settings-in-command-line-on-linux/
// [<proxy-scheme>://]<proxy-host>[:<proxy-port>]
// "http", "socks", "socks4", "socks5".
ADDITIONAL_CHROME_FLAGS.push(
'--proxy-server=' + config.proxy,
)
}
let launch_args = { let launch_args = {
args: ADDITIONAL_CHROME_FLAGS, args: ADDITIONAL_CHROME_FLAGS,
headless: config.headless, headless: config.headless,
ignoreHTTPSErrors: true,
}; };
if (config.debug === true) { if (config.debug === true) {
console.log("Chrome Args: ", launch_args); console.log("Chrome Args: ", launch_args);
} }
if (pluggable) { if (pluggable.start_browser) {
launch_args.config = config; launch_args.config = config;
browser = await pluggable.start_browser(launch_args); browser = await pluggable.start_browser(launch_args);
} else { } else {
@ -91,6 +105,30 @@ module.exports.handler = async function handler (event, context, callback) {
console.dir(headers); console.dir(headers);
} }
let metadata = {};
if (config.write_meta_data === true) {
metadata = await meta.get_metadata(browser);
}
// check that our proxy is working by confirming
// that ipinfo.io sees the proxy IP address
if (config.proxy && config.write_meta_data === true) {
console.log(`${metadata.ipinfo} vs ${config.proxy}`);
try {
let ipdata = JSON.parse(metadata.ipinfo);
// if the ip returned by ipinfo is not a substring of our proxystring, get the heck outta here
if (!config.proxy.includes(ipdata.ip)) {
console.error('Proxy not working properly.');
await browser.close();
return;
}
} catch (exception) {
}
}
const page = await browser.newPage(); const page = await browser.newPage();
// block some assets to speed up scraping // block some assets to speed up scraping
@ -127,13 +165,8 @@ module.exports.handler = async function handler (event, context, callback) {
marketwatch: tickersearch.scrape_marketwatch_finance_pup, marketwatch: tickersearch.scrape_marketwatch_finance_pup,
}[config.search_engine](page, config, context, pluggable); }[config.search_engine](page, config, context, pluggable);
let metadata = {};
if (config.write_meta_data === true) { if (pluggable.close_browser) {
metadata = await meta.get_metadata(browser);
}
if (pluggable) {
await pluggable.close_browser(); await pluggable.close_browser();
} else { } else {
await browser.close(); await browser.close();
@ -155,7 +188,7 @@ module.exports.handler = async function handler (event, context, callback) {
results = zlib.deflateSync(results).toString('base64'); results = zlib.deflateSync(results).toString('base64');
} }
if (pluggable && pluggable.handle_results) { if (pluggable.handle_results) {
await pluggable.handle_results({ await pluggable.handle_results({
config: config, config: config,
results: results, results: results,
@ -172,7 +205,7 @@ module.exports.handler = async function handler (event, context, callback) {
console.log(metadata); console.log(metadata);
} }
if (pluggable) { if (pluggable.handle_metadata) {
await pluggable.handle_metadata({metadata: metadata, config: config}); await pluggable.handle_metadata({metadata: metadata, config: config});
} }
} }