resolved some issues. proxy possible now. scraping for more than one page possible now

This commit is contained in:
Nikolai Tschacher 2019-01-29 22:48:08 +01:00
parent 89441070cd
commit 9e62f23451
14 changed files with 764 additions and 340 deletions

426
README.md
View File

@ -34,10 +34,50 @@ Scraping is done with a headless chromium browser using the automation library p
If you need to deploy scraping to the cloud (AWS or Azure), you can contact me on hire@incolumitas.com
The chromium browser is started with the following flags to prevent
scraping detection.
```js
var ADDITIONAL_CHROME_FLAGS = [
'--disable-infobars',
'--window-position=0,0',
'--ignore-certifcate-errors',
'--ignore-certifcate-errors-spki-list',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--disable-gpu',
'--window-size=1920x1080',
'--hide-scrollbars',
];
```
Furthermore, to avoid loading unnecessary ressources and to speed up
scraping a great deal, we instruct chrome to not load images and css:
```js
await page.setRequestInterception(true);
page.on('request', (req) => {
let type = req.resourceType();
const block = ['stylesheet', 'font', 'image', 'media'];
if (block.includes(type)) {
req.abort();
} else {
req.continue();
}
});
```
#### Making puppeteer and headless chrome undetectable
Consider the following resources:
* https://intoli.com/blog/making-chrome-headless-undetectable/
### Installation and Usage
Install with
Install with
```bash
npm install se-scraper
@ -53,12 +93,12 @@ let config = {
// the user agent to scrape with
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
// if random_user_agent is set to True, a random user agent is chosen
random_user_agent: false,
random_user_agent: true,
// get meta data of scraping in return object
write_meta_data: false,
// how long to sleep between requests. a random sleep interval within the range [a,b]
// is drawn before every request. empty string for no sleeping.
sleep_range: '',
sleep_range: '[1,2]',
// which search engine to scrape
search_engine: 'google',
// whether debug information should be printed
@ -68,9 +108,11 @@ let config = {
// this output is informational
verbose: false,
// an array of keywords to scrape
keywords: ['scrapeulous.com', ],
keywords: ['scraping scrapeulous.com'],
// alternatively you can specify a keyword_file. this overwrites the keywords array
keyword_file: '',
// the number of pages to scrape for each keyword
num_pages: 2,
// whether to start the browser in headless mode
headless: true,
// path to output file, data will be stored in JSON
@ -84,9 +126,13 @@ let config = {
// must be an absolute path to the module
//custom_func: resolve('examples/pluggable.js'),
custom_func: '',
// use a proxy for all connections
// example: 'socks5://78.94.172.42:1080'
// example: 'http://118.174.233.10:48400'
//proxy: 'socks5://78.94.172.42:1080',
};
se_scraper.scrape(config, (err, response) => {
function callback(err, response) {
if (err) { console.error(err) }
/* response object has the following properties:
@ -97,7 +143,9 @@ se_scraper.scrape(config, (err, response) => {
*/
console.dir(response.results, {depth: null, colors: true});
});
}
se_scraper.scrape(config, callback);
```
Supported options for the `search_engine` config key:
@ -123,199 +171,179 @@ Supported options for the `search_engine` config key:
'marketwatch'
```
Output for the above script on my laptop:
Output for the above script on my machine:
```text
Scraper took 4295ms to scrape 2 keywords.
On average ms/keyword: 2147.5ms/keyword
{ 'incolumitas.com scraping':
{ time: 'Mon, 24 Dec 2018 13:07:43 GMT',
num_results: 'Ungefähr 2020 Ergebnisse (0.18 Sekunden) ',
no_results: false,
effective_query: '',
results:
[ { link:
'https://incolumitas.com/2018/10/29/youtube-puppeteer-scraping/',
title:
'Coding, Learning and Business Ideas Tutorial: Youtube scraping ...',
snippet:
'29.10.2018 - In this blog post I am going to show you how to scrape YouTube video data using the handy puppeteer library. Puppeteer is a Node library ...',
visible_link:
'https://incolumitas.com/2018/10/29/youtube-puppeteer-scraping/',
date: '29.10.2018 - ',
rank: 1 },
{ link: 'https://incolumitas.com/2018/09/05/googlescraper-tutorial/',
title:
'GoogleScraper Tutorial - How to scrape 1000 keywords with Google',
snippet:
'05.09.2018 - Tutorial that teaches how to use GoogleScraper to scrape 1000 keywords with 10 selenium browsers.',
visible_link: 'https://incolumitas.com/2018/09/05/googlescraper-tutorial/',
date: '05.09.2018 - ',
rank: 2 },
{ link: 'https://incolumitas.com/tag/scraping.html',
title: 'Coding, Learning and Business Ideas Tag Scraping',
snippet:
'Scraping Amazon Reviews using Headless Chrome Browser and Python3. Posted on Mi ... GoogleScraper Tutorial - How to scrape 1000 keywords with Google.',
visible_link: 'https://incolumitas.com/tag/scraping.html',
date: '',
rank: 3 },
{ link: 'https://incolumitas.com/category/scraping.html',
title: 'Coding, Learning and Business Ideas Category Scraping',
snippet:
'Nikolai Tschacher\'s ideas and projects around IT security and computer science.',
visible_link: 'https://incolumitas.com/category/scraping.html',
date: '',
rank: 4 },
{ link:
'https://github.com/NikolaiT/incolumitas/blob/master/content/Meta/scraping-and-extracting-links-from-any-major-search-engine-like-google-yandex-baidu-bing-and-duckduckgo.md',
title:
'incolumitas/scraping-and-extracting-links-from-any-major-search ...',
snippet:
'Title: Scraping and Extracting Links from any major Search Engine like Google, Yandex, Baidu, Bing and Duckduckgo Date: 2014-11-12 00:47 Author: Nikolai ...',
visible_link:
'https://github.com/.../incolumitas/.../scraping-and-extracting-links...',
date: '',
rank: 5 },
{ link:
'https://stackoverflow.com/questions/16955325/scraping-google-results-with-python',
title: 'Scraping Google Results with Python - Stack Overflow',
snippet:
'I found this. incolumitas.com/2013/01/06/… But the author claims it is not ported to 2.7 yet. user2351394 Jun 6 \'13 at 6:59 ...',
visible_link:
'https://stackoverflow.com/.../scraping-google-results-with-python',
date: '',
rank: 6 },
{ link: 'https://pypi.org/project/GoogleScraper/0.1.18/',
title: 'GoogleScraper · PyPI',
snippet:
'[5]: http://incolumitas.com/2014/11/12/scraping-and-extracting-links-from-any-major-search-engine-like-google-yandex-baidu-bing-and-duckduckgo/ ...',
visible_link: 'https://pypi.org/project/GoogleScraper/0.1.18/',
date: '',
rank: 7 },
{ link:
'https://www.reddit.com/r/Python/comments/2m0vyu/scraping_links_on_google_yandex_bing_duckduckgo/',
title:
'Scraping links on Google, Yandex, Bing, Duckduckgo, Baidu and ...',
snippet:
'12.11.2014 - Scraping links on Google, Yandex, Bing, Duckduckgo, Baidu and other search engines with Python ... submitted 4 years ago by incolumitas.',
visible_link:
'https://www.reddit.com/.../scraping_links_on_google_yandex_bi...',
date: '12.11.2014 - ',
rank: 9 },
{ link: 'https://twitter.com/incolumitas_?lang=de',
title: 'Nikolai Tschacher (@incolumitas_) | Twitter',
snippet:
'Embed Tweet. How to use GoogleScraper to scrape images and download them ... Learn how to scrape millions of url from yandex and google or bing with: ...',
visible_link: 'https://twitter.com/incolumitas_?lang=de',
date: '',
rank: 10 } ] },
'best scraping framework':
{ time: 'Mon, 24 Dec 2018 13:07:44 GMT',
num_results: 'Ungefähr 2820000 Ergebnisse (0.36 Sekunden) ',
no_results: false,
effective_query: '',
results:
[ { link:
'http://www.aioptify.com/top-web-scraping-frameworks-and-librares.php',
title: 'Top Web Scraping Frameworks and Libraries - AI Optify',
snippet: '',
visible_link:
'www.aioptify.com/top-web-scraping-frameworks-and-librares.php',
date: '',
rank: 1 },
{ link:
'http://www.aioptify.com/top-web-scraping-frameworks-and-librares.php',
title: 'Top Web Scraping Frameworks and Libraries - AI Optify',
snippet: '',
visible_link:
'www.aioptify.com/top-web-scraping-frameworks-and-librares.php',
date: '',
rank: 2 },
{ link:
'https://www.scrapehero.com/open-source-web-scraping-frameworks-and-tools/',
title:
'Best Open Source Web Scraping Frameworks and Tools - ScrapeHero',
snippet:
'05.06.2018 - List of Open Source Web Scraping Frameworks. Scrapy. MechanicalSoup. PySpider. Portia. Apify SDK. Nodecrawler. Selenium WebDriver. Puppeteer.',
visible_link:
'https://www.scrapehero.com/open-source-web-scraping-framewo...',
date: '05.06.2018 - ',
rank: 3 },
{ link:
'https://medium.com/datadriveninvestor/best-data-scraping-tools-for-2018-top-10-reviews-558cc5a4992f',
title:
'Best Data Scraping Tools for 2018 (Top 10 Reviews) Data Driven ...',
snippet:
'05.03.2018 - Pros: Octoparse is the best free data scraping tool I\'ve met. ... your Scrapy (a open-source data extraction framework) web spider\'s activities.',
visible_link:
'https://medium.com/.../best-data-scraping-tools-for-2018-top-10-...',
date: '05.03.2018 - ',
rank: 4 },
{ link:
'https://www.quora.com/What-is-the-best-web-scraping-open-source-tool',
title: 'What is the best web scraping open source tool? - Quora',
snippet:
'15.06.2015 - My personal favourite is Python Scrapy and it is an excellent framework for building a web data scraper. Why Scrapy? 1) It is an open source framework and cost ...',
visible_link:
'https://www.quora.com/What-is-the-best-web-scraping-open-sour...',
date: '15.06.2015 - ',
rank: 5 },
{ link:
'http://www.aioptify.com/top-web-scraping-frameworks-and-librares.php',
title: 'Top Web Scraping Frameworks and Libraries - AI Optify',
snippet:
'21.05.2018 - Top Web Scraping Frameworks and Libraries. Requests. Scrapy. Beautiful Soup. Selenium with Python. lxml. Webscraping with Selenium - part 1. Extracting data from websites with Scrapy. Scrapinghub.',
visible_link:
'www.aioptify.com/top-web-scraping-frameworks-and-librares.php',
date: '21.05.2018 - ',
rank: 6 },
{ link: 'https://scrapy.org/',
title:
'Scrapy | A Fast and Powerful Scraping and Web Crawling Framework',
snippet:
'An open source and collaborative framework for extracting the data you need from ... Spider): name = \'blogspider\' start_urls = [\'https://blog.scrapinghub.com\'] def ...',
visible_link: 'https://scrapy.org/',
date: '',
rank: 7 },
{ link:
'https://www.scraperapi.com/blog/the-10-best-web-scraping-tools',
title: 'The 10 Best Web Scraping Tools of 2018 - Scraper API',
snippet:
'19.07.2018 - The 10 Best Web Scraping Tools of 2018. ParseHub. Scrapy. Diffbot. Cheerio. Website: https://cheerio.js.org. Beautiful Soup. Website: https://www.crummy.com/software/BeautifulSoup/ Puppeteer. Website: https://github.com/GoogleChrome/puppeteer. Content Grabber. Website: http://www.contentgrabber.com/ Mozenda. Website: ...',
visible_link:
'https://www.scraperapi.com/blog/the-10-best-web-scraping-tools',
date: '19.07.2018 - ',
rank: 8 },
{ link: 'https://elitedatascience.com/python-web-scraping-libraries',
title: '5 Tasty Python Web Scraping Libraries - EliteDataScience',
snippet:
'03.02.2017 - We\'ve decided to feature the 5 Python libraries for web scraping that ... The good news is that you can swap out its parser with a faster one if ... Scrapy is technically not even a library… it\'s a complete web scraping framework.',
visible_link: 'https://elitedatascience.com/python-web-scraping-libraries',
date: '03.02.2017 - ',
rank: 9 },
{ link:
'https://blog.michaelyin.info/web-scraping-framework-review-scrapy-vs-selenium/',
title:
'Web Scraping Framework Review: Scrapy VS Selenium | MichaelYin ...',
snippet:
'01.10.2018 - In this Scrapy tutorial, I will cover the features of Scrapy and Selenium, and help you decide which one is better for your projects.',
visible_link:
'https://blog.michaelyin.info/web-scraping-framework-review-scr...',
date: '01.10.2018 - ',
rank: 10 },
{ link: 'https://github.com/lorien/awesome-web-scraping',
title:
'GitHub - lorien/awesome-web-scraping: List of libraries, tools and APIs ...',
snippet:
'List of libraries, tools and APIs for web scraping and data processing. ... golang.md · add dataflow kit framework, 2 months ago ... Make this list better!',
visible_link: 'https://github.com/lorien/awesome-web-scraping',
date: '',
rank: 11 },
{ link: 'https://www.import.io/post/best-web-scraping-tools-2018/',
title: 'Best Web Scraping Software Tools 2018 | Import.io',
snippet:
'07.08.2018 - List of Best Web Scraping SoftwareThere are hundreds of Web ... it is a fast high-level screen scraping and web crawling framework, used to ...',
visible_link: 'https://www.import.io/post/best-web-scraping-tools-2018/',
date: '07.08.2018 - ',
rank: 12 } ] } }
{ 'scraping scrapeulous.com':
{ '1':
{ time: 'Tue, 29 Jan 2019 21:39:22 GMT',
num_results: 'Ungefähr 145 Ergebnisse (0,18 Sekunden) ',
no_results: false,
effective_query: '',
results:
[ { link: 'https://scrapeulous.com/',
title:
'Scrapeuloushttps://scrapeulous.com/Im CacheDiese Seite übersetzen',
snippet:
'Scrapeulous.com allows you to scrape various search engines automatically ... or to find hidden links, Scrapeulous.com enables you to scrape a ever increasing ...',
visible_link: 'https://scrapeulous.com/',
date: '',
rank: 1 },
{ link: 'https://scrapeulous.com/about/',
title:
'About - Scrapeuloushttps://scrapeulous.com/about/Im CacheDiese Seite übersetzen',
snippet:
'Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. The business requirement to scrape information from ...',
visible_link: 'https://scrapeulous.com/about/',
date: '',
rank: 2 },
{ link: 'https://scrapeulous.com/howto/',
title:
'Howto - Scrapeuloushttps://scrapeulous.com/howto/Im CacheDiese Seite übersetzen',
snippet:
'We offer scraping large amounts of keywords for the Google Search Engine. Large means any number of keywords between 40 and 50000. Additionally, we ...',
visible_link: 'https://scrapeulous.com/howto/',
date: '',
rank: 3 },
{ link: 'https://github.com/NikolaiT/se-scraper',
title:
'GitHub - NikolaiT/se-scraper: Javascript scraping module based on ...https://github.com/NikolaiT/se-scraperIm CacheDiese Seite übersetzen',
snippet:
'24.12.2018 - Javascript scraping module based on puppeteer for many different search ... for many different search engines... https://scrapeulous.com/.',
visible_link: 'https://github.com/NikolaiT/se-scraper',
date: '24.12.2018 - ',
rank: 4 },
{ link:
'https://github.com/NikolaiT/GoogleScraper/blob/master/README.md',
title:
'GoogleScraper/README.md at master · NikolaiT/GoogleScraper ...https://github.com/NikolaiT/GoogleScraper/blob/.../README.mdIm CacheÄhnliche SeitenDiese Seite übersetzen',
snippet:
'GoogleScraper - Scraping search engines professionally. Scrapeulous.com - Scraping Service. GoogleScraper is a open source tool and will remain a open ...',
visible_link:
'https://github.com/NikolaiT/GoogleScraper/blob/.../README.md',
date: '',
rank: 5 },
{ link: 'https://googlescraper.readthedocs.io/',
title:
'Welcome to GoogleScraper\'s documentation! — GoogleScraper ...https://googlescraper.readthedocs.io/Im CacheDiese Seite übersetzen',
snippet:
'Welcome to GoogleScraper\'s documentation!¶. Contents: GoogleScraper - Scraping search engines professionally · Scrapeulous.com - Scraping Service ...',
visible_link: 'https://googlescraper.readthedocs.io/',
date: '',
rank: 6 },
{ link: 'https://incolumitas.com/pages/scrapeulous/',
title:
'Coding, Learning and Business Ideas Scrapeulous.com - Incolumitashttps://incolumitas.com/pages/scrapeulous/Im CacheDiese Seite übersetzen',
snippet:
'A scraping service for scientists, marketing professionals, analysts or SEO folk. In autumn 2018, I created a scraping service called scrapeulous.com. There you ...',
visible_link: 'https://incolumitas.com/pages/scrapeulous/',
date: '',
rank: 7 },
{ link: 'https://incolumitas.com/',
title:
'Coding, Learning and Business Ideashttps://incolumitas.com/Im CacheDiese Seite übersetzen',
snippet:
'Scraping Amazon Reviews using Headless Chrome Browser and Python3. Posted on Mi ... GoogleScraper Tutorial - How to scrape 1000 keywords with Google.',
visible_link: 'https://incolumitas.com/',
date: '',
rank: 8 },
{ link: 'https://en.wikipedia.org/wiki/Search_engine_scraping',
title:
'Search engine scraping - Wikipediahttps://en.wikipedia.org/wiki/Search_engine_scrapingIm CacheDiese Seite übersetzen',
snippet:
'Search engine scraping is the process of harvesting URLs, descriptions, or other information from search engines such as Google, Bing or Yahoo. This is a ...',
visible_link: 'https://en.wikipedia.org/wiki/Search_engine_scraping',
date: '',
rank: 9 },
{ link:
'https://readthedocs.org/projects/googlescraper/downloads/pdf/latest/',
title:
'GoogleScraper Documentation - Read the Docshttps://readthedocs.org/projects/googlescraper/downloads/.../latest...Im CacheDiese Seite übersetzen',
snippet:
'23.12.2018 - Contents: 1 GoogleScraper - Scraping search engines professionally. 1. 1.1 ... For this reason, I created the web service scrapeulous.com.',
visible_link:
'https://readthedocs.org/projects/googlescraper/downloads/.../latest...',
date: '23.12.2018 - ',
rank: 10 } ] },
'2':
{ time: 'Tue, 29 Jan 2019 21:39:24 GMT',
num_results: 'Seite 2 von ungefähr 145 Ergebnissen (0,20 Sekunden) ',
no_results: false,
effective_query: '',
results:
[ { link: 'https://pypi.org/project/CountryGoogleScraper/',
title:
'CountryGoogleScraper · PyPIhttps://pypi.org/project/CountryGoogleScraper/Im CacheDiese Seite übersetzen',
snippet:
'A module to scrape and extract links, titles and descriptions from various search ... Look [here to get an idea how to use asynchronous mode](http://scrapeulous.',
visible_link: 'https://pypi.org/project/CountryGoogleScraper/',
date: '',
rank: 1 },
{ link: 'https://www.youtube.com/watch?v=a6xn6rc9GbI',
title:
'scrapeulous intro - YouTubehttps://www.youtube.com/watch?v=a6xn6rc9GbIDiese Seite übersetzen',
snippet:
'scrapeulous intro. Scrapeulous Scrapeulous. Loading... Unsubscribe from ... on Dec 16, 2018. Introduction ...',
visible_link: 'https://www.youtube.com/watch?v=a6xn6rc9GbI',
date: '',
rank: 3 },
{ link:
'https://www.reddit.com/r/Python/comments/2tii3r/scraping_260_search_queries_in_bing_in_a_matter/',
title:
'Scraping 260 search queries in Bing in a matter of seconds using ...https://www.reddit.com/.../scraping_260_search_queries_in_bing...Im CacheDiese Seite übersetzen',
snippet:
'24.01.2015 - Scraping 260 search queries in Bing in a matter of seconds using asyncio and aiohttp. (scrapeulous.com). submitted 3 years ago by ...',
visible_link:
'https://www.reddit.com/.../scraping_260_search_queries_in_bing...',
date: '24.01.2015 - ',
rank: 4 },
{ link: 'https://twitter.com/incolumitas_?lang=de',
title:
'Nikolai Tschacher (@incolumitas_) | Twitterhttps://twitter.com/incolumitas_?lang=deIm CacheÄhnliche SeitenDiese Seite übersetzen',
snippet:
'Learn how to scrape millions of url from yandex and google or bing with: http://scrapeulous.com/googlescraper-market-analysis.html … 0 replies 0 retweets 0 ...',
visible_link: 'https://twitter.com/incolumitas_?lang=de',
date: '',
rank: 5 },
{ link:
'http://blog.shodan.io/hostility-in-the-python-package-index/',
title:
'Hostility in the Cheese Shop - Shodan Blogblog.shodan.io/hostility-in-the-python-package-index/Im CacheDiese Seite übersetzen',
snippet:
'22.02.2015 - https://zzz.scrapeulous.com/r? According to the author of the website, these hostile packages are used as honeypots. Honeypots are usually ...',
visible_link: 'blog.shodan.io/hostility-in-the-python-package-index/',
date: '22.02.2015 - ',
rank: 6 },
{ link: 'https://libraries.io/github/NikolaiT/GoogleScraper',
title:
'NikolaiT/GoogleScraper - Libraries.iohttps://libraries.io/github/NikolaiT/GoogleScraperIm CacheDiese Seite übersetzen',
snippet:
'A Python module to scrape several search engines (like Google, Yandex, Bing, ... https://scrapeulous.com/ ... You can install GoogleScraper comfortably with pip:',
visible_link: 'https://libraries.io/github/NikolaiT/GoogleScraper',
date: '',
rank: 7 },
{ link: 'https://pydigger.com/pypi/CountryGoogleScraper',
title:
'CountryGoogleScraper - PyDiggerhttps://pydigger.com/pypi/CountryGoogleScraperDiese Seite übersetzen',
snippet:
'19.10.2016 - Look [here to get an idea how to use asynchronous mode](http://scrapeulous.com/googlescraper-260-keywords-in-a-second.html). ### Table ...',
visible_link: 'https://pydigger.com/pypi/CountryGoogleScraper',
date: '19.10.2016 - ',
rank: 8 },
{ link: 'https://hub.docker.com/r/cimenx/data-mining-penandtest/',
title:
'cimenx/data-mining-penandtest - Docker Hubhttps://hub.docker.com/r/cimenx/data-mining-penandtest/Im CacheDiese Seite übersetzen',
snippet:
'Container. OverviewTagsDockerfileBuilds · http://scrapeulous.com/googlescraper-260-keywords-in-a-second.html. Docker Pull Command. Owner. profile ...',
visible_link: 'https://hub.docker.com/r/cimenx/data-mining-penandtest/',
date: '',
rank: 9 },
{ link: 'https://www.revolvy.com/page/Search-engine-scraping',
title:
'Search engine scraping | Revolvyhttps://www.revolvy.com/page/Search-engine-scrapingIm CacheDiese Seite übersetzen',
snippet:
'Search engine scraping is the process of harvesting URLs, descriptions, or other information from search engines such as Google, Bing or Yahoo. This is a ...',
visible_link: 'https://www.revolvy.com/page/Search-engine-scraping',
date: '',
rank: 10 } ] } } }
```

View File

@ -14,6 +14,17 @@
https://www.scrapehero.com/how-to-increase-web-scraping-speed-using-puppeteer/
https://www.scrapehero.com/how-to-build-a-web-scraper-using-puppeteer-and-node-js/
29.1.2019
- implement proxy support functionality
- implement proxy check
- implement scraping more than 1 page
- do it for google
- and bing
- implement duckduckgo scraping
TODO:
- think about implementing ticker search for: https://quotes.wsj.com/MSFT?mod=searchresults_companyquotes
- add proxy support
@ -23,4 +34,28 @@ TODO:
TODO:
- think whether it makes sense to introduce a generic scraping class?
- is scraping abstractable or is every scraper too unique?
- dont make the same mistakes as with GoogleScraper
- dont make the same mistakes as with GoogleScraper
TODO:
okay its fucking time to make a generic scraping class like in GoogleScraper
i feel like history repeats
class Scraper
constructor(options = {}) {
}
async load_search_engine() {}
async search_keyword() {}
async new_page() {}
async detected() {}
then each search engine derives from this generic class
some search engines do not seed such a abstract class, because they are too complex

File diff suppressed because one or more lines are too long

View File

@ -35,6 +35,10 @@ exports.scrape = async function(config, callback) {
// get_browser, handle_metadata, close_browser
//custom_func: resolve('examples/pluggable.js'),
custom_func: '',
// use a proxy for all connections
// example: 'socks5://78.94.172.42:1080'
// example: 'http://118.174.233.10:48400'
proxy: '',
};
// overwrite default config

162
package-lock.json generated
View File

@ -1,9 +1,22 @@
{
"name": "se-scraper",
"version": "1.1.4",
"version": "1.1.7",
"lockfileVersion": 1,
"requires": true,
"dependencies": {
"@sindresorhus/is": {
"version": "0.14.0",
"resolved": "https://registry.npmjs.org/@sindresorhus/is/-/is-0.14.0.tgz",
"integrity": "sha512-9NET910DNaIPngYnLLPeg+Ogzqsi9uM4mSboU5y6p8S5DzMTVEsJZrawi+BoDNUVBa2DhJqQYUFvMDfgU062LQ=="
},
"@szmarczak/http-timer": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/@szmarczak/http-timer/-/http-timer-1.1.2.tgz",
"integrity": "sha512-XIB2XbzHTN6ieIjfIMV9hlVcfPU26s2vafYWQcZHWXHOxiaRZYEDKEwdl129Zyg50+foYV2jCgtrqSA6qNuNSA==",
"requires": {
"defer-to-connect": "^1.0.1"
}
},
"@types/node": {
"version": "10.12.18",
"resolved": "https://registry.npmjs.org/@types/node/-/node-10.12.18.tgz",
@ -51,6 +64,20 @@
"resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.1.tgz",
"integrity": "sha512-MQcXEUbCKtEo7bhqEs6560Hyd4XaovZlO/k9V3hjVUF/zwW7KBVdSK4gIt/bzwS9MbR5qob+F5jusZsb0YQK2A=="
},
"cacheable-request": {
"version": "6.0.0",
"resolved": "https://registry.npmjs.org/cacheable-request/-/cacheable-request-6.0.0.tgz",
"integrity": "sha512-2N7AmszH/WPPpl5Z3XMw1HAP+8d+xugnKQAeKvxFZ/04dbT/CAznqwbl+7eSr3HkwdepNwtb2yx3CAMQWvG01Q==",
"requires": {
"clone-response": "^1.0.2",
"get-stream": "^4.0.0",
"http-cache-semantics": "^4.0.0",
"keyv": "^3.0.0",
"lowercase-keys": "^1.0.1",
"normalize-url": "^3.1.0",
"responselike": "^1.0.2"
}
},
"chai": {
"version": "4.2.0",
"resolved": "https://registry.npmjs.org/chai/-/chai-4.2.0.tgz",
@ -82,6 +109,14 @@
"parse5": "^3.0.1"
}
},
"clone-response": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/clone-response/-/clone-response-1.0.2.tgz",
"integrity": "sha1-0dyXOSAxTfZ/vrlCI7TuNQI56Ws=",
"requires": {
"mimic-response": "^1.0.0"
}
},
"concat-map": {
"version": "0.0.1",
"resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz",
@ -151,6 +186,14 @@
"ms": "^2.1.1"
}
},
"decompress-response": {
"version": "3.3.0",
"resolved": "https://registry.npmjs.org/decompress-response/-/decompress-response-3.3.0.tgz",
"integrity": "sha1-gKTdMjdIOEv6JICDYirt7Jgq3/M=",
"requires": {
"mimic-response": "^1.0.0"
}
},
"deep-eql": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/deep-eql/-/deep-eql-3.0.1.tgz",
@ -159,6 +202,11 @@
"type-detect": "^4.0.0"
}
},
"defer-to-connect": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/defer-to-connect/-/defer-to-connect-1.0.2.tgz",
"integrity": "sha512-k09hcQcTDY+cwgiwa6PYKLm3jlagNzQ+RSvhjzESOGOx+MNOuXkxTfEvPrO1IOQ81tArCFYQgi631clB70RpQw=="
},
"dom-serializer": {
"version": "0.1.0",
"resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-0.1.0.tgz",
@ -197,6 +245,19 @@
"domelementtype": "1"
}
},
"duplexer3": {
"version": "0.1.4",
"resolved": "https://registry.npmjs.org/duplexer3/-/duplexer3-0.1.4.tgz",
"integrity": "sha1-7gHdHKwO08vH/b6jfcCo8c4ALOI="
},
"end-of-stream": {
"version": "1.4.1",
"resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.1.tgz",
"integrity": "sha512-1MkrZNvWTKCaigbn+W15elq2BB/L22nqrSY5DKlo3X6+vclJm8Bb5djXJBmEX6fS3+zCh/F4VBK5Z2KxJt4s2Q==",
"requires": {
"once": "^1.4.0"
}
},
"entities": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/entities/-/entities-1.1.2.tgz",
@ -259,6 +320,14 @@
"resolved": "https://registry.npmjs.org/get-func-name/-/get-func-name-2.0.0.tgz",
"integrity": "sha1-6td0q+5y4gQJQzoGY2YCPdaIekE="
},
"get-stream": {
"version": "4.1.0",
"resolved": "https://registry.npmjs.org/get-stream/-/get-stream-4.1.0.tgz",
"integrity": "sha512-GMat4EJ5161kIy2HevLlr4luNjBgvmj413KaQA7jt4V8B4RDsfpHk7WQ9GVqfYyyx8OS/L66Kox+rJRNklLK7w==",
"requires": {
"pump": "^3.0.0"
}
},
"glob": {
"version": "7.1.3",
"resolved": "https://registry.npmjs.org/glob/-/glob-7.1.3.tgz",
@ -272,6 +341,24 @@
"path-is-absolute": "^1.0.0"
}
},
"got": {
"version": "9.6.0",
"resolved": "https://registry.npmjs.org/got/-/got-9.6.0.tgz",
"integrity": "sha512-R7eWptXuGYxwijs0eV+v3o6+XH1IqVK8dJOEecQfTmkncw9AV4dcw/Dhxi8MdlqPthxxpZyizMzyg8RTmEsG+Q==",
"requires": {
"@sindresorhus/is": "^0.14.0",
"@szmarczak/http-timer": "^1.1.2",
"cacheable-request": "^6.0.0",
"decompress-response": "^3.3.0",
"duplexer3": "^0.1.4",
"get-stream": "^4.1.0",
"lowercase-keys": "^1.0.1",
"mimic-response": "^1.0.1",
"p-cancelable": "^1.0.0",
"to-readable-stream": "^1.0.0",
"url-parse-lax": "^3.0.0"
}
},
"htmlparser2": {
"version": "3.10.0",
"resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-3.10.0.tgz",
@ -285,6 +372,11 @@
"readable-stream": "^3.0.6"
}
},
"http-cache-semantics": {
"version": "4.0.2",
"resolved": "https://registry.npmjs.org/http-cache-semantics/-/http-cache-semantics-4.0.2.tgz",
"integrity": "sha512-laeSTWIkuFa6lUgZAt+ic9RwOSEwbi9VDQNcCvMFO4sZiDc2Ha8DaZVCJnfpLLQCcS8rvCnIWYmz0POLxt7Dew=="
},
"https-proxy-agent": {
"version": "2.2.1",
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-2.2.1.tgz",
@ -323,16 +415,39 @@
"resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz",
"integrity": "sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE="
},
"json-buffer": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/json-buffer/-/json-buffer-3.0.0.tgz",
"integrity": "sha1-Wx85evx11ne96Lz8Dkfh+aPZqJg="
},
"keyv": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/keyv/-/keyv-3.1.0.tgz",
"integrity": "sha512-9ykJ/46SN/9KPM/sichzQ7OvXyGDYKGTaDlKMGCAlg2UK8KRy4jb0d8sFc+0Tt0YYnThq8X2RZgCg74RPxgcVA==",
"requires": {
"json-buffer": "3.0.0"
}
},
"lodash": {
"version": "4.17.11",
"resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.11.tgz",
"integrity": "sha512-cQKh8igo5QUhZ7lg38DYWAxMvjSAKG0A8wGSVimP07SIUEK2UO+arSRKbRZWtelMtN5V0Hkwh5ryOto/SshYIg=="
},
"lowercase-keys": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/lowercase-keys/-/lowercase-keys-1.0.1.tgz",
"integrity": "sha512-G2Lj61tXDnVFFOi8VZds+SoQjtQC3dgokKdDG2mTm1tx4m50NUHBOZSBwQQHyy0V12A0JTG4icfZQH+xPyh8VA=="
},
"mime": {
"version": "2.4.0",
"resolved": "https://registry.npmjs.org/mime/-/mime-2.4.0.tgz",
"integrity": "sha512-ikBcWwyqXQSHKtciCcctu9YfPbFYZ4+gbHEmE0Q8jzcTYQg5dHCr3g2wwAZjPoJfQVXZq6KXAjpXOTf5/cjT7w=="
},
"mimic-response": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-1.0.1.tgz",
"integrity": "sha512-j5EctnkH7amfV/q5Hgmoal1g2QHFJRraOtmx0JpIqkxhBhI/lJSl1nMpQ45hVarwNETOoWEimndZ4QK0RHxuxQ=="
},
"minimatch": {
"version": "3.0.4",
"resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.4.tgz",
@ -359,6 +474,11 @@
"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.1.tgz",
"integrity": "sha512-tgp+dl5cGk28utYktBsrFqA7HKgrhgPsg6Z/EfhWI4gl1Hwq8B/GmY/0oXZ6nF8hDVesS/FpnYaD/kOWhYQvyg=="
},
"normalize-url": {
"version": "3.3.0",
"resolved": "https://registry.npmjs.org/normalize-url/-/normalize-url-3.3.0.tgz",
"integrity": "sha512-U+JJi7duF1o+u2pynbp2zXDW2/PADgC30f0GsHZtRh+HOcXHnw137TrNlyxxRvWW5fjKd3bcLHPxofWuCjaeZg=="
},
"nth-check": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/nth-check/-/nth-check-1.0.2.tgz",
@ -375,6 +495,11 @@
"wrappy": "1"
}
},
"p-cancelable": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/p-cancelable/-/p-cancelable-1.0.0.tgz",
"integrity": "sha512-USgPoaC6tkTGlS831CxsVdmZmyb8tR1D+hStI84MyckLOzfJlYQUweomrwE3D8T7u5u5GVuW064LT501wHTYYA=="
},
"parse5": {
"version": "3.0.3",
"resolved": "https://registry.npmjs.org/parse5/-/parse5-3.0.3.tgz",
@ -398,6 +523,11 @@
"resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz",
"integrity": "sha1-elfrVQpng/kRUzH89GY9XI4AelA="
},
"prepend-http": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/prepend-http/-/prepend-http-2.0.0.tgz",
"integrity": "sha1-6SQ0v6XqjBn0HN/UAddBo8gZ2Jc="
},
"process-nextick-args": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.0.tgz",
@ -413,6 +543,15 @@
"resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.0.0.tgz",
"integrity": "sha1-M8UDmPcOp+uW0h97gXYwpVeRx+4="
},
"pump": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz",
"integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==",
"requires": {
"end-of-stream": "^1.1.0",
"once": "^1.3.1"
}
},
"puppeteer": {
"version": "1.11.0",
"resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-1.11.0.tgz",
@ -438,6 +577,14 @@
"util-deprecate": "^1.0.1"
}
},
"responselike": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/responselike/-/responselike-1.0.2.tgz",
"integrity": "sha1-kYcg7ztjHFZCvgaPFa3lpG9Loec=",
"requires": {
"lowercase-keys": "^1.0.0"
}
},
"rimraf": {
"version": "2.6.2",
"resolved": "https://registry.npmjs.org/rimraf/-/rimraf-2.6.2.tgz",
@ -459,6 +606,11 @@
"safe-buffer": "~5.1.0"
}
},
"to-readable-stream": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/to-readable-stream/-/to-readable-stream-1.0.0.tgz",
"integrity": "sha512-Iq25XBt6zD5npPhlLVXGFN3/gyR2/qODcKNNyTMd4vbm39HUaOiAM4PMq0eMVC/Tkxz+Zjdsc55g9yyz+Yq00Q=="
},
"type-detect": {
"version": "4.0.8",
"resolved": "https://registry.npmjs.org/type-detect/-/type-detect-4.0.8.tgz",
@ -469,6 +621,14 @@
"resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz",
"integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c="
},
"url-parse-lax": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/url-parse-lax/-/url-parse-lax-3.0.0.tgz",
"integrity": "sha1-FrXK/Afb42dsGxmZF3gj1lA6yww=",
"requires": {
"prepend-http": "^2.0.0"
}
},
"util-deprecate": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz",

View File

@ -1,6 +1,6 @@
{
"name": "se-scraper",
"version": "1.1.7",
"version": "1.1.8",
"description": "A simple module which uses puppeteer to scrape several search engines.",
"homepage": "https://scrapeulous.com/",
"main": "index.js",
@ -22,6 +22,7 @@
"dependencies": {
"chai": "^4.2.0",
"cheerio": "^1.0.0-rc.2",
"got": "^9.6.0",
"puppeteer": "^1.9.0"
}
}

12
run.js
View File

@ -10,7 +10,7 @@ let config = {
write_meta_data: false,
// how long to sleep between requests. a random sleep interval within the range [a,b]
// is drawn before every request. empty string for no sleeping.
sleep_range: '[1,1]',
sleep_range: '[1,2]',
// which search engine to scrape
search_engine: 'google',
// whether debug information should be printed
@ -20,9 +20,11 @@ let config = {
// this output is informational
verbose: false,
// an array of keywords to scrape
keywords: ['trump', 'chief'],
keywords: ['scraping scrapeulous.com'],
// alternatively you can specify a keyword_file. this overwrites the keywords array
keyword_file: '',
// the number of pages to scrape for each keyword
num_pages: 1,
// whether to start the browser in headless mode
headless: true,
// path to output file, data will be stored in JSON
@ -35,7 +37,11 @@ let config = {
// get_browser, handle_metadata, close_browser
// must be an absolute path to the module
//custom_func: resolve('examples/pluggable.js'),
custom_func: resolve('examples/pluggable.js'),
custom_func: '',
// use a proxy for all connections
// example: 'socks5://78.94.172.42:1080'
// example: 'http://118.174.233.10:48400'
//proxy: 'socks5://78.94.172.42:1080',
};
function callback(err, response) {

96
src/captcha_solver.js Normal file
View File

@ -0,0 +1,96 @@
/*
There are essentially two strategies to handle a search engine showing you a captcha:
1. Solve the captcha
https://github.com/ecthros/uncaptcha2
or use a captcha solving service such as https://anti-captcha.com/mainpage
2. Switch your IP address with rotating proxies
*/
/**
* @name download recaptcha2 audio captcha
*
* There are several issues:
*
* Google sees that we are using an automated browser.
*
* In the worst case we have to completely control the browser ourselves without puppeteer.
*
* https://github.com/ecthros/uncaptcha2
*
* See here:
*
* https://gist.github.com/tegansnyder/c3aeae4d57768c58247ae6c4e5acd3d1
*
* https://github.com/GoogleChrome/puppeteer/issues/3039
*
* https://intoli.com/blog/making-chrome-headless-undetectable/
*
* @desc Go to the https://www.google.com/recaptcha/api2/demo demo page and download the captcha
*/
const puppeteer = require('puppeteer');
const fs = require('fs');
const got = require('got');
try {
(async () => {
const browser = await puppeteer.launch({
args: [
'--proxy-server=socks5://78.94.172.42:1080',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--disable-gpu',
'--window-size=1920x1080',
'--hide-scrollbars',
'--user-agent="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0"',
],
headless: false,
});
const page = await browser.newPage()
await page.goto('https://www.google.com/recaptcha/api2/demo')
await page.waitFor(1000);
const frames = page.frames();
console.info('Available frames', frames.map(frame => frame.name()));
console.info('Available frame urls', frames.map(frame => frame.url()));
const frame = frames.find(frame => frame.url().includes('/recaptcha/api2/anchor?'));
const content_frame = frames.find(frame => frame.url().includes('/recaptcha/api2/bframe?'));
await frame.waitForSelector('#recaptcha-anchor', { timeout: 10000 });
await page.waitFor(1000);
const button = await frame.$('#recaptcha-anchor');
await button.click();
await content_frame.waitForSelector('#recaptcha-audio-button');
const audio_button = await content_frame.$('#recaptcha-audio-button');
await audio_button.click();
await page.waitFor(1000);
await content_frame.waitForSelector('.rc-audiochallenge-tdownload-link');
let download_link = await content_frame.evaluate(() => {
return document.querySelectorAll('.rc-audiochallenge-tdownload-link').getAttribute('href');
});
console.log('Got audio download link: ', download_link);
got.stream(download_link).pipe(fs.createWriteStream('audio.mp3'));
await browser.close();
})()
} catch (err) {
console.error(err)
}
/*
translate this shit into js: https://github.com/ecthros/uncaptcha2/blob/master/queryAPI.py
*/
async function translate_audio_file() {
}

View File

@ -21,6 +21,7 @@ async function scrape_bing_pup(page, event, context, pluggable) {
for (var i = 0; i < keywords.length; i++) {
keyword = keywords[i];
results[keyword] = {};
if (pluggable.before_keyword_scraped) {
await pluggable.before_keyword_scraped({
@ -33,23 +34,35 @@ async function scrape_bing_pup(page, event, context, pluggable) {
try {
const input = await page.$('input[name="q"]');
// overwrites last text in input
await input.click({ clickCount: 3 });
await input.type(keyword);
await sfunctions.set_input_value(page, `input[name="q"]`, keyword);
await sfunctions.sleep(50);
await input.focus();
await page.keyboard.press("Enter");
if (event.sleep_range) {
await sfunctions.random_sleep(event);
}
let page_num = 1;
await page.waitForSelector('#b_content', { timeout: 5000 });
if (event.debug === true && event.is_local === true) {
await page.screenshot({path: `debug/${keyword}.png`});
}
do {
if (event.verbose === true) {
console.log(`${event.search_engine} is scraping keyword: ${keyword} on page ${page_num}`);
}
if (event.sleep_range) {
await sfunctions.random_sleep(event);
}
await page.waitForSelector('#b_content', { timeout: 5000 });
await sfunctions.sleep(500);
let html = await page.content();
results[keyword][page_num] = parse(html);
let html = await page.content();
results[keyword] = parse(html);
page_num += 1;
let next_page_link = await page.$('.sb_pagN', {timeout: 1000});
if (!next_page_link) {
break;
}
await next_page_link.click();
await page.waitForNavigation();
} while (page_num <= event.num_pages)
} catch (e) {
console.error(`Problem with scraping ${keyword}: ${e}`);

View File

@ -25,8 +25,8 @@ async function scrape_google_pup(page, event, context, pluggable) {
var results = {};
for (var i = 0; i < keywords.length; i++) {
keyword = keywords[i];
results[keyword] = {};
if (pluggable.before_keyword_scraped) {
await pluggable.before_keyword_scraped({
@ -37,54 +37,63 @@ async function scrape_google_pup(page, event, context, pluggable) {
});
}
if (event.verbose === true) {
console.log(`${event.search_engine} is scraping keyword: ${keyword}`);
}
try {
const input = await page.$('input[name="q"]');
// await input.click({ clickCount: 3 });
// await sfunctions.sleep(50);
//await input.type(keyword);
await sfunctions.set_input_value(page, `input[name="q"]`, keyword);
await sfunctions.sleep(50);
await sfunctions.set_input_value(page, `input[name="q"]`, keyword);
await sfunctions.sleep(50);
await input.focus();
await page.keyboard.press("Enter");
if (event.sleep_range) {
await sfunctions.random_sleep(event);
}
let page_num = 1;
await page.waitForSelector('#center_col', { timeout: STANDARD_TIMEOUT });
await sfunctions.sleep(500);
do {
if (event.verbose === true) {
console.log(`${event.search_engine} is scraping keyword: ${keyword} on page ${page_num}`);
}
if (event.sleep_range) {
await sfunctions.random_sleep(event);
}
await page.waitForSelector('#center_col', {timeout: STANDARD_TIMEOUT});
await sfunctions.sleep(500);
let html = await page.content();
results[keyword][page_num] = parse_google_results(html);
page_num += 1;
let next_page_link = await page.$('#pnnext', {timeout: 1000});
if (!next_page_link) {
break;
}
await next_page_link.click();
await page.waitForNavigation();
} while (page_num <= event.num_pages)
} catch (e) {
console.error(`Problem with scraping ${keyword}.`);
console.error(e);
if (await scraping_detected(page) === true) {
console.error('Google detected the scraping. Aborting.');
if (await scraping_detected(page) === true) {
console.error('Google detected the scraping. Aborting.');
if (event.is_local === true) {
await sfunctions.sleep(SOLVE_CAPTCHA_TIME);
console.error('You have 45 seconds to enter the captcha.');
// expect that user filled out necessary captcha
} else {
return results;
}
} else {
// some other error, quit scraping process if stuff is broken
if (event.is_local === true) {
console.error('You have 30 seconds to fix this.');
await sfunctions.sleep(30000);
} else {
return results;
}
}
if (event.is_local === true) {
await sfunctions.sleep(SOLVE_CAPTCHA_TIME);
console.error('You have 45 seconds to enter the captcha.');
// expect that user filled out necessary captcha
} else {
return results;
}
} else {
// some other error, quit scraping process if stuff is broken
if (event.is_local === true) {
console.error('You have 30 seconds to fix this.');
await sfunctions.sleep(30000);
} else {
return results;
}
}
}
let html = await page.content();
results[keyword] = parse_google_results(html);
}
return results;

View File

@ -12,13 +12,14 @@ async function get_metadata(browser) {
waitLoad: true,
waitNetworkIdle: true // defaults to false
});
let json = await page.content();
let json = await page.content({
timeout: 20000
});
const $ = cheerio.load(json);
metadata.ipinfo = $('pre').text();
return metadata;
}
async function get_http_headers(browser) {
let metadata = {};
const page = await browser.newPage();

39
src/modules/se_scraper.js Normal file
View File

@ -0,0 +1,39 @@
const start_url = {
'google': ''
};
/*
Read this shit: https://javascript.info/class-inheritance
*/
module.exports = class Scraper {
constructor(options = {}) {
const {
searchEngine = 'google',
numPages = 1,
pluggable = null,
} = options;
this.pluggable = pluggable;
this.searchEngine = searchEngine;
this.numPages = numPages;
this.results = {}
}
async load_search_engine() {
}
async search_keyword() {
}
parse() {
}
async next_page() {
}
async detected() {
}
};

View File

@ -3,80 +3,79 @@ module.exports = {
};
function random_user_agent() {
return user_agents[Math.floor(Math.random()*user_agents.length)];
let rand = user_agents[Math.floor(Math.random()*user_agents.length)];
}
// updated: 29 Jan 2019
const user_agents = [
['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 YaBrowser/18.11.1.805 Yowser/2.5 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763',
'Mozilla/5.0 (iPad; CPU OS 12_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.1 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 YaBrowser/18.11.1.805 Yowser/2.5 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.106',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.98 Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; Touch; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8',
'Mozilla/5.0 (Windows NT 10.0; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0',
'Mozilla/5.0 (X11; CrOS x86_64 11151.59.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.94 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
]
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 YaBrowser/18.11.1.805 Yowser/2.5 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763',
'Mozilla/5.0 (iPad; CPU OS 12_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.1 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 YaBrowser/18.11.1.805 Yowser/2.5 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.106',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.98 Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; Touch; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8',
'Mozilla/5.0 (Windows NT 10.0; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0',
'Mozilla/5.0 (X11; CrOS x86_64 11151.59.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.94 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
];

View File

@ -22,7 +22,7 @@ function write_results(fname, data) {
module.exports.handler = async function handler (event, context, callback) {
config = event;
pluggable = null;
pluggable = {};
if (config.custom_func) {
if (fs.existsSync(config.custom_func)) {
try {
@ -43,8 +43,11 @@ module.exports.handler = async function handler (event, context, callback) {
console.log(config);
}
const ADDITIONAL_CHROME_FLAGS = [
//'--proxy-server=' + proxy,
var ADDITIONAL_CHROME_FLAGS = [
'--disable-infobars',
'--window-position=0,0',
'--ignore-certifcate-errors',
'--ignore-certifcate-errors-spki-list',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
@ -70,16 +73,27 @@ module.exports.handler = async function handler (event, context, callback) {
)
}
if (config.proxy) {
// check this out bubbles
// https://www.systutorials.com/241062/how-to-set-google-chromes-proxy-settings-in-command-line-on-linux/
// [<proxy-scheme>://]<proxy-host>[:<proxy-port>]
// "http", "socks", "socks4", "socks5".
ADDITIONAL_CHROME_FLAGS.push(
'--proxy-server=' + config.proxy,
)
}
let launch_args = {
args: ADDITIONAL_CHROME_FLAGS,
headless: config.headless,
ignoreHTTPSErrors: true,
};
if (config.debug === true) {
console.log("Chrome Args: ", launch_args);
}
if (pluggable) {
if (pluggable.start_browser) {
launch_args.config = config;
browser = await pluggable.start_browser(launch_args);
} else {
@ -91,6 +105,30 @@ module.exports.handler = async function handler (event, context, callback) {
console.dir(headers);
}
let metadata = {};
if (config.write_meta_data === true) {
metadata = await meta.get_metadata(browser);
}
// check that our proxy is working by confirming
// that ipinfo.io sees the proxy IP address
if (config.proxy && config.write_meta_data === true) {
console.log(`${metadata.ipinfo} vs ${config.proxy}`);
try {
let ipdata = JSON.parse(metadata.ipinfo);
// if the ip returned by ipinfo is not a substring of our proxystring, get the heck outta here
if (!config.proxy.includes(ipdata.ip)) {
console.error('Proxy not working properly.');
await browser.close();
return;
}
} catch (exception) {
}
}
const page = await browser.newPage();
// block some assets to speed up scraping
@ -127,13 +165,8 @@ module.exports.handler = async function handler (event, context, callback) {
marketwatch: tickersearch.scrape_marketwatch_finance_pup,
}[config.search_engine](page, config, context, pluggable);
let metadata = {};
if (config.write_meta_data === true) {
metadata = await meta.get_metadata(browser);
}
if (pluggable) {
if (pluggable.close_browser) {
await pluggable.close_browser();
} else {
await browser.close();
@ -155,7 +188,7 @@ module.exports.handler = async function handler (event, context, callback) {
results = zlib.deflateSync(results).toString('base64');
}
if (pluggable && pluggable.handle_results) {
if (pluggable.handle_results) {
await pluggable.handle_results({
config: config,
results: results,
@ -172,7 +205,7 @@ module.exports.handler = async function handler (event, context, callback) {
console.log(metadata);
}
if (pluggable) {
if (pluggable.handle_metadata) {
await pluggable.handle_metadata({metadata: metadata, config: config});
}
}