forked from extern/se-scraper
resolved some issues. proxy possible now. scraping for more than one page possible now
This commit is contained in:
parent
89441070cd
commit
9e62f23451
426
README.md
426
README.md
@ -34,10 +34,50 @@ Scraping is done with a headless chromium browser using the automation library p
|
||||
|
||||
If you need to deploy scraping to the cloud (AWS or Azure), you can contact me on hire@incolumitas.com
|
||||
|
||||
The chromium browser is started with the following flags to prevent
|
||||
scraping detection.
|
||||
|
||||
```js
|
||||
var ADDITIONAL_CHROME_FLAGS = [
|
||||
'--disable-infobars',
|
||||
'--window-position=0,0',
|
||||
'--ignore-certifcate-errors',
|
||||
'--ignore-certifcate-errors-spki-list',
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-accelerated-2d-canvas',
|
||||
'--disable-gpu',
|
||||
'--window-size=1920x1080',
|
||||
'--hide-scrollbars',
|
||||
];
|
||||
```
|
||||
|
||||
Furthermore, to avoid loading unnecessary ressources and to speed up
|
||||
scraping a great deal, we instruct chrome to not load images and css:
|
||||
|
||||
```js
|
||||
await page.setRequestInterception(true);
|
||||
page.on('request', (req) => {
|
||||
let type = req.resourceType();
|
||||
const block = ['stylesheet', 'font', 'image', 'media'];
|
||||
if (block.includes(type)) {
|
||||
req.abort();
|
||||
} else {
|
||||
req.continue();
|
||||
}
|
||||
});
|
||||
```
|
||||
|
||||
#### Making puppeteer and headless chrome undetectable
|
||||
|
||||
Consider the following resources:
|
||||
|
||||
* https://intoli.com/blog/making-chrome-headless-undetectable/
|
||||
|
||||
### Installation and Usage
|
||||
|
||||
Install with
|
||||
Install with
|
||||
|
||||
```bash
|
||||
npm install se-scraper
|
||||
@ -53,12 +93,12 @@ let config = {
|
||||
// the user agent to scrape with
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
// if random_user_agent is set to True, a random user agent is chosen
|
||||
random_user_agent: false,
|
||||
random_user_agent: true,
|
||||
// get meta data of scraping in return object
|
||||
write_meta_data: false,
|
||||
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
||||
// is drawn before every request. empty string for no sleeping.
|
||||
sleep_range: '',
|
||||
sleep_range: '[1,2]',
|
||||
// which search engine to scrape
|
||||
search_engine: 'google',
|
||||
// whether debug information should be printed
|
||||
@ -68,9 +108,11 @@ let config = {
|
||||
// this output is informational
|
||||
verbose: false,
|
||||
// an array of keywords to scrape
|
||||
keywords: ['scrapeulous.com', ],
|
||||
keywords: ['scraping scrapeulous.com'],
|
||||
// alternatively you can specify a keyword_file. this overwrites the keywords array
|
||||
keyword_file: '',
|
||||
// the number of pages to scrape for each keyword
|
||||
num_pages: 2,
|
||||
// whether to start the browser in headless mode
|
||||
headless: true,
|
||||
// path to output file, data will be stored in JSON
|
||||
@ -84,9 +126,13 @@ let config = {
|
||||
// must be an absolute path to the module
|
||||
//custom_func: resolve('examples/pluggable.js'),
|
||||
custom_func: '',
|
||||
// use a proxy for all connections
|
||||
// example: 'socks5://78.94.172.42:1080'
|
||||
// example: 'http://118.174.233.10:48400'
|
||||
//proxy: 'socks5://78.94.172.42:1080',
|
||||
};
|
||||
|
||||
se_scraper.scrape(config, (err, response) => {
|
||||
function callback(err, response) {
|
||||
if (err) { console.error(err) }
|
||||
|
||||
/* response object has the following properties:
|
||||
@ -97,7 +143,9 @@ se_scraper.scrape(config, (err, response) => {
|
||||
*/
|
||||
|
||||
console.dir(response.results, {depth: null, colors: true});
|
||||
});
|
||||
}
|
||||
|
||||
se_scraper.scrape(config, callback);
|
||||
```
|
||||
|
||||
Supported options for the `search_engine` config key:
|
||||
@ -123,199 +171,179 @@ Supported options for the `search_engine` config key:
|
||||
'marketwatch'
|
||||
```
|
||||
|
||||
Output for the above script on my laptop:
|
||||
Output for the above script on my machine:
|
||||
|
||||
```text
|
||||
Scraper took 4295ms to scrape 2 keywords.
|
||||
On average ms/keyword: 2147.5ms/keyword
|
||||
{ 'incolumitas.com scraping':
|
||||
{ time: 'Mon, 24 Dec 2018 13:07:43 GMT',
|
||||
num_results: 'Ungefähr 2’020 Ergebnisse (0.18 Sekunden) ',
|
||||
no_results: false,
|
||||
effective_query: '',
|
||||
results:
|
||||
[ { link:
|
||||
'https://incolumitas.com/2018/10/29/youtube-puppeteer-scraping/',
|
||||
title:
|
||||
'Coding, Learning and Business Ideas – Tutorial: Youtube scraping ...',
|
||||
snippet:
|
||||
'29.10.2018 - In this blog post I am going to show you how to scrape YouTube video data using the handy puppeteer library. Puppeteer is a Node library ...',
|
||||
visible_link:
|
||||
'https://incolumitas.com/2018/10/29/youtube-puppeteer-scraping/',
|
||||
date: '29.10.2018 - ',
|
||||
rank: 1 },
|
||||
{ link: 'https://incolumitas.com/2018/09/05/googlescraper-tutorial/',
|
||||
title:
|
||||
'GoogleScraper Tutorial - How to scrape 1000 keywords with Google',
|
||||
snippet:
|
||||
'05.09.2018 - Tutorial that teaches how to use GoogleScraper to scrape 1000 keywords with 10 selenium browsers.',
|
||||
visible_link: 'https://incolumitas.com/2018/09/05/googlescraper-tutorial/',
|
||||
date: '05.09.2018 - ',
|
||||
rank: 2 },
|
||||
{ link: 'https://incolumitas.com/tag/scraping.html',
|
||||
title: 'Coding, Learning and Business Ideas – Tag Scraping',
|
||||
snippet:
|
||||
'Scraping Amazon Reviews using Headless Chrome Browser and Python3. Posted on Mi ... GoogleScraper Tutorial - How to scrape 1000 keywords with Google.',
|
||||
visible_link: 'https://incolumitas.com/tag/scraping.html',
|
||||
date: '',
|
||||
rank: 3 },
|
||||
{ link: 'https://incolumitas.com/category/scraping.html',
|
||||
title: 'Coding, Learning and Business Ideas – Category Scraping',
|
||||
snippet:
|
||||
'Nikolai Tschacher\'s ideas and projects around IT security and computer science.',
|
||||
visible_link: 'https://incolumitas.com/category/scraping.html',
|
||||
date: '',
|
||||
rank: 4 },
|
||||
{ link:
|
||||
'https://github.com/NikolaiT/incolumitas/blob/master/content/Meta/scraping-and-extracting-links-from-any-major-search-engine-like-google-yandex-baidu-bing-and-duckduckgo.md',
|
||||
title:
|
||||
'incolumitas/scraping-and-extracting-links-from-any-major-search ...',
|
||||
snippet:
|
||||
'Title: Scraping and Extracting Links from any major Search Engine like Google, Yandex, Baidu, Bing and Duckduckgo Date: 2014-11-12 00:47 Author: Nikolai ...',
|
||||
visible_link:
|
||||
'https://github.com/.../incolumitas/.../scraping-and-extracting-links...',
|
||||
date: '',
|
||||
rank: 5 },
|
||||
{ link:
|
||||
'https://stackoverflow.com/questions/16955325/scraping-google-results-with-python',
|
||||
title: 'Scraping Google Results with Python - Stack Overflow',
|
||||
snippet:
|
||||
'I found this. incolumitas.com/2013/01/06/… But the author claims it is not ported to 2.7 yet. – user2351394 Jun 6 \'13 at 6:59 ...',
|
||||
visible_link:
|
||||
'https://stackoverflow.com/.../scraping-google-results-with-python',
|
||||
date: '',
|
||||
rank: 6 },
|
||||
{ link: 'https://pypi.org/project/GoogleScraper/0.1.18/',
|
||||
title: 'GoogleScraper · PyPI',
|
||||
snippet:
|
||||
'[5]: http://incolumitas.com/2014/11/12/scraping-and-extracting-links-from-any-major-search-engine-like-google-yandex-baidu-bing-and-duckduckgo/ ...',
|
||||
visible_link: 'https://pypi.org/project/GoogleScraper/0.1.18/',
|
||||
date: '',
|
||||
rank: 7 },
|
||||
{ link:
|
||||
'https://www.reddit.com/r/Python/comments/2m0vyu/scraping_links_on_google_yandex_bing_duckduckgo/',
|
||||
title:
|
||||
'Scraping links on Google, Yandex, Bing, Duckduckgo, Baidu and ...',
|
||||
snippet:
|
||||
'12.11.2014 - Scraping links on Google, Yandex, Bing, Duckduckgo, Baidu and other search engines with Python ... submitted 4 years ago by incolumitas.',
|
||||
visible_link:
|
||||
'https://www.reddit.com/.../scraping_links_on_google_yandex_bi...',
|
||||
date: '12.11.2014 - ',
|
||||
rank: 9 },
|
||||
{ link: 'https://twitter.com/incolumitas_?lang=de',
|
||||
title: 'Nikolai Tschacher (@incolumitas_) | Twitter',
|
||||
snippet:
|
||||
'Embed Tweet. How to use GoogleScraper to scrape images and download them ... Learn how to scrape millions of url from yandex and google or bing with: ...',
|
||||
visible_link: 'https://twitter.com/incolumitas_?lang=de',
|
||||
date: '',
|
||||
rank: 10 } ] },
|
||||
'best scraping framework':
|
||||
{ time: 'Mon, 24 Dec 2018 13:07:44 GMT',
|
||||
num_results: 'Ungefähr 2’820’000 Ergebnisse (0.36 Sekunden) ',
|
||||
no_results: false,
|
||||
effective_query: '',
|
||||
results:
|
||||
[ { link:
|
||||
'http://www.aioptify.com/top-web-scraping-frameworks-and-librares.php',
|
||||
title: 'Top Web Scraping Frameworks and Libraries - AI Optify',
|
||||
snippet: '',
|
||||
visible_link:
|
||||
'www.aioptify.com/top-web-scraping-frameworks-and-librares.php',
|
||||
date: '',
|
||||
rank: 1 },
|
||||
{ link:
|
||||
'http://www.aioptify.com/top-web-scraping-frameworks-and-librares.php',
|
||||
title: 'Top Web Scraping Frameworks and Libraries - AI Optify',
|
||||
snippet: '',
|
||||
visible_link:
|
||||
'www.aioptify.com/top-web-scraping-frameworks-and-librares.php',
|
||||
date: '',
|
||||
rank: 2 },
|
||||
{ link:
|
||||
'https://www.scrapehero.com/open-source-web-scraping-frameworks-and-tools/',
|
||||
title:
|
||||
'Best Open Source Web Scraping Frameworks and Tools - ScrapeHero',
|
||||
snippet:
|
||||
'05.06.2018 - List of Open Source Web Scraping Frameworks. Scrapy. MechanicalSoup. PySpider. Portia. Apify SDK. Nodecrawler. Selenium WebDriver. Puppeteer.',
|
||||
visible_link:
|
||||
'https://www.scrapehero.com/open-source-web-scraping-framewo...',
|
||||
date: '05.06.2018 - ',
|
||||
rank: 3 },
|
||||
{ link:
|
||||
'https://medium.com/datadriveninvestor/best-data-scraping-tools-for-2018-top-10-reviews-558cc5a4992f',
|
||||
title:
|
||||
'Best Data Scraping Tools for 2018 (Top 10 Reviews) – Data Driven ...',
|
||||
snippet:
|
||||
'05.03.2018 - Pros: Octoparse is the best free data scraping tool I\'ve met. ... your Scrapy (a open-source data extraction framework) web spider\'s activities.',
|
||||
visible_link:
|
||||
'https://medium.com/.../best-data-scraping-tools-for-2018-top-10-...',
|
||||
date: '05.03.2018 - ',
|
||||
rank: 4 },
|
||||
{ link:
|
||||
'https://www.quora.com/What-is-the-best-web-scraping-open-source-tool',
|
||||
title: 'What is the best web scraping open source tool? - Quora',
|
||||
snippet:
|
||||
'15.06.2015 - My personal favourite is Python Scrapy and it is an excellent framework for building a web data scraper. Why Scrapy? 1) It is an open source framework and cost ...',
|
||||
visible_link:
|
||||
'https://www.quora.com/What-is-the-best-web-scraping-open-sour...',
|
||||
date: '15.06.2015 - ',
|
||||
rank: 5 },
|
||||
{ link:
|
||||
'http://www.aioptify.com/top-web-scraping-frameworks-and-librares.php',
|
||||
title: 'Top Web Scraping Frameworks and Libraries - AI Optify',
|
||||
snippet:
|
||||
'21.05.2018 - Top Web Scraping Frameworks and Libraries. Requests. Scrapy. Beautiful Soup. Selenium with Python. lxml. Webscraping with Selenium - part 1. Extracting data from websites with Scrapy. Scrapinghub.',
|
||||
visible_link:
|
||||
'www.aioptify.com/top-web-scraping-frameworks-and-librares.php',
|
||||
date: '21.05.2018 - ',
|
||||
rank: 6 },
|
||||
{ link: 'https://scrapy.org/',
|
||||
title:
|
||||
'Scrapy | A Fast and Powerful Scraping and Web Crawling Framework',
|
||||
snippet:
|
||||
'An open source and collaborative framework for extracting the data you need from ... Spider): name = \'blogspider\' start_urls = [\'https://blog.scrapinghub.com\'] def ...',
|
||||
visible_link: 'https://scrapy.org/',
|
||||
date: '',
|
||||
rank: 7 },
|
||||
{ link:
|
||||
'https://www.scraperapi.com/blog/the-10-best-web-scraping-tools',
|
||||
title: 'The 10 Best Web Scraping Tools of 2018 - Scraper API',
|
||||
snippet:
|
||||
'19.07.2018 - The 10 Best Web Scraping Tools of 2018. ParseHub. Scrapy. Diffbot. Cheerio. Website: https://cheerio.js.org. Beautiful Soup. Website: https://www.crummy.com/software/BeautifulSoup/ Puppeteer. Website: https://github.com/GoogleChrome/puppeteer. Content Grabber. Website: http://www.contentgrabber.com/ Mozenda. Website: ...',
|
||||
visible_link:
|
||||
'https://www.scraperapi.com/blog/the-10-best-web-scraping-tools',
|
||||
date: '19.07.2018 - ',
|
||||
rank: 8 },
|
||||
{ link: 'https://elitedatascience.com/python-web-scraping-libraries',
|
||||
title: '5 Tasty Python Web Scraping Libraries - EliteDataScience',
|
||||
snippet:
|
||||
'03.02.2017 - We\'ve decided to feature the 5 Python libraries for web scraping that ... The good news is that you can swap out its parser with a faster one if ... Scrapy is technically not even a library… it\'s a complete web scraping framework.',
|
||||
visible_link: 'https://elitedatascience.com/python-web-scraping-libraries',
|
||||
date: '03.02.2017 - ',
|
||||
rank: 9 },
|
||||
{ link:
|
||||
'https://blog.michaelyin.info/web-scraping-framework-review-scrapy-vs-selenium/',
|
||||
title:
|
||||
'Web Scraping Framework Review: Scrapy VS Selenium | MichaelYin ...',
|
||||
snippet:
|
||||
'01.10.2018 - In this Scrapy tutorial, I will cover the features of Scrapy and Selenium, and help you decide which one is better for your projects.',
|
||||
visible_link:
|
||||
'https://blog.michaelyin.info/web-scraping-framework-review-scr...',
|
||||
date: '01.10.2018 - ',
|
||||
rank: 10 },
|
||||
{ link: 'https://github.com/lorien/awesome-web-scraping',
|
||||
title:
|
||||
'GitHub - lorien/awesome-web-scraping: List of libraries, tools and APIs ...',
|
||||
snippet:
|
||||
'List of libraries, tools and APIs for web scraping and data processing. ... golang.md · add dataflow kit framework, 2 months ago ... Make this list better!',
|
||||
visible_link: 'https://github.com/lorien/awesome-web-scraping',
|
||||
date: '',
|
||||
rank: 11 },
|
||||
{ link: 'https://www.import.io/post/best-web-scraping-tools-2018/',
|
||||
title: 'Best Web Scraping Software Tools 2018 | Import.io',
|
||||
snippet:
|
||||
'07.08.2018 - List of Best Web Scraping SoftwareThere are hundreds of Web ... it is a fast high-level screen scraping and web crawling framework, used to ...',
|
||||
visible_link: 'https://www.import.io/post/best-web-scraping-tools-2018/',
|
||||
date: '07.08.2018 - ',
|
||||
rank: 12 } ] } }
|
||||
{ 'scraping scrapeulous.com':
|
||||
{ '1':
|
||||
{ time: 'Tue, 29 Jan 2019 21:39:22 GMT',
|
||||
num_results: 'Ungefähr 145 Ergebnisse (0,18 Sekunden) ',
|
||||
no_results: false,
|
||||
effective_query: '',
|
||||
results:
|
||||
[ { link: 'https://scrapeulous.com/',
|
||||
title:
|
||||
'Scrapeuloushttps://scrapeulous.com/Im CacheDiese Seite übersetzen',
|
||||
snippet:
|
||||
'Scrapeulous.com allows you to scrape various search engines automatically ... or to find hidden links, Scrapeulous.com enables you to scrape a ever increasing ...',
|
||||
visible_link: 'https://scrapeulous.com/',
|
||||
date: '',
|
||||
rank: 1 },
|
||||
{ link: 'https://scrapeulous.com/about/',
|
||||
title:
|
||||
'About - Scrapeuloushttps://scrapeulous.com/about/Im CacheDiese Seite übersetzen',
|
||||
snippet:
|
||||
'Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. The business requirement to scrape information from ...',
|
||||
visible_link: 'https://scrapeulous.com/about/',
|
||||
date: '',
|
||||
rank: 2 },
|
||||
{ link: 'https://scrapeulous.com/howto/',
|
||||
title:
|
||||
'Howto - Scrapeuloushttps://scrapeulous.com/howto/Im CacheDiese Seite übersetzen',
|
||||
snippet:
|
||||
'We offer scraping large amounts of keywords for the Google Search Engine. Large means any number of keywords between 40 and 50000. Additionally, we ...',
|
||||
visible_link: 'https://scrapeulous.com/howto/',
|
||||
date: '',
|
||||
rank: 3 },
|
||||
{ link: 'https://github.com/NikolaiT/se-scraper',
|
||||
title:
|
||||
'GitHub - NikolaiT/se-scraper: Javascript scraping module based on ...https://github.com/NikolaiT/se-scraperIm CacheDiese Seite übersetzen',
|
||||
snippet:
|
||||
'24.12.2018 - Javascript scraping module based on puppeteer for many different search ... for many different search engines... https://scrapeulous.com/.',
|
||||
visible_link: 'https://github.com/NikolaiT/se-scraper',
|
||||
date: '24.12.2018 - ',
|
||||
rank: 4 },
|
||||
{ link:
|
||||
'https://github.com/NikolaiT/GoogleScraper/blob/master/README.md',
|
||||
title:
|
||||
'GoogleScraper/README.md at master · NikolaiT/GoogleScraper ...https://github.com/NikolaiT/GoogleScraper/blob/.../README.mdIm CacheÄhnliche SeitenDiese Seite übersetzen',
|
||||
snippet:
|
||||
'GoogleScraper - Scraping search engines professionally. Scrapeulous.com - Scraping Service. GoogleScraper is a open source tool and will remain a open ...',
|
||||
visible_link:
|
||||
'https://github.com/NikolaiT/GoogleScraper/blob/.../README.md',
|
||||
date: '',
|
||||
rank: 5 },
|
||||
{ link: 'https://googlescraper.readthedocs.io/',
|
||||
title:
|
||||
'Welcome to GoogleScraper\'s documentation! — GoogleScraper ...https://googlescraper.readthedocs.io/Im CacheDiese Seite übersetzen',
|
||||
snippet:
|
||||
'Welcome to GoogleScraper\'s documentation!¶. Contents: GoogleScraper - Scraping search engines professionally · Scrapeulous.com - Scraping Service ...',
|
||||
visible_link: 'https://googlescraper.readthedocs.io/',
|
||||
date: '',
|
||||
rank: 6 },
|
||||
{ link: 'https://incolumitas.com/pages/scrapeulous/',
|
||||
title:
|
||||
'Coding, Learning and Business Ideas – Scrapeulous.com - Incolumitashttps://incolumitas.com/pages/scrapeulous/Im CacheDiese Seite übersetzen',
|
||||
snippet:
|
||||
'A scraping service for scientists, marketing professionals, analysts or SEO folk. In autumn 2018, I created a scraping service called scrapeulous.com. There you ...',
|
||||
visible_link: 'https://incolumitas.com/pages/scrapeulous/',
|
||||
date: '',
|
||||
rank: 7 },
|
||||
{ link: 'https://incolumitas.com/',
|
||||
title:
|
||||
'Coding, Learning and Business Ideashttps://incolumitas.com/Im CacheDiese Seite übersetzen',
|
||||
snippet:
|
||||
'Scraping Amazon Reviews using Headless Chrome Browser and Python3. Posted on Mi ... GoogleScraper Tutorial - How to scrape 1000 keywords with Google.',
|
||||
visible_link: 'https://incolumitas.com/',
|
||||
date: '',
|
||||
rank: 8 },
|
||||
{ link: 'https://en.wikipedia.org/wiki/Search_engine_scraping',
|
||||
title:
|
||||
'Search engine scraping - Wikipediahttps://en.wikipedia.org/wiki/Search_engine_scrapingIm CacheDiese Seite übersetzen',
|
||||
snippet:
|
||||
'Search engine scraping is the process of harvesting URLs, descriptions, or other information from search engines such as Google, Bing or Yahoo. This is a ...',
|
||||
visible_link: 'https://en.wikipedia.org/wiki/Search_engine_scraping',
|
||||
date: '',
|
||||
rank: 9 },
|
||||
{ link:
|
||||
'https://readthedocs.org/projects/googlescraper/downloads/pdf/latest/',
|
||||
title:
|
||||
'GoogleScraper Documentation - Read the Docshttps://readthedocs.org/projects/googlescraper/downloads/.../latest...Im CacheDiese Seite übersetzen',
|
||||
snippet:
|
||||
'23.12.2018 - Contents: 1 GoogleScraper - Scraping search engines professionally. 1. 1.1 ... For this reason, I created the web service scrapeulous.com.',
|
||||
visible_link:
|
||||
'https://readthedocs.org/projects/googlescraper/downloads/.../latest...',
|
||||
date: '23.12.2018 - ',
|
||||
rank: 10 } ] },
|
||||
'2':
|
||||
{ time: 'Tue, 29 Jan 2019 21:39:24 GMT',
|
||||
num_results: 'Seite 2 von ungefähr 145 Ergebnissen (0,20 Sekunden) ',
|
||||
no_results: false,
|
||||
effective_query: '',
|
||||
results:
|
||||
[ { link: 'https://pypi.org/project/CountryGoogleScraper/',
|
||||
title:
|
||||
'CountryGoogleScraper · PyPIhttps://pypi.org/project/CountryGoogleScraper/Im CacheDiese Seite übersetzen',
|
||||
snippet:
|
||||
'A module to scrape and extract links, titles and descriptions from various search ... Look [here to get an idea how to use asynchronous mode](http://scrapeulous.',
|
||||
visible_link: 'https://pypi.org/project/CountryGoogleScraper/',
|
||||
date: '',
|
||||
rank: 1 },
|
||||
{ link: 'https://www.youtube.com/watch?v=a6xn6rc9GbI',
|
||||
title:
|
||||
'scrapeulous intro - YouTubehttps://www.youtube.com/watch?v=a6xn6rc9GbIDiese Seite übersetzen',
|
||||
snippet:
|
||||
'scrapeulous intro. Scrapeulous Scrapeulous. Loading... Unsubscribe from ... on Dec 16, 2018. Introduction ...',
|
||||
visible_link: 'https://www.youtube.com/watch?v=a6xn6rc9GbI',
|
||||
date: '',
|
||||
rank: 3 },
|
||||
{ link:
|
||||
'https://www.reddit.com/r/Python/comments/2tii3r/scraping_260_search_queries_in_bing_in_a_matter/',
|
||||
title:
|
||||
'Scraping 260 search queries in Bing in a matter of seconds using ...https://www.reddit.com/.../scraping_260_search_queries_in_bing...Im CacheDiese Seite übersetzen',
|
||||
snippet:
|
||||
'24.01.2015 - Scraping 260 search queries in Bing in a matter of seconds using asyncio and aiohttp. (scrapeulous.com). submitted 3 years ago by ...',
|
||||
visible_link:
|
||||
'https://www.reddit.com/.../scraping_260_search_queries_in_bing...',
|
||||
date: '24.01.2015 - ',
|
||||
rank: 4 },
|
||||
{ link: 'https://twitter.com/incolumitas_?lang=de',
|
||||
title:
|
||||
'Nikolai Tschacher (@incolumitas_) | Twitterhttps://twitter.com/incolumitas_?lang=deIm CacheÄhnliche SeitenDiese Seite übersetzen',
|
||||
snippet:
|
||||
'Learn how to scrape millions of url from yandex and google or bing with: http://scrapeulous.com/googlescraper-market-analysis.html … 0 replies 0 retweets 0 ...',
|
||||
visible_link: 'https://twitter.com/incolumitas_?lang=de',
|
||||
date: '',
|
||||
rank: 5 },
|
||||
{ link:
|
||||
'http://blog.shodan.io/hostility-in-the-python-package-index/',
|
||||
title:
|
||||
'Hostility in the Cheese Shop - Shodan Blogblog.shodan.io/hostility-in-the-python-package-index/Im CacheDiese Seite übersetzen',
|
||||
snippet:
|
||||
'22.02.2015 - https://zzz.scrapeulous.com/r? According to the author of the website, these hostile packages are used as honeypots. Honeypots are usually ...',
|
||||
visible_link: 'blog.shodan.io/hostility-in-the-python-package-index/',
|
||||
date: '22.02.2015 - ',
|
||||
rank: 6 },
|
||||
{ link: 'https://libraries.io/github/NikolaiT/GoogleScraper',
|
||||
title:
|
||||
'NikolaiT/GoogleScraper - Libraries.iohttps://libraries.io/github/NikolaiT/GoogleScraperIm CacheDiese Seite übersetzen',
|
||||
snippet:
|
||||
'A Python module to scrape several search engines (like Google, Yandex, Bing, ... https://scrapeulous.com/ ... You can install GoogleScraper comfortably with pip:',
|
||||
visible_link: 'https://libraries.io/github/NikolaiT/GoogleScraper',
|
||||
date: '',
|
||||
rank: 7 },
|
||||
{ link: 'https://pydigger.com/pypi/CountryGoogleScraper',
|
||||
title:
|
||||
'CountryGoogleScraper - PyDiggerhttps://pydigger.com/pypi/CountryGoogleScraperDiese Seite übersetzen',
|
||||
snippet:
|
||||
'19.10.2016 - Look [here to get an idea how to use asynchronous mode](http://scrapeulous.com/googlescraper-260-keywords-in-a-second.html). ### Table ...',
|
||||
visible_link: 'https://pydigger.com/pypi/CountryGoogleScraper',
|
||||
date: '19.10.2016 - ',
|
||||
rank: 8 },
|
||||
{ link: 'https://hub.docker.com/r/cimenx/data-mining-penandtest/',
|
||||
title:
|
||||
'cimenx/data-mining-penandtest - Docker Hubhttps://hub.docker.com/r/cimenx/data-mining-penandtest/Im CacheDiese Seite übersetzen',
|
||||
snippet:
|
||||
'Container. OverviewTagsDockerfileBuilds · http://scrapeulous.com/googlescraper-260-keywords-in-a-second.html. Docker Pull Command. Owner. profile ...',
|
||||
visible_link: 'https://hub.docker.com/r/cimenx/data-mining-penandtest/',
|
||||
date: '',
|
||||
rank: 9 },
|
||||
{ link: 'https://www.revolvy.com/page/Search-engine-scraping',
|
||||
title:
|
||||
'Search engine scraping | Revolvyhttps://www.revolvy.com/page/Search-engine-scrapingIm CacheDiese Seite übersetzen',
|
||||
snippet:
|
||||
'Search engine scraping is the process of harvesting URLs, descriptions, or other information from search engines such as Google, Bing or Yahoo. This is a ...',
|
||||
visible_link: 'https://www.revolvy.com/page/Search-engine-scraping',
|
||||
date: '',
|
||||
rank: 10 } ] } } }
|
||||
```
|
37
TODO.txt
37
TODO.txt
@ -14,6 +14,17 @@
|
||||
https://www.scrapehero.com/how-to-increase-web-scraping-speed-using-puppeteer/
|
||||
https://www.scrapehero.com/how-to-build-a-web-scraper-using-puppeteer-and-node-js/
|
||||
|
||||
29.1.2019
|
||||
|
||||
- implement proxy support functionality
|
||||
- implement proxy check
|
||||
|
||||
- implement scraping more than 1 page
|
||||
- do it for google
|
||||
- and bing
|
||||
|
||||
- implement duckduckgo scraping
|
||||
|
||||
TODO:
|
||||
- think about implementing ticker search for: https://quotes.wsj.com/MSFT?mod=searchresults_companyquotes
|
||||
- add proxy support
|
||||
@ -23,4 +34,28 @@ TODO:
|
||||
TODO:
|
||||
- think whether it makes sense to introduce a generic scraping class?
|
||||
- is scraping abstractable or is every scraper too unique?
|
||||
- dont make the same mistakes as with GoogleScraper
|
||||
- dont make the same mistakes as with GoogleScraper
|
||||
|
||||
|
||||
TODO:
|
||||
okay its fucking time to make a generic scraping class like in GoogleScraper
|
||||
i feel like history repeats
|
||||
|
||||
class Scraper
|
||||
|
||||
constructor(options = {}) {
|
||||
|
||||
}
|
||||
|
||||
async load_search_engine() {}
|
||||
|
||||
async search_keyword() {}
|
||||
|
||||
async new_page() {}
|
||||
|
||||
async detected() {}
|
||||
|
||||
|
||||
then each search engine derives from this generic class
|
||||
|
||||
some search engines do not seed such a abstract class, because they are too complex
|
4
index.js
4
index.js
@ -35,6 +35,10 @@ exports.scrape = async function(config, callback) {
|
||||
// get_browser, handle_metadata, close_browser
|
||||
//custom_func: resolve('examples/pluggable.js'),
|
||||
custom_func: '',
|
||||
// use a proxy for all connections
|
||||
// example: 'socks5://78.94.172.42:1080'
|
||||
// example: 'http://118.174.233.10:48400'
|
||||
proxy: '',
|
||||
};
|
||||
|
||||
// overwrite default config
|
||||
|
162
package-lock.json
generated
162
package-lock.json
generated
@ -1,9 +1,22 @@
|
||||
{
|
||||
"name": "se-scraper",
|
||||
"version": "1.1.4",
|
||||
"version": "1.1.7",
|
||||
"lockfileVersion": 1,
|
||||
"requires": true,
|
||||
"dependencies": {
|
||||
"@sindresorhus/is": {
|
||||
"version": "0.14.0",
|
||||
"resolved": "https://registry.npmjs.org/@sindresorhus/is/-/is-0.14.0.tgz",
|
||||
"integrity": "sha512-9NET910DNaIPngYnLLPeg+Ogzqsi9uM4mSboU5y6p8S5DzMTVEsJZrawi+BoDNUVBa2DhJqQYUFvMDfgU062LQ=="
|
||||
},
|
||||
"@szmarczak/http-timer": {
|
||||
"version": "1.1.2",
|
||||
"resolved": "https://registry.npmjs.org/@szmarczak/http-timer/-/http-timer-1.1.2.tgz",
|
||||
"integrity": "sha512-XIB2XbzHTN6ieIjfIMV9hlVcfPU26s2vafYWQcZHWXHOxiaRZYEDKEwdl129Zyg50+foYV2jCgtrqSA6qNuNSA==",
|
||||
"requires": {
|
||||
"defer-to-connect": "^1.0.1"
|
||||
}
|
||||
},
|
||||
"@types/node": {
|
||||
"version": "10.12.18",
|
||||
"resolved": "https://registry.npmjs.org/@types/node/-/node-10.12.18.tgz",
|
||||
@ -51,6 +64,20 @@
|
||||
"resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.1.tgz",
|
||||
"integrity": "sha512-MQcXEUbCKtEo7bhqEs6560Hyd4XaovZlO/k9V3hjVUF/zwW7KBVdSK4gIt/bzwS9MbR5qob+F5jusZsb0YQK2A=="
|
||||
},
|
||||
"cacheable-request": {
|
||||
"version": "6.0.0",
|
||||
"resolved": "https://registry.npmjs.org/cacheable-request/-/cacheable-request-6.0.0.tgz",
|
||||
"integrity": "sha512-2N7AmszH/WPPpl5Z3XMw1HAP+8d+xugnKQAeKvxFZ/04dbT/CAznqwbl+7eSr3HkwdepNwtb2yx3CAMQWvG01Q==",
|
||||
"requires": {
|
||||
"clone-response": "^1.0.2",
|
||||
"get-stream": "^4.0.0",
|
||||
"http-cache-semantics": "^4.0.0",
|
||||
"keyv": "^3.0.0",
|
||||
"lowercase-keys": "^1.0.1",
|
||||
"normalize-url": "^3.1.0",
|
||||
"responselike": "^1.0.2"
|
||||
}
|
||||
},
|
||||
"chai": {
|
||||
"version": "4.2.0",
|
||||
"resolved": "https://registry.npmjs.org/chai/-/chai-4.2.0.tgz",
|
||||
@ -82,6 +109,14 @@
|
||||
"parse5": "^3.0.1"
|
||||
}
|
||||
},
|
||||
"clone-response": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/clone-response/-/clone-response-1.0.2.tgz",
|
||||
"integrity": "sha1-0dyXOSAxTfZ/vrlCI7TuNQI56Ws=",
|
||||
"requires": {
|
||||
"mimic-response": "^1.0.0"
|
||||
}
|
||||
},
|
||||
"concat-map": {
|
||||
"version": "0.0.1",
|
||||
"resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz",
|
||||
@ -151,6 +186,14 @@
|
||||
"ms": "^2.1.1"
|
||||
}
|
||||
},
|
||||
"decompress-response": {
|
||||
"version": "3.3.0",
|
||||
"resolved": "https://registry.npmjs.org/decompress-response/-/decompress-response-3.3.0.tgz",
|
||||
"integrity": "sha1-gKTdMjdIOEv6JICDYirt7Jgq3/M=",
|
||||
"requires": {
|
||||
"mimic-response": "^1.0.0"
|
||||
}
|
||||
},
|
||||
"deep-eql": {
|
||||
"version": "3.0.1",
|
||||
"resolved": "https://registry.npmjs.org/deep-eql/-/deep-eql-3.0.1.tgz",
|
||||
@ -159,6 +202,11 @@
|
||||
"type-detect": "^4.0.0"
|
||||
}
|
||||
},
|
||||
"defer-to-connect": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/defer-to-connect/-/defer-to-connect-1.0.2.tgz",
|
||||
"integrity": "sha512-k09hcQcTDY+cwgiwa6PYKLm3jlagNzQ+RSvhjzESOGOx+MNOuXkxTfEvPrO1IOQ81tArCFYQgi631clB70RpQw=="
|
||||
},
|
||||
"dom-serializer": {
|
||||
"version": "0.1.0",
|
||||
"resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-0.1.0.tgz",
|
||||
@ -197,6 +245,19 @@
|
||||
"domelementtype": "1"
|
||||
}
|
||||
},
|
||||
"duplexer3": {
|
||||
"version": "0.1.4",
|
||||
"resolved": "https://registry.npmjs.org/duplexer3/-/duplexer3-0.1.4.tgz",
|
||||
"integrity": "sha1-7gHdHKwO08vH/b6jfcCo8c4ALOI="
|
||||
},
|
||||
"end-of-stream": {
|
||||
"version": "1.4.1",
|
||||
"resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.1.tgz",
|
||||
"integrity": "sha512-1MkrZNvWTKCaigbn+W15elq2BB/L22nqrSY5DKlo3X6+vclJm8Bb5djXJBmEX6fS3+zCh/F4VBK5Z2KxJt4s2Q==",
|
||||
"requires": {
|
||||
"once": "^1.4.0"
|
||||
}
|
||||
},
|
||||
"entities": {
|
||||
"version": "1.1.2",
|
||||
"resolved": "https://registry.npmjs.org/entities/-/entities-1.1.2.tgz",
|
||||
@ -259,6 +320,14 @@
|
||||
"resolved": "https://registry.npmjs.org/get-func-name/-/get-func-name-2.0.0.tgz",
|
||||
"integrity": "sha1-6td0q+5y4gQJQzoGY2YCPdaIekE="
|
||||
},
|
||||
"get-stream": {
|
||||
"version": "4.1.0",
|
||||
"resolved": "https://registry.npmjs.org/get-stream/-/get-stream-4.1.0.tgz",
|
||||
"integrity": "sha512-GMat4EJ5161kIy2HevLlr4luNjBgvmj413KaQA7jt4V8B4RDsfpHk7WQ9GVqfYyyx8OS/L66Kox+rJRNklLK7w==",
|
||||
"requires": {
|
||||
"pump": "^3.0.0"
|
||||
}
|
||||
},
|
||||
"glob": {
|
||||
"version": "7.1.3",
|
||||
"resolved": "https://registry.npmjs.org/glob/-/glob-7.1.3.tgz",
|
||||
@ -272,6 +341,24 @@
|
||||
"path-is-absolute": "^1.0.0"
|
||||
}
|
||||
},
|
||||
"got": {
|
||||
"version": "9.6.0",
|
||||
"resolved": "https://registry.npmjs.org/got/-/got-9.6.0.tgz",
|
||||
"integrity": "sha512-R7eWptXuGYxwijs0eV+v3o6+XH1IqVK8dJOEecQfTmkncw9AV4dcw/Dhxi8MdlqPthxxpZyizMzyg8RTmEsG+Q==",
|
||||
"requires": {
|
||||
"@sindresorhus/is": "^0.14.0",
|
||||
"@szmarczak/http-timer": "^1.1.2",
|
||||
"cacheable-request": "^6.0.0",
|
||||
"decompress-response": "^3.3.0",
|
||||
"duplexer3": "^0.1.4",
|
||||
"get-stream": "^4.1.0",
|
||||
"lowercase-keys": "^1.0.1",
|
||||
"mimic-response": "^1.0.1",
|
||||
"p-cancelable": "^1.0.0",
|
||||
"to-readable-stream": "^1.0.0",
|
||||
"url-parse-lax": "^3.0.0"
|
||||
}
|
||||
},
|
||||
"htmlparser2": {
|
||||
"version": "3.10.0",
|
||||
"resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-3.10.0.tgz",
|
||||
@ -285,6 +372,11 @@
|
||||
"readable-stream": "^3.0.6"
|
||||
}
|
||||
},
|
||||
"http-cache-semantics": {
|
||||
"version": "4.0.2",
|
||||
"resolved": "https://registry.npmjs.org/http-cache-semantics/-/http-cache-semantics-4.0.2.tgz",
|
||||
"integrity": "sha512-laeSTWIkuFa6lUgZAt+ic9RwOSEwbi9VDQNcCvMFO4sZiDc2Ha8DaZVCJnfpLLQCcS8rvCnIWYmz0POLxt7Dew=="
|
||||
},
|
||||
"https-proxy-agent": {
|
||||
"version": "2.2.1",
|
||||
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-2.2.1.tgz",
|
||||
@ -323,16 +415,39 @@
|
||||
"resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz",
|
||||
"integrity": "sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE="
|
||||
},
|
||||
"json-buffer": {
|
||||
"version": "3.0.0",
|
||||
"resolved": "https://registry.npmjs.org/json-buffer/-/json-buffer-3.0.0.tgz",
|
||||
"integrity": "sha1-Wx85evx11ne96Lz8Dkfh+aPZqJg="
|
||||
},
|
||||
"keyv": {
|
||||
"version": "3.1.0",
|
||||
"resolved": "https://registry.npmjs.org/keyv/-/keyv-3.1.0.tgz",
|
||||
"integrity": "sha512-9ykJ/46SN/9KPM/sichzQ7OvXyGDYKGTaDlKMGCAlg2UK8KRy4jb0d8sFc+0Tt0YYnThq8X2RZgCg74RPxgcVA==",
|
||||
"requires": {
|
||||
"json-buffer": "3.0.0"
|
||||
}
|
||||
},
|
||||
"lodash": {
|
||||
"version": "4.17.11",
|
||||
"resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.11.tgz",
|
||||
"integrity": "sha512-cQKh8igo5QUhZ7lg38DYWAxMvjSAKG0A8wGSVimP07SIUEK2UO+arSRKbRZWtelMtN5V0Hkwh5ryOto/SshYIg=="
|
||||
},
|
||||
"lowercase-keys": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/lowercase-keys/-/lowercase-keys-1.0.1.tgz",
|
||||
"integrity": "sha512-G2Lj61tXDnVFFOi8VZds+SoQjtQC3dgokKdDG2mTm1tx4m50NUHBOZSBwQQHyy0V12A0JTG4icfZQH+xPyh8VA=="
|
||||
},
|
||||
"mime": {
|
||||
"version": "2.4.0",
|
||||
"resolved": "https://registry.npmjs.org/mime/-/mime-2.4.0.tgz",
|
||||
"integrity": "sha512-ikBcWwyqXQSHKtciCcctu9YfPbFYZ4+gbHEmE0Q8jzcTYQg5dHCr3g2wwAZjPoJfQVXZq6KXAjpXOTf5/cjT7w=="
|
||||
},
|
||||
"mimic-response": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-1.0.1.tgz",
|
||||
"integrity": "sha512-j5EctnkH7amfV/q5Hgmoal1g2QHFJRraOtmx0JpIqkxhBhI/lJSl1nMpQ45hVarwNETOoWEimndZ4QK0RHxuxQ=="
|
||||
},
|
||||
"minimatch": {
|
||||
"version": "3.0.4",
|
||||
"resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.4.tgz",
|
||||
@ -359,6 +474,11 @@
|
||||
"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.1.tgz",
|
||||
"integrity": "sha512-tgp+dl5cGk28utYktBsrFqA7HKgrhgPsg6Z/EfhWI4gl1Hwq8B/GmY/0oXZ6nF8hDVesS/FpnYaD/kOWhYQvyg=="
|
||||
},
|
||||
"normalize-url": {
|
||||
"version": "3.3.0",
|
||||
"resolved": "https://registry.npmjs.org/normalize-url/-/normalize-url-3.3.0.tgz",
|
||||
"integrity": "sha512-U+JJi7duF1o+u2pynbp2zXDW2/PADgC30f0GsHZtRh+HOcXHnw137TrNlyxxRvWW5fjKd3bcLHPxofWuCjaeZg=="
|
||||
},
|
||||
"nth-check": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/nth-check/-/nth-check-1.0.2.tgz",
|
||||
@ -375,6 +495,11 @@
|
||||
"wrappy": "1"
|
||||
}
|
||||
},
|
||||
"p-cancelable": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/p-cancelable/-/p-cancelable-1.0.0.tgz",
|
||||
"integrity": "sha512-USgPoaC6tkTGlS831CxsVdmZmyb8tR1D+hStI84MyckLOzfJlYQUweomrwE3D8T7u5u5GVuW064LT501wHTYYA=="
|
||||
},
|
||||
"parse5": {
|
||||
"version": "3.0.3",
|
||||
"resolved": "https://registry.npmjs.org/parse5/-/parse5-3.0.3.tgz",
|
||||
@ -398,6 +523,11 @@
|
||||
"resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz",
|
||||
"integrity": "sha1-elfrVQpng/kRUzH89GY9XI4AelA="
|
||||
},
|
||||
"prepend-http": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/prepend-http/-/prepend-http-2.0.0.tgz",
|
||||
"integrity": "sha1-6SQ0v6XqjBn0HN/UAddBo8gZ2Jc="
|
||||
},
|
||||
"process-nextick-args": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.0.tgz",
|
||||
@ -413,6 +543,15 @@
|
||||
"resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.0.0.tgz",
|
||||
"integrity": "sha1-M8UDmPcOp+uW0h97gXYwpVeRx+4="
|
||||
},
|
||||
"pump": {
|
||||
"version": "3.0.0",
|
||||
"resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz",
|
||||
"integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==",
|
||||
"requires": {
|
||||
"end-of-stream": "^1.1.0",
|
||||
"once": "^1.3.1"
|
||||
}
|
||||
},
|
||||
"puppeteer": {
|
||||
"version": "1.11.0",
|
||||
"resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-1.11.0.tgz",
|
||||
@ -438,6 +577,14 @@
|
||||
"util-deprecate": "^1.0.1"
|
||||
}
|
||||
},
|
||||
"responselike": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/responselike/-/responselike-1.0.2.tgz",
|
||||
"integrity": "sha1-kYcg7ztjHFZCvgaPFa3lpG9Loec=",
|
||||
"requires": {
|
||||
"lowercase-keys": "^1.0.0"
|
||||
}
|
||||
},
|
||||
"rimraf": {
|
||||
"version": "2.6.2",
|
||||
"resolved": "https://registry.npmjs.org/rimraf/-/rimraf-2.6.2.tgz",
|
||||
@ -459,6 +606,11 @@
|
||||
"safe-buffer": "~5.1.0"
|
||||
}
|
||||
},
|
||||
"to-readable-stream": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/to-readable-stream/-/to-readable-stream-1.0.0.tgz",
|
||||
"integrity": "sha512-Iq25XBt6zD5npPhlLVXGFN3/gyR2/qODcKNNyTMd4vbm39HUaOiAM4PMq0eMVC/Tkxz+Zjdsc55g9yyz+Yq00Q=="
|
||||
},
|
||||
"type-detect": {
|
||||
"version": "4.0.8",
|
||||
"resolved": "https://registry.npmjs.org/type-detect/-/type-detect-4.0.8.tgz",
|
||||
@ -469,6 +621,14 @@
|
||||
"resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz",
|
||||
"integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c="
|
||||
},
|
||||
"url-parse-lax": {
|
||||
"version": "3.0.0",
|
||||
"resolved": "https://registry.npmjs.org/url-parse-lax/-/url-parse-lax-3.0.0.tgz",
|
||||
"integrity": "sha1-FrXK/Afb42dsGxmZF3gj1lA6yww=",
|
||||
"requires": {
|
||||
"prepend-http": "^2.0.0"
|
||||
}
|
||||
},
|
||||
"util-deprecate": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz",
|
||||
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "se-scraper",
|
||||
"version": "1.1.7",
|
||||
"version": "1.1.8",
|
||||
"description": "A simple module which uses puppeteer to scrape several search engines.",
|
||||
"homepage": "https://scrapeulous.com/",
|
||||
"main": "index.js",
|
||||
@ -22,6 +22,7 @@
|
||||
"dependencies": {
|
||||
"chai": "^4.2.0",
|
||||
"cheerio": "^1.0.0-rc.2",
|
||||
"got": "^9.6.0",
|
||||
"puppeteer": "^1.9.0"
|
||||
}
|
||||
}
|
||||
|
12
run.js
12
run.js
@ -10,7 +10,7 @@ let config = {
|
||||
write_meta_data: false,
|
||||
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
||||
// is drawn before every request. empty string for no sleeping.
|
||||
sleep_range: '[1,1]',
|
||||
sleep_range: '[1,2]',
|
||||
// which search engine to scrape
|
||||
search_engine: 'google',
|
||||
// whether debug information should be printed
|
||||
@ -20,9 +20,11 @@ let config = {
|
||||
// this output is informational
|
||||
verbose: false,
|
||||
// an array of keywords to scrape
|
||||
keywords: ['trump', 'chief'],
|
||||
keywords: ['scraping scrapeulous.com'],
|
||||
// alternatively you can specify a keyword_file. this overwrites the keywords array
|
||||
keyword_file: '',
|
||||
// the number of pages to scrape for each keyword
|
||||
num_pages: 1,
|
||||
// whether to start the browser in headless mode
|
||||
headless: true,
|
||||
// path to output file, data will be stored in JSON
|
||||
@ -35,7 +37,11 @@ let config = {
|
||||
// get_browser, handle_metadata, close_browser
|
||||
// must be an absolute path to the module
|
||||
//custom_func: resolve('examples/pluggable.js'),
|
||||
custom_func: resolve('examples/pluggable.js'),
|
||||
custom_func: '',
|
||||
// use a proxy for all connections
|
||||
// example: 'socks5://78.94.172.42:1080'
|
||||
// example: 'http://118.174.233.10:48400'
|
||||
//proxy: 'socks5://78.94.172.42:1080',
|
||||
};
|
||||
|
||||
function callback(err, response) {
|
||||
|
96
src/captcha_solver.js
Normal file
96
src/captcha_solver.js
Normal file
@ -0,0 +1,96 @@
|
||||
/*
|
||||
There are essentially two strategies to handle a search engine showing you a captcha:
|
||||
|
||||
1. Solve the captcha
|
||||
https://github.com/ecthros/uncaptcha2
|
||||
or use a captcha solving service such as https://anti-captcha.com/mainpage
|
||||
|
||||
2. Switch your IP address with rotating proxies
|
||||
|
||||
*/
|
||||
|
||||
/**
|
||||
* @name download recaptcha2 audio captcha
|
||||
*
|
||||
* There are several issues:
|
||||
*
|
||||
* Google sees that we are using an automated browser.
|
||||
*
|
||||
* In the worst case we have to completely control the browser ourselves without puppeteer.
|
||||
*
|
||||
* https://github.com/ecthros/uncaptcha2
|
||||
*
|
||||
* See here:
|
||||
*
|
||||
* https://gist.github.com/tegansnyder/c3aeae4d57768c58247ae6c4e5acd3d1
|
||||
*
|
||||
* https://github.com/GoogleChrome/puppeteer/issues/3039
|
||||
*
|
||||
* https://intoli.com/blog/making-chrome-headless-undetectable/
|
||||
*
|
||||
* @desc Go to the https://www.google.com/recaptcha/api2/demo demo page and download the captcha
|
||||
*/
|
||||
|
||||
const puppeteer = require('puppeteer');
|
||||
const fs = require('fs');
|
||||
const got = require('got');
|
||||
|
||||
try {
|
||||
(async () => {
|
||||
const browser = await puppeteer.launch({
|
||||
args: [
|
||||
'--proxy-server=socks5://78.94.172.42:1080',
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-accelerated-2d-canvas',
|
||||
'--disable-gpu',
|
||||
'--window-size=1920x1080',
|
||||
'--hide-scrollbars',
|
||||
'--user-agent="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0"',
|
||||
],
|
||||
headless: false,
|
||||
});
|
||||
const page = await browser.newPage()
|
||||
await page.goto('https://www.google.com/recaptcha/api2/demo')
|
||||
|
||||
await page.waitFor(1000);
|
||||
|
||||
const frames = page.frames();
|
||||
|
||||
console.info('Available frames', frames.map(frame => frame.name()));
|
||||
console.info('Available frame urls', frames.map(frame => frame.url()));
|
||||
|
||||
const frame = frames.find(frame => frame.url().includes('/recaptcha/api2/anchor?'));
|
||||
const content_frame = frames.find(frame => frame.url().includes('/recaptcha/api2/bframe?'));
|
||||
|
||||
await frame.waitForSelector('#recaptcha-anchor', { timeout: 10000 });
|
||||
await page.waitFor(1000);
|
||||
const button = await frame.$('#recaptcha-anchor');
|
||||
await button.click();
|
||||
|
||||
await content_frame.waitForSelector('#recaptcha-audio-button');
|
||||
|
||||
const audio_button = await content_frame.$('#recaptcha-audio-button');
|
||||
await audio_button.click();
|
||||
await page.waitFor(1000);
|
||||
|
||||
await content_frame.waitForSelector('.rc-audiochallenge-tdownload-link');
|
||||
|
||||
let download_link = await content_frame.evaluate(() => {
|
||||
return document.querySelectorAll('.rc-audiochallenge-tdownload-link').getAttribute('href');
|
||||
});
|
||||
console.log('Got audio download link: ', download_link);
|
||||
got.stream(download_link).pipe(fs.createWriteStream('audio.mp3'));
|
||||
|
||||
await browser.close();
|
||||
})()
|
||||
} catch (err) {
|
||||
console.error(err)
|
||||
}
|
||||
|
||||
/*
|
||||
translate this shit into js: https://github.com/ecthros/uncaptcha2/blob/master/queryAPI.py
|
||||
*/
|
||||
async function translate_audio_file() {
|
||||
}
|
@ -21,6 +21,7 @@ async function scrape_bing_pup(page, event, context, pluggable) {
|
||||
for (var i = 0; i < keywords.length; i++) {
|
||||
|
||||
keyword = keywords[i];
|
||||
results[keyword] = {};
|
||||
|
||||
if (pluggable.before_keyword_scraped) {
|
||||
await pluggable.before_keyword_scraped({
|
||||
@ -33,23 +34,35 @@ async function scrape_bing_pup(page, event, context, pluggable) {
|
||||
|
||||
try {
|
||||
const input = await page.$('input[name="q"]');
|
||||
// overwrites last text in input
|
||||
await input.click({ clickCount: 3 });
|
||||
await input.type(keyword);
|
||||
await sfunctions.set_input_value(page, `input[name="q"]`, keyword);
|
||||
await sfunctions.sleep(50);
|
||||
await input.focus();
|
||||
await page.keyboard.press("Enter");
|
||||
|
||||
if (event.sleep_range) {
|
||||
await sfunctions.random_sleep(event);
|
||||
}
|
||||
let page_num = 1;
|
||||
|
||||
await page.waitForSelector('#b_content', { timeout: 5000 });
|
||||
if (event.debug === true && event.is_local === true) {
|
||||
await page.screenshot({path: `debug/${keyword}.png`});
|
||||
}
|
||||
do {
|
||||
if (event.verbose === true) {
|
||||
console.log(`${event.search_engine} is scraping keyword: ${keyword} on page ${page_num}`);
|
||||
}
|
||||
if (event.sleep_range) {
|
||||
await sfunctions.random_sleep(event);
|
||||
}
|
||||
await page.waitForSelector('#b_content', { timeout: 5000 });
|
||||
await sfunctions.sleep(500);
|
||||
let html = await page.content();
|
||||
results[keyword][page_num] = parse(html);
|
||||
|
||||
let html = await page.content();
|
||||
results[keyword] = parse(html);
|
||||
page_num += 1;
|
||||
|
||||
let next_page_link = await page.$('.sb_pagN', {timeout: 1000});
|
||||
if (!next_page_link) {
|
||||
break;
|
||||
}
|
||||
await next_page_link.click();
|
||||
await page.waitForNavigation();
|
||||
|
||||
} while (page_num <= event.num_pages)
|
||||
|
||||
} catch (e) {
|
||||
console.error(`Problem with scraping ${keyword}: ${e}`);
|
||||
|
@ -25,8 +25,8 @@ async function scrape_google_pup(page, event, context, pluggable) {
|
||||
var results = {};
|
||||
|
||||
for (var i = 0; i < keywords.length; i++) {
|
||||
|
||||
keyword = keywords[i];
|
||||
results[keyword] = {};
|
||||
|
||||
if (pluggable.before_keyword_scraped) {
|
||||
await pluggable.before_keyword_scraped({
|
||||
@ -37,54 +37,63 @@ async function scrape_google_pup(page, event, context, pluggable) {
|
||||
});
|
||||
}
|
||||
|
||||
if (event.verbose === true) {
|
||||
console.log(`${event.search_engine} is scraping keyword: ${keyword}`);
|
||||
}
|
||||
|
||||
try {
|
||||
|
||||
const input = await page.$('input[name="q"]');
|
||||
// await input.click({ clickCount: 3 });
|
||||
// await sfunctions.sleep(50);
|
||||
//await input.type(keyword);
|
||||
await sfunctions.set_input_value(page, `input[name="q"]`, keyword);
|
||||
await sfunctions.sleep(50);
|
||||
await sfunctions.set_input_value(page, `input[name="q"]`, keyword);
|
||||
await sfunctions.sleep(50);
|
||||
await input.focus();
|
||||
await page.keyboard.press("Enter");
|
||||
|
||||
if (event.sleep_range) {
|
||||
await sfunctions.random_sleep(event);
|
||||
}
|
||||
let page_num = 1;
|
||||
|
||||
await page.waitForSelector('#center_col', { timeout: STANDARD_TIMEOUT });
|
||||
await sfunctions.sleep(500);
|
||||
do {
|
||||
if (event.verbose === true) {
|
||||
console.log(`${event.search_engine} is scraping keyword: ${keyword} on page ${page_num}`);
|
||||
}
|
||||
if (event.sleep_range) {
|
||||
await sfunctions.random_sleep(event);
|
||||
}
|
||||
await page.waitForSelector('#center_col', {timeout: STANDARD_TIMEOUT});
|
||||
await sfunctions.sleep(500);
|
||||
let html = await page.content();
|
||||
results[keyword][page_num] = parse_google_results(html);
|
||||
|
||||
page_num += 1;
|
||||
|
||||
let next_page_link = await page.$('#pnnext', {timeout: 1000});
|
||||
if (!next_page_link) {
|
||||
break;
|
||||
}
|
||||
await next_page_link.click();
|
||||
await page.waitForNavigation();
|
||||
|
||||
} while (page_num <= event.num_pages)
|
||||
|
||||
} catch (e) {
|
||||
console.error(`Problem with scraping ${keyword}.`);
|
||||
console.error(e);
|
||||
|
||||
if (await scraping_detected(page) === true) {
|
||||
console.error('Google detected the scraping. Aborting.');
|
||||
if (await scraping_detected(page) === true) {
|
||||
console.error('Google detected the scraping. Aborting.');
|
||||
|
||||
if (event.is_local === true) {
|
||||
await sfunctions.sleep(SOLVE_CAPTCHA_TIME);
|
||||
console.error('You have 45 seconds to enter the captcha.');
|
||||
// expect that user filled out necessary captcha
|
||||
} else {
|
||||
return results;
|
||||
}
|
||||
} else {
|
||||
// some other error, quit scraping process if stuff is broken
|
||||
if (event.is_local === true) {
|
||||
console.error('You have 30 seconds to fix this.');
|
||||
await sfunctions.sleep(30000);
|
||||
} else {
|
||||
return results;
|
||||
}
|
||||
}
|
||||
if (event.is_local === true) {
|
||||
await sfunctions.sleep(SOLVE_CAPTCHA_TIME);
|
||||
console.error('You have 45 seconds to enter the captcha.');
|
||||
// expect that user filled out necessary captcha
|
||||
} else {
|
||||
return results;
|
||||
}
|
||||
} else {
|
||||
// some other error, quit scraping process if stuff is broken
|
||||
if (event.is_local === true) {
|
||||
console.error('You have 30 seconds to fix this.');
|
||||
await sfunctions.sleep(30000);
|
||||
} else {
|
||||
return results;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let html = await page.content();
|
||||
results[keyword] = parse_google_results(html);
|
||||
}
|
||||
|
||||
return results;
|
||||
|
@ -12,13 +12,14 @@ async function get_metadata(browser) {
|
||||
waitLoad: true,
|
||||
waitNetworkIdle: true // defaults to false
|
||||
});
|
||||
let json = await page.content();
|
||||
let json = await page.content({
|
||||
timeout: 20000
|
||||
});
|
||||
const $ = cheerio.load(json);
|
||||
metadata.ipinfo = $('pre').text();
|
||||
return metadata;
|
||||
}
|
||||
|
||||
|
||||
async function get_http_headers(browser) {
|
||||
let metadata = {};
|
||||
const page = await browser.newPage();
|
||||
|
39
src/modules/se_scraper.js
Normal file
39
src/modules/se_scraper.js
Normal file
@ -0,0 +1,39 @@
|
||||
const start_url = {
|
||||
'google': ''
|
||||
};
|
||||
|
||||
/*
|
||||
Read this shit: https://javascript.info/class-inheritance
|
||||
*/
|
||||
|
||||
module.exports = class Scraper {
|
||||
constructor(options = {}) {
|
||||
const {
|
||||
searchEngine = 'google',
|
||||
numPages = 1,
|
||||
pluggable = null,
|
||||
} = options;
|
||||
|
||||
this.pluggable = pluggable;
|
||||
this.searchEngine = searchEngine;
|
||||
this.numPages = numPages;
|
||||
this.results = {}
|
||||
}
|
||||
|
||||
async load_search_engine() {
|
||||
}
|
||||
|
||||
async search_keyword() {
|
||||
}
|
||||
|
||||
parse() {
|
||||
|
||||
}
|
||||
|
||||
async next_page() {
|
||||
}
|
||||
|
||||
async detected() {
|
||||
|
||||
}
|
||||
};
|
@ -3,80 +3,79 @@ module.exports = {
|
||||
};
|
||||
|
||||
function random_user_agent() {
|
||||
return user_agents[Math.floor(Math.random()*user_agents.length)];
|
||||
let rand = user_agents[Math.floor(Math.random()*user_agents.length)];
|
||||
}
|
||||
|
||||
// updated: 29 Jan 2019
|
||||
const user_agents = [
|
||||
['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134',
|
||||
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
||||
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
||||
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 YaBrowser/18.11.1.805 Yowser/2.5 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763',
|
||||
'Mozilla/5.0 (iPad; CPU OS 12_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.1 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116',
|
||||
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 YaBrowser/18.11.1.805 Yowser/2.5 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299',
|
||||
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.106',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0',
|
||||
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.98 Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; Touch; rv:11.0) like Gecko',
|
||||
'Mozilla/5.0 (Windows NT 6.1; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
|
||||
'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8',
|
||||
'Mozilla/5.0 (Windows NT 10.0; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
|
||||
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0',
|
||||
'Mozilla/5.0 (X11; CrOS x86_64 11151.59.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.94 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
]
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134',
|
||||
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
||||
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
||||
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 YaBrowser/18.11.1.805 Yowser/2.5 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763',
|
||||
'Mozilla/5.0 (iPad; CPU OS 12_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.1 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116',
|
||||
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 YaBrowser/18.11.1.805 Yowser/2.5 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299',
|
||||
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.106',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0',
|
||||
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.98 Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; Touch; rv:11.0) like Gecko',
|
||||
'Mozilla/5.0 (Windows NT 6.1; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
|
||||
'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8',
|
||||
'Mozilla/5.0 (Windows NT 10.0; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
|
||||
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0',
|
||||
'Mozilla/5.0 (X11; CrOS x86_64 11151.59.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.94 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
];
|
@ -22,7 +22,7 @@ function write_results(fname, data) {
|
||||
|
||||
module.exports.handler = async function handler (event, context, callback) {
|
||||
config = event;
|
||||
pluggable = null;
|
||||
pluggable = {};
|
||||
if (config.custom_func) {
|
||||
if (fs.existsSync(config.custom_func)) {
|
||||
try {
|
||||
@ -43,8 +43,11 @@ module.exports.handler = async function handler (event, context, callback) {
|
||||
console.log(config);
|
||||
}
|
||||
|
||||
const ADDITIONAL_CHROME_FLAGS = [
|
||||
//'--proxy-server=' + proxy,
|
||||
var ADDITIONAL_CHROME_FLAGS = [
|
||||
'--disable-infobars',
|
||||
'--window-position=0,0',
|
||||
'--ignore-certifcate-errors',
|
||||
'--ignore-certifcate-errors-spki-list',
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
@ -70,16 +73,27 @@ module.exports.handler = async function handler (event, context, callback) {
|
||||
)
|
||||
}
|
||||
|
||||
if (config.proxy) {
|
||||
// check this out bubbles
|
||||
// https://www.systutorials.com/241062/how-to-set-google-chromes-proxy-settings-in-command-line-on-linux/
|
||||
// [<proxy-scheme>://]<proxy-host>[:<proxy-port>]
|
||||
// "http", "socks", "socks4", "socks5".
|
||||
ADDITIONAL_CHROME_FLAGS.push(
|
||||
'--proxy-server=' + config.proxy,
|
||||
)
|
||||
}
|
||||
|
||||
let launch_args = {
|
||||
args: ADDITIONAL_CHROME_FLAGS,
|
||||
headless: config.headless,
|
||||
ignoreHTTPSErrors: true,
|
||||
};
|
||||
|
||||
if (config.debug === true) {
|
||||
console.log("Chrome Args: ", launch_args);
|
||||
}
|
||||
|
||||
if (pluggable) {
|
||||
if (pluggable.start_browser) {
|
||||
launch_args.config = config;
|
||||
browser = await pluggable.start_browser(launch_args);
|
||||
} else {
|
||||
@ -91,6 +105,30 @@ module.exports.handler = async function handler (event, context, callback) {
|
||||
console.dir(headers);
|
||||
}
|
||||
|
||||
let metadata = {};
|
||||
|
||||
if (config.write_meta_data === true) {
|
||||
metadata = await meta.get_metadata(browser);
|
||||
}
|
||||
|
||||
// check that our proxy is working by confirming
|
||||
// that ipinfo.io sees the proxy IP address
|
||||
if (config.proxy && config.write_meta_data === true) {
|
||||
console.log(`${metadata.ipinfo} vs ${config.proxy}`);
|
||||
|
||||
try {
|
||||
let ipdata = JSON.parse(metadata.ipinfo);
|
||||
// if the ip returned by ipinfo is not a substring of our proxystring, get the heck outta here
|
||||
if (!config.proxy.includes(ipdata.ip)) {
|
||||
console.error('Proxy not working properly.');
|
||||
await browser.close();
|
||||
return;
|
||||
}
|
||||
} catch (exception) {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
const page = await browser.newPage();
|
||||
|
||||
// block some assets to speed up scraping
|
||||
@ -127,13 +165,8 @@ module.exports.handler = async function handler (event, context, callback) {
|
||||
marketwatch: tickersearch.scrape_marketwatch_finance_pup,
|
||||
}[config.search_engine](page, config, context, pluggable);
|
||||
|
||||
let metadata = {};
|
||||
|
||||
if (config.write_meta_data === true) {
|
||||
metadata = await meta.get_metadata(browser);
|
||||
}
|
||||
|
||||
if (pluggable) {
|
||||
if (pluggable.close_browser) {
|
||||
await pluggable.close_browser();
|
||||
} else {
|
||||
await browser.close();
|
||||
@ -155,7 +188,7 @@ module.exports.handler = async function handler (event, context, callback) {
|
||||
results = zlib.deflateSync(results).toString('base64');
|
||||
}
|
||||
|
||||
if (pluggable && pluggable.handle_results) {
|
||||
if (pluggable.handle_results) {
|
||||
await pluggable.handle_results({
|
||||
config: config,
|
||||
results: results,
|
||||
@ -172,7 +205,7 @@ module.exports.handler = async function handler (event, context, callback) {
|
||||
console.log(metadata);
|
||||
}
|
||||
|
||||
if (pluggable) {
|
||||
if (pluggable.handle_metadata) {
|
||||
await pluggable.handle_metadata({metadata: metadata, config: config});
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user