added support for amazon

This commit is contained in:
Nikolai Tschacher 2019-03-10 20:02:42 +01:00
parent dd1f36076e
commit 51d617442d
9 changed files with 1066 additions and 440 deletions

View File

@ -24,6 +24,7 @@ Se-scraper supports the following search engines:
* Google News * Google News
* Google News App version (https://news.google.com) * Google News App version (https://news.google.com)
* Google Image * Google Image
* Amazon
* Bing * Bing
* Bing News * Bing News
* Baidu * Baidu
@ -119,6 +120,7 @@ This will scrape with **three** browser instance each having their own IP addres
* [Simple example scraping google](examples/quickstart.js) yields [these results](examples/results/data.json) * [Simple example scraping google](examples/quickstart.js) yields [these results](examples/results/data.json)
* [Scrape with one proxy per browser](examples/proxies.js) yields [these results](examples/results/proxyresults.json) * [Scrape with one proxy per browser](examples/proxies.js) yields [these results](examples/results/proxyresults.json)
* [Scrape 100 keywords on Bing with multible tabs in one browser](examples/multiple_tabs.js) produces [this](examples/results/bing.json) * [Scrape 100 keywords on Bing with multible tabs in one browser](examples/multiple_tabs.js) produces [this](examples/results/bing.json)
* [Scrape two keywords on Amazon](examples/amazon.js) produces [this](examples/results/amazon.json)
* [Inject your own scraping logic](examples/pluggable.js) * [Inject your own scraping logic](examples/pluggable.js)

21
examples/amazon.js Normal file
View File

@ -0,0 +1,21 @@
const se_scraper = require('./../index.js');
let config = {
headless: false,
search_engine: 'amazon',
debug: false,
verbose: false,
keywords: ['iphone', 'drone'],
num_pages: 1,
output_file: 'examples/results/amazon.json',
amazon_settings: {
amazon_domain: 'amazon.com',
}
};
function callback(err, response) {
if (err) { console.error(err) }
console.dir(response, {depth: null, colors: true});
}
se_scraper.scrape(config, callback);

View File

@ -0,0 +1,352 @@
{
"iphone": {
"1": {
"time": "Sun, 10 Mar 2019 19:02:01 GMT",
"num_results": "\n 1-16 of over 1,000 results for \"iphone\"\n \n \n\n\n\n\n\n\n \n \n \n \n \n \n \n \n Sort by:\n \n Featured\n \n Price: Low to High\n \n Price: High to Low\n \n Avg. Customer Review\n \n Newest Arrivals\n \n Sort by:Featured\n <span class=\"a-button a-button-base\"><span class=\"a-button-inner\"><input class=\"a-button-input\" type=\"submit\" value=\"Go\"><span class=\"a-button-text\" aria-hidden=\"true\">Go</span></span></span>\n \n\n\n ",
"no_results": false,
"effective_query": "\"iphone\"",
"results": [
{
"image": "/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A0875484UWI8SLQ4J64Y&url=%2FBandolier-Natalie-Wallet-Compatible-iPhone%2Fdp%2FB079YDTRKV%2Fref%3Dsr_1_1_sspa%3Fkeywords%3Diphone%26qid%3D1552244519%26s%3Dgateway%26sr%3D8-1-spons%26psc%3D1&qualifier=1552244519&id=2869484585385390&widgetName=sp_atf",
"seller": "by Bandolier",
"link": "/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A0875484UWI8SLQ4J64Y&url=%2FBandolier-Natalie-Wallet-Compatible-iPhone%2Fdp%2FB079YDTRKV%2Fref%3Dsr_1_1_sspa%3Fkeywords%3Diphone%26qid%3D1552244519%26s%3Dgateway%26sr%3D8-1-spons%26psc%3D1&qualifier=1552244519&id=2869484585385390&widgetName=sp_atf",
"title": "Bandolier [Natalie] Phone Case with Strap & Wallet Compatible w/iPhone 8 +, 7 + & 6 + - Gold Details & Crossbody Leather Shoulder Purse Belt. Handsfree Carrying Hard Cover. Travel Friendly Accessory",
"stars": "3.8 out of 5 stars",
"num_reviews": "6",
"price": "$98.00",
"oldprice": "",
"rank": 1
},
{
"image": "/Apple-iPhone-XR-64GB-PRODUCT/dp/B07K97BQDF/ref=sr_1_2?keywords=iphone&qid=1552244519&s=gateway&sr=8-2",
"seller": "by Apple",
"link": "/Apple-iPhone-XR-64GB-PRODUCT/dp/B07K97BQDF/ref=sr_1_2?keywords=iphone&qid=1552244519&s=gateway&sr=8-2",
"title": "Apple iPhone XR (64GB) - (PRODUCT)RED [Locked to Simple Mobile Prepaid]",
"stars": "3.0 out of 5 stars",
"num_reviews": "7",
"price": "$749.99",
"oldprice": "",
"rank": 2
},
{
"image": "/Apple-iPhone-64GB-Silver-Prepaid/dp/B078HVJB69/ref=sr_1_3?keywords=iphone&qid=1552244519&s=gateway&sr=8-3",
"seller": "by Apple",
"link": "/Apple-iPhone-64GB-Silver-Prepaid/dp/B078HVJB69/ref=sr_1_3?keywords=iphone&qid=1552244519&s=gateway&sr=8-3",
"title": "Apple iPhone X (64GB) - Silver [Locked to Simple Mobile Prepaid]",
"stars": "3.4 out of 5 stars",
"num_reviews": "3",
"price": "$899.00",
"oldprice": "",
"rank": 3
},
{
"image": "/Apple-iPhone-Silver-Locked-Prepaid/dp/B076MP43X5/ref=sr_1_4?keywords=iphone&qid=1552244519&s=gateway&sr=8-4",
"seller": "by Apple",
"link": "/Apple-iPhone-Silver-Locked-Prepaid/dp/B076MP43X5/ref=sr_1_4?keywords=iphone&qid=1552244519&s=gateway&sr=8-4",
"title": "Apple iPhone 8 (64GB) - Silver [Locked to Simple Mobile Prepaid]",
"stars": "2.4 out of 5 stars",
"num_reviews": "3",
"price": "$599.99",
"oldprice": "",
"rank": 4
},
{
"image": "/Apple-iPhone-Plus-Unlocked-Version/dp/B01LY5U2X3/ref=sr_1_5?keywords=iphone&qid=1552244519&s=gateway&sr=8-5",
"seller": "by Apple",
"link": "/Apple-iPhone-Plus-Unlocked-Version/dp/B01LY5U2X3/ref=sr_1_5?keywords=iphone&qid=1552244519&s=gateway&sr=8-5",
"title": "Apple iPhone 7 Plus (32GB) - Silver [Locked to Simple Mobile Prepaid]",
"stars": "3.1 out of 5 stars",
"num_reviews": "132",
"price": "$399.99$569.99",
"oldprice": "$569.99$569.99",
"rank": 5
},
{
"image": "/Apple-iPhone-32GB-Black-Prepaid/dp/B01N2K14U7/ref=sr_1_6?keywords=iphone&qid=1552244519&s=gateway&sr=8-6",
"seller": "by Apple",
"link": "/Apple-iPhone-32GB-Black-Prepaid/dp/B01N2K14U7/ref=sr_1_6?keywords=iphone&qid=1552244519&s=gateway&sr=8-6",
"title": "Apple iPhone 7 (32GB) - Black - [Locked to Simple Mobile Prepaid]",
"stars": "3.2 out of 5 stars",
"num_reviews": "32",
"price": "$299.99$449.99",
"oldprice": "$449.99$449.99",
"rank": 6
},
{
"image": "/Apple-iPhone-6S-Unlocked-Refurbished/dp/B0731JJCRZ/ref=sr_1_7?keywords=iphone&qid=1552244519&s=gateway&sr=8-7",
"seller": "by Apple",
"link": "/Apple-iPhone-6S-Unlocked-Refurbished/dp/B0731JJCRZ/ref=sr_1_7?keywords=iphone&qid=1552244519&s=gateway&sr=8-7",
"title": "Apple iPhone 6S - 32GB GSM Unlocked - (Rose Gold) (Refurbished)",
"stars": "3.8 out of 5 stars",
"num_reviews": "3,966",
"price": "$174.97",
"oldprice": "",
"rank": 7
},
{
"image": "/Apple-iPhone-Fully-Unlocked-64GB/dp/B06XRHJWNC/ref=sr_1_8?keywords=iphone&qid=1552244519&s=gateway&sr=8-8",
"seller": "by Apple",
"link": "/Apple-iPhone-Fully-Unlocked-64GB/dp/B06XRHJWNC/ref=sr_1_8?keywords=iphone&qid=1552244519&s=gateway&sr=8-8",
"title": "Apple iPhone 6S, Fully Unlocked, 64GB - Silver (Refurbished)",
"stars": "3.5 out of 5 stars",
"num_reviews": "385",
"price": "$204.83",
"oldprice": "",
"rank": 8
},
{
"image": "/Apple-iPhone-GSM-Unlocked-16GB/dp/B00YD547Q6/ref=sr_1_9?keywords=iphone&qid=1552244519&s=gateway&sr=8-9",
"seller": "by Apple",
"link": "/Apple-iPhone-GSM-Unlocked-16GB/dp/B00YD547Q6/ref=sr_1_9?keywords=iphone&qid=1552244519&s=gateway&sr=8-9",
"title": "Apple iPhone 6, GSM Unlocked, 16GB - Space Gray (Refurbished)",
"stars": "3.0 out of 5 stars",
"num_reviews": "2,509",
"price": "$149.99$194.99",
"oldprice": "$194.99$194.99",
"rank": 9
},
{
"image": "/gp/slredirect/picassoRedirect.html/ref=pa_sp_mtf_aps_sr_pg1_1?ie=UTF8&adId=A0145807DUMZAO43XNKF&url=%2FCharger-ONTWIE-Qualcomm-Certified-Compatible%2Fdp%2FB07KKD4832%2Fref%3Dsr_1_10_sspa%3Fkeywords%3Diphone%26qid%3D1552244519%26s%3Dgateway%26sr%3D8-10-spons%26psc%3D1&qualifier=1552244519&id=2869484585385390&widgetName=sp_mtf",
"seller": "by ONTWIE",
"link": "/gp/slredirect/picassoRedirect.html/ref=pa_sp_mtf_aps_sr_pg1_1?ie=UTF8&adId=A0145807DUMZAO43XNKF&url=%2FCharger-ONTWIE-Qualcomm-Certified-Compatible%2Fdp%2FB07KKD4832%2Fref%3Dsr_1_10_sspa%3Fkeywords%3Diphone%26qid%3D1552244519%26s%3Dgateway%26sr%3D8-10-spons%26psc%3D1&qualifier=1552244519&id=2869484585385390&widgetName=sp_mtf",
"title": "USB Wall Charger Quick Charge 3.0, ONTWIE Qualcomm Certified 18W QC 3.0 Charger Adapter, UL Certified Travel Adapte Compatible iPhone XS/X/8/7/6/Plus/iPad, Samsung, LG, Nexus, HTC and More",
"stars": "4.5 out of 5 stars",
"num_reviews": "6",
"price": "$12.19",
"oldprice": "",
"rank": 10
},
{
"image": "/gp/slredirect/picassoRedirect.html/ref=pa_sp_mtf_aps_sr_pg1_2?ie=UTF8&adId=A03784982YI18C25BT179&url=%2FHeadphone-Adapter-Splitter-Earphone-Connector%2Fdp%2FB07P11PYPH%2Fref%3Dsr_1_11_sspa%3Fkeywords%3Diphone%26qid%3D1552244519%26s%3Dgateway%26sr%3D8-11-spons%26psc%3D1&qualifier=1552244519&id=2869484585385390&widgetName=sp_mtf",
"seller": "by Alcoco",
"link": "/gp/slredirect/picassoRedirect.html/ref=pa_sp_mtf_aps_sr_pg1_2?ie=UTF8&adId=A03784982YI18C25BT179&url=%2FHeadphone-Adapter-Splitter-Earphone-Connector%2Fdp%2FB07P11PYPH%2Fref%3Dsr_1_11_sspa%3Fkeywords%3Diphone%26qid%3D1552244519%26s%3Dgateway%26sr%3D8-11-spons%26psc%3D1&qualifier=1552244519&id=2869484585385390&widgetName=sp_mtf",
"title": "Headphone Adapter for iPhone 8 3.5mm Splitter Jack Dongle Earphone Cable Charge and Aux Audio Connector for iPhone X/Xs/XS max/8/8 Plus/7/7 Plus 2 in 1 Headphone for Music and Charge Support iOS 12",
"stars": "4.8 out of 5 stars",
"num_reviews": "1,009",
"price": "$9.89",
"oldprice": "",
"rank": 11
},
{
"image": "/Apple-iPhone-Fully-Unlocked-32GB/dp/B01NAW98VS/ref=sr_1_12?keywords=iphone&qid=1552244519&s=gateway&sr=8-12",
"seller": "by Apple",
"link": "/Apple-iPhone-Fully-Unlocked-32GB/dp/B01NAW98VS/ref=sr_1_12?keywords=iphone&qid=1552244519&s=gateway&sr=8-12",
"title": "Apple iPhone 7, Fully Unlocked, 32GB - Gold (Refurbished)",
"stars": "3.5 out of 5 stars",
"num_reviews": "298",
"price": "$265.83$349.99",
"oldprice": "$349.99$349.99",
"rank": 12
},
{
"image": "/Apple-iPhone-GSM-Unlocked-16GB/dp/B0774T8DC6/ref=sr_1_13?keywords=iphone&qid=1552244519&s=gateway&sr=8-13",
"seller": "by Apple",
"link": "/Apple-iPhone-GSM-Unlocked-16GB/dp/B0774T8DC6/ref=sr_1_13?keywords=iphone&qid=1552244519&s=gateway&sr=8-13",
"title": "Apple iPhone SE, GSM Unlocked, 16GB - Space Gray (Refurbished)",
"stars": "3.9 out of 5 stars",
"num_reviews": "636",
"price": "$134.00",
"oldprice": "",
"rank": 13
},
{
"image": "/Apple-iPhone-GSM-Unlocked-256GB/dp/B07753NSQZ/ref=sr_1_14?keywords=iphone&qid=1552244519&s=gateway&sr=8-14",
"seller": "by Apple",
"link": "/Apple-iPhone-GSM-Unlocked-256GB/dp/B07753NSQZ/ref=sr_1_14?keywords=iphone&qid=1552244519&s=gateway&sr=8-14",
"title": "Apple iPhone 8, GSM Unlocked, 256GB - Space Gray (Refurbished)",
"stars": "3.9 out of 5 stars",
"num_reviews": "44",
"price": "$529.99",
"oldprice": "",
"rank": 14
},
{
"image": "/Apple-iPhone-Plus-Unlocked-16GB/dp/B00YD54J8W/ref=sr_1_15?keywords=iphone&qid=1552244519&s=gateway&sr=8-15",
"seller": "by Apple",
"link": "/Apple-iPhone-Plus-Unlocked-16GB/dp/B00YD54J8W/ref=sr_1_15?keywords=iphone&qid=1552244519&s=gateway&sr=8-15",
"title": "Apple iPhone 6 Plus, GSM Unlocked, 16GB - Space Gray (Refurbished)",
"stars": "3.3 out of 5 stars",
"num_reviews": "1,408",
"price": "$190.00",
"oldprice": "",
"rank": 15
},
{
"image": "/Apple-iPhone-GSM-Unlocked-64GB/dp/B014Z8HDWU/ref=sr_1_16?keywords=iphone&qid=1552244519&s=gateway&sr=8-16",
"seller": "by Apple",
"link": "/Apple-iPhone-GSM-Unlocked-64GB/dp/B014Z8HDWU/ref=sr_1_16?keywords=iphone&qid=1552244519&s=gateway&sr=8-16",
"title": "Apple iPhone 6, GSM Unlocked, 64GB - Space Gray (Refurbished)",
"stars": "2.9 out of 5 stars",
"num_reviews": "968",
"price": "$156.94",
"oldprice": "",
"rank": 16
},
{
"image": "/Apple-iPhone-GSM-Unlocked-128GB/dp/B01N9YO1DS/ref=sr_1_18?keywords=iphone&qid=1552244519&s=gateway&sr=8-18",
"seller": "by Apple",
"link": "/Apple-iPhone-GSM-Unlocked-128GB/dp/B01N9YO1DS/ref=sr_1_18?keywords=iphone&qid=1552244519&s=gateway&sr=8-18",
"title": "Apple iPhone 7, GSM Unlocked, 128GB - Gold (Refurbished)",
"stars": "3.5 out of 5 stars",
"num_reviews": "623",
"price": "$309.89$399.99",
"oldprice": "$399.99$399.99",
"rank": 17
},
{
"image": "/Apple-iPhone-Fully-Unlocked-16GB/dp/B06XRG6S73/ref=sr_1_19?keywords=iphone&qid=1552244519&s=gateway&sr=8-19",
"seller": "by Apple",
"link": "/Apple-iPhone-Fully-Unlocked-16GB/dp/B06XRG6S73/ref=sr_1_19?keywords=iphone&qid=1552244519&s=gateway&sr=8-19",
"title": "Apple iPhone 6S, Fully Unlocked, 16GB - Rose Gold (Refurbished)",
"stars": "3.9 out of 5 stars",
"num_reviews": "205",
"price": "$168.88",
"oldprice": "",
"rank": 18
},
{
"image": "/gp/slredirect/picassoRedirect.html/ref=pa_sp_btf_aps_sr_pg1_1?ie=UTF8&adId=A058166516UYKTSX41DBD&url=%2FBluetooth-Headphones-Retractable-Sweat-Proof-Earphones%2Fdp%2FB07K197VKQ%2Fref%3Dsr_1_20_sspa%3Fkeywords%3Diphone%26qid%3D1552244519%26s%3Dgateway%26sr%3D8-20-spons%26psc%3D1&qualifier=1552244519&id=2869484585385390&widgetName=sp_btf",
"seller": "by ZSW Tech",
"link": "/gp/slredirect/picassoRedirect.html/ref=pa_sp_btf_aps_sr_pg1_1?ie=UTF8&adId=A058166516UYKTSX41DBD&url=%2FBluetooth-Headphones-Retractable-Sweat-Proof-Earphones%2Fdp%2FB07K197VKQ%2Fref%3Dsr_1_20_sspa%3Fkeywords%3Diphone%26qid%3D1552244519%26s%3Dgateway%26sr%3D8-20-spons%26psc%3D1&qualifier=1552244519&id=2869484585385390&widgetName=sp_btf",
"title": "Bluetooth Headphones, Wireless Neckband Earbuds Retractable Headset Stereo Sweat-Proof Sports Earphones with Mic for iPhone X/8/7/6, Android and Other Bluetooth Devices (Rose Gold)",
"stars": "3.9 out of 5 stars",
"num_reviews": "17",
"price": "$21.99",
"oldprice": "",
"rank": 19
},
{
"image": "/gp/slredirect/picassoRedirect.html/ref=pa_sp_btf_aps_sr_pg1_2?ie=UTF8&adId=A00322032TZ6RHIDCJFI7&url=%2FCancelling-Headphone-Bluetooth-Headphones-Microphone%2Fdp%2FB077YG22Y9%2Fref%3Dsr_1_21_sspa%3Fkeywords%3Diphone%26qid%3D1552244519%26s%3Dgateway%26sr%3D8-21-spons%26psc%3D1&qualifier=1552244519&id=2869484585385390&widgetName=sp_btf",
"seller": "by COWIN",
"link": "/gp/slredirect/picassoRedirect.html/ref=pa_sp_btf_aps_sr_pg1_2?ie=UTF8&adId=A00322032TZ6RHIDCJFI7&url=%2FCancelling-Headphone-Bluetooth-Headphones-Microphone%2Fdp%2FB077YG22Y9%2Fref%3Dsr_1_21_sspa%3Fkeywords%3Diphone%26qid%3D1552244519%26s%3Dgateway%26sr%3D8-21-spons%26psc%3D1&qualifier=1552244519&id=2869484585385390&widgetName=sp_btf",
"title": "COWIN E7 Pro [2018 Upgraded] Active Noise Cancelling Headphone Bluetooth Headphones Microphone Hi-Fi Deep Bass Wireless Headphones Over Ear 30H Playtime Travel Work TV Computer Phone - Black",
"stars": "4.3 out of 5 stars",
"num_reviews": "2,580",
"price": "$89.99",
"oldprice": "",
"rank": 20
},
{
"image": "/gp/slredirect/picassoRedirect.html/ref=pa_sp_btf_aps_sr_pg1_3?ie=UTF8&adId=A05897013R5R41MQFZSIF&url=%2FBluetooth-Headphones-Retractable-Sweat-Proof-Earphones%2Fdp%2FB07K166QXV%2Fref%3Dsr_1_22_sspa%3Fkeywords%3Diphone%26qid%3D1552244519%26s%3Dgateway%26sr%3D8-22-spons%26psc%3D1&qualifier=1552244519&id=2869484585385390&widgetName=sp_btf",
"seller": "by ZSW Tech",
"link": "/gp/slredirect/picassoRedirect.html/ref=pa_sp_btf_aps_sr_pg1_3?ie=UTF8&adId=A05897013R5R41MQFZSIF&url=%2FBluetooth-Headphones-Retractable-Sweat-Proof-Earphones%2Fdp%2FB07K166QXV%2Fref%3Dsr_1_22_sspa%3Fkeywords%3Diphone%26qid%3D1552244519%26s%3Dgateway%26sr%3D8-22-spons%26psc%3D1&qualifier=1552244519&id=2869484585385390&widgetName=sp_btf",
"title": "Bluetooth Headphones, Wireless Neckband Earbuds Retractable Headset Stereo Sweat-Proof Sports Earphones with Mic for iPhone X/8/7/6, Android and Other Bluetooth Devices (Black)",
"stars": "3.6 out of 5 stars",
"num_reviews": "15",
"price": "$21.99",
"oldprice": "",
"rank": 21
}
]
}
},
"drone": {
"1": {
"time": "Sun, 10 Mar 2019 19:02:02 GMT",
"num_results": "\n 1-48 of over 50,000 results for \"drone\"\n \n \n\n\n\n\n\n\n \n \n \n \n \n \n \n \n Sort by:\n \n Featured\n \n Price: Low to High\n \n Price: High to Low\n \n Avg. Customer Review\n \n Newest Arrivals\n \n Sort by:Featured\n <span class=\"a-button a-button-base\"><span class=\"a-button-inner\"><input class=\"a-button-input\" type=\"submit\" value=\"Go\"><span class=\"a-button-text\" aria-hidden=\"true\">Go</span></span></span>\n \n\n\n ",
"no_results": false,
"effective_query": "\"drone\"",
"results": [
{
"image": "/gp/slredirect/picassoRedirect.html/ref=pa_sp_phone_search_atf_aps_sr_pg1_1?ie=UTF8&adId=A07746892PFE2ZAVP8GLM&url=%2FHS120D-Quadcotper-Helicopter-Beginners-Functions%2Fdp%2FB07GTJ31ZM%2Fref%3Dsr_1_1_sspa%3Fkeywords%3Ddrone%26qid%3D1552244521%26s%3Dgateway%26sr%3D8-1-spons%26psc%3D1&qualifier=1552244521&id=7643069347457012&widgetName=sp_phone_search_atf",
"seller": "by DEERC",
"link": "/gp/slredirect/picassoRedirect.html/ref=pa_sp_phone_search_atf_aps_sr_pg1_1?ie=UTF8&adId=A07746892PFE2ZAVP8GLM&url=%2FHS120D-Quadcotper-Helicopter-Beginners-Functions%2Fdp%2FB07GTJ31ZM%2Fref%3Dsr_1_1_sspa%3Fkeywords%3Ddrone%26qid%3D1552244521%26s%3Dgateway%26sr%3D8-1-spons%26psc%3D1&qualifier=1552244521&id=7643069347457012&widgetName=sp_phone_search_atf",
"title": "Holy Stone HS120D FPV Drone with Camera for Adults 1080p HD Live Video and GPS Return Home, RC Quadcotper Helicopter for Kids Beginners 16 Min Flight Time Long Range with Follow Me Selfie Functions",
"stars": "4.9 out of 5 stars",
"num_reviews": "20",
"price": "$169.99",
"oldprice": "",
"rank": 1
},
{
"image": "/gp/slredirect/picassoRedirect.html/ref=pa_sp_phone_search_atf_aps_sr_pg1_2?ie=UTF8&adId=A07499583F006NLTR338J&url=%2FHoly-Stone-Quadcopter-Beginners-Intelligent%2Fdp%2FB07B6TZ575%2Fref%3Dsr_1_2_sspa%3Fkeywords%3Ddrone%26qid%3D1552244521%26s%3Dgateway%26sr%3D8-2-spons%26psc%3D1&qualifier=1552244521&id=7643069347457012&widgetName=sp_phone_search_atf",
"seller": "by Holy Stone",
"link": "/gp/slredirect/picassoRedirect.html/ref=pa_sp_phone_search_atf_aps_sr_pg1_2?ie=UTF8&adId=A07499583F006NLTR338J&url=%2FHoly-Stone-Quadcopter-Beginners-Intelligent%2Fdp%2FB07B6TZ575%2Fref%3Dsr_1_2_sspa%3Fkeywords%3Ddrone%26qid%3D1552244521%26s%3Dgateway%26sr%3D8-2-spons%26psc%3D1&qualifier=1552244521&id=7643069347457012&widgetName=sp_phone_search_atf",
"title": "Holy Stone HS100G Drone with 1080p FHD Camera 5G FPV Live Video and GPS Return Home Function RC Quadcopter for Beginners Kids Adults with Follow Me, Altitude Hold, Intelligent Battery",
"stars": "4.4 out of 5 stars",
"num_reviews": "225",
"price": "$249.99",
"oldprice": "",
"rank": 2
},
{
"image": "/Holy-Stone-Predator-Helicopter-Quadcopter/dp/B0157IHJMQ/ref=sr_1_3?keywords=drone&qid=1552244521&s=gateway&sr=8-3",
"seller": "by Holy Stone",
"link": "/Holy-Stone-Predator-Helicopter-Quadcopter/dp/B0157IHJMQ/ref=sr_1_3?keywords=drone&qid=1552244521&s=gateway&sr=8-3",
"title": "Holy Stone HS170 Predator Mini RC Helicopter Drone 2.4Ghz 6-Axis Gyro 4 Channels Quadcopter Good Choice for Drone Training",
"stars": "4.3 out of 5 stars",
"num_reviews": "5,003",
"price": "$35.99",
"oldprice": "",
"rank": 3
},
{
"image": "/TOPE-Wide-Angle-Quadcopter-Beginners-Brushless/dp/B07MTZPWL7/ref=sr_1_4?keywords=drone&qid=1552244521&s=gateway&sr=8-4",
"seller": "by TOPE",
"link": "/TOPE-Wide-Angle-Quadcopter-Beginners-Brushless/dp/B07MTZPWL7/ref=sr_1_4?keywords=drone&qid=1552244521&s=gateway&sr=8-4",
"title": "TOPE GPS FPV RC Drones with 1080P FHD Camera Live Video 150° Wide-Angle 5Ghz WiFi Quadcopter for Beginners Kids Adults with Follow Me,Brushless Motor,GPS Return Home and Foldable Arms, Black",
"stars": "4.0 out of 5 stars",
"num_reviews": "7",
"price": "$198.99",
"oldprice": "",
"rank": 4
},
{
"image": "/SNAPTAIN-Wide-Angle-Quadcopter-Altitude-Compatible/dp/B07GPNZSMY/ref=sr_1_5?keywords=drone&qid=1552244521&s=gateway&sr=8-5",
"seller": "by SNAPTAIN",
"link": "/SNAPTAIN-Wide-Angle-Quadcopter-Altitude-Compatible/dp/B07GPNZSMY/ref=sr_1_5?keywords=drone&qid=1552244521&s=gateway&sr=8-5",
"title": "SNAPTAIN S5C WiFi FPV Drone with 720P HD Camera,Voice Control, Wide-Angle Live Video RC Quadcopter with Altitude Hold, Gravity Sensor Function, RTF One Key Take Off/Landing, Compatible w/VR Headset",
"stars": "4.3 out of 5 stars",
"num_reviews": "462",
"price": "$74.99$159.99",
"oldprice": "$159.99$159.99",
"rank": 5
},
{
"image": "/SIMREX-Foldable-Quadcopter-Headless-Altitude/dp/B07HSQJ387/ref=sr_1_6?keywords=drone&qid=1552244521&s=gateway&sr=8-6",
"seller": "by SIMREX",
"link": "/SIMREX-Foldable-Quadcopter-Headless-Altitude/dp/B07HSQJ387/ref=sr_1_6?keywords=drone&qid=1552244521&s=gateway&sr=8-6",
"title": "SIMREX X300C 8816 Mini Drone with Camera WiFi HD FPV Foldable RC Quadcopter RTF 4CH 2.4Ghz Remote Control Headless [Altitude Hold] Super Easy Fly for Training - White",
"stars": "4.0 out of 5 stars",
"num_reviews": "160",
"price": "$39.99",
"oldprice": "",
"rank": 6
},
{
"image": "/Holy-Stone-Wide-Angle-Quadcopter-Altitude/dp/B078WKT1HL/ref=sr_1_7?keywords=drone&qid=1552244521&s=gateway&sr=8-7",
"seller": "by Holy Stone",
"link": "/Holy-Stone-Wide-Angle-Quadcopter-Altitude/dp/B078WKT1HL/ref=sr_1_7?keywords=drone&qid=1552244521&s=gateway&sr=8-7",
"title": "Holy Stone HS110D FPV RC Drone with 720P HD Camera Live Video 120° Wide-Angle WiFi Quadcopter with Altitude Hold Headless Mode 3D Flips RTF with Modular Battery, Color Black",
"stars": "4.5 out of 5 stars",
"num_reviews": "633",
"price": "$129.99",
"oldprice": "",
"rank": 7
},
{
"image": "/Quadcopter-Drone-Camera-EACHINE-Foldable/dp/B07HMWK4C2/ref=sr_1_8?keywords=drone&qid=1552244521&s=gateway&sr=8-8",
"seller": "by EACHINE",
"link": "/Quadcopter-Drone-Camera-EACHINE-Foldable/dp/B07HMWK4C2/ref=sr_1_8?keywords=drone&qid=1552244521&s=gateway&sr=8-8",
"title": "Quadcopter Drone With Camera Live Video, EACHINE E58 WiFi FPV Quadcopter with 120° FOV 720P HD Camera Foldable Drone RTF - Altitude Hold, One Key Take Off/Landing, 3D Flip, APP Control3Pcs Batteries",
"stars": "4.0 out of 5 stars",
"num_reviews": "72",
"price": "$92.99$100.00",
"oldprice": "$100.00$100.00",
"rank": 8
},
{
"image": "/DROCON-Portable-Quadcopter-Altitude-Beginners/dp/B07FCCGXDL/ref=sr_1_9?keywords=drone&qid=1552244521&s=gateway&sr=8-9",
"seller": "by DROCON",
"link": "/DROCON-Portable-Quadcopter-Altitude-Beginners/dp/B07FCCGXDL/ref=sr_1_9?keywords=drone&qid=1552244521&s=gateway&sr=8-9",
"title": "DROCON Mini RC Drone for Kids, Portable Pocket Quadcopter with Altitude Hold Mode, One-Key Take-Off & Landing, 3D Flips and Headless Mode, Easy to Fly for Beginners, Great Gift",
"stars": "4.3 out of 5 stars",
"num_reviews": "344",
"price": "$32.99$59.99",
"oldprice": "$59.99$59.99",
"rank": 9
}
]
}
}
}

View File

@ -1,6 +1,6 @@
{ {
"name": "se-scraper", "name": "se-scraper",
"version": "1.2.10", "version": "1.2.11",
"description": "A simple module using puppeteer to scrape several search engines such as Google, Duckduckgo and Bing.", "description": "A simple module using puppeteer to scrape several search engines such as Google, Duckduckgo and Bing.",
"homepage": "https://scrapeulous.com/", "homepage": "https://scrapeulous.com/",
"main": "index.js", "main": "index.js",

34
run.js
View File

@ -9,27 +9,7 @@ let config = {
// is drawn before every request. empty string for no sleeping. // is drawn before every request. empty string for no sleeping.
sleep_range: '[1,2]', sleep_range: '[1,2]',
// which search engine to scrape // which search engine to scrape
search_engine: 'google', search_engine: 'amazon',
// use specific search engine parameters for various search engines
// google_settings: {
// google_domain: 'google.com',
// gl: 'us', // The gl parameter determines the Google country to use for the query.
// hl: 'us', // The hl parameter determines the Google UI language to return results.
// start: 0, // Determines the results offset to use, defaults to 0.
// num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
// },
google_settings: '{"gl": "tr", "hl": "tr", "num": "50", "start": "0"}',
// https://docs.microsoft.com/en-us/rest/api/cognitiveservices/bing-web-api-v5-reference#query-parameters
bing_settings: {
count: 50, // how many results per page
safeSearch: 'Off', // safe search (strict, moderate, off)
cc: 'us', // ISO 3166 country code
offset: 0, // The zero-based offset that indicates the number of search results to skip before returning results
},
// whether debug information should be printed // whether debug information should be printed
// debug info is useful for developers when debugging // debug info is useful for developers when debugging
debug: false, debug: false,
@ -37,18 +17,18 @@ let config = {
// this output is informational // this output is informational
verbose: true, verbose: true,
// an array of keywords to scrape // an array of keywords to scrape
keywords: ['good news'], keywords: ['drone', 'smartphone'],
// alternatively you can specify a keyword_file. this overwrites the keywords array // alternatively you can specify a keyword_file. this overwrites the keywords array
keyword_file: '', keyword_file: '',
// the number of pages to scrape for each keyword // the number of pages to scrape for each keyword
num_pages: 2, num_pages: 1,
// whether to start the browser in headless mode // whether to start the browser in headless mode
headless: true, headless: false,
// path to output file, data will be stored in JSON // path to output file, data will be stored in JSON
output_file: 'examples/results/advanced.json', output_file: 'examples/results/amazon.json',
// whether to prevent images, css, fonts from being loaded // whether to prevent images, css, fonts from being loaded
// will speed up scraping a great deal // will speed up scraping a great deal
block_assets: true, block_assets: false,
// path to js module that extends functionality // path to js module that extends functionality
// this module should export the functions: // this module should export the functions:
// get_browser, handle_metadata, close_browser // get_browser, handle_metadata, close_browser
@ -66,7 +46,7 @@ let config = {
// check if headless chrome escapes common detection techniques // check if headless chrome escapes common detection techniques
// this is a quick test and should be used for debugging // this is a quick test and should be used for debugging
test_evasion: false, test_evasion: false,
apply_evasion_techniques: false, apply_evasion_techniques: true,
// log ip address data // log ip address data
log_ip_address: false, log_ip_address: false,
// log http headers // log http headers

120
src/modules/amazon.js Normal file
View File

@ -0,0 +1,120 @@
const cheerio = require('cheerio');
const Scraper = require('./se_scraper');
class AmazonScraper extends Scraper {
constructor(...args) {
super(...args);
}
parse(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
// perform queries
const results = [];
$('#search .s-result-item').each((i, product) => {
results.push({
image: $(product).find('[data-component-type="s-product-image"] a').attr('href'),
seller: $(product).find('h5 + div span').text(),
link: $(product).find('h5 a').attr('href'),
title: $(product).find('h5 a span').text(),
stars: $(product).find('a i span').text(),
num_reviews: $(product).find('span > a > span:first-child').text(),
price: $(product).find('.a-price .a-offscreen').text(),
oldprice: $(product).find('.a-price[data-a-color="secondary"]').text(),
})
});
let no_results = this.no_results(
['Keine Ergebnisse', 'No results for '],
$('#search').text()
);
let effective_query = $('[data-component-type="s-result-info-bar"] span.a-text-bold').text() || '';
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.title && res.title.trim() && res.price && res.price.trim() && res.stars.trim()) {
res.rank = this.result_rank++;
cleaned.push(res);
}
}
return {
time: (new Date()).toUTCString(),
num_results: $('[data-component-type="s-result-info-bar"] .a-spacing-top-small').text(),
no_results: no_results,
effective_query: effective_query,
results: cleaned
}
}
async load_start_page() {
let startUrl = 'https://www.amazon.com/';
if (this.config.amazon_settings) {
startUrl = `https://www.${this.config.amazon_settings.amazon_domain}/s?`;
if (this.config.amazon_settings.amazon_domain) {
startUrl = `https://www.${this.config.amazon_settings.amazon_domain}/s?`;
} else {
startUrl = 'https://www.amazon.com/s?';
}
for (var key in this.config.amazon_settings) {
if (key !== 'amazon_domain') {
startUrl += `${key}=${this.config.amazon_settings[key]}&`
}
}
}
if (this.config.verbose) {
console.log('Using startUrl: ' + startUrl);
}
await this.page.goto(startUrl);
try {
await this.page.waitForSelector('input[name="field-keywords"]', { timeout: this.STANDARD_TIMEOUT });
} catch (e) {
return false;
}
return true;
}
async search_keyword(keyword) {
const input = await this.page.$('input[name="field-keywords"]');
await this.set_input_value(`input[name="field-keywords"]`, keyword);
await this.sleep(50);
await input.focus();
await this.page.keyboard.press("Enter");
}
async next_page() {
let next_page_link = await this.page.$('.a-last a', {timeout: 1000});
if (!next_page_link) {
return false;
}
await next_page_link.click();
return true;
}
async wait_for_results() {
await this.page.waitForSelector('.s-result-list', { timeout: this.STANDARD_TIMEOUT });
await this.sleep(500);
}
async detected() {
const title = await this.page.title();
let html = await this.page.content();
return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
}
}
module.exports = {
AmazonScraper: AmazonScraper,
};

View File

@ -1,6 +1,418 @@
const cheerio = require('cheerio'); const cheerio = require('cheerio');
const Scraper = require('./se_scraper'); const Scraper = require('./se_scraper');
class GoogleScraper extends Scraper {
constructor(...args) {
super(...args);
}
parse(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
// perform queries
const results = [];
$('#center_col .g').each((i, link) => {
results.push({
link: $(link).find('.r a').attr('href'),
title: $(link).find('.r a').text(),
snippet: $(link).find('span.st').text(),
visible_link: $(link).find('.r cite').text(),
date: $(link).find('span.f').text() || '',
})
});
// 'Ergebnisse für', 'Showing results for'
let no_results = this.no_results(
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
'No results found for'],
$('#main').text()
);
let effective_query = $('#fprsl').text() || '';
if (!effective_query) {
effective_query = $('#fprs a').text()
}
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.title && res.title.trim()) {
res.rank = this.result_rank++;
cleaned.push(res);
}
}
return {
time: (new Date()).toUTCString(),
num_results: $('#resultStats').text(),
no_results: no_results,
effective_query: effective_query,
results: cleaned
}
}
async load_start_page() {
let startUrl = 'https://www.google.com';
if (this.config.google_settings) {
startUrl = `https://www.${this.config.google_settings.google_domain}/search?q=`;
if (this.config.google_settings.google_domain) {
startUrl = `https://www.${this.config.google_settings.google_domain}/search?`;
} else {
startUrl = `https://www.google.com/search?`;
}
for (var key in this.config.google_settings) {
if (key !== 'google_domain') {
startUrl += `${key}=${this.config.google_settings[key]}&`
}
}
}
if (this.config.verbose) {
console.log('Using startUrl: ' + startUrl);
}
await this.page.goto(startUrl);
try {
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
} catch (e) {
return false;
}
return true;
}
async search_keyword(keyword) {
const input = await this.page.$('input[name="q"]');
await this.set_input_value(`input[name="q"]`, keyword);
await this.sleep(50);
await input.focus();
await this.page.keyboard.press("Enter");
}
async next_page() {
let next_page_link = await this.page.$('#pnnext', {timeout: 1000});
if (!next_page_link) {
return false;
}
await next_page_link.click();
return true;
}
async wait_for_results() {
await this.page.waitForSelector('#center_col', { timeout: this.STANDARD_TIMEOUT });
await this.sleep(500);
}
async detected() {
const title = await this.page.title();
let html = await this.page.content();
return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
}
}
class GoogleNewsOldScraper extends Scraper {
parse(html) {
const $ = cheerio.load(html);
// perform queries
const results = [];
$('.g').each((i, result) => {
results.push({
link: $(result).find('h3 a').attr('href'),
title: $(result).find('h3 a').text(),
snippet: $(result).find('.st').text(),
date: $(result).find('.nsa').text(),
})
});
let no_results = this.no_results(
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
'No results found for', 'did not match any news results'],
$('#main').text()
);
let effective_query = $('#fprsl').text() || '';
if (!effective_query) {
effective_query = $('#fprs a').text()
}
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim()) {
res.rank = this.result_rank++;
cleaned.push(res);
}
}
return {
time: (new Date()).toUTCString(),
results: cleaned,
no_results: no_results,
effective_query: effective_query,
}
}
async load_start_page() {
return true;
}
async search_keyword(keyword) {
await this.page.goto(`https://www.google.com/search?q=${keyword}&hl=en&source=lnms&tbm=nws`, {
referer: 'https://www.google.com/'
});
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
}
async next_page() {
let next_page_link = await this.page.$('#pnnext', {timeout: 1000});
if (!next_page_link) {
return false;
}
await next_page_link.click();
return true;
}
async wait_for_results() {
await this.page.waitForSelector('#main', { timeout: this.STANDARD_TIMEOUT });
await this.sleep(500);
}
async detected() {
const title = await this.page.title();
let html = await this.page.content();
return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
}
}
class GoogleImageScraper extends Scraper {
parse(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
// perform queries
const results = [];
$('.rg_bx').each((i, link) => {
let link_element = $(link).find('a.rg_l').attr('href');
let clean_link = clean_image_url(link_element);
results.push({
link: link_element,
clean_link: clean_link,
snippet: $(link).find('.a-no-hover-decoration').text(),
})
});
let no_results = this.no_results(
['stimmt mit keinem Bildergebnis', 'Keine Ergebnisse für', 'not match any image results', 'No results found for',],
$('#main').text()
);
let effective_query = $('#fprsl').text() || '';
if (!effective_query) {
effective_query = $('#fprs a').text();
}
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.link.trim().length > 10) {
res.link = res.link.trim();
res.rank = this.result_rank++;
cleaned.push(res);
}
}
return {
time: (new Date()).toUTCString(),
no_results: no_results,
results: cleaned,
effective_query: effective_query
}
}
async load_start_page() {
try {
await this.page.goto(`https://www.google.com/imghp?tbm=isch`, {
referer: 'https://www.google.com/'
});
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
} catch (e) {
return false;
}
return true;
}
async search_keyword(keyword) {
const input = await this.page.$('input[name="q"]');
await this.set_input_value(`input[name="q"]`, keyword);
await this.sleep(50);
await input.focus();
await this.page.keyboard.press("Enter");
}
async next_page() {
return false;
}
async wait_for_results() {
await this.page.waitForSelector('#main', { timeout: this.STANDARD_TIMEOUT });
await this.sleep(500);
}
async detected() {
const title = await this.page.title();
let html = await this.page.content();
return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
}
}
class GoogleNewsScraper extends Scraper {
parse(html) {
const $ = cheerio.load(html);
// perform queries
const results = [];
$('article h3').each((i, headline) => {
let title = $(headline).find('a span').text();
try {
var snippet = $(headline).parent().find('p').text();
var link = $(headline).find('a').attr('href');
var date = $(headline).parent().parent().parent().find('time').text();
var ts = $(headline).parent().parent().parent().find('time').attr('datetime');
} catch(e) {
}
if (!this.all_results.has(title)) {
results.push({
rank: i+1,
title: title,
snippet: snippet,
link: link,
date: date,
ts: ts,
});
}
this.all_results.add(title);
});
let no_results = this.no_results(
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
'No results found for', 'did not match any news results'],
$('body').text()
);
let effective_query = $('#fprsl').text() || '';
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.title && res.title.trim()) {
res.rank = this.result_rank++;
cleaned.push(res);
}
}
return {
time: (new Date()).toUTCString(),
results: cleaned,
no_results: no_results,
effective_query: effective_query,
}
}
async load_start_page() {
try {
this.all_results = new Set();
await this.page.goto(`https://news.google.com/?hl=en-US&gl=US&ceid=US:en`, {
referer: 'https://news.google.com'
});
await this.page.waitForSelector('div input:nth-child(2)', {timeout: this.STANDARD_TIMEOUT});
await this.sleep(1000);
// parse here front page results
let html = await this.page.content();
this.results['frontpage'] = this.parse(html);
this.result_rank = 1;
} catch(e) {
return false;
}
return true;
}
async search_keyword(keyword) {
await this.page.waitForSelector('div input:nth-child(2)', { timeout: this.STANDARD_TIMEOUT });
const input = await this.page.$('div input:nth-child(2)');
// overwrites last text in input
await input.click({ clickCount: 3 });
await input.type(keyword);
await this.sleep(50);
await input.focus();
await this.page.keyboard.press("Enter");
}
async next_page() {
// google news app does not have next pages
return false;
}
async wait_for_results() {
await this.page.waitForSelector(`[data-n-q="${this.keyword}"]`, { timeout: this.STANDARD_TIMEOUT });
await this.sleep(2000);
}
async detected() {
const title = await this.page.title();
let html = await this.page.content();
return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
}
}
function clean_image_url(url) {
// Example:
// https://www.google.com/imgres?imgurl=https%3A%2F%2Fupload.wikimedia.org%2Fwikipedia%2Fen%2Fthumb%2Ff%2Ffd%2F1928_Edward_Campbell.jpg%2F220px-1928_Edward_Campbell.jpg&imgrefurl=https%3A%2F%2Fwww.revolvy.com%2Fpage%2FSir-Edward-Campbell%252C-1st-Baronet&docid=BMkW_GerTIY4GM&tbnid=TmQapIxDCQbQhM%3A&vet=10ahUKEwje_LLE_YXeAhXisaQKHVAEBSAQMwiNAShEMEQ..i&w=220&h=290&bih=1696&biw=1280&q=John%20MacLeod%20Breadalbane%20Councillor%20Prince%20Edward%20Island&ved=0ahUKEwje_LLE_YXeAhXisaQKHVAEBSAQMwiNAShEMEQ&iact=mrc&uact=8
const regex = /imgurl=(.*?)&/gm;
let match = regex.exec(url);
if (match !== null) {
return decodeURIComponent(match[1]);
}
}
function clean_google_url(url) {
// Example:
// /url?q=https://www.zeit.de/thema/donald-trump&sa=U&ved=0ahUKEwiL9-u-_ZLgAhVJsqQKHeITDAoQFgg0MAc&usg=AOvVaw3JV3UZjTXRwaS2I-sBbeXF
// /search?q=trump&hl=de&gbv=2&ie=UTF-8&prmd=ivns&source=univ&tbm=nws&tbo=u&sa=X&ved=0ahUKEwiL9-u-_ZLgAhVJsqQKHeITDAoQqAIIFA
const regex = /url\?q=(.*?)&/gm;
let match = regex.exec(url);
if (match !== null) {
return decodeURIComponent(match[1]);
} else {
return url;
}
}
module.exports = {
GoogleNewsOldScraper: GoogleNewsOldScraper,
GoogleScraper: GoogleScraper,
GoogleImageScraper: GoogleImageScraper,
GoogleNewsScraper: GoogleNewsScraper,
};
// https://developers.google.com/custom-search/v1/cse/list // https://developers.google.com/custom-search/v1/cse/list
const GOOGLE_DOMAINS = { const GOOGLE_DOMAINS = {
@ -536,415 +948,3 @@ const GOOGLE_HL = {
'xh': 'Xhosa', 'xh': 'Xhosa',
'zu': 'Zulu' 'zu': 'Zulu'
}; };
class GoogleScraper extends Scraper {
constructor(...args) {
super(...args);
}
parse(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
// perform queries
const results = [];
$('#center_col .g').each((i, link) => {
results.push({
link: $(link).find('.r a').attr('href'),
title: $(link).find('.r a').text(),
snippet: $(link).find('span.st').text(),
visible_link: $(link).find('.r cite').text(),
date: $(link).find('span.f').text() || '',
})
});
// 'Ergebnisse für', 'Showing results for'
let no_results = this.no_results(
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
'No results found for'],
$('#main').text()
);
let effective_query = $('#fprsl').text() || '';
if (!effective_query) {
effective_query = $('#fprs a').text()
}
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.title && res.title.trim()) {
res.rank = this.result_rank++;
cleaned.push(res);
}
}
return {
time: (new Date()).toUTCString(),
num_results: $('#resultStats').text(),
no_results: no_results,
effective_query: effective_query,
results: cleaned
}
}
async load_start_page() {
let startUrl = 'https://www.google.com';
if (this.config.google_settings) {
startUrl = `https://www.${this.config.google_settings.google_domain}/search?q=`;
if (this.config.google_settings.google_domain) {
startUrl = `https://www.${this.config.google_settings.google_domain}/search?`;
} else {
startUrl = `https://www.google.com/search?`;
}
for (var key in this.config.google_settings) {
if (key !== 'google_domain') {
startUrl += `${key}=${this.config.google_settings[key]}&`
}
}
}
if (this.config.verbose) {
console.log('Using startUrl: ' + startUrl);
}
await this.page.goto(startUrl);
try {
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
} catch (e) {
return false;
}
return true;
}
async search_keyword(keyword) {
const input = await this.page.$('input[name="q"]');
await this.set_input_value(`input[name="q"]`, keyword);
await this.sleep(50);
await input.focus();
await this.page.keyboard.press("Enter");
}
async next_page() {
let next_page_link = await this.page.$('#pnnext', {timeout: 1000});
if (!next_page_link) {
return false;
}
await next_page_link.click();
return true;
}
async wait_for_results() {
await this.page.waitForSelector('#center_col', { timeout: this.STANDARD_TIMEOUT });
await this.sleep(500);
}
async detected() {
const title = await this.page.title();
let html = await this.page.content();
return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
}
}
class GoogleNewsOldScraper extends Scraper {
parse(html) {
const $ = cheerio.load(html);
// perform queries
const results = [];
$('.g').each((i, result) => {
results.push({
link: $(result).find('h3 a').attr('href'),
title: $(result).find('h3 a').text(),
snippet: $(result).find('.st').text(),
date: $(result).find('.nsa').text(),
})
});
let no_results = this.no_results(
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
'No results found for', 'did not match any news results'],
$('#main').text()
);
let effective_query = $('#fprsl').text() || '';
if (!effective_query) {
effective_query = $('#fprs a').text()
}
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim()) {
res.rank = this.result_rank++;
cleaned.push(res);
}
}
return {
time: (new Date()).toUTCString(),
results: cleaned,
no_results: no_results,
effective_query: effective_query,
}
}
async load_start_page() {
return true;
}
async search_keyword(keyword) {
await this.page.goto(`https://www.google.com/search?q=${keyword}&hl=en&source=lnms&tbm=nws`, {
referer: 'https://www.google.com/'
});
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
}
async next_page() {
let next_page_link = await this.page.$('#pnnext', {timeout: 1000});
if (!next_page_link) {
return false;
}
await next_page_link.click();
return true;
}
async wait_for_results() {
await this.page.waitForSelector('#main', { timeout: this.STANDARD_TIMEOUT });
await this.sleep(500);
}
async detected() {
const title = await this.page.title();
let html = await this.page.content();
return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
}
}
class GoogleImageScraper extends Scraper {
parse(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
// perform queries
const results = [];
$('.rg_bx').each((i, link) => {
let link_element = $(link).find('a.rg_l').attr('href');
let clean_link = clean_image_url(link_element);
results.push({
link: link_element,
clean_link: clean_link,
snippet: $(link).find('.a-no-hover-decoration').text(),
})
});
let no_results = this.no_results(
['stimmt mit keinem Bildergebnis', 'Keine Ergebnisse für', 'not match any image results', 'No results found for',],
$('#main').text()
);
let effective_query = $('#fprsl').text() || '';
if (!effective_query) {
effective_query = $('#fprs a').text();
}
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.link.trim().length > 10) {
res.link = res.link.trim();
res.rank = this.result_rank++;
cleaned.push(res);
}
}
return {
time: (new Date()).toUTCString(),
no_results: no_results,
results: cleaned,
effective_query: effective_query
}
}
async load_start_page() {
try {
await this.page.goto(`https://www.google.com/imghp?tbm=isch`, {
referer: 'https://www.google.com/'
});
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
} catch (e) {
return false;
}
return true;
}
async search_keyword(keyword) {
const input = await this.page.$('input[name="q"]');
await this.set_input_value(`input[name="q"]`, keyword);
await this.sleep(50);
await input.focus();
await this.page.keyboard.press("Enter");
}
async next_page() {
return false;
}
async wait_for_results() {
await this.page.waitForSelector('#main', { timeout: this.STANDARD_TIMEOUT });
await this.sleep(500);
}
async detected() {
const title = await this.page.title();
let html = await this.page.content();
return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
}
}
class GoogleNewsScraper extends Scraper {
parse(html) {
const $ = cheerio.load(html);
// perform queries
const results = [];
$('article h3').each((i, headline) => {
let title = $(headline).find('a span').text();
try {
var snippet = $(headline).parent().find('p').text();
var link = $(headline).find('a').attr('href');
var date = $(headline).parent().parent().parent().find('time').text();
var ts = $(headline).parent().parent().parent().find('time').attr('datetime');
} catch(e) {
}
if (!this.all_results.has(title)) {
results.push({
rank: i+1,
title: title,
snippet: snippet,
link: link,
date: date,
ts: ts,
});
}
this.all_results.add(title);
});
let no_results = this.no_results(
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
'No results found for', 'did not match any news results'],
$('body').text()
);
let effective_query = $('#fprsl').text() || '';
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.title && res.title.trim()) {
res.rank = this.result_rank++;
cleaned.push(res);
}
}
return {
time: (new Date()).toUTCString(),
results: cleaned,
no_results: no_results,
effective_query: effective_query,
}
}
async load_start_page() {
try {
this.all_results = new Set();
await this.page.goto(`https://news.google.com/?hl=en-US&gl=US&ceid=US:en`, {
referer: 'https://news.google.com'
});
await this.page.waitForSelector('div input:nth-child(2)', {timeout: this.STANDARD_TIMEOUT});
await this.sleep(1000);
// parse here front page results
let html = await this.page.content();
this.results['frontpage'] = this.parse(html);
this.result_rank = 1;
} catch(e) {
return false;
}
return true;
}
async search_keyword(keyword) {
await this.page.waitForSelector('div input:nth-child(2)', { timeout: this.STANDARD_TIMEOUT });
const input = await this.page.$('div input:nth-child(2)');
// overwrites last text in input
await input.click({ clickCount: 3 });
await input.type(keyword);
await this.sleep(50);
await input.focus();
await this.page.keyboard.press("Enter");
}
async next_page() {
// google news app does not have next pages
return false;
}
async wait_for_results() {
await this.page.waitForSelector(`[data-n-q="${this.keyword}"]`, { timeout: this.STANDARD_TIMEOUT });
await this.sleep(2000);
}
async detected() {
const title = await this.page.title();
let html = await this.page.content();
return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
}
}
function clean_image_url(url) {
// Example:
// https://www.google.com/imgres?imgurl=https%3A%2F%2Fupload.wikimedia.org%2Fwikipedia%2Fen%2Fthumb%2Ff%2Ffd%2F1928_Edward_Campbell.jpg%2F220px-1928_Edward_Campbell.jpg&imgrefurl=https%3A%2F%2Fwww.revolvy.com%2Fpage%2FSir-Edward-Campbell%252C-1st-Baronet&docid=BMkW_GerTIY4GM&tbnid=TmQapIxDCQbQhM%3A&vet=10ahUKEwje_LLE_YXeAhXisaQKHVAEBSAQMwiNAShEMEQ..i&w=220&h=290&bih=1696&biw=1280&q=John%20MacLeod%20Breadalbane%20Councillor%20Prince%20Edward%20Island&ved=0ahUKEwje_LLE_YXeAhXisaQKHVAEBSAQMwiNAShEMEQ&iact=mrc&uact=8
const regex = /imgurl=(.*?)&/gm;
let match = regex.exec(url);
if (match !== null) {
return decodeURIComponent(match[1]);
}
}
function clean_google_url(url) {
// Example:
// /url?q=https://www.zeit.de/thema/donald-trump&sa=U&ved=0ahUKEwiL9-u-_ZLgAhVJsqQKHeITDAoQFgg0MAc&usg=AOvVaw3JV3UZjTXRwaS2I-sBbeXF
// /search?q=trump&hl=de&gbv=2&ie=UTF-8&prmd=ivns&source=univ&tbm=nws&tbo=u&sa=X&ved=0ahUKEwiL9-u-_ZLgAhVJsqQKHeITDAoQqAIIFA
const regex = /url\?q=(.*?)&/gm;
let match = regex.exec(url);
if (match !== null) {
return decodeURIComponent(match[1]);
} else {
return url;
}
}
module.exports = {
GoogleNewsOldScraper: GoogleNewsOldScraper,
GoogleScraper: GoogleScraper,
GoogleImageScraper: GoogleImageScraper,
GoogleNewsScraper: GoogleNewsScraper,
};

View File

@ -1,6 +1,7 @@
const zlib = require('zlib'); const zlib = require('zlib');
var fs = require('fs'); var fs = require('fs');
const google = require('./modules/google.js'); const google = require('./modules/google.js');
const amazon = require('./modules/amazon.js');
const bing = require('./modules/bing.js'); const bing = require('./modules/bing.js');
const baidu = require('./modules/baidu.js'); const baidu = require('./modules/baidu.js');
const infospace = require('./modules/infospace.js'); const infospace = require('./modules/infospace.js');
@ -24,6 +25,7 @@ function getScraper(searchEngine, args) {
google_image: google.GoogleImageScraper, google_image: google.GoogleImageScraper,
bing: bing.BingScraper, bing: bing.BingScraper,
bing_news: bing.BingNewsScraper, bing_news: bing.BingNewsScraper,
amazon: amazon.AmazonScraper,
duckduckgo: duckduckgo.DuckduckgoScraper, duckduckgo: duckduckgo.DuckduckgoScraper,
duckduckgo_news: duckduckgo.DuckduckgoNewsScraper, duckduckgo_news: duckduckgo.DuckduckgoNewsScraper,
infospace: infospace.InfospaceScraper, infospace: infospace.InfospaceScraper,

149
test/test_amazon.js Normal file
View File

@ -0,0 +1,149 @@
const se_scraper = require('./../index.js');
var assert = require('chai').assert;
/*
* Use chai and mocha for tests.
* https://mochajs.org/#installation
*/
const normal_search_keywords = ['iphone', 'clock'];
async function normal_search_test() {
let config = {
search_engine: 'amazon',
compress: false,
debug: false,
verbose: false,
keywords: normal_search_keywords,
keyword_file: '',
num_pages: 1,
headless: true,
output_file: '',
block_assets: true,
random_user_agent: false,
};
console.log('normal_search_test()');
await se_scraper.scrape(config, normal_search_test_case);
}
// we test with a callback function to our handler
function normal_search_test_case(err, response) {
if (err) {
console.error(err);
} else {
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
assert.equal(response.statusCode, 200, 'status code must be 200');
assert.equal(response.metadata.num_requests, 2);
for (let query in response.results) {
let total_rank = 1;
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
for (let res of obj.results) {
assert.containsAllKeys(res, ['link', 'title', 'rank', 'image', 'seller', 'stars', 'num_reviews', 'price', 'oldprice'], 'not all keys are in the SERP object');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
assert.isOk(res.seller, 'seller must be ok');
assert.typeOf(res.seller, 'string', 'seller must be string');
assert.isAtLeast(res.seller.length, 5, 'seller must have at least 10 chars');
assert.isOk(res.stars, 'stars be ok');
assert.typeOf(res.stars, 'string', 'stars must be string');
assert.isAtLeast(res.stars.length, 5, 'stars must have at least 6 chars');
assert.include(res.stars, ' out of ', 'stars must include " out of "');
assert.isOk(res.num_reviews, 'num_reviews be ok');
assert.typeOf(res.num_reviews, 'string', 'num_reviews must be string');
assert.isAtLeast(res.num_reviews.length, 1, 'num_reviews must have at least 1 chars');
assert.isOk(res.price, 'price be ok');
assert.typeOf(res.price, 'string', 'price must be string');
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
assert.isNumber(res.rank, 'rank must be integer');
assert.equal(res.rank, total_rank++, 'rank ist wrong');
}
}
}
}
}
const keywords_no_results = ['2342kljp;fj9834u40abJ54634344023safkl34a44dsflkjaQQuBBdfk',];
async function no_results_test() {
let config = {
search_engine: 'amazon',
compress: false,
debug: false,
verbose: false,
keywords: keywords_no_results,
keyword_file: '',
num_pages: 1,
headless: true,
output_file: '',
block_assets: true,
random_user_agent: false,
};
console.log('no_results_test()');
await se_scraper.scrape(config, test_case_no_results);
}
// we test with a callback function to our handler
function test_case_no_results(err, response) {
if (err) {
console.error(err);
} else {
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
assert.equal(response.statusCode, 200, 'status code must be 200');
assert.equal(response.metadata.num_requests, 1);
results = response.results;
for (let query in response.results) {
assert.containsAllKeys(response.results, keywords_no_results, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
assert(obj.results.length === 0, 'results must have 0 SERP objects');
assert.equal(obj.no_results, true, 'no results should be true');
assert.isEmpty(obj.num_results, 'no results should be a empty string');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
}
}
}
}
(async () => {
await normal_search_test();
await no_results_test();
})();