From 7a8c6f13f04857919be565d693b807f8f2428471 Mon Sep 17 00:00:00 2001 From: Nikolai Tschacher Date: Thu, 14 Mar 2019 23:33:46 +0100 Subject: [PATCH] fixed #11 by improving baidu a lot in speed and quality --- README.md | 1 + examples/baidu.js | 17 +++ examples/results/baidu.json | 286 ++++++++++++++++++++++++++++++++++++ package.json | 4 +- run.js | 8 +- src/modules/baidu.js | 119 ++++++++------- src/modules/se_scraper.js | 2 +- test/test_baidu.js | 90 ++++++++++++ 8 files changed, 459 insertions(+), 68 deletions(-) create mode 100644 examples/baidu.js create mode 100644 examples/results/baidu.json create mode 100644 test/test_baidu.js diff --git a/README.md b/README.md index f8dfdf5..4a0f955 100644 --- a/README.md +++ b/README.md @@ -118,6 +118,7 @@ This will scrape with **three** browser instance each having their own IP addres ## Examples * [Simple example scraping google](examples/quickstart.js) yields [these results](examples/results/data.json) +* [Simple example scraping baidu](examples/baidu.js) yields [these results](examples/results/baidu.json) * [Scrape with one proxy per browser](examples/proxies.js) yields [these results](examples/results/proxyresults.json) * [Scrape 100 keywords on Bing with multible tabs in one browser](examples/multiple_tabs.js) produces [this](examples/results/bing.json) * [Scrape two keywords on Amazon](examples/amazon.js) produces [this](examples/results/amazon.json) diff --git a/examples/baidu.js b/examples/baidu.js new file mode 100644 index 0000000..a456f65 --- /dev/null +++ b/examples/baidu.js @@ -0,0 +1,17 @@ +const se_scraper = require('./../index.js'); + +let config = { + search_engine: 'baidu', + debug: false, + verbose: false, + keywords: ['cat', 'mouse'], + num_pages: 2, + output_file: 'examples/results/baidu.json', +}; + +function callback(err, response) { + if (err) { console.error(err) } + console.dir(response, {depth: null, colors: true}); +} + +se_scraper.scrape(config, callback); \ No newline at end of file diff --git a/examples/results/baidu.json b/examples/results/baidu.json new file mode 100644 index 0000000..75fbe58 --- /dev/null +++ b/examples/results/baidu.json @@ -0,0 +1,286 @@ +{ + "cat": { + "1": { + "time": "Thu, 14 Mar 2019 22:31:20 GMT", + "no_results": false, + "num_results": "百度为您找到相关结果约31,900,000个", + "results": [ + { + "link": "http://www.baidu.com/link?url=1DLhRKEktA3-C9-w42iT8RFUwtEhrZNVdmrAdADKa4ZrPb2Q3blQieFN8-5olh5Pe5fJ6okkR2qP6FquPRTT1q", + "title": "cat_百度百科", + "snippet": "2017年7月30日 - CAT鞋也叫catfootwear。公司成立于1904年,出产工业制造工具和全世界闻名的CAT品牌各类休闲衣服与鞋业。CAT制造...", + "visible_link": "百度百科 - 百度快照", + "rank": 1 + }, + { + "link": "http://www.baidu.com/link?url=PTfE2yMxRXiCKsbuJoQXw9cMFGlyNsrNBNvUAnymH0SNCaANBl5lXN56yhp2sTzy", + "title": "Cat | 亚太区 | Caterpillar", + "snippet": "CAT 实干成就梦想。卡特彼勒,全球实干家的强大伙伴。欢迎访问Cat (卡特) 官网,产品和服务价格查询中心。Cat是...", + "visible_link": "https://www.cat.com/zh_...html  - 百度快照 - 36条评价", + "rank": 2 + }, + { + "link": "http://www.baidu.com/link?url=cOW3sgfQXtGfCpcSaB2Gu7ln6wAoMfcmOP4oqDJM3Ku", + "title": "Cat | global-selector | Caterpillar", + "snippet": "global-selector Caterpillar Worldwide Genuine enabler of sustainable world progress and opportunity, ...", + "visible_link": "https://www.cat.com/  - 百度快照 - 36条评价 - 翻译此页", + "rank": 3 + }, + { + "link": "http://www.baidu.com/link?url=a81Bbgu0TGU9hKwb8RF20hF_kfEN6vdl0FRXwhA20kCfbvCj2wBhWXbnjJfCKUeY", + "title": "CAT - 京东", + "snippet": "京东JD.COM是国内专业的网上购物商城,为您提供CAT价格、CAT评论、CAT导购、CAT图片等相关信息", + "visible_link": "京东 - 百度快照", + "rank": 4 + }, + { + "link": "http://www.baidu.com/link?url=m1xeiaTFWm-RBFn2t5720iS7Jsn26kr88v2I_vqrVEABcOGGTU6lhANim8WbQrBwQoi7-2xVVh31RefgAHtgj--ANFzKtX6UwO2LqFQ2Gdi", + "title": "大众点评CAT简介 - stone_tomcate的博客 - CSDN博客", + "snippet": "2018年5月17日 - CAT简介 CAT(Central Application Tracking),是基于纯Java开发的分布式实时监控系统。开源代码托管在GitHub(搜索CAT即可),作者是吴其敏(qmwu2000,目前...", + "visible_link": "CSDN博客号 - 百度快照", + "rank": 5 + }, + { + "link": "http://www.baidu.com/link?url=5LCk_4r6s5pCnIXPgCo4tb2qMv6SIYpb5c_JPpgHsTUmaHf_fILjZBJNiylrWLqf360uyJClnizYA6huD9ayB_", + "title": "Linux cat命令 | 菜鸟教程", + "snippet": "2019年3月6日 - Linux cat命令 Linux 命令大全 命令:cat cat 命令用于连接文件并打印到标准输出设备上。 使用权限 所有使用者 语法格式 cat [-AbeEnstTuv] [--help] [...", + "visible_link": "www.runoob.com/linux/l...  - 百度快照", + "rank": 6 + }, + { + "link": "http://www.baidu.com/link?url=fvGSBq5NeKyy7mgnEN4MdmJBkO0bMl7mtZQ8xkcBNWsmcBfbcCo0F_DquyXuP50isHBbkpUFFyT3Ta3q1eCkGCNoLKZchTxfx3Q9a-6zc8u", + "title": "cat /proc/media-mem - 留点什么... - CSDN博客", + "snippet": "2018年10月27日 - ", + "visible_link": "CSDN博客号 - 百度快照", + "rank": 7 + }, + { + "link": "http://www.baidu.com/link?url=9QjsK2jX7fdw5RtwSwgOLE_UIP5DoeRyc-pkmgLA5BJglG_MWMsBtsVgM6USZymB", + "title": "CAT", + "snippet": "CAT12 additionally includes the estimation of the cortical thickness and central surface of the left and right hemispheres based on the projection-based thick...", + "visible_link": "www.neuro.uni-jena.de/...  - 百度快照 - 翻译此页", + "rank": 8 + }, + { + "link": "http://www.baidu.com/link?url=uOwWor1n93n5v5WJTorjdMIJ--R283kBnCAKD3eG9xre8uD5_P57LHhnNMZAXmPuMjTOnSX4Y8QE7md8zBSMoGOBT7dHKeAYCkE-OofEq0e", + "title": "深入详解美团点评CAT跨语言服务监控(一) CAT简介与部署..._CSDN博客", + "snippet": "2018年7月2日 - 前言: CAT是一个实时和接近全量的监控系统,它侧重于对Java应用的监控,除了与点评RPC组件融合的很好之外,他将会能与Spring、MyBatis、Dubbo 等框架以及...", + "visible_link": "CSDN博客号 - 百度快照", + "rank": 9 + } + ] + }, + "2": { + "time": "Thu, 14 Mar 2019 22:31:20 GMT", + "no_results": false, + "num_results": "百度为您找到相关结果约31,900,000个", + "results": [ + { + "link": "http://www.baidu.com/link?url=1DLhRKEktA3-C9-w42iT8RFUwtEhrZNVdmrAdADKa4ZrPb2Q3blQieFN8-5olh5Pe5fJ6okkR2qP6FquPRTT1q", + "title": "cat_百度百科", + "snippet": "2017年7月30日 - CAT鞋也叫catfootwear。公司成立于1904年,出产工业制造工具和全世界闻名的CAT品牌各类休闲衣服与鞋业。CAT制造...", + "visible_link": "百度百科 - 百度快照", + "rank": 10 + }, + { + "link": "http://www.baidu.com/link?url=PTfE2yMxRXiCKsbuJoQXw9cMFGlyNsrNBNvUAnymH0SNCaANBl5lXN56yhp2sTzy", + "title": "Cat | 亚太区 | Caterpillar", + "snippet": "CAT 实干成就梦想。卡特彼勒,全球实干家的强大伙伴。欢迎访问Cat (卡特) 官网,产品和服务价格查询中心。Cat是...", + "visible_link": "https://www.cat.com/zh_...html  - 百度快照 - 36条评价", + "rank": 11 + }, + { + "link": "http://www.baidu.com/link?url=cOW3sgfQXtGfCpcSaB2Gu7ln6wAoMfcmOP4oqDJM3Ku", + "title": "Cat | global-selector | Caterpillar", + "snippet": "global-selector Caterpillar Worldwide Genuine enabler of sustainable world progress and opportunity, ...", + "visible_link": "https://www.cat.com/  - 百度快照 - 36条评价 - 翻译此页", + "rank": 12 + }, + { + "link": "http://www.baidu.com/link?url=a81Bbgu0TGU9hKwb8RF20hF_kfEN6vdl0FRXwhA20kCfbvCj2wBhWXbnjJfCKUeY", + "title": "CAT - 京东", + "snippet": "京东JD.COM是国内专业的网上购物商城,为您提供CAT价格、CAT评论、CAT导购、CAT图片等相关信息", + "visible_link": "京东 - 百度快照", + "rank": 13 + }, + { + "link": "http://www.baidu.com/link?url=m1xeiaTFWm-RBFn2t5720iS7Jsn26kr88v2I_vqrVEABcOGGTU6lhANim8WbQrBwQoi7-2xVVh31RefgAHtgj--ANFzKtX6UwO2LqFQ2Gdi", + "title": "大众点评CAT简介 - stone_tomcate的博客 - CSDN博客", + "snippet": "2018年5月17日 - CAT简介 CAT(Central Application Tracking),是基于纯Java开发的分布式实时监控系统。开源代码托管在GitHub(搜索CAT即可),作者是吴其敏(qmwu2000,目前...", + "visible_link": "CSDN博客号 - 百度快照", + "rank": 14 + }, + { + "link": "http://www.baidu.com/link?url=5LCk_4r6s5pCnIXPgCo4tb2qMv6SIYpb5c_JPpgHsTUmaHf_fILjZBJNiylrWLqf360uyJClnizYA6huD9ayB_", + "title": "Linux cat命令 | 菜鸟教程", + "snippet": "2019年3月6日 - Linux cat命令 Linux 命令大全 命令:cat cat 命令用于连接文件并打印到标准输出设备上。 使用权限 所有使用者 语法格式 cat [-AbeEnstTuv] [--help] [...", + "visible_link": "www.runoob.com/linux/l...  - 百度快照", + "rank": 15 + }, + { + "link": "http://www.baidu.com/link?url=fvGSBq5NeKyy7mgnEN4MdmJBkO0bMl7mtZQ8xkcBNWsmcBfbcCo0F_DquyXuP50isHBbkpUFFyT3Ta3q1eCkGCNoLKZchTxfx3Q9a-6zc8u", + "title": "cat /proc/media-mem - 留点什么... - CSDN博客", + "snippet": "2018年10月27日 - ", + "visible_link": "CSDN博客号 - 百度快照", + "rank": 16 + }, + { + "link": "http://www.baidu.com/link?url=9QjsK2jX7fdw5RtwSwgOLE_UIP5DoeRyc-pkmgLA5BJglG_MWMsBtsVgM6USZymB", + "title": "CAT", + "snippet": "CAT12 additionally includes the estimation of the cortical thickness and central surface of the left and right hemispheres based on the projection-based thick...", + "visible_link": "www.neuro.uni-jena.de/...  - 百度快照 - 翻译此页", + "rank": 17 + }, + { + "link": "http://www.baidu.com/link?url=uOwWor1n93n5v5WJTorjdMIJ--R283kBnCAKD3eG9xre8uD5_P57LHhnNMZAXmPuMjTOnSX4Y8QE7md8zBSMoGOBT7dHKeAYCkE-OofEq0e", + "title": "深入详解美团点评CAT跨语言服务监控(一) CAT简介与部署..._CSDN博客", + "snippet": "2018年7月2日 - 前言: CAT是一个实时和接近全量的监控系统,它侧重于对Java应用的监控,除了与点评RPC组件融合的很好之外,他将会能与Spring、MyBatis、Dubbo 等框架以及...", + "visible_link": "CSDN博客号 - 百度快照", + "rank": 18 + } + ] + } + }, + "mouse": { + "1": { + "time": "Thu, 14 Mar 2019 22:31:21 GMT", + "no_results": false, + "num_results": "百度为您找到相关结果约31,900,000个", + "results": [ + { + "link": "http://www.baidu.com/link?url=1DLhRKEktA3-C9-w42iT8RFUwtEhrZNVdmrAdADKa4ZrPb2Q3blQieFN8-5olh5Pe5fJ6okkR2qP6FquPRTT1q", + "title": "cat_百度百科", + "snippet": "2017年7月30日 - CAT鞋也叫catfootwear。公司成立于1904年,出产工业制造工具和全世界闻名的CAT品牌各类休闲衣服与鞋业。CAT制造...", + "visible_link": "百度百科 - 百度快照", + "rank": 1 + }, + { + "link": "http://www.baidu.com/link?url=PTfE2yMxRXiCKsbuJoQXw9cMFGlyNsrNBNvUAnymH0SNCaANBl5lXN56yhp2sTzy", + "title": "Cat | 亚太区 | Caterpillar", + "snippet": "CAT 实干成就梦想。卡特彼勒,全球实干家的强大伙伴。欢迎访问Cat (卡特) 官网,产品和服务价格查询中心。Cat是...", + "visible_link": "https://www.cat.com/zh_...html  - 百度快照 - 36条评价", + "rank": 2 + }, + { + "link": "http://www.baidu.com/link?url=cOW3sgfQXtGfCpcSaB2Gu7ln6wAoMfcmOP4oqDJM3Ku", + "title": "Cat | global-selector | Caterpillar", + "snippet": "global-selector Caterpillar Worldwide Genuine enabler of sustainable world progress and opportunity, ...", + "visible_link": "https://www.cat.com/  - 百度快照 - 36条评价 - 翻译此页", + "rank": 3 + }, + { + "link": "http://www.baidu.com/link?url=a81Bbgu0TGU9hKwb8RF20hF_kfEN6vdl0FRXwhA20kCfbvCj2wBhWXbnjJfCKUeY", + "title": "CAT - 京东", + "snippet": "京东JD.COM是国内专业的网上购物商城,为您提供CAT价格、CAT评论、CAT导购、CAT图片等相关信息", + "visible_link": "京东 - 百度快照", + "rank": 4 + }, + { + "link": "http://www.baidu.com/link?url=m1xeiaTFWm-RBFn2t5720iS7Jsn26kr88v2I_vqrVEABcOGGTU6lhANim8WbQrBwQoi7-2xVVh31RefgAHtgj--ANFzKtX6UwO2LqFQ2Gdi", + "title": "大众点评CAT简介 - stone_tomcate的博客 - CSDN博客", + "snippet": "2018年5月17日 - CAT简介 CAT(Central Application Tracking),是基于纯Java开发的分布式实时监控系统。开源代码托管在GitHub(搜索CAT即可),作者是吴其敏(qmwu2000,目前...", + "visible_link": "CSDN博客号 - 百度快照", + "rank": 5 + }, + { + "link": "http://www.baidu.com/link?url=5LCk_4r6s5pCnIXPgCo4tb2qMv6SIYpb5c_JPpgHsTUmaHf_fILjZBJNiylrWLqf360uyJClnizYA6huD9ayB_", + "title": "Linux cat命令 | 菜鸟教程", + "snippet": "2019年3月6日 - Linux cat命令 Linux 命令大全 命令:cat cat 命令用于连接文件并打印到标准输出设备上。 使用权限 所有使用者 语法格式 cat [-AbeEnstTuv] [--help] [...", + "visible_link": "www.runoob.com/linux/l...  - 百度快照", + "rank": 6 + }, + { + "link": "http://www.baidu.com/link?url=fvGSBq5NeKyy7mgnEN4MdmJBkO0bMl7mtZQ8xkcBNWsmcBfbcCo0F_DquyXuP50isHBbkpUFFyT3Ta3q1eCkGCNoLKZchTxfx3Q9a-6zc8u", + "title": "cat /proc/media-mem - 留点什么... - CSDN博客", + "snippet": "2018年10月27日 - ", + "visible_link": "CSDN博客号 - 百度快照", + "rank": 7 + }, + { + "link": "http://www.baidu.com/link?url=9QjsK2jX7fdw5RtwSwgOLE_UIP5DoeRyc-pkmgLA5BJglG_MWMsBtsVgM6USZymB", + "title": "CAT", + "snippet": "CAT12 additionally includes the estimation of the cortical thickness and central surface of the left and right hemispheres based on the projection-based thick...", + "visible_link": "www.neuro.uni-jena.de/...  - 百度快照 - 翻译此页", + "rank": 8 + }, + { + "link": "http://www.baidu.com/link?url=uOwWor1n93n5v5WJTorjdMIJ--R283kBnCAKD3eG9xre8uD5_P57LHhnNMZAXmPuMjTOnSX4Y8QE7md8zBSMoGOBT7dHKeAYCkE-OofEq0e", + "title": "深入详解美团点评CAT跨语言服务监控(一) CAT简介与部署..._CSDN博客", + "snippet": "2018年7月2日 - 前言: CAT是一个实时和接近全量的监控系统,它侧重于对Java应用的监控,除了与点评RPC组件融合的很好之外,他将会能与Spring、MyBatis、Dubbo 等框架以及...", + "visible_link": "CSDN博客号 - 百度快照", + "rank": 9 + } + ] + }, + "2": { + "time": "Thu, 14 Mar 2019 22:31:21 GMT", + "no_results": false, + "num_results": "百度为您找到相关结果约31,900,000个", + "results": [ + { + "link": "http://www.baidu.com/link?url=1DLhRKEktA3-C9-w42iT8RFUwtEhrZNVdmrAdADKa4ZrPb2Q3blQieFN8-5olh5Pe5fJ6okkR2qP6FquPRTT1q", + "title": "cat_百度百科", + "snippet": "2017年7月30日 - CAT鞋也叫catfootwear。公司成立于1904年,出产工业制造工具和全世界闻名的CAT品牌各类休闲衣服与鞋业。CAT制造...", + "visible_link": "百度百科 - 百度快照", + "rank": 10 + }, + { + "link": "http://www.baidu.com/link?url=PTfE2yMxRXiCKsbuJoQXw9cMFGlyNsrNBNvUAnymH0SNCaANBl5lXN56yhp2sTzy", + "title": "Cat | 亚太区 | Caterpillar", + "snippet": "CAT 实干成就梦想。卡特彼勒,全球实干家的强大伙伴。欢迎访问Cat (卡特) 官网,产品和服务价格查询中心。Cat是...", + "visible_link": "https://www.cat.com/zh_...html  - 百度快照 - 36条评价", + "rank": 11 + }, + { + "link": "http://www.baidu.com/link?url=cOW3sgfQXtGfCpcSaB2Gu7ln6wAoMfcmOP4oqDJM3Ku", + "title": "Cat | global-selector | Caterpillar", + "snippet": "global-selector Caterpillar Worldwide Genuine enabler of sustainable world progress and opportunity, ...", + "visible_link": "https://www.cat.com/  - 百度快照 - 36条评价 - 翻译此页", + "rank": 12 + }, + { + "link": "http://www.baidu.com/link?url=a81Bbgu0TGU9hKwb8RF20hF_kfEN6vdl0FRXwhA20kCfbvCj2wBhWXbnjJfCKUeY", + "title": "CAT - 京东", + "snippet": "京东JD.COM是国内专业的网上购物商城,为您提供CAT价格、CAT评论、CAT导购、CAT图片等相关信息", + "visible_link": "京东 - 百度快照", + "rank": 13 + }, + { + "link": "http://www.baidu.com/link?url=m1xeiaTFWm-RBFn2t5720iS7Jsn26kr88v2I_vqrVEABcOGGTU6lhANim8WbQrBwQoi7-2xVVh31RefgAHtgj--ANFzKtX6UwO2LqFQ2Gdi", + "title": "大众点评CAT简介 - stone_tomcate的博客 - CSDN博客", + "snippet": "2018年5月17日 - CAT简介 CAT(Central Application Tracking),是基于纯Java开发的分布式实时监控系统。开源代码托管在GitHub(搜索CAT即可),作者是吴其敏(qmwu2000,目前...", + "visible_link": "CSDN博客号 - 百度快照", + "rank": 14 + }, + { + "link": "http://www.baidu.com/link?url=5LCk_4r6s5pCnIXPgCo4tb2qMv6SIYpb5c_JPpgHsTUmaHf_fILjZBJNiylrWLqf360uyJClnizYA6huD9ayB_", + "title": "Linux cat命令 | 菜鸟教程", + "snippet": "2019年3月6日 - Linux cat命令 Linux 命令大全 命令:cat cat 命令用于连接文件并打印到标准输出设备上。 使用权限 所有使用者 语法格式 cat [-AbeEnstTuv] [--help] [...", + "visible_link": "www.runoob.com/linux/l...  - 百度快照", + "rank": 15 + }, + { + "link": "http://www.baidu.com/link?url=fvGSBq5NeKyy7mgnEN4MdmJBkO0bMl7mtZQ8xkcBNWsmcBfbcCo0F_DquyXuP50isHBbkpUFFyT3Ta3q1eCkGCNoLKZchTxfx3Q9a-6zc8u", + "title": "cat /proc/media-mem - 留点什么... - CSDN博客", + "snippet": "2018年10月27日 - ", + "visible_link": "CSDN博客号 - 百度快照", + "rank": 16 + }, + { + "link": "http://www.baidu.com/link?url=9QjsK2jX7fdw5RtwSwgOLE_UIP5DoeRyc-pkmgLA5BJglG_MWMsBtsVgM6USZymB", + "title": "CAT", + "snippet": "CAT12 additionally includes the estimation of the cortical thickness and central surface of the left and right hemispheres based on the projection-based thick...", + "visible_link": "www.neuro.uni-jena.de/...  - 百度快照 - 翻译此页", + "rank": 17 + }, + { + "link": "http://www.baidu.com/link?url=uOwWor1n93n5v5WJTorjdMIJ--R283kBnCAKD3eG9xre8uD5_P57LHhnNMZAXmPuMjTOnSX4Y8QE7md8zBSMoGOBT7dHKeAYCkE-OofEq0e", + "title": "深入详解美团点评CAT跨语言服务监控(一) CAT简介与部署..._CSDN博客", + "snippet": "2018年7月2日 - 前言: CAT是一个实时和接近全量的监控系统,它侧重于对Java应用的监控,除了与点评RPC组件融合的很好之外,他将会能与Spring、MyBatis、Dubbo 等框架以及...", + "visible_link": "CSDN博客号 - 百度快照", + "rank": 18 + } + ] + } + } +} \ No newline at end of file diff --git a/package.json b/package.json index 877136b..af41d21 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "se-scraper", - "version": "1.2.11", - "description": "A simple module using puppeteer to scrape several search engines such as Google, Duckduckgo and Bing.", + "version": "1.2.13", + "description": "A simple module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu", "homepage": "https://scrapeulous.com/", "main": "index.js", "scripts": { diff --git a/run.js b/run.js index e978485..095ac08 100644 --- a/run.js +++ b/run.js @@ -7,9 +7,9 @@ let config = { random_user_agent: true, // how long to sleep between requests. a random sleep interval within the range [a,b] // is drawn before every request. empty string for no sleeping. - sleep_range: '[1,2]', + sleep_range: '', // which search engine to scrape - search_engine: 'amazon', + search_engine: 'baidu', // whether debug information should be printed // debug info is useful for developers when debugging debug: false, @@ -17,7 +17,7 @@ let config = { // this output is informational verbose: true, // an array of keywords to scrape - keywords: ['drone', 'smartphone'], + keywords: ['cat', 'mouse'], // alternatively you can specify a keyword_file. this overwrites the keywords array keyword_file: '', // the number of pages to scrape for each keyword @@ -25,7 +25,7 @@ let config = { // whether to start the browser in headless mode headless: false, // path to output file, data will be stored in JSON - output_file: 'examples/results/amazon.json', + output_file: 'examples/results/baidu.json', // whether to prevent images, css, fonts from being loaded // will speed up scraping a great deal block_assets: false, diff --git a/src/modules/baidu.js b/src/modules/baidu.js index 02ed3ce..8772dce 100644 --- a/src/modules/baidu.js +++ b/src/modules/baidu.js @@ -2,80 +2,77 @@ const cheerio = require('cheerio'); const Scraper = require('./se_scraper'); class BaiduScraper extends Scraper { - parse(html) { - // load the page source into cheerio - const $ = cheerio.load(html); + parse(html) { + // load the page source into cheerio + const $ = cheerio.load(html); - // perform queries - const results = []; - $('#content_left .result').each((i, link) => { - results.push({ - link: $(link).find('h3 a').attr('href'), - title: $(link).find('h3').text(), - snippet: $(link).find('.c-abstract').text(), - visible_link: $(link).find('.f13').text(), - }) - }); + // perform queries + const results = []; + $('#content_left .result').each((i, link) => { + results.push({ + link: $(link).find('h3 a').attr('href'), + title: $(link).find('h3').text(), + snippet: $(link).find('.c-abstract').text(), + visible_link: $(link).find('.f13').text(), + }) + }); - const cleaned = []; - for (var i=0; i < results.length; i++) { - let res = results[i]; - if (res.link && res.link.trim()) { - res.rank = this.result_rank++; - cleaned.push(res); - } - } + const cleaned = []; + for (var i=0; i < results.length; i++) { + let res = results[i]; + if (res.link && res.link.trim()) { + res.rank = this.result_rank++; + cleaned.push(res); + } + } - return { - time: (new Date()).toUTCString(), - no_results: false, - num_results: $('.nums_text').text(), - results: cleaned, - } - } + return { + time: (new Date()).toUTCString(), + no_results: false, + num_results: $('.nums_text').text(), + results: cleaned, + } + } - async load_start_page() { + async load_start_page() { let startUrl = this.build_start_url('https://www.baidu.com/s?') || 'https://www.baidu.com/'; - try { - await this.page.goto(startUrl); - await this.page.waitForSelector('input[name="wd"]', { timeout: 5000 }); - } catch (e) { - return false; - } - return true; - } + try { + await this.page.goto(startUrl); + await this.page.waitForSelector('input[name="wd"]', { timeout: 5000 }); + } catch (e) { + return false; + } + return true; + } - async search_keyword(keyword) { - const input = await this.page.$('input[name="wd"]'); - // overwrites last text in input - await input.click({ clickCount: 3 }); - await input.type(keyword); - await input.focus(); - await this.page.keyboard.press("Enter"); - } + async search_keyword(keyword) { + const input = await this.page.$('input[name="wd"]'); + await this.set_input_value(`input[name="wd"]`, keyword); + await this.sleep(50); + await input.focus(); + await this.page.keyboard.press("Enter"); + } - async next_page() { - let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000}); - if (!next_page_link) { - return false; - } - await next_page_link.click(); - await this.page.waitForNavigation(); + async next_page() { + let next_page_link = await this.page.$('#page .n', {timeout: 5000}); + if (!next_page_link) { + return false; + } + await next_page_link.click(); - return true; - } + return true; + } - async wait_for_results() { - // TODO: very very bad, but nobody uses baidu, or does someone? - await this.sleep(2000); - } + async wait_for_results() { + await this.page.waitForSelector('#content_left .result', { timeout: 5000 }); + } - async detected() { - } + async detected() { + } } module.exports = { - BaiduScraper: BaiduScraper, + BaiduScraper: BaiduScraper, }; \ No newline at end of file diff --git a/src/modules/se_scraper.js b/src/modules/se_scraper.js index d5f8a66..672ee7c 100644 --- a/src/modules/se_scraper.js +++ b/src/modules/se_scraper.js @@ -206,7 +206,7 @@ module.exports = class Scraper { console.error(`Problem with scraping ${keyword} in search engine ${this.config.search_engine}: ${e}`); if (await this.detected() === true) { - console.error(`${this.config.search_engine} DETECTED the scraping!`); + console.error(`${this.config.search_engine} detected the scraping!`); if (this.config.is_local === true) { await this.sleep(this.SOLVE_CAPTCHA_TIME); diff --git a/test/test_baidu.js b/test/test_baidu.js new file mode 100644 index 0000000..c82a221 --- /dev/null +++ b/test/test_baidu.js @@ -0,0 +1,90 @@ +const se_scraper = require('./../index.js'); +var assert = require('chai').assert; + +/* + * Use chai and mocha for tests. + * https://mochajs.org/#installation + */ + +const normal_search_keywords = ['mouse', 'cat']; + +async function normal_search_test() { + let config = { + search_engine: 'baidu', + compress: false, + debug: false, + verbose: false, + keywords: normal_search_keywords, + keyword_file: '', + num_pages: 2, + headless: true, + block_assets: true, + user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', + }; + + console.log('normal_search_test()'); + await se_scraper.scrape(config, normal_search_test_case); +} + +// we test with a callback function to our handler +function normal_search_test_case(err, response) { + + if (err) { + console.error(err); + } else { + assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json'); + assert.equal(response.statusCode, 200, 'status code must be 200'); + assert.equal(response.metadata.num_requests, 4); + + for (let query in response.results) { + let total_rank = 1; + + assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.'); + + for (let page_number in response.results[query]) { + + assert.isNumber(parseInt(page_number), 'page_number must be numeric'); + + let obj = response.results[query][page_number]; + + assert.containsAllKeys(obj, ['results', 'time', 'num_results', 'no_results'], 'not all keys are in the object'); + + assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects'); + assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date'); + + assert.typeOf(obj.num_results, 'string', 'num_results must be a string'); + assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars'); + + assert.equal(obj.no_results, false, 'no results should be false'); + + for (let res of obj.results) { + + assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object'); + + assert.isOk(res.link, 'link must be ok'); + assert.typeOf(res.link, 'string', 'link must be string'); + assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars'); + + assert.isOk(res.visible_link, 'visible_link must be ok'); + assert.typeOf(res.visible_link, 'string', 'visible_link must be string'); + assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars'); + + assert.isOk(res.title, 'title must be ok'); + assert.typeOf(res.title, 'string', 'title must be string'); + assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars'); + + assert.isOk(res.snippet, 'snippet must be ok'); + assert.typeOf(res.snippet, 'string', 'snippet must be string'); + assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars'); + + assert.isNumber(res.rank, 'rank must be integer'); + assert.equal(res.rank, total_rank++, 'rank ist wrong'); + } + } + } + } +} + +(async () => { + await normal_search_test(); +})(); \ No newline at end of file