forked from extern/se-scraper
fixed #11 by improving baidu a lot in speed and quality
This commit is contained in:
parent
51d617442d
commit
7a8c6f13f0
@ -118,6 +118,7 @@ This will scrape with **three** browser instance each having their own IP addres
|
||||
## Examples
|
||||
|
||||
* [Simple example scraping google](examples/quickstart.js) yields [these results](examples/results/data.json)
|
||||
* [Simple example scraping baidu](examples/baidu.js) yields [these results](examples/results/baidu.json)
|
||||
* [Scrape with one proxy per browser](examples/proxies.js) yields [these results](examples/results/proxyresults.json)
|
||||
* [Scrape 100 keywords on Bing with multible tabs in one browser](examples/multiple_tabs.js) produces [this](examples/results/bing.json)
|
||||
* [Scrape two keywords on Amazon](examples/amazon.js) produces [this](examples/results/amazon.json)
|
||||
|
17
examples/baidu.js
Normal file
17
examples/baidu.js
Normal file
@ -0,0 +1,17 @@
|
||||
const se_scraper = require('./../index.js');
|
||||
|
||||
let config = {
|
||||
search_engine: 'baidu',
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: ['cat', 'mouse'],
|
||||
num_pages: 2,
|
||||
output_file: 'examples/results/baidu.json',
|
||||
};
|
||||
|
||||
function callback(err, response) {
|
||||
if (err) { console.error(err) }
|
||||
console.dir(response, {depth: null, colors: true});
|
||||
}
|
||||
|
||||
se_scraper.scrape(config, callback);
|
286
examples/results/baidu.json
Normal file
286
examples/results/baidu.json
Normal file
@ -0,0 +1,286 @@
|
||||
{
|
||||
"cat": {
|
||||
"1": {
|
||||
"time": "Thu, 14 Mar 2019 22:31:20 GMT",
|
||||
"no_results": false,
|
||||
"num_results": "百度为您找到相关结果约31,900,000个",
|
||||
"results": [
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=1DLhRKEktA3-C9-w42iT8RFUwtEhrZNVdmrAdADKa4ZrPb2Q3blQieFN8-5olh5Pe5fJ6okkR2qP6FquPRTT1q",
|
||||
"title": "cat_百度百科",
|
||||
"snippet": "2017年7月30日 - CAT鞋也叫catfootwear。公司成立于1904年,出产工业制造工具和全世界闻名的CAT品牌各类休闲衣服与鞋业。CAT制造...",
|
||||
"visible_link": "百度百科 - 百度快照",
|
||||
"rank": 1
|
||||
},
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=PTfE2yMxRXiCKsbuJoQXw9cMFGlyNsrNBNvUAnymH0SNCaANBl5lXN56yhp2sTzy",
|
||||
"title": "Cat | 亚太区 | Caterpillar",
|
||||
"snippet": "CAT 实干成就梦想。卡特彼勒,全球实干家的强大伙伴。欢迎访问Cat (卡特) 官网,产品和服务价格查询中心。Cat是...",
|
||||
"visible_link": "https://www.cat.com/zh_...html - 百度快照 - 36条评价",
|
||||
"rank": 2
|
||||
},
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=cOW3sgfQXtGfCpcSaB2Gu7ln6wAoMfcmOP4oqDJM3Ku",
|
||||
"title": "Cat | global-selector | Caterpillar",
|
||||
"snippet": "global-selector Caterpillar Worldwide Genuine enabler of sustainable world progress and opportunity, ...",
|
||||
"visible_link": "https://www.cat.com/ - 百度快照 - 36条评价 - 翻译此页",
|
||||
"rank": 3
|
||||
},
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=a81Bbgu0TGU9hKwb8RF20hF_kfEN6vdl0FRXwhA20kCfbvCj2wBhWXbnjJfCKUeY",
|
||||
"title": "CAT - 京东",
|
||||
"snippet": "京东JD.COM是国内专业的网上购物商城,为您提供CAT价格、CAT评论、CAT导购、CAT图片等相关信息",
|
||||
"visible_link": "京东 - 百度快照",
|
||||
"rank": 4
|
||||
},
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=m1xeiaTFWm-RBFn2t5720iS7Jsn26kr88v2I_vqrVEABcOGGTU6lhANim8WbQrBwQoi7-2xVVh31RefgAHtgj--ANFzKtX6UwO2LqFQ2Gdi",
|
||||
"title": "大众点评CAT简介 - stone_tomcate的博客 - CSDN博客",
|
||||
"snippet": "2018年5月17日 - CAT简介 CAT(Central Application Tracking),是基于纯Java开发的分布式实时监控系统。开源代码托管在GitHub(搜索CAT即可),作者是吴其敏(qmwu2000,目前...",
|
||||
"visible_link": "CSDN博客号 - 百度快照",
|
||||
"rank": 5
|
||||
},
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=5LCk_4r6s5pCnIXPgCo4tb2qMv6SIYpb5c_JPpgHsTUmaHf_fILjZBJNiylrWLqf360uyJClnizYA6huD9ayB_",
|
||||
"title": "Linux cat命令 | 菜鸟教程",
|
||||
"snippet": "2019年3月6日 - Linux cat命令 Linux 命令大全 命令:cat cat 命令用于连接文件并打印到标准输出设备上。 使用权限 所有使用者 语法格式 cat [-AbeEnstTuv] [--help] [...",
|
||||
"visible_link": "www.runoob.com/linux/l... - 百度快照",
|
||||
"rank": 6
|
||||
},
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=fvGSBq5NeKyy7mgnEN4MdmJBkO0bMl7mtZQ8xkcBNWsmcBfbcCo0F_DquyXuP50isHBbkpUFFyT3Ta3q1eCkGCNoLKZchTxfx3Q9a-6zc8u",
|
||||
"title": "cat /proc/media-mem - 留点什么... - CSDN博客",
|
||||
"snippet": "2018年10月27日 - ",
|
||||
"visible_link": "CSDN博客号 - 百度快照",
|
||||
"rank": 7
|
||||
},
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=9QjsK2jX7fdw5RtwSwgOLE_UIP5DoeRyc-pkmgLA5BJglG_MWMsBtsVgM6USZymB",
|
||||
"title": "CAT",
|
||||
"snippet": "CAT12 additionally includes the estimation of the cortical thickness and central surface of the left and right hemispheres based on the projection-based thick...",
|
||||
"visible_link": "www.neuro.uni-jena.de/... - 百度快照 - 翻译此页",
|
||||
"rank": 8
|
||||
},
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=uOwWor1n93n5v5WJTorjdMIJ--R283kBnCAKD3eG9xre8uD5_P57LHhnNMZAXmPuMjTOnSX4Y8QE7md8zBSMoGOBT7dHKeAYCkE-OofEq0e",
|
||||
"title": "深入详解美团点评CAT跨语言服务监控(一) CAT简介与部署..._CSDN博客",
|
||||
"snippet": "2018年7月2日 - 前言: CAT是一个实时和接近全量的监控系统,它侧重于对Java应用的监控,除了与点评RPC组件融合的很好之外,他将会能与Spring、MyBatis、Dubbo 等框架以及...",
|
||||
"visible_link": "CSDN博客号 - 百度快照",
|
||||
"rank": 9
|
||||
}
|
||||
]
|
||||
},
|
||||
"2": {
|
||||
"time": "Thu, 14 Mar 2019 22:31:20 GMT",
|
||||
"no_results": false,
|
||||
"num_results": "百度为您找到相关结果约31,900,000个",
|
||||
"results": [
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=1DLhRKEktA3-C9-w42iT8RFUwtEhrZNVdmrAdADKa4ZrPb2Q3blQieFN8-5olh5Pe5fJ6okkR2qP6FquPRTT1q",
|
||||
"title": "cat_百度百科",
|
||||
"snippet": "2017年7月30日 - CAT鞋也叫catfootwear。公司成立于1904年,出产工业制造工具和全世界闻名的CAT品牌各类休闲衣服与鞋业。CAT制造...",
|
||||
"visible_link": "百度百科 - 百度快照",
|
||||
"rank": 10
|
||||
},
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=PTfE2yMxRXiCKsbuJoQXw9cMFGlyNsrNBNvUAnymH0SNCaANBl5lXN56yhp2sTzy",
|
||||
"title": "Cat | 亚太区 | Caterpillar",
|
||||
"snippet": "CAT 实干成就梦想。卡特彼勒,全球实干家的强大伙伴。欢迎访问Cat (卡特) 官网,产品和服务价格查询中心。Cat是...",
|
||||
"visible_link": "https://www.cat.com/zh_...html - 百度快照 - 36条评价",
|
||||
"rank": 11
|
||||
},
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=cOW3sgfQXtGfCpcSaB2Gu7ln6wAoMfcmOP4oqDJM3Ku",
|
||||
"title": "Cat | global-selector | Caterpillar",
|
||||
"snippet": "global-selector Caterpillar Worldwide Genuine enabler of sustainable world progress and opportunity, ...",
|
||||
"visible_link": "https://www.cat.com/ - 百度快照 - 36条评价 - 翻译此页",
|
||||
"rank": 12
|
||||
},
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=a81Bbgu0TGU9hKwb8RF20hF_kfEN6vdl0FRXwhA20kCfbvCj2wBhWXbnjJfCKUeY",
|
||||
"title": "CAT - 京东",
|
||||
"snippet": "京东JD.COM是国内专业的网上购物商城,为您提供CAT价格、CAT评论、CAT导购、CAT图片等相关信息",
|
||||
"visible_link": "京东 - 百度快照",
|
||||
"rank": 13
|
||||
},
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=m1xeiaTFWm-RBFn2t5720iS7Jsn26kr88v2I_vqrVEABcOGGTU6lhANim8WbQrBwQoi7-2xVVh31RefgAHtgj--ANFzKtX6UwO2LqFQ2Gdi",
|
||||
"title": "大众点评CAT简介 - stone_tomcate的博客 - CSDN博客",
|
||||
"snippet": "2018年5月17日 - CAT简介 CAT(Central Application Tracking),是基于纯Java开发的分布式实时监控系统。开源代码托管在GitHub(搜索CAT即可),作者是吴其敏(qmwu2000,目前...",
|
||||
"visible_link": "CSDN博客号 - 百度快照",
|
||||
"rank": 14
|
||||
},
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=5LCk_4r6s5pCnIXPgCo4tb2qMv6SIYpb5c_JPpgHsTUmaHf_fILjZBJNiylrWLqf360uyJClnizYA6huD9ayB_",
|
||||
"title": "Linux cat命令 | 菜鸟教程",
|
||||
"snippet": "2019年3月6日 - Linux cat命令 Linux 命令大全 命令:cat cat 命令用于连接文件并打印到标准输出设备上。 使用权限 所有使用者 语法格式 cat [-AbeEnstTuv] [--help] [...",
|
||||
"visible_link": "www.runoob.com/linux/l... - 百度快照",
|
||||
"rank": 15
|
||||
},
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=fvGSBq5NeKyy7mgnEN4MdmJBkO0bMl7mtZQ8xkcBNWsmcBfbcCo0F_DquyXuP50isHBbkpUFFyT3Ta3q1eCkGCNoLKZchTxfx3Q9a-6zc8u",
|
||||
"title": "cat /proc/media-mem - 留点什么... - CSDN博客",
|
||||
"snippet": "2018年10月27日 - ",
|
||||
"visible_link": "CSDN博客号 - 百度快照",
|
||||
"rank": 16
|
||||
},
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=9QjsK2jX7fdw5RtwSwgOLE_UIP5DoeRyc-pkmgLA5BJglG_MWMsBtsVgM6USZymB",
|
||||
"title": "CAT",
|
||||
"snippet": "CAT12 additionally includes the estimation of the cortical thickness and central surface of the left and right hemispheres based on the projection-based thick...",
|
||||
"visible_link": "www.neuro.uni-jena.de/... - 百度快照 - 翻译此页",
|
||||
"rank": 17
|
||||
},
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=uOwWor1n93n5v5WJTorjdMIJ--R283kBnCAKD3eG9xre8uD5_P57LHhnNMZAXmPuMjTOnSX4Y8QE7md8zBSMoGOBT7dHKeAYCkE-OofEq0e",
|
||||
"title": "深入详解美团点评CAT跨语言服务监控(一) CAT简介与部署..._CSDN博客",
|
||||
"snippet": "2018年7月2日 - 前言: CAT是一个实时和接近全量的监控系统,它侧重于对Java应用的监控,除了与点评RPC组件融合的很好之外,他将会能与Spring、MyBatis、Dubbo 等框架以及...",
|
||||
"visible_link": "CSDN博客号 - 百度快照",
|
||||
"rank": 18
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"mouse": {
|
||||
"1": {
|
||||
"time": "Thu, 14 Mar 2019 22:31:21 GMT",
|
||||
"no_results": false,
|
||||
"num_results": "百度为您找到相关结果约31,900,000个",
|
||||
"results": [
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=1DLhRKEktA3-C9-w42iT8RFUwtEhrZNVdmrAdADKa4ZrPb2Q3blQieFN8-5olh5Pe5fJ6okkR2qP6FquPRTT1q",
|
||||
"title": "cat_百度百科",
|
||||
"snippet": "2017年7月30日 - CAT鞋也叫catfootwear。公司成立于1904年,出产工业制造工具和全世界闻名的CAT品牌各类休闲衣服与鞋业。CAT制造...",
|
||||
"visible_link": "百度百科 - 百度快照",
|
||||
"rank": 1
|
||||
},
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=PTfE2yMxRXiCKsbuJoQXw9cMFGlyNsrNBNvUAnymH0SNCaANBl5lXN56yhp2sTzy",
|
||||
"title": "Cat | 亚太区 | Caterpillar",
|
||||
"snippet": "CAT 实干成就梦想。卡特彼勒,全球实干家的强大伙伴。欢迎访问Cat (卡特) 官网,产品和服务价格查询中心。Cat是...",
|
||||
"visible_link": "https://www.cat.com/zh_...html - 百度快照 - 36条评价",
|
||||
"rank": 2
|
||||
},
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=cOW3sgfQXtGfCpcSaB2Gu7ln6wAoMfcmOP4oqDJM3Ku",
|
||||
"title": "Cat | global-selector | Caterpillar",
|
||||
"snippet": "global-selector Caterpillar Worldwide Genuine enabler of sustainable world progress and opportunity, ...",
|
||||
"visible_link": "https://www.cat.com/ - 百度快照 - 36条评价 - 翻译此页",
|
||||
"rank": 3
|
||||
},
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=a81Bbgu0TGU9hKwb8RF20hF_kfEN6vdl0FRXwhA20kCfbvCj2wBhWXbnjJfCKUeY",
|
||||
"title": "CAT - 京东",
|
||||
"snippet": "京东JD.COM是国内专业的网上购物商城,为您提供CAT价格、CAT评论、CAT导购、CAT图片等相关信息",
|
||||
"visible_link": "京东 - 百度快照",
|
||||
"rank": 4
|
||||
},
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=m1xeiaTFWm-RBFn2t5720iS7Jsn26kr88v2I_vqrVEABcOGGTU6lhANim8WbQrBwQoi7-2xVVh31RefgAHtgj--ANFzKtX6UwO2LqFQ2Gdi",
|
||||
"title": "大众点评CAT简介 - stone_tomcate的博客 - CSDN博客",
|
||||
"snippet": "2018年5月17日 - CAT简介 CAT(Central Application Tracking),是基于纯Java开发的分布式实时监控系统。开源代码托管在GitHub(搜索CAT即可),作者是吴其敏(qmwu2000,目前...",
|
||||
"visible_link": "CSDN博客号 - 百度快照",
|
||||
"rank": 5
|
||||
},
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=5LCk_4r6s5pCnIXPgCo4tb2qMv6SIYpb5c_JPpgHsTUmaHf_fILjZBJNiylrWLqf360uyJClnizYA6huD9ayB_",
|
||||
"title": "Linux cat命令 | 菜鸟教程",
|
||||
"snippet": "2019年3月6日 - Linux cat命令 Linux 命令大全 命令:cat cat 命令用于连接文件并打印到标准输出设备上。 使用权限 所有使用者 语法格式 cat [-AbeEnstTuv] [--help] [...",
|
||||
"visible_link": "www.runoob.com/linux/l... - 百度快照",
|
||||
"rank": 6
|
||||
},
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=fvGSBq5NeKyy7mgnEN4MdmJBkO0bMl7mtZQ8xkcBNWsmcBfbcCo0F_DquyXuP50isHBbkpUFFyT3Ta3q1eCkGCNoLKZchTxfx3Q9a-6zc8u",
|
||||
"title": "cat /proc/media-mem - 留点什么... - CSDN博客",
|
||||
"snippet": "2018年10月27日 - ",
|
||||
"visible_link": "CSDN博客号 - 百度快照",
|
||||
"rank": 7
|
||||
},
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=9QjsK2jX7fdw5RtwSwgOLE_UIP5DoeRyc-pkmgLA5BJglG_MWMsBtsVgM6USZymB",
|
||||
"title": "CAT",
|
||||
"snippet": "CAT12 additionally includes the estimation of the cortical thickness and central surface of the left and right hemispheres based on the projection-based thick...",
|
||||
"visible_link": "www.neuro.uni-jena.de/... - 百度快照 - 翻译此页",
|
||||
"rank": 8
|
||||
},
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=uOwWor1n93n5v5WJTorjdMIJ--R283kBnCAKD3eG9xre8uD5_P57LHhnNMZAXmPuMjTOnSX4Y8QE7md8zBSMoGOBT7dHKeAYCkE-OofEq0e",
|
||||
"title": "深入详解美团点评CAT跨语言服务监控(一) CAT简介与部署..._CSDN博客",
|
||||
"snippet": "2018年7月2日 - 前言: CAT是一个实时和接近全量的监控系统,它侧重于对Java应用的监控,除了与点评RPC组件融合的很好之外,他将会能与Spring、MyBatis、Dubbo 等框架以及...",
|
||||
"visible_link": "CSDN博客号 - 百度快照",
|
||||
"rank": 9
|
||||
}
|
||||
]
|
||||
},
|
||||
"2": {
|
||||
"time": "Thu, 14 Mar 2019 22:31:21 GMT",
|
||||
"no_results": false,
|
||||
"num_results": "百度为您找到相关结果约31,900,000个",
|
||||
"results": [
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=1DLhRKEktA3-C9-w42iT8RFUwtEhrZNVdmrAdADKa4ZrPb2Q3blQieFN8-5olh5Pe5fJ6okkR2qP6FquPRTT1q",
|
||||
"title": "cat_百度百科",
|
||||
"snippet": "2017年7月30日 - CAT鞋也叫catfootwear。公司成立于1904年,出产工业制造工具和全世界闻名的CAT品牌各类休闲衣服与鞋业。CAT制造...",
|
||||
"visible_link": "百度百科 - 百度快照",
|
||||
"rank": 10
|
||||
},
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=PTfE2yMxRXiCKsbuJoQXw9cMFGlyNsrNBNvUAnymH0SNCaANBl5lXN56yhp2sTzy",
|
||||
"title": "Cat | 亚太区 | Caterpillar",
|
||||
"snippet": "CAT 实干成就梦想。卡特彼勒,全球实干家的强大伙伴。欢迎访问Cat (卡特) 官网,产品和服务价格查询中心。Cat是...",
|
||||
"visible_link": "https://www.cat.com/zh_...html - 百度快照 - 36条评价",
|
||||
"rank": 11
|
||||
},
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=cOW3sgfQXtGfCpcSaB2Gu7ln6wAoMfcmOP4oqDJM3Ku",
|
||||
"title": "Cat | global-selector | Caterpillar",
|
||||
"snippet": "global-selector Caterpillar Worldwide Genuine enabler of sustainable world progress and opportunity, ...",
|
||||
"visible_link": "https://www.cat.com/ - 百度快照 - 36条评价 - 翻译此页",
|
||||
"rank": 12
|
||||
},
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=a81Bbgu0TGU9hKwb8RF20hF_kfEN6vdl0FRXwhA20kCfbvCj2wBhWXbnjJfCKUeY",
|
||||
"title": "CAT - 京东",
|
||||
"snippet": "京东JD.COM是国内专业的网上购物商城,为您提供CAT价格、CAT评论、CAT导购、CAT图片等相关信息",
|
||||
"visible_link": "京东 - 百度快照",
|
||||
"rank": 13
|
||||
},
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=m1xeiaTFWm-RBFn2t5720iS7Jsn26kr88v2I_vqrVEABcOGGTU6lhANim8WbQrBwQoi7-2xVVh31RefgAHtgj--ANFzKtX6UwO2LqFQ2Gdi",
|
||||
"title": "大众点评CAT简介 - stone_tomcate的博客 - CSDN博客",
|
||||
"snippet": "2018年5月17日 - CAT简介 CAT(Central Application Tracking),是基于纯Java开发的分布式实时监控系统。开源代码托管在GitHub(搜索CAT即可),作者是吴其敏(qmwu2000,目前...",
|
||||
"visible_link": "CSDN博客号 - 百度快照",
|
||||
"rank": 14
|
||||
},
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=5LCk_4r6s5pCnIXPgCo4tb2qMv6SIYpb5c_JPpgHsTUmaHf_fILjZBJNiylrWLqf360uyJClnizYA6huD9ayB_",
|
||||
"title": "Linux cat命令 | 菜鸟教程",
|
||||
"snippet": "2019年3月6日 - Linux cat命令 Linux 命令大全 命令:cat cat 命令用于连接文件并打印到标准输出设备上。 使用权限 所有使用者 语法格式 cat [-AbeEnstTuv] [--help] [...",
|
||||
"visible_link": "www.runoob.com/linux/l... - 百度快照",
|
||||
"rank": 15
|
||||
},
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=fvGSBq5NeKyy7mgnEN4MdmJBkO0bMl7mtZQ8xkcBNWsmcBfbcCo0F_DquyXuP50isHBbkpUFFyT3Ta3q1eCkGCNoLKZchTxfx3Q9a-6zc8u",
|
||||
"title": "cat /proc/media-mem - 留点什么... - CSDN博客",
|
||||
"snippet": "2018年10月27日 - ",
|
||||
"visible_link": "CSDN博客号 - 百度快照",
|
||||
"rank": 16
|
||||
},
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=9QjsK2jX7fdw5RtwSwgOLE_UIP5DoeRyc-pkmgLA5BJglG_MWMsBtsVgM6USZymB",
|
||||
"title": "CAT",
|
||||
"snippet": "CAT12 additionally includes the estimation of the cortical thickness and central surface of the left and right hemispheres based on the projection-based thick...",
|
||||
"visible_link": "www.neuro.uni-jena.de/... - 百度快照 - 翻译此页",
|
||||
"rank": 17
|
||||
},
|
||||
{
|
||||
"link": "http://www.baidu.com/link?url=uOwWor1n93n5v5WJTorjdMIJ--R283kBnCAKD3eG9xre8uD5_P57LHhnNMZAXmPuMjTOnSX4Y8QE7md8zBSMoGOBT7dHKeAYCkE-OofEq0e",
|
||||
"title": "深入详解美团点评CAT跨语言服务监控(一) CAT简介与部署..._CSDN博客",
|
||||
"snippet": "2018年7月2日 - 前言: CAT是一个实时和接近全量的监控系统,它侧重于对Java应用的监控,除了与点评RPC组件融合的很好之外,他将会能与Spring、MyBatis、Dubbo 等框架以及...",
|
||||
"visible_link": "CSDN博客号 - 百度快照",
|
||||
"rank": 18
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
@ -1,7 +1,7 @@
|
||||
{
|
||||
"name": "se-scraper",
|
||||
"version": "1.2.11",
|
||||
"description": "A simple module using puppeteer to scrape several search engines such as Google, Duckduckgo and Bing.",
|
||||
"version": "1.2.13",
|
||||
"description": "A simple module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu",
|
||||
"homepage": "https://scrapeulous.com/",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
|
8
run.js
8
run.js
@ -7,9 +7,9 @@ let config = {
|
||||
random_user_agent: true,
|
||||
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
||||
// is drawn before every request. empty string for no sleeping.
|
||||
sleep_range: '[1,2]',
|
||||
sleep_range: '',
|
||||
// which search engine to scrape
|
||||
search_engine: 'amazon',
|
||||
search_engine: 'baidu',
|
||||
// whether debug information should be printed
|
||||
// debug info is useful for developers when debugging
|
||||
debug: false,
|
||||
@ -17,7 +17,7 @@ let config = {
|
||||
// this output is informational
|
||||
verbose: true,
|
||||
// an array of keywords to scrape
|
||||
keywords: ['drone', 'smartphone'],
|
||||
keywords: ['cat', 'mouse'],
|
||||
// alternatively you can specify a keyword_file. this overwrites the keywords array
|
||||
keyword_file: '',
|
||||
// the number of pages to scrape for each keyword
|
||||
@ -25,7 +25,7 @@ let config = {
|
||||
// whether to start the browser in headless mode
|
||||
headless: false,
|
||||
// path to output file, data will be stored in JSON
|
||||
output_file: 'examples/results/amazon.json',
|
||||
output_file: 'examples/results/baidu.json',
|
||||
// whether to prevent images, css, fonts from being loaded
|
||||
// will speed up scraping a great deal
|
||||
block_assets: false,
|
||||
|
@ -2,80 +2,77 @@ const cheerio = require('cheerio');
|
||||
const Scraper = require('./se_scraper');
|
||||
|
||||
class BaiduScraper extends Scraper {
|
||||
parse(html) {
|
||||
// load the page source into cheerio
|
||||
const $ = cheerio.load(html);
|
||||
parse(html) {
|
||||
// load the page source into cheerio
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
// perform queries
|
||||
const results = [];
|
||||
$('#content_left .result').each((i, link) => {
|
||||
results.push({
|
||||
link: $(link).find('h3 a').attr('href'),
|
||||
title: $(link).find('h3').text(),
|
||||
snippet: $(link).find('.c-abstract').text(),
|
||||
visible_link: $(link).find('.f13').text(),
|
||||
})
|
||||
});
|
||||
// perform queries
|
||||
const results = [];
|
||||
$('#content_left .result').each((i, link) => {
|
||||
results.push({
|
||||
link: $(link).find('h3 a').attr('href'),
|
||||
title: $(link).find('h3').text(),
|
||||
snippet: $(link).find('.c-abstract').text(),
|
||||
visible_link: $(link).find('.f13').text(),
|
||||
})
|
||||
});
|
||||
|
||||
const cleaned = [];
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.link && res.link.trim()) {
|
||||
res.rank = this.result_rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
const cleaned = [];
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.link && res.link.trim()) {
|
||||
res.rank = this.result_rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
no_results: false,
|
||||
num_results: $('.nums_text').text(),
|
||||
results: cleaned,
|
||||
}
|
||||
}
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
no_results: false,
|
||||
num_results: $('.nums_text').text(),
|
||||
results: cleaned,
|
||||
}
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
async load_start_page() {
|
||||
|
||||
let startUrl = this.build_start_url('https://www.baidu.com/s?') || 'https://www.baidu.com/';
|
||||
|
||||
try {
|
||||
await this.page.goto(startUrl);
|
||||
await this.page.waitForSelector('input[name="wd"]', { timeout: 5000 });
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
try {
|
||||
await this.page.goto(startUrl);
|
||||
await this.page.waitForSelector('input[name="wd"]', { timeout: 5000 });
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
async search_keyword(keyword) {
|
||||
const input = await this.page.$('input[name="wd"]');
|
||||
// overwrites last text in input
|
||||
await input.click({ clickCount: 3 });
|
||||
await input.type(keyword);
|
||||
await input.focus();
|
||||
await this.page.keyboard.press("Enter");
|
||||
}
|
||||
async search_keyword(keyword) {
|
||||
const input = await this.page.$('input[name="wd"]');
|
||||
await this.set_input_value(`input[name="wd"]`, keyword);
|
||||
await this.sleep(50);
|
||||
await input.focus();
|
||||
await this.page.keyboard.press("Enter");
|
||||
}
|
||||
|
||||
async next_page() {
|
||||
let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000});
|
||||
if (!next_page_link) {
|
||||
return false;
|
||||
}
|
||||
await next_page_link.click();
|
||||
await this.page.waitForNavigation();
|
||||
async next_page() {
|
||||
let next_page_link = await this.page.$('#page .n', {timeout: 5000});
|
||||
if (!next_page_link) {
|
||||
return false;
|
||||
}
|
||||
await next_page_link.click();
|
||||
|
||||
return true;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
async wait_for_results() {
|
||||
// TODO: very very bad, but nobody uses baidu, or does someone?
|
||||
await this.sleep(2000);
|
||||
}
|
||||
async wait_for_results() {
|
||||
await this.page.waitForSelector('#content_left .result', { timeout: 5000 });
|
||||
}
|
||||
|
||||
async detected() {
|
||||
}
|
||||
async detected() {
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
BaiduScraper: BaiduScraper,
|
||||
BaiduScraper: BaiduScraper,
|
||||
};
|
@ -206,7 +206,7 @@ module.exports = class Scraper {
|
||||
console.error(`Problem with scraping ${keyword} in search engine ${this.config.search_engine}: ${e}`);
|
||||
|
||||
if (await this.detected() === true) {
|
||||
console.error(`${this.config.search_engine} DETECTED the scraping!`);
|
||||
console.error(`${this.config.search_engine} detected the scraping!`);
|
||||
|
||||
if (this.config.is_local === true) {
|
||||
await this.sleep(this.SOLVE_CAPTCHA_TIME);
|
||||
|
90
test/test_baidu.js
Normal file
90
test/test_baidu.js
Normal file
@ -0,0 +1,90 @@
|
||||
const se_scraper = require('./../index.js');
|
||||
var assert = require('chai').assert;
|
||||
|
||||
/*
|
||||
* Use chai and mocha for tests.
|
||||
* https://mochajs.org/#installation
|
||||
*/
|
||||
|
||||
const normal_search_keywords = ['mouse', 'cat'];
|
||||
|
||||
async function normal_search_test() {
|
||||
let config = {
|
||||
search_engine: 'baidu',
|
||||
compress: false,
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: normal_search_keywords,
|
||||
keyword_file: '',
|
||||
num_pages: 2,
|
||||
headless: true,
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
};
|
||||
|
||||
console.log('normal_search_test()');
|
||||
await se_scraper.scrape(config, normal_search_test_case);
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function normal_search_test_case(err, response) {
|
||||
|
||||
if (err) {
|
||||
console.error(err);
|
||||
} else {
|
||||
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
|
||||
assert.equal(response.statusCode, 200, 'status code must be 200');
|
||||
assert.equal(response.metadata.num_requests, 4);
|
||||
|
||||
for (let query in response.results) {
|
||||
let total_rank = 1;
|
||||
|
||||
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'num_results', 'no_results'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
assert.equal(res.rank, total_rank++, 'rank ist wrong');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
(async () => {
|
||||
await normal_search_test();
|
||||
})();
|
Loading…
Reference in New Issue
Block a user