Compare commits

...

106 Commits

Author SHA1 Message Date
5a0eea201d Merge branch 'master' of github.com:NikolaiT/se-scraper
branchy
2020-05-17 22:06:57 +02:00
0278b24f0d ad 2020-05-17 22:06:33 +02:00
33fa371716 Merge pull request #62 from aularon/patch-1
Take screenshot before modifying HTML
2020-02-13 20:04:20 +01:00
6b806dedfe Merge pull request #61 from Monibrand/refactor/use-original-puppeteer-cluster
Refactor/use original puppeteer cluster
2020-02-13 20:03:39 +01:00
5633b10e50 Merge pull request #60 from Monibrand/fix/unusable-proxy-file-option
fix(scrape-manager): proxy_file options can't be used with proxies default value
2020-02-13 20:02:55 +01:00
c58d4fa74d fix(proxy): throw on use_proxies_only if no proxies given 2020-01-17 15:55:17 +01:00
4f467abf1e fix(scrape-manager): keywords propagated through a clone config for not being re-affected 2020-01-17 15:12:00 +01:00
89dc5c3ebb fix(scrape-manager): conflict between proxies and user_agent option 2020-01-17 12:07:12 +01:00
4b33ef9b19 fix(duckduckgo): extract correct amount of results, handle pagination 2020-01-15 16:35:16 +01:00
28332528ea test(duckduckgo): implement tests for duckduckgo module 2020-01-15 16:33:30 +01:00
b685fb4def test: working test for html_output 2020-01-10 09:51:54 +01:00
394b567db6 test: add user_agent tests, add html_output tests 2020-01-10 09:35:24 +01:00
cac6b87e92 test: Bing tests working, refactor proxy for tests 2020-01-08 14:40:28 +01:00
1c1db88545 test: add config proxy options tests 2020-01-07 16:50:09 +01:00
8f6317cea7 style: add debug trace on some file 2020-01-07 16:47:09 +01:00
f192e4ebb4 test: remove legacy tests 2020-01-07 16:43:17 +01:00
3ab8e46126 test: add bing module test 2020-01-07 09:48:46 +01:00
392c43390e test(google): add real integration/unit tests for google module 2020-01-03 19:21:34 +01:00
77c1bb8372 Take screenshot before modifying HTML
Otherwise the screenshot will be very messed up
2020-01-03 11:12:40 +02:00
8f40057534 refactor(cluster): use custom concurrency for puppeteer-cluster 2019-12-20 19:44:59 +01:00
301695cd2b fix(scrape-manager): proxy_file options can be used with proxies default value 2019-12-20 19:35:23 +01:00
d362e4ae2c Merge pull request #59 from TDenoncin/refactor/logging
Refactor logging
2019-12-20 14:42:09 +01:00
bcd181111b refactor(log): remove common.js, use winston and debug 2019-12-15 17:56:22 +01:00
b4a86fcc51 refactor(proxy): remove proxy option not working replace by proxies 2019-12-13 18:02:22 +01:00
9e6a555663 Merge pull request #52 from kujaomega/master
Added post install script to build the puppeteer-cluster, and also ad…
2019-12-01 22:15:39 +01:00
ca9f5f7f50 Added post install script to build the puppeteer-cluster, and also added the updated dependencies from puppeteer-cluster 2019-11-22 00:37:29 +01:00
1694ee92d0 updated to puppeteeer 2.0 2019-11-08 16:21:16 +01:00
da69913272 added detected status to metadata 2019-10-06 15:34:18 +02:00
4a3a0e6fd4 better pluggable api 2019-10-05 19:39:33 +02:00
4953d9da7a chaned version 2019-09-23 23:39:06 +02:00
5e47c27c70 too late to find a proper commit description 2019-09-23 23:38:38 +02:00
95a5ee56d8 remove cheerio from parsing 2019-09-23 21:57:13 +02:00
52a2ec7b33 changed README 2019-09-23 16:50:57 +02:00
07f3dceba1 fixed google SERP title, better docker support 2019-09-23 16:46:22 +02:00
b25f7a4285 added test to my working tree 2019-09-13 18:28:19 +02:00
4b581bd03f removed static tests because they are too larege 2019-09-13 18:21:17 +02:00
21378dab02 removed some search engines, added tests for existing, added yandex search engines 2019-09-13 16:15:33 +02:00
77d6c4f04a removed some stuff 2019-09-12 10:43:57 +02:00
b513bb0f5b Merge branch 'master' of github.com:NikolaiT/se-scraper
server in dockerfile was changed
2019-09-04 12:28:05 +02:00
855a874f9e some minor changes 2019-09-04 12:27:53 +02:00
dde1711d9d Merge pull request #45 from slotix/master
add process supervisor for starting server.js
2019-08-29 20:41:42 +02:00
7ba7ee9226 add process supervisor for starting server.js 2019-08-19 14:01:37 +02:00
e661241f6f added some parsing to google 2019-08-16 20:10:40 +02:00
98414259fe docker support added 2019-08-13 17:35:06 +02:00
19a172c654 better tests 2019-08-13 15:28:30 +02:00
0f7e89c272 added little bug in cleaning 2019-08-12 17:16:37 +02:00
ca941cee45 added static bing test, added html cleaning when exporting html 2019-08-12 16:05:17 +02:00
4c77aeba76 Merge pull request #42 from TDenoncin/error-management
Clean integration tests with mocha
2019-08-12 00:04:40 +02:00
0427d9f915 Merge branch 'master' into error-management 2019-08-12 00:04:27 +02:00
87fcdd35d5 readme in static tests 2019-08-12 00:01:02 +02:00
4ca50ab2b9 added new static test case that runs much faster and tests a lot of behavior 2019-08-11 23:58:10 +02:00
8e629f6266 Merge pull request #41 from victor9000/master
Fix broken Google News selectors, fixes #40
2019-08-08 21:57:14 +02:00
a369bd07f9 Add "use strict" to ensure quality code control 2019-08-06 12:18:51 +02:00
dde2b14fc0 Remove uneeded try catch block in Google Search module 2019-08-06 11:50:08 +02:00
0db6e068da Remove uneeded try catch block
Add proper error for ip matching test
2019-08-06 11:46:53 +02:00
50bda275a6 Clean integration tests for mocha 2019-08-05 17:01:48 +02:00
a61fade2c9 Fix broken Google News selectors, fixes #40 2019-08-04 14:43:02 -07:00
78fe12390b better user agents now, added option to include screenshots as base64 in results 2019-07-18 20:19:15 +02:00
fcbe66b56b using random user agents now from https://github.com/intoli/user-agents 2019-07-18 19:34:09 +02:00
59154694f2 fixed issue https://github.com/NikolaiT/se-scraper/issues/37 2019-07-18 19:14:33 +02:00
60a9d52924 add fucking google product information 2019-07-11 19:23:40 +02:00
1fc7f0d1c8 fixed a badboy 2019-07-11 16:54:32 +02:00
baaff5824e ... 2019-07-11 16:43:41 +02:00
dab25f9068 added google shopping results 2019-07-11 16:42:01 +02:00
a413cb54ef parsing ads works for duckduckgo, google, bing. tested. 2019-07-07 19:38:28 +02:00
bbebe3ce60 parsing ads is supported now for google, bing and duckduckgo 2019-07-06 21:42:13 +02:00
09c1255400 removed some superflous stuff 2019-07-02 18:04:01 +02:00
5e8ff1cb34 Merge branch 'master' of https://github.com/NikolaiT/se-scraper 2019-06-29 17:01:25 +02:00
c1a036e8da removed some stuff 2019-06-29 17:00:50 +02:00
d1e9b21269 added google maps scraper 2019-06-29 17:00:19 +02:00
593f3a95e5 Merge pull request #33 from TDenoncin/add-html-output-rework
Add html output option
2019-06-26 15:38:38 +02:00
d9ac9f4162 Add test for html_output, refactor the results return 2019-06-26 12:03:42 +02:00
a0e63aa4b0 Use bing_setting.bing_domain if defined for startUrl 2019-06-25 17:16:17 +02:00
a3ebe357a4 Add html_output fonctionality
Pagination support for html output
Change return value to keep it compliant to the current version of se-scrapper
2019-06-25 17:02:34 +02:00
0d7f6dcd11 worked on issue #31 2019-06-18 22:23:52 +02:00
80d23a9d57 users may pass their own user agents, different browsers have random user agents and not the same now 2019-06-17 21:25:45 +02:00
ebe9ba8ea9 added option to throw on detection 2019-06-17 15:02:44 +02:00
caa93df3b0 random user agent fixed 2019-06-17 12:01:13 +02:00
0c9f353cb2 remove hardcoded sleep() in Google Image 2019-06-17 00:03:13 +02:00
43d5732de7 resolved issue #30, custom scrapers now possible. new npm version 2019-06-13 12:34:39 +02:00
06d500f75c . 2019-06-12 21:25:40 +02:00
784e887787 fixed issue #22 2019-06-12 21:25:20 +02:00
db5fbb23d2 removed unnecessary sleeping times 2019-06-12 18:14:49 +02:00
5bf7c94b9a new version 2019-06-11 22:01:27 +02:00
d4d06f7d67 need to edit readme 2019-06-11 18:34:51 +02:00
35943e7449 minor stuff 2019-06-11 18:33:11 +02:00
7e06944fa1 updated README 2019-06-11 18:27:34 +02:00
6825c97790 changed api big time 2019-06-11 18:16:59 +02:00
3d69f4e249 added a proxy test script 2019-05-06 21:54:23 +02:00
1593759556 passing chrome flags directly now possible 2019-04-01 15:33:26 +02:00
775dcfa077 proxy mgmt better 2019-03-22 18:55:17 +01:00
b82c769bb1 google_news_old supports google_news_old_settings now 2019-03-20 15:28:04 +01:00
1bed9c5854 fixed issue 12 2019-03-20 11:50:43 +01:00
7a8c6f13f0 fixed #11 by improving baidu a lot in speed and quality 2019-03-14 23:33:46 +01:00
51d617442d added support for amazon 2019-03-10 20:02:42 +01:00
dd1f36076e can now parse args from string to json 2019-03-07 15:50:36 +01:00
62b3b688b4 minor fixes 2019-03-07 13:16:12 +01:00
7b52b4e62f added suport for custom query string parameters 2019-03-06 00:08:25 +01:00
7239e23cba fixed pluggable 2019-03-03 16:46:10 +01:00
8cbf37eaba minor improvements 2019-03-02 22:32:26 +01:00
abf4458e46 fixed quotes in user agent. this lead to cloudflare detecting the scraper. very bad. 2019-03-01 16:02:30 +01:00
79d32a315a fixed some errors and way better README 2019-02-28 15:34:25 +01:00
089e410ec6 support for multible browsers and proxies 2019-02-27 20:58:13 +01:00
393b9c0450 Merge pull request #8 from NikolaiT/add-license-1
Create LICENSE
2019-02-08 00:58:27 +01:00
fb3f2836e4 Create LICENSE 2019-02-08 00:58:15 +01:00
53c9ebf467 Merge pull request #7 from NikolaiT/add-code-of-conduct-1
Create CODE_OF_CONDUCT.md
2019-02-08 00:54:28 +01:00
71 changed files with 12955 additions and 2901 deletions

18
.gitignore vendored
View File

@ -1,3 +1,19 @@
# ignore static tests
test/static_tests/html/
test/static_tests/html/*
.idea
# ignore data
examples/data/
examples/data/*
examples/results/
examples/results/*
# Logs
logs
*.log
@ -63,3 +79,5 @@ typings/
.idea/
GoogleScraperPup.iml
.http-mitm-proxy

0
.gitmodules vendored Normal file
View File

73
Dockerfile Normal file
View File

@ -0,0 +1,73 @@
FROM node:10-slim
# Application parameters and variables
# ENV NODE_ENV=production
ENV HOST=0.0.0.0
ENV PORT=3000
ENV application_directory=/se-scraper
ENV puppeteer_cluster_directory=/se-scraper/src/puppeteer-cluster
# Create app directory
WORKDIR $application_directory
RUN apt-get update && \
apt-get install -y \
gconf-service \
libasound2 \
libatk1.0-0 \
libc6 \
libcairo2 \
libcups2 \
libdbus-1-3 \
libexpat1 \
libfontconfig1 \
libgcc1 \
libgconf-2-4 \
libgdk-pixbuf2.0-0 \
libglib2.0-0 \
libgtk-3-0 \
libnspr4 \
libpango-1.0-0 \
libpangocairo-1.0-0 \
libstdc++6 \
libx11-6 \
libx11-xcb1 \
libxcb1 \
libxcomposite1 \
libxcursor1 \
libxdamage1 \
libxext6 \
libxfixes3 \
libxi6 \
libxrandr2 \
libxrender1 \
libxss1 \
libxtst6 \
ca-certificates \
fonts-liberation \
libappindicator1 \
libnss3 \
lsb-release \
xdg-utils \
wget
# Bundle app source
COPY . .
WORKDIR $puppeteer_cluster_directory
RUN npm install \
&& npm run build
WORKDIR $application_directory
# skip installing scripts for puppeteer dependencies
# we've already installed puppeteer above.
RUN npm install --ignore-scripts
# Cleanup
RUN apt-get clean && rm -rf /var/lib/apt/lists/*
ADD https://github.com/Yelp/dumb-init/releases/download/v1.2.2/dumb-init_1.2.2_amd64 /usr/local/bin/dumb-init
RUN chmod +x /usr/local/bin/dumb-init
EXPOSE $PORT
CMD ["dumb-init", "node", "server/server.js"]

201
LICENSE Normal file
View File

@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2019 Nikolai Tschacher
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

625
README.md
View File

@ -1,78 +1,307 @@
# Search Engine Scraper
# [The maintained successor of se-scraper is the general purpose crawling infrastructure](https://github.com/NikolaiT/Crawling-Infrastructure)
This node module supports scraping several search engines.
## Search Engine Scraper - se-scraper
Right now scraping the search engines
[![npm](https://img.shields.io/npm/v/se-scraper.svg?style=for-the-badge)](https://www.npmjs.com/package/se-scraper)
[![Donate](https://img.shields.io/badge/donate-paypal-blue.svg?style=for-the-badge)](https://www.paypal.me/incolumitas)
[![Known Vulnerabilities](https://snyk.io/test/github/NikolaiT/se-scraper/badge.svg)](https://snyk.io/test/github/NikolaiT/se-scraper)
This node module allows you to scrape search engines concurrently with different proxies.
If you don't have extensive technical experience or don't want to purchase proxies, you can use [my scraping service](https://scrapeulous.com/).
#### Table of Contents
- [Installation](#installation)
- [Docker](#docker-support)
- [Minimal Example](#minimal-example)
- [Quickstart](#quickstart)
- [Contribute](#contribute)
- [Using Proxies](#proxies)
- [Custom Scrapers](#custom-scrapers)
- [Examples](#examples)
- [Scraping Model](#scraping-model)
- [Technical Notes](#technical-notes)
- [Advanced Usage](#advanced-usage)
- [Special Query String Parameters for Search Engines](#query-string-parameters)
Se-scraper supports the following search engines:
* Google
* Google News
* Google News App version (https://news.google.com)
* Google Image
* Bing
* Baidu
* Youtube
* Bing News
* Infospace
* Duckduckgo
* Yandex
* Webcrawler
is supported.
This module uses puppeteer and a modified version of [puppeteer-cluster](https://github.com/thomasdondorf/puppeteer-cluster/). It was created by the Developer of [GoogleScraper](https://github.com/NikolaiT/GoogleScraper), a module with 1800 Stars on Github.
Additionally **se-scraper** supports investment ticker search from the following sites:
## Installation
* Reuters
* cnbc
* Marketwatch
You need a working installation of **node** and the **npm** package manager.
This module uses puppeteer. It was created by the Developer of https://github.com/NikolaiT/GoogleScraper, a module with 1800 Stars on Github.
### Quickstart
**Note**: If you don't want puppeteer to download a complete chromium browser, add this variable to your environments:
For example, if you are using Ubuntu 18.04, you can install node and npm with the following commands:
```bash
export PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=1
sudo apt update;
sudo apt install nodejs;
# recent version of npm
curl -sL https://deb.nodesource.com/setup_10.x -o nodesource_setup.sh;
sudo bash nodesource_setup.sh;
sudo apt install npm;
```
Then install with
Chrome and puppeteer [need some additional libraries to run on ubuntu](https://techoverflow.net/2018/06/05/how-to-fix-puppetteer-error-).
This command will install dependencies:
```bash
# install all that is needed by chromium browser. Maybe not everything needed
sudo apt-get install gconf-service libasound2 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 libgcc1 libgconf-2-4 libgdk-pixbuf2.0-0 libglib2.0-0 libgtk-3-0 libnspr4 libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6 ca-certificates fonts-liberation libappindicator1 libnss3 lsb-release xdg-utils wget;
```
Install **se-scraper** by entering the following command in your terminal
```bash
npm install se-scraper
```
then create a file with the following contents and start scraping.
If you **don't** want puppeteer to download a complete chromium browser, add this variable to your environment. Then this module is not guaranteed to run out of the box.
```bash
export PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=1
```
### Docker Support
I will maintain a public docker image of se-scraper. Pull the docker image with the command:
```bash
docker pull tschachn/se-scraper
```
Confirm that the docker image was correctly pulled:
```bash
docker image ls
```
Should show something like that:
```
tschachn/se-scraper latest 897e1aeeba78 21 minutes ago 1.29GB
```
You can check the [latest tag here](https://hub.docker.com/r/tschachn/se-scraper/tags). In the example below, the latest tag is **latest**. This will most likely remain **latest** in the future.
Run the docker image and map the internal port 3000 to the external
port 3000:
```bash
$ docker run -p 3000:3000 tschachn/se-scraper:latest
Running on http://0.0.0.0:3000
```
When the image is running, you may start scrape jobs via HTTP API:
```bash
curl -XPOST http://0.0.0.0:3000 -H 'Content-Type: application/json' \
-d '{
"browser_config": {
"random_user_agent": true
},
"scrape_config": {
"search_engine": "google",
"keywords": ["test"],
"num_pages": 1
}
}'
```
Many thanks goes to [slotix](https://github.com/NikolaiT/se-scraper/pull/21) for his tremendous help in setting up a docker image.
## Minimal Example
Create a file named `minimal.js` with the following contents
```js
const se_scraper = require('se-scraper');
let config = {
search_engine: 'google',
debug: false,
verbose: false,
keywords: ['news', 'scraping scrapeulous.com'],
num_pages: 3,
output_file: 'data.json',
};
(async () => {
let scrape_job = {
search_engine: 'google',
keywords: ['lets go boys'],
num_pages: 1,
};
function callback(err, response) {
if (err) { console.error(err) }
console.dir(response, {depth: null, colors: true});
}
var results = await se_scraper.scrape({}, scrape_job);
se_scraper.scrape(config, callback);
console.dir(results, {depth: null, colors: true});
})();
```
### Technical Notes
Start scraping by firing up the command `node minimal.js`
## Quickstart
Create a file named `run.js` with the following contents
```js
const se_scraper = require('se-scraper');
(async () => {
let browser_config = {
debug_level: 1,
output_file: 'examples/results/data.json',
};
let scrape_job = {
search_engine: 'google',
keywords: ['news', 'se-scraper'],
num_pages: 1,
// add some cool google search settings
google_settings: {
gl: 'us', // The gl parameter determines the Google country to use for the query.
hl: 'en', // The hl parameter determines the Google UI language to return results.
start: 0, // Determines the results offset to use, defaults to 0.
num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
},
};
var scraper = new se_scraper.ScrapeManager(browser_config);
await scraper.start();
var results = await scraper.scrape(scrape_job);
console.dir(results, {depth: null, colors: true});
await scraper.quit();
})();
```
Start scraping by firing up the command `node run.js`
## Contribute
I really help and love your help! However scraping is a dirty business and it often takes me a lot of time to find failing selectors or missing JS logic. So if any search engine does not yield the results of your liking, please create a **static test case** similar to [this static test of google](test/static_tests/google.js) that fails. I will try to correct se-scraper then.
That's how you would proceed:
1. Copy the [static google test case](test/static_tests/google.js)
2. Remove all unnecessary testing code
3. Save a search to file where se-scraper does not work correctly.
3. Implement the static test case using the saved search html where se-scraper currently fails.
4. Submit a new issue with the failing test case as pull request
5. I will fix it! (or better: you submit a pull request directly)
## Proxies
**se-scraper** will create one browser instance per proxy. So the maximal amount of concurrency is equivalent to the number of proxies plus one (your own IP).
```js
const se_scraper = require('se-scraper');
(async () => {
let browser_config = {
debug_level: 1,
output_file: 'examples/results/proxyresults.json',
proxy_file: '/home/nikolai/.proxies', // one proxy per line
log_ip_address: true,
};
let scrape_job = {
search_engine: 'google',
keywords: ['news', 'scrapeulous.com', 'incolumitas.com', 'i work too much', 'what to do?', 'javascript is hard'],
num_pages: 1,
};
var scraper = new se_scraper.ScrapeManager(browser_config);
await scraper.start();
var results = await scraper.scrape(scrape_job);
console.dir(results, {depth: null, colors: true});
await scraper.quit();
})();
```
With a proxy file such as
```text
socks5://53.34.23.55:55523
socks4://51.11.23.22:22222
```
This will scrape with **three** browser instance each having their own IP address. Unfortunately, it is currently not possible to scrape with different proxies per tab. Chromium does not support that.
## Custom Scrapers
You can define your own scraper class and use it within se-scraper.
[Check this example out](examples/custom_scraper.js) that defines a custom scraper for Ecosia.
## Examples
* [Reuse existing browser](examples/multiple_search_engines.js) yields [these results](examples/results/multiple_search_engines.json)
* [Simple example scraping google](examples/quickstart.js) yields [these results](examples/results/data.json)
* [Scrape with one proxy per browser](examples/proxies.js) yields [these results](examples/results/proxyresults.json)
* [Scrape 100 keywords on Bing with multible tabs in one browser](examples/multiple_tabs.js) produces [this](examples/results/bing.json)
* [Inject your own scraping logic](examples/pluggable.js)
* [For the Lulz: Scraping google dorks for SQL injection vulnerabilites and confirming them.](examples/for_the_lulz.js)
* [Scrape google maps/locations](examples/google_maps.js) yields [these results](examples/results/maps.json)
## Scraping Model
**se-scraper** scrapes search engines only. In order to introduce concurrency into this library, it is necessary to define the scraping model. Then we can decide how we divide and conquer.
#### Scraping Resources
What are common scraping resources?
1. **Memory and CPU**. Necessary to launch multiple browser instances.
2. **Network Bandwith**. Is not often the bottleneck.
3. **IP Addresses**. Websites often block IP addresses after a certain amount of requests from the same IP address. Can be circumvented by using proxies.
4. Spoofable identifiers such as browser fingerprint or user agents. Those will be handled by **se-scraper**
#### Concurrency Model
**se-scraper** should be able to run without any concurrency at all. This is the default case. No concurrency means only one browser/tab is searching at the time.
For concurrent use, we will make use of a modified [puppeteer-cluster library](https://github.com/thomasdondorf/puppeteer-cluster).
One scrape job is properly defined by
* 1 search engine such as `google`
* `M` pages
* `N` keywords/queries
* `K` proxies and `K+1` browser instances (because when we have no proxies available, we will scrape with our dedicated IP)
Then **se-scraper** will create `K+1` dedicated browser instances with a unique ip address. Each browser will get `N/(K+1)` keywords and will issue `N/(K+1) * M` total requests to the search engine.
The problem is that [puppeteer-cluster library](https://github.com/thomasdondorf/puppeteer-cluster) does only allow identical options for subsequent new browser instances. Therefore, it is not trivial to launch a cluster of browsers with distinct proxy settings. Right now, every browser has the same options. It's not possible to set options on a per browser basis.
Solution:
1. Create a [upstream proxy router](https://github.com/GoogleChrome/puppeteer/issues/678).
2. Modify [puppeteer-cluster library](https://github.com/thomasdondorf/puppeteer-cluster) to accept a list of proxy strings and then pop() from this list at every new call to `workerInstance()` in https://github.com/thomasdondorf/puppeteer-cluster/blob/master/src/Cluster.ts I wrote an [issue here](https://github.com/thomasdondorf/puppeteer-cluster/issues/107). **I ended up doing this**.
## Technical Notes
Scraping is done with a headless chromium browser using the automation library puppeteer. Puppeteer is a Node library which provides a high-level API to control headless Chrome or Chromium over the DevTools Protocol.
No multithreading is supported for now. Only one scraping worker per `scrape()` call.
We will soon support parallelization. **se-scraper** will support an architecture similar to:
1. https://antoinevastel.com/crawler/2018/09/20/parallel-crawler-puppeteer.html
2. https://docs.browserless.io/blog/2018/06/04/puppeteer-best-practices.html
If you need to deploy scraping to the cloud (AWS or Azure), you can contact me at hire@incolumitas.com
If you need to deploy scraping to the cloud (AWS or Azure), you can contact me at **hire@incolumitas.com**
The chromium browser is started with the following flags to prevent
scraping detection.
@ -90,11 +319,12 @@ var ADDITIONAL_CHROME_FLAGS = [
'--disable-gpu',
'--window-size=1920x1080',
'--hide-scrollbars',
'--disable-notifications',
];
```
Furthermore, to avoid loading unnecessary ressources and to speed up
scraping a great deal, we instruct chrome to not load images and css:
scraping a great deal, we instruct chrome to not load images and css and media:
```js
await page.setRequestInterception(true);
@ -109,10 +339,11 @@ page.on('request', (req) => {
});
```
### Making puppeteer and headless chrome undetectable
#### Making puppeteer and headless chrome undetectable
Consider the following resources:
* https://antoinevastel.com/bot%20detection/2019/07/19/detecting-chrome-headless-v3.html
* https://intoli.com/blog/making-chrome-headless-undetectable/
* https://intoli.com/blog/not-possible-to-block-chrome-headless/
* https://news.ycombinator.com/item?id=16179602
@ -136,19 +367,20 @@ let config = {
It will create a screenshot named `headless-test-result.png` in the directory where the scraper was started that shows whether all test have passed.
### Advanced Usage
## Advanced Usage
Use se-scraper by calling it with a script such as the one below.
Use **se-scraper** by calling it with a script such as the one below.
```js
const se_scraper = require('se-scraper');
const resolve = require('path').resolve;
let config = {
// those options need to be provided on startup
// and cannot give to se-scraper on scrape() calls
let browser_config = {
// the user agent to scrape with
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3835.0 Safari/537.36',
// if random_user_agent is set to True, a random user agent is chosen
random_user_agent: true,
random_user_agent: false,
// whether to select manual settings in visible mode
set_manual_settings: false,
// log ip address data
@ -157,19 +389,29 @@ let config = {
log_http_headers: false,
// how long to sleep between requests. a random sleep interval within the range [a,b]
// is drawn before every request. empty string for no sleeping.
sleep_range: '[1,1]',
sleep_range: '',
// which search engine to scrape
search_engine: 'google',
compress: false, // compress
debug: false,
verbose: false,
keywords: ['scrapeulous.com'],
// whether debug information should be printed
// level 0: print nothing
// level 1: print most important info
// ...
// level 4: print all shit nobody wants to know
debug_level: 1,
keywords: ['nodejs rocks',],
// whether to start the browser in headless mode
headless: true,
// specify flags passed to chrome here
chrome_flags: [],
// the number of pages to scrape for each keyword
num_pages: 1,
// path to output file, data will be stored in JSON
output_file: '',
// whether to also passthru all the html output of the serp pages
html_output: false,
// whether to return a screenshot of serp pages as b64 data
screen_output: false,
// whether to prevent images, css, fonts and media from being loaded
// will speed up scraping a great deal
block_assets: true,
@ -178,223 +420,92 @@ let config = {
// get_browser, handle_metadata, close_browser
//custom_func: resolve('examples/pluggable.js'),
custom_func: '',
throw_on_detection: false,
// use a proxy for all connections
// example: 'socks5://78.94.172.42:1080'
// example: 'http://118.174.233.10:48400'
proxy: '',
// a file with one proxy per line. Example:
// socks5://78.94.172.42:1080
// http://118.174.233.10:48400
proxy_file: '',
// whether to use proxies only
// when this is set to true, se-scraper will not use
// your default IP address
use_proxies_only: false,
// check if headless chrome escapes common detection techniques
// this is a quick test and should be used for debugging
test_evasion: false,
apply_evasion_techniques: true,
// settings for puppeteer-cluster
puppeteer_cluster_config: {
timeout: 30 * 60 * 1000, // max timeout set to 30 minutes
monitor: false,
concurrency: Cluster.CONCURRENCY_BROWSER,
maxConcurrency: 1,
}
};
function callback(err, response) {
if (err) { console.error(err) }
(async () => {
// scrape config can change on each scrape() call
let scrape_config = {
// which search engine to scrape
search_engine: 'google',
// an array of keywords to scrape
keywords: ['cat', 'mouse'],
// the number of pages to scrape for each keyword
num_pages: 2,
/* response object has the following properties:
// OPTIONAL PARAMS BELOW:
google_settings: {
gl: 'us', // The gl parameter determines the Google country to use for the query.
hl: 'fr', // The hl parameter determines the Google UI language to return results.
start: 0, // Determines the results offset to use, defaults to 0.
num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
},
// instead of keywords you can specify a keyword_file. this overwrites the keywords array
keyword_file: '',
// how long to sleep between requests. a random sleep interval within the range [a,b]
// is drawn before every request. empty string for no sleeping.
sleep_range: '',
// path to output file, data will be stored in JSON
output_file: 'output.json',
// whether to prevent images, css, fonts from being loaded
// will speed up scraping a great deal
block_assets: false,
// check if headless chrome escapes common detection techniques
// this is a quick test and should be used for debugging
test_evasion: false,
apply_evasion_techniques: true,
// log ip address data
log_ip_address: false,
// log http headers
log_http_headers: false,
};
response.results - json object with the scraping results
response.metadata - json object with metadata information
response.statusCode - status code of the scraping process
*/
let results = await se_scraper.scrape(browser_config, scrape_config);
console.dir(results, {depth: null, colors: true});
})();
```
console.dir(response.results, {depth: null, colors: true});
[Output for the above script on my machine.](examples/results/advanced.json)
### Query String Parameters
You can add your custom query string parameters to the configuration object by specifying a `google_settings` key. In general: `{{search engine}}_settings`.
For example you can customize your google search with the following config:
```js
let scrape_config = {
search_engine: 'google',
// use specific search engine parameters for various search engines
google_settings: {
google_domain: 'google.com',
gl: 'us', // The gl parameter determines the Google country to use for the query.
hl: 'us', // The hl parameter determines the Google UI language to return results.
start: 0, // Determines the results offset to use, defaults to 0.
num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
},
}
se_scraper.scrape(config, callback);
```
Supported options for the `search_engine` config key:
```javascript
'google'
'google_news_old'
'google_news'
'google_image'
'bing'
'bing_news'
'infospace'
'webcrawler'
'baidu'
'youtube'
'duckduckgo_news'
'reuters'
'cnbc'
'marketwatch'
```
Output for the above script on my machine:
```text
{ 'scraping scrapeulous.com':
{ '1':
{ time: 'Tue, 29 Jan 2019 21:39:22 GMT',
num_results: 'Ungefähr 145 Ergebnisse (0,18 Sekunden) ',
no_results: false,
effective_query: '',
results:
[ { link: 'https://scrapeulous.com/',
title:
'Scrapeuloushttps://scrapeulous.com/Im CacheDiese Seite übersetzen',
snippet:
'Scrapeulous.com allows you to scrape various search engines automatically ... or to find hidden links, Scrapeulous.com enables you to scrape a ever increasing ...',
visible_link: 'https://scrapeulous.com/',
date: '',
rank: 1 },
{ link: 'https://scrapeulous.com/about/',
title:
'About - Scrapeuloushttps://scrapeulous.com/about/Im CacheDiese Seite übersetzen',
snippet:
'Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. The business requirement to scrape information from ...',
visible_link: 'https://scrapeulous.com/about/',
date: '',
rank: 2 },
{ link: 'https://scrapeulous.com/howto/',
title:
'Howto - Scrapeuloushttps://scrapeulous.com/howto/Im CacheDiese Seite übersetzen',
snippet:
'We offer scraping large amounts of keywords for the Google Search Engine. Large means any number of keywords between 40 and 50000. Additionally, we ...',
visible_link: 'https://scrapeulous.com/howto/',
date: '',
rank: 3 },
{ link: 'https://github.com/NikolaiT/se-scraper',
title:
'GitHub - NikolaiT/se-scraper: Javascript scraping module based on ...https://github.com/NikolaiT/se-scraperIm CacheDiese Seite übersetzen',
snippet:
'24.12.2018 - Javascript scraping module based on puppeteer for many different search ... for many different search engines... https://scrapeulous.com/.',
visible_link: 'https://github.com/NikolaiT/se-scraper',
date: '24.12.2018 - ',
rank: 4 },
{ link:
'https://github.com/NikolaiT/GoogleScraper/blob/master/README.md',
title:
'GoogleScraper/README.md at master · NikolaiT/GoogleScraper ...https://github.com/NikolaiT/GoogleScraper/blob/.../README.mdIm CacheÄhnliche SeitenDiese Seite übersetzen',
snippet:
'GoogleScraper - Scraping search engines professionally. Scrapeulous.com - Scraping Service. GoogleScraper is a open source tool and will remain a open ...',
visible_link:
'https://github.com/NikolaiT/GoogleScraper/blob/.../README.md',
date: '',
rank: 5 },
{ link: 'https://googlescraper.readthedocs.io/',
title:
'Welcome to GoogleScraper\'s documentation! — GoogleScraper ...https://googlescraper.readthedocs.io/Im CacheDiese Seite übersetzen',
snippet:
'Welcome to GoogleScraper\'s documentation!¶. Contents: GoogleScraper - Scraping search engines professionally · Scrapeulous.com - Scraping Service ...',
visible_link: 'https://googlescraper.readthedocs.io/',
date: '',
rank: 6 },
{ link: 'https://incolumitas.com/pages/scrapeulous/',
title:
'Coding, Learning and Business Ideas Scrapeulous.com - Incolumitashttps://incolumitas.com/pages/scrapeulous/Im CacheDiese Seite übersetzen',
snippet:
'A scraping service for scientists, marketing professionals, analysts or SEO folk. In autumn 2018, I created a scraping service called scrapeulous.com. There you ...',
visible_link: 'https://incolumitas.com/pages/scrapeulous/',
date: '',
rank: 7 },
{ link: 'https://incolumitas.com/',
title:
'Coding, Learning and Business Ideashttps://incolumitas.com/Im CacheDiese Seite übersetzen',
snippet:
'Scraping Amazon Reviews using Headless Chrome Browser and Python3. Posted on Mi ... GoogleScraper Tutorial - How to scrape 1000 keywords with Google.',
visible_link: 'https://incolumitas.com/',
date: '',
rank: 8 },
{ link: 'https://en.wikipedia.org/wiki/Search_engine_scraping',
title:
'Search engine scraping - Wikipediahttps://en.wikipedia.org/wiki/Search_engine_scrapingIm CacheDiese Seite übersetzen',
snippet:
'Search engine scraping is the process of harvesting URLs, descriptions, or other information from search engines such as Google, Bing or Yahoo. This is a ...',
visible_link: 'https://en.wikipedia.org/wiki/Search_engine_scraping',
date: '',
rank: 9 },
{ link:
'https://readthedocs.org/projects/googlescraper/downloads/pdf/latest/',
title:
'GoogleScraper Documentation - Read the Docshttps://readthedocs.org/projects/googlescraper/downloads/.../latest...Im CacheDiese Seite übersetzen',
snippet:
'23.12.2018 - Contents: 1 GoogleScraper - Scraping search engines professionally. 1. 1.1 ... For this reason, I created the web service scrapeulous.com.',
visible_link:
'https://readthedocs.org/projects/googlescraper/downloads/.../latest...',
date: '23.12.2018 - ',
rank: 10 } ] },
'2':
{ time: 'Tue, 29 Jan 2019 21:39:24 GMT',
num_results: 'Seite 2 von ungefähr 145 Ergebnissen (0,20 Sekunden) ',
no_results: false,
effective_query: '',
results:
[ { link: 'https://pypi.org/project/CountryGoogleScraper/',
title:
'CountryGoogleScraper · PyPIhttps://pypi.org/project/CountryGoogleScraper/Im CacheDiese Seite übersetzen',
snippet:
'A module to scrape and extract links, titles and descriptions from various search ... Look [here to get an idea how to use asynchronous mode](http://scrapeulous.',
visible_link: 'https://pypi.org/project/CountryGoogleScraper/',
date: '',
rank: 1 },
{ link: 'https://www.youtube.com/watch?v=a6xn6rc9GbI',
title:
'scrapeulous intro - YouTubehttps://www.youtube.com/watch?v=a6xn6rc9GbIDiese Seite übersetzen',
snippet:
'scrapeulous intro. Scrapeulous Scrapeulous. Loading... Unsubscribe from ... on Dec 16, 2018. Introduction ...',
visible_link: 'https://www.youtube.com/watch?v=a6xn6rc9GbI',
date: '',
rank: 3 },
{ link:
'https://www.reddit.com/r/Python/comments/2tii3r/scraping_260_search_queries_in_bing_in_a_matter/',
title:
'Scraping 260 search queries in Bing in a matter of seconds using ...https://www.reddit.com/.../scraping_260_search_queries_in_bing...Im CacheDiese Seite übersetzen',
snippet:
'24.01.2015 - Scraping 260 search queries in Bing in a matter of seconds using asyncio and aiohttp. (scrapeulous.com). submitted 3 years ago by ...',
visible_link:
'https://www.reddit.com/.../scraping_260_search_queries_in_bing...',
date: '24.01.2015 - ',
rank: 4 },
{ link: 'https://twitter.com/incolumitas_?lang=de',
title:
'Nikolai Tschacher (@incolumitas_) | Twitterhttps://twitter.com/incolumitas_?lang=deIm CacheÄhnliche SeitenDiese Seite übersetzen',
snippet:
'Learn how to scrape millions of url from yandex and google or bing with: http://scrapeulous.com/googlescraper-market-analysis.html … 0 replies 0 retweets 0 ...',
visible_link: 'https://twitter.com/incolumitas_?lang=de',
date: '',
rank: 5 },
{ link:
'http://blog.shodan.io/hostility-in-the-python-package-index/',
title:
'Hostility in the Cheese Shop - Shodan Blogblog.shodan.io/hostility-in-the-python-package-index/Im CacheDiese Seite übersetzen',
snippet:
'22.02.2015 - https://zzz.scrapeulous.com/r? According to the author of the website, these hostile packages are used as honeypots. Honeypots are usually ...',
visible_link: 'blog.shodan.io/hostility-in-the-python-package-index/',
date: '22.02.2015 - ',
rank: 6 },
{ link: 'https://libraries.io/github/NikolaiT/GoogleScraper',
title:
'NikolaiT/GoogleScraper - Libraries.iohttps://libraries.io/github/NikolaiT/GoogleScraperIm CacheDiese Seite übersetzen',
snippet:
'A Python module to scrape several search engines (like Google, Yandex, Bing, ... https://scrapeulous.com/ ... You can install GoogleScraper comfortably with pip:',
visible_link: 'https://libraries.io/github/NikolaiT/GoogleScraper',
date: '',
rank: 7 },
{ link: 'https://pydigger.com/pypi/CountryGoogleScraper',
title:
'CountryGoogleScraper - PyDiggerhttps://pydigger.com/pypi/CountryGoogleScraperDiese Seite übersetzen',
snippet:
'19.10.2016 - Look [here to get an idea how to use asynchronous mode](http://scrapeulous.com/googlescraper-260-keywords-in-a-second.html). ### Table ...',
visible_link: 'https://pydigger.com/pypi/CountryGoogleScraper',
date: '19.10.2016 - ',
rank: 8 },
{ link: 'https://hub.docker.com/r/cimenx/data-mining-penandtest/',
title:
'cimenx/data-mining-penandtest - Docker Hubhttps://hub.docker.com/r/cimenx/data-mining-penandtest/Im CacheDiese Seite übersetzen',
snippet:
'Container. OverviewTagsDockerfileBuilds · http://scrapeulous.com/googlescraper-260-keywords-in-a-second.html. Docker Pull Command. Owner. profile ...',
visible_link: 'https://hub.docker.com/r/cimenx/data-mining-penandtest/',
date: '',
rank: 9 },
{ link: 'https://www.revolvy.com/page/Search-engine-scraping',
title:
'Search engine scraping | Revolvyhttps://www.revolvy.com/page/Search-engine-scrapingIm CacheDiese Seite übersetzen',
snippet:
'Search engine scraping is the process of harvesting URLs, descriptions, or other information from search engines such as Google, Bing or Yahoo. This is a ...',
visible_link: 'https://www.revolvy.com/page/Search-engine-scraping',
date: '',
rank: 10 } ] } } }
```

88
TODO.md Normal file
View File

@ -0,0 +1,88 @@
### 24.12.2018
- fix interface to scrape() [DONE]
- add to Github
### 24.1.2018
- fix issue #3: add functionality to add keyword file
### 27.1.2019
- Add functionality to block images and CSS from loading as described here:
https://www.scrapehero.com/how-to-increase-web-scraping-speed-using-puppeteer/
https://www.scrapehero.com/how-to-build-a-web-scraper-using-puppeteer-and-node-js/
### 29.1.2019
- implement proxy support functionality
- implement proxy check
- implement scraping more than 1 page
- do it for google
- and bing
- implement duckduckgo scraping
### 30.1.2019
- modify all scrapers to use the generic class where it makes sense
- Bing, Baidu, Google, Duckduckgo
### 7.2.2019
- add num_requests to test cases [done]
### 25.2.2019
- https://antoinevastel.com/crawler/2018/09/20/parallel-crawler-puppeteer.html
- add support for browsing with multiple browsers, use this neat library:
- https://github.com/thomasdondorf/puppeteer-cluster [done]
### 28.2.2019
- write test case for multiple browsers/proxies
- write test case and example for multiple tabs with bing
- make README.md nicer. https://github.com/thomasdondorf/puppeteer-cluster/blob/master/README.md as template
### 11.6.2019
- TODO: fix amazon scraping
- change api of remaining test cases [done]
- TODO: implement custom search engine parameters on scrape()
### 12.6.2019
- remove unnecessary sleep() calls and replace with waitFor selectors
### 16.7.2019
- resolve issues
- fix this https://github.com/NikolaiT/se-scraper/issues/37 [done]
- use puppeteer stealth plugin: https://www.npmjs.com/package/puppeteer-extra-plugin-stealth
- we will need to load at the concurrency impl of puppeteer-cluster [no typescript support :(), I will not support this right now]
- user random user agents plugin: https://github.com/intoli/user-agents [done]
- add screenshot capability (make the screen after parsing)
- store as b64 [done]
### 12.8.2019
- add static test case for bing [done]
- add options that minimize `html_output` flag:
`clean_html_output` will remove all JS and CSS from the html
`clean_data_images` removes all data images from the html
[done]
### 13.8.2019
- Write test case for clean html output [done]
- Consider better compression algorithm. [done] There is the brotli algorithm, but this is only supported
in very recent versions of nodejs
- what else can we remove from the dom [done] Removing comment nodes now! They are large in BING.
- remove all whitespace and \n and \t from html
### TODO:
1. fix googlenewsscraper waiting for results and parsing. remove the static sleep [done]
2. when using multiple browsers and random user agent, pass a random user agent to each perBrowserOptions
3. dont create a new tab when opening a new scraper

View File

@ -1,45 +0,0 @@
24.12.2018
- fix interface to scrape() [DONE]
- add to Github
24.1.2018
- fix issue #3: add functionality to add keyword file
27.1.2019
- Add functionality to block images and CSS from loading as described here:
https://www.scrapehero.com/how-to-increase-web-scraping-speed-using-puppeteer/
https://www.scrapehero.com/how-to-build-a-web-scraper-using-puppeteer-and-node-js/
29.1.2019
- implement proxy support functionality
- implement proxy check
- implement scraping more than 1 page
- do it for google
- and bing
- implement duckduckgo scraping
30.1.2019
- modify all scrapers to use the generic class where it makes sense
- Bing, Baidu, Google, Duckduckgo
7.2.2019
- add num_requests to test cases [done]
TODO:
- add captcha service solving support
- check if news instances run the same browser and if we can have one proxy per tab wokers
- write test case for:
- pluggable
- full metadata (log http headers, log ip address)

4645
examples/bing_de.json Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,85 @@
var fs = require('fs');
var path = require('path');
var os = require("os");
const se_scraper = require('./../index.js');
var filepath_de = path.join(__dirname, '/data/keywords_de.txt');
function read_keywords_from_file(fpath) {
let kws = fs.readFileSync(fpath).toString().split(os.EOL);
// clean keywords
kws = kws.filter((kw) => {
return kw.trim().length > 0;
});
return kws;
}
let keywords_de = read_keywords_from_file(filepath_de);
const Cluster = {
CONCURRENCY_PAGE: 1, // shares cookies, etc.
CONCURRENCY_CONTEXT: 2, // no cookie sharing (uses contexts)
CONCURRENCY_BROWSER: 3, // no cookie sharing and individual processes (uses contexts)
};
// those options need to be provided on startup
// and cannot give to se-scraper on scrape() calls
let browser_config = {
// the user agent to scrape with
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
// if random_user_agent is set to True, a random user agent is chosen
random_user_agent: true,
verbose: true,
// whether to start the browser in headless mode
headless: true,
is_local: false,
throw_on_detection: false,
puppeteer_cluster_config: {
headless: true,
timeout: 2 * 60 * 1000, // max timeout set to 2 minutes
monitor: false,
concurrency: 3, // one scraper per tab
maxConcurrency: 3, // scrape with 5 tabs
}
};
(async () => {
// scrape config can change on each scrape() call
let scrape_config_bing_de = {
// which search engine to scrape
search_engine: 'bing',
// an array of keywords to scrape
keywords: keywords_de,
// the number of pages to scrape for each keyword
num_pages: 10,
// OPTIONAL PARAMS BELOW:
// https://docs.microsoft.com/en-us/rest/api/cognitiveservices-bingsearch/bing-web-api-v5-reference#query-parameters
bing_settings: {
cc: 'DE', // The cc parameter determines the country to use for the query.
mkt: 'de-DE', // The mkt parameter determines the UI language to return results.
offset: 0, // Determines the results offset to use, defaults to 0.
count: 20, // Determines the number of results to show, defaults to 10. Maximum is 100.
},
// how long to sleep between requests. a random sleep interval within the range [a,b]
// is drawn before every request. empty string for no sleeping.
sleep_range: '',
// path to output file, data will be stored in JSON
output_file: 'examples/bing_de.json',
// whether to prevent images, css, fonts from being loaded
// will speed up scraping a great deal
block_assets: true,
// check if headless chrome escapes common detection techniques
// this is a quick test and should be used for debugging
test_evasion: false,
apply_evasion_techniques: true,
// log ip address data
log_ip_address: false,
// log http headers
log_http_headers: false,
};
let results = await se_scraper.scrape(browser_config, scrape_config_bing_de);
console.dir(results.metadata, {depth: null, colors: true});
})();

25
examples/cleaned_html.js Normal file
View File

@ -0,0 +1,25 @@
const se_scraper = require('./../index.js');
const fs = require('fs');
(async () => {
let kw = 'news iran'
let scrape_job = {
search_engine: 'baidu',
keywords: [kw],
num_pages: 1,
html_output: true,
// whether to strip JS and CSS from the html_output
// has only an effect if `html_output` is true
clean_html_output: true,
// remove all data images from the html
clean_data_images: true,
};
var response = await se_scraper.scrape({}, scrape_job);
console.dir(response, {depth: null, colors: true});
fs.writeFileSync('example_cleaned.html', response.results[kw]['1']['html']);
})();

119
examples/custom_scraper.js Normal file
View File

@ -0,0 +1,119 @@
const se_scraper = require('./../index.js');
/*
* This example shows how you can define your custom scraper class and use it
* within se-scraper.
*/
class EcosiaScraper extends se_scraper.Scraper {
constructor(...args) {
super(...args);
}
async parse_async(html) {
// In this example we use vanilla javascript to parse out the
// interesting information from the search engine
// you may also use a external library such as cheerio.
return await this.page.evaluate(() => {
var results = {
num_results: '',
no_results: false,
effective_query: '',
results: [],
};
document.querySelectorAll('.results .result').forEach((result) => {
var serp = {};
var title = result.querySelector('.result-title');
if (title) {
serp.title = title.innerText;
serp.link = title.getAttribute('href');
}
var green = result.querySelector('.result-url');
if (green) {
serp.green = green.getAttribute('href');
}
var snippet = result.querySelector('.result-snippet');
if (snippet) {
serp.snippet = snippet.innerText;
}
results.results.push(serp);
});
var num_res = document.querySelector('.card-title-result-count');
if (num_res) {
results.num_results = num_res.innerText;
}
results.no_results = document.querySelector('.empty-result') != null;
var effective = document.querySelector('.query-context-text .result-title');
if (effective) {
results.effective_query = effective.innerText;
}
return results;
});
}
async load_start_page() {
let startUrl = 'https://www.ecosia.org/';
await this.page.goto(startUrl);
try {
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
} catch (e) {
return false;
}
return true;
}
async search_keyword(keyword) {
const input = await this.page.$('input[name="q"]');
await this.set_input_value(`input[name="q"]`, keyword);
await this.sleep(50);
await input.focus();
await this.page.keyboard.press("Enter");
}
async next_page() {
let next_page_link = await this.page.$('.pagination-next', {timeout: 1000});
if (!next_page_link) {
return false;
}
await next_page_link.click();
return true;
}
async wait_for_results() {
await this.page.waitForSelector('.results .result', { timeout: this.STANDARD_TIMEOUT });
}
async detected() {
// check whether scraping was detected.
}
}
(async () => {
let scrape_job = {
search_engine: EcosiaScraper,
keywords: ['lets go boys'],
num_pages: 2,
};
var results = await se_scraper.scrape({headless: true}, scrape_job);
console.dir(results, {depth: null, colors: true});
})();

View File

@ -0,0 +1,11 @@
var nodeIterator = document.createNodeIterator(
document.body,
NodeFilter.SHOW_COMMENT,
{ acceptNode: function(node) { return NodeFilter.FILTER_ACCEPT; } }
);
// Remove all comment nodes
while(nodeIterator.nextNode()){
var commentNode = nodeIterator.referenceNode;
commentNode.remove();
}

97
examples/for_the_lulz.js Normal file
View File

@ -0,0 +1,97 @@
/*
* Do not run this, this is probably illegal in your country ;)
*/
const se_scraper = require('./../index.js');
// generate some google dorks
function genGoogleDorks(iter=4) {
let lulz_keywords = [];
['seite', 'inicio', 'index'].forEach((x) => {
for (var i = 0; i < iter; i++) {
lulz_keywords.push(
'inurl:"' + x + '.php?id=' + Math.floor(Math.random() * 100) + '"'
)
}
});
return lulz_keywords;
}
const lulz_keywords = genGoogleDorks();
console.log(lulz_keywords);
// those options need to be provided on startup
// and cannot give to se-scraper on scrape() calls
let browser_config = {
// if random_user_agent is set to True, a random user agent is chosen
random_user_agent: true,
headless: true,
is_local: false,
throw_on_detection: false,
puppeteer_cluster_config: {
headless: true,
timeout: 2 * 60 * 1000, // max timeout set to 2 minutes
monitor: false,
concurrency: 3, // one scraper per tab
maxConcurrency: 4, // scrape with 4 tabs
}
};
(async () => {
// scrape config can change on each scrape() call
let lulz_config = {
// which search engine to scrape
search_engine: 'google',
// an array of keywords to scrape
keywords: lulz_keywords,
// the number of pages to scrape for each keyword
num_pages: 3,
// how long to sleep between requests. a random sleep interval within the range [a,b]
// is drawn before every request. empty string for no sleeping.
sleep_range: '',
// path to output file, data will be stored in JSON
output_file: 'goodboys.json',
// whether to prevent images, css, fonts from being loaded
// will speed up scraping a great deal
block_assets: true,
// check if headless chrome escapes common detection techniques
// this is a quick test and should be used for debugging
test_evasion: false,
apply_evasion_techniques: true,
// log ip address data
log_ip_address: false,
// log http headers
log_http_headers: false,
};
let results = await se_scraper.scrape(browser_config, lulz_config);
const all_links = [];
for (var kw in results) {
for (var page in results[kw]) {
for (var res of results[kw][page]['results']) {
all_links.push(res.link);
}
}
}
console.log(all_links);
for (var link of all_links) {
try {
const response = await got(link.replace(/(id=\d+)/g, "$1'"));
let html = response.body;
if (html.includes('error') || html.includes('mysql')) {
console.log('Got a mysql injection in ' + url);
}
} catch (error) {
console.log(error.response.statusCode);
}
}
})();

23
examples/gimage.js Normal file
View File

@ -0,0 +1,23 @@
const se_scraper = require('./../src/node_scraper.js');
(async () => {
let browser_config = {
output_file: '',
};
let scrape_job = {
search_engine: 'google_image',
keywords: ['manaslu', 'everest', 'pitcairn'],
num_pages: 1,
};
var scraper = new se_scraper.ScrapeManager(browser_config);
await scraper.start();
var results = await scraper.scrape(scrape_job);
console.dir(results, {depth: null, colors: true});
await scraper.quit();
})();

26
examples/gnold.js Normal file
View File

@ -0,0 +1,26 @@
const se_scraper = require('./../src/node_scraper.js');
(async () => {
let browser_config = {
output_file: 'examples/results/gnold.json',
google_news_old_settings: {
gl: 'us', // The gl parameter determines the Google country to use for the query.
hl: 'fr', // The hl parameter determines the Google UI language to return results.
start: 0, // Determines the results offset to use, defaults to 0.
num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
},
};
let scrape_job = {
search_engine: 'google_news_old',
keywords: ['news world'],
num_pages: 1,
};
var scraper = new se_scraper.ScrapeManager(browser_config);
await scraper.start();
var results = await scraper.scrape(scrape_job);
console.dir(results, {depth: null, colors: true});
await scraper.quit();
})();

30
examples/google_maps.js Normal file
View File

@ -0,0 +1,30 @@
const se_scraper = require('./../src/node_scraper.js');
(async () => {
let browser_config = {
output_file: 'examples/results/maps.json',
test_evasion: false,
block_assets: false,
headless: false,
google_maps_settings: {
scrape_in_detail: false,
}
};
let scrape_job = {
search_engine: 'google_maps',
keywords: ['Berlin Zahnarzt'],
num_pages: 1,
};
var scraper = new se_scraper.ScrapeManager(browser_config);
await scraper.start();
var results = await scraper.scrape(scrape_job);
console.dir(results, {depth: null, colors: true});
await scraper.quit();
})();

21
examples/minimal.js Normal file
View File

@ -0,0 +1,21 @@
const se_scraper = require('./../index.js');
(async () => {
let kws = [
'https://www.linkedin.com/in/aakanksha-majhi-b24a8449',
'https://www.linkedin.com/in/aakash-srivastava-7374a830',
'https://www.linkedin.com/in/aakash-tiwari-019b8569',
];
let scrape_job = {
search_engine: 'google',
keywords: kws,
num_pages: 1,
};
var results = await se_scraper.scrape({}, scrape_job);
console.dir(results, {depth: null, colors: true});
})();

View File

@ -0,0 +1,35 @@
const se_scraper = require('./../src/node_scraper.js');
(async () => {
let browser_config = {
search_engine: 'google',
random_user_agent: true,
is_local: false,
html_output: false,
throw_on_detection: false,
headless: true,
puppeteer_cluster_config: {
headless: true,
timeout: 30 * 60 * 1000, // max timeout set to 30 minutes
monitor: false,
concurrency: 3, // 3 == CONCURRENCY_BROWSER
maxConcurrency: 3, // 3 browsers will scrape
},
};
let scrape_job = {
search_engine: 'google',
keywords: ['news', 'mountain', 'what', 'are good', 'keyword', 'who', 'nice'],
num_pages: 1,
};
var scraper = new se_scraper.ScrapeManager(browser_config);
await scraper.start();
var results = await scraper.scrape(scrape_job);
console.dir(results, {depth: null, colors: true});
await scraper.quit();
})();

View File

@ -1,35 +1,29 @@
const se_scraper = require('../index.js');
const se_scraper = require('./../src/node_scraper.js');
async function multiple_search_engines() {
(async () => {
let browser_config = {
random_user_agent: true,
write_meta_data: true,
sleep_range: '[1,1]',
headless: true,
output_file: `examples/results/multiple_search_engines.json`
};
var searchEnginesList = ['google', 'bing'];
let scrape_job = {
search_engine: 'google',
keywords: ['news', 'se-scraper'],
num_pages: 1,
};
for (let index = 0; index < searchEnginesList.length; index++) {
const searchEngine = searchEnginesList[index];
let config = {
random_user_agent: true,
write_meta_data: true,
sleep_range: '[1,1]',
search_engine: searchEngine,
debug: false,
verbose: false,
// the list of keywords to scrape
keywords: ['scrapeulous.com',],
// whether to start the browser in headless mode
headless: true,
output_file: `${searchEngine}.json`
};
var scraper = new se_scraper.ScrapeManager(browser_config);
await scraper.start();
await se_scraper.scrape(config, (err, response) => {
if (err) {
console.error(err)
}
console.dir(response.results, {
depth: null,
colors: true
});
});
for (var se of ['google', 'bing']) {
scrape_job.search_engine = se;
var results = await scraper.scrape(scrape_job);
console.dir(results, {depth: null, colors: true});
}
}
multiple_search_engines();
await scraper.quit();
})();

134
examples/multiple_tabs.js Normal file
View File

@ -0,0 +1,134 @@
const se_scraper = require('./../index.js');
const Cluster = {
CONCURRENCY_PAGE: 1, // shares cookies, etc.
CONCURRENCY_CONTEXT: 2, // no cookie sharing (uses contexts)
CONCURRENCY_BROWSER: 3, // no cookie sharing and individual processes (uses contexts)
};
let keywords = ['New York',
'Los Angeles',
'Chicago',
'Houston',
'Philadelphia',
'Phoenix',
'San Antonio',
'San Diego',
'Dallas',
'San Jose',
'Austin',
'Indianapolis',
'Jacksonville',
'San Francisco',
'Columbus',
'Charlotte',
'Fort Worth',
'Detroit',
'El Paso',
'Memphis',
'Seattle',
'Denver',
'Washington',
'Boston',
'Nashville-Davidson',
'Baltimore',
'Oklahoma City',
'Louisville/Jefferson County',
'Portland',
'Las Vegas',
'Milwaukee',
'Albuquerque',
'Tucson',
'Fresno',
'Sacramento',
'Long Beach',
'Kansas City',
'Mesa',
'Virginia Beach',
'Atlanta',
'Colorado Springs',
'Omaha',
'Raleigh',
'Miami',
'Oakland',
'Minneapolis',
'Tulsa',
'Cleveland',
'Wichita',
'Arlington',
'New Orleans',
'Bakersfield',
'Tampa',
'Honolulu',
'Aurora',
'Anaheim',
'Santa Ana',
'St. Louis',
'Riverside',
'Corpus Christi',
'Lexington-Fayette',
'Pittsburgh',
'Anchorage',
'Stockton',
'Cincinnati',
'St. Paul',
'Toledo',
'Greensboro',
'Newark',
'Plano',
'Henderson',
'Lincoln',
'Buffalo',
'Jersey City',
'Chula Vista',
'Fort Wayne',
'Orlando',
'St. Petersburg',
'Chandler',
'Laredo',
'Norfolk',
'Durham',
'Madison',
'Lubbock',
'Irvine',
'Winston-Salem',
'Glendale',
'Garland',
'Hialeah',
'Reno',
'Chesapeake',
'Gilbert',
'Baton Rouge',
'Irving',
'Scottsdale',
'North Las Vegas',
'Fremont',
'Boise City',
'Richmond',
'San Bernardino'];
let config = {
search_engine: 'bing',
debug: false,
verbose: true,
keywords: keywords,
num_pages: 1, // how many pages per keyword
output_file: 'examples/results/bing.json',
log_ip_address: false,
headless: true,
puppeteer_cluster_config: {
timeout: 10 * 60 * 1000, // max timeout set to 10 minutes
monitor: false,
concurrency: Cluster.CONCURRENCY_PAGE, // one scraper per tab
maxConcurrency: 7, // scrape with 7 tabs
}
};
function callback(err, response) {
if (err) {
console.error(err)
}
console.dir(response, {depth: null, colors: true});
}
se_scraper.scrape(config, callback);

View File

@ -0,0 +1,76 @@
const puppeteer = require('puppeteer');
const ProxyChain = require('proxy-chain');
const ROUTER_PROXY = 'http://127.0.0.1:8000';
// SEE: https://github.com/GoogleChrome/puppeteer/issues/678
// Idea is: Setup a local router proxy that assigns requests identified by unique user-agent strings
// distinct upstream proxies. With this way it is possible to use one proxy per chromium tab.
// downside: not fast and efficient
const uas = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
];
const proxies = ['http://142.93.57.147:3128', 'http://85.132.31.115:8181'];
(async () => {
const browser = await puppeteer.launch({
headless: false,
args: [`--proxy-server=${ROUTER_PROXY}`],
});
const page1 = await browser.newPage();
const page2 = await browser.newPage();
try {
await page1.setUserAgent(uas[0]);
await page1.goto('https://www.whatsmyip.org/');
} catch (e) {
console.log(e);
}
try {
await page2.setUserAgent(uas[1]);
await page2.goto('https://www.whatsmyip.org/');
} catch (e) {
console.log(e);
}
//await browser.close();
})();
const server = new ProxyChain.Server({
// Port where the server the server will listen. By default 8000.
port: 8000,
// Enables verbose logging
verbose: true,
prepareRequestFunction: ({
request,
username,
password,
hostname,
port,
isHttp,
}) => {
var upstreamProxyUrl;
if (request.headers['user-agent'] === uas[0]) {
upstreamProxyUrl = proxies[0];
}
if (request.headers['user-agent'] === uas[1]) {
upstreamProxyUrl = proxies[1];
}
console.log('Using proxy: ' + upstreamProxyUrl);
return { upstreamProxyUrl };
},
});
server.listen(() => {
console.log(`Router Proxy server is listening on port ${8000}`);
});

View File

@ -9,17 +9,13 @@ module.exports = class Pluggable {
'--disable-gpu',
'--window-size=1920x1080',
'--hide-scrollbars',
'--user-agent=Chrome',
'--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3835.0 Safari/537.36',
],
userAgent = 'Chrome',
headless = true,
} = options;
this.chromeFlags = chromeFlags;
this.userAgent = userAgent;
this.headless = headless;
this.chromeFlags.push(this.userAgent);
}
async close_browser() {
@ -65,4 +61,9 @@ module.exports = class Pluggable {
return this.browser;
}
async do_work(page) {
// do some scraping work and return results and num_requests
}
};

View File

@ -0,0 +1,31 @@
const se_scraper = require('./../src/node_scraper.js');
const resolve = require('path').resolve;
(async () => {
let browser_config = {
test_evasion: false,
log_http_headers: true,
log_ip_address: true,
random_user_agent: false,
apply_evasion_techniques: false,
screen_output: false,
custom_func: resolve('./examples/pluggable.js'),
headless: false,
};
let scrape_job = {
search_engine: 'google',
keywords: ['news usa'],
num_pages: 1,
};
var scraper = new se_scraper.ScrapeManager(browser_config);
await scraper.start();
var results = await scraper.scrape(scrape_job);
console.dir(results, {depth: null, colors: true});
await scraper.quit();
})();

29
examples/proxies.js Normal file
View File

@ -0,0 +1,29 @@
const se_scraper = require('./../src/node_scraper.js');
(async () => {
let browser_config = {
output_file: 'examples/results/proxyresults.json',
log_ip_address: true,
// a file with one proxy per line. Example:
// socks5://78.94.172.42:1080
// http://118.174.233.10:48400
proxy_file: '/home/nikolai/.proxies', // one proxy per line
// whether to use proxies only
// when this is set to true, se-scraper will not use
// your default IP address in a browser
use_proxies_only: true,
};
let scrape_job = {
search_engine: 'google',
keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'],
num_pages: 1,
};
var scraper = new se_scraper.ScrapeManager(browser_config);
await scraper.start();
var results = await scraper.scrape(scrape_job);
console.dir(results, {depth: null, colors: true});
await scraper.quit();
})();

View File

@ -1,17 +1,36 @@
const se_scraper = require('./../index.js');
const se_scraper = require('./../src/node_scraper.js');
let config = {
search_engine: 'duckduckgo',
debug: false,
verbose: false,
keywords: ['news'],
num_pages: 2,
output_file: 'data.json',
};
(async () => {
let browser_config = {
test_evasion: false,
log_http_headers: false,
log_ip_address: false,
random_user_agent: false,
apply_evasion_techniques: true,
screen_output: false,
html_output: false,
clean_html_output: true,
};
function callback(err, response) {
if (err) { console.error(err) }
console.dir(response, {depth: null, colors: true});
}
let scrape_job = {
search_engine: 'google',
keywords: ['buy a nice car'],
num_pages: 1,
google_settings: {
"gl": "us",
"hl": "en",
"start": 0,
"num": 10
}
};
se_scraper.scrape(config, callback);
var scraper = new se_scraper.ScrapeManager(browser_config);
await scraper.start();
var results = await scraper.scrape(scrape_job);
console.dir(results, {depth: null, colors: true});
await scraper.quit();
})();

30
examples/reusing.js Normal file
View File

@ -0,0 +1,30 @@
const se_scraper = require('./../src/node_scraper.js');
(async () => {
let browser_config = {
output_file: 'examples/results/data.json',
};
let scrape_job = {
search_engine: 'google',
keywords: ['news', 'se-scraper'],
num_pages: 1,
};
let scrape_job2 = {
search_engine: 'bing',
keywords: ['test', 'what a wonderful world'],
num_pages: 1,
};
var scraper = new se_scraper.ScrapeManager(browser_config);
await scraper.start();
var results = await scraper.scrape(scrape_job);
console.dir(results, {depth: null, colors: true});
var results2 = await scraper.scrape(scrape_job2);
console.dir(results2, {depth: null, colors: true});
await scraper.quit();
})();

87
examples/test_cluster.js Normal file
View File

@ -0,0 +1,87 @@
const { Cluster } = require('../../puppeteer-cluster/dist/index.js');
var fs = require('fs');
var os = require("os");
const PROXY_FILE = '/home/nikolai/.proxies';
function read_items_from_file(fname) {
let kws = fs.readFileSync(fname).toString().split(os.EOL);
// clean keywords
kws = kws.filter((kw) => {
return kw.trim().length > 0;
});
return kws;
}
(async () => {
let browserArgs = [
'--disable-infobars',
'--window-position=0,0',
'--ignore-certifcate-errors',
'--ignore-certifcate-errors-spki-list',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--disable-gpu',
'--window-size=1920x1080',
'--hide-scrollbars',
];
let proxies = read_items_from_file(PROXY_FILE);
console.dir(proxies);
// each new call to workerInstance() will
// left pop() one element from this list
// maxConcurrency should be equal to perBrowserOptions.length
// the first browser config with home IP
let perBrowserOptions = [{
headless: false,
ignoreHTTPSErrors: true,
args: browserArgs
}];
for (var proxy of proxies) {
perBrowserOptions.push({
headless: false,
ignoreHTTPSErrors: true,
args: browserArgs.concat(`--proxy-server=${proxy}`)
})
}
const cluster = await Cluster.launch({
monitor: true,
timeout: 12 * 60 * 60 * 1000, // 12 hours in ms
concurrency: Cluster.CONCURRENCY_BROWSER,
maxConcurrency: perBrowserOptions.length,
puppeteerOptions: {
headless: false,
args: browserArgs,
ignoreHTTPSErrors: true,
},
perBrowserOptions: perBrowserOptions
});
// Event handler to be called in case of problems
cluster.on('taskerror', (err, data) => {
console.log(`Error crawling ${data}: ${err.message}`);
});
await cluster.task(async ({ page, data: url }) => {
await page.goto(url, {waitUntil: 'domcontentloaded', timeout: 20000});
const pageTitle = await page.evaluate(() => document.title);
console.log(`Page title of ${url} is ${pageTitle}`);
console.log(await page.content());
});
for(var i = 0; i < perBrowserOptions.length; i++) {
await cluster.queue('http://ipinfo.io/json');
}
await cluster.idle();
await cluster.close();
})();

40
examples/test_promise.js Normal file
View File

@ -0,0 +1,40 @@
class Test {
constructor(options = {}) {
const {
config = {},
} = options;
this.config = config;
}
run(vars) {
console.log(this.config)
}
}
let o1 = new Test({config: {a: Math.random()}});
let o2 = new Test({config: {a: Math.random()}});
o1.run()
o2.run()
// (async () => {
//
// let prom = [];
//
// for (var i = 0; i < 3; i++) {
// var obj = new Test({
// config: {a: Math.random()},
// });
// prom.push(new Promise(resolve => {
// setTimeout(() => { new Test({
// config: {a: Math.random()},
// }).run(); resolve() }, 1000);
// }));
// }
//
// let res = await Promise.all(prom);
// console.log(res);
//
// })();

View File

@ -0,0 +1,29 @@
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({
args: [
// SET PROXY HERE
'--proxy-server=socks5://IP:PORT',
'--disable-infobars',
'--window-position=0,0',
'--ignore-certifcate-errors',
'--ignore-certifcate-errors-spki-list',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--disable-gpu',
'--window-size=1920x1080',
'--hide-scrollbars',
'--disable-notifications',
'--no-sandbox',
'--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.98 Chrome/71.0.3578.98 Safari/537.36'
],
headless: true
});
var page = await browser.newPage();
await page.setViewport({width: 1920, height: 926});
await page.goto('http://ipinfo.io/json');
console.log(await page.content());
await browser.close();
})();

Binary file not shown.

Before

Width:  |  Height:  |  Size: 43 KiB

View File

@ -1,81 +1,23 @@
const handler = require('./src/node_scraper.js');
var fs = require('fs');
var os = require("os");
const se_scraper = require('./src/node_scraper.js');
var Scraper = require('./src/modules/se_scraper');
exports.scrape = async function(config, callback) {
async function scrape(browser_config, scrape_config) {
// scrape config overwrites the browser_config
Object.assign(browser_config, scrape_config);
// options for scraping
event = {
// the user agent to scrape with
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
// if random_user_agent is set to True, a random user agent is chosen
random_user_agent: true,
// whether to select manual settings in visible mode
set_manual_settings: false,
// log ip address data
log_ip_address: false,
// log http headers
log_http_headers: false,
// how long to sleep between requests. a random sleep interval within the range [a,b]
// is drawn before every request. empty string for no sleeping.
sleep_range: '[1,1]',
// which search engine to scrape
search_engine: 'google',
compress: false, // compress
debug: false,
verbose: false,
keywords: ['scrapeulous.com'],
// whether to start the browser in headless mode
headless: true,
// the number of pages to scrape for each keyword
num_pages: 1,
// path to output file, data will be stored in JSON
output_file: '',
// whether to prevent images, css, fonts and media from being loaded
// will speed up scraping a great deal
block_assets: true,
// path to js module that extends functionality
// this module should export the functions:
// get_browser, handle_metadata, close_browser
//custom_func: resolve('examples/pluggable.js'),
custom_func: '',
// use a proxy for all connections
// example: 'socks5://78.94.172.42:1080'
// example: 'http://118.174.233.10:48400'
proxy: '',
// check if headless chrome escapes common detection techniques
// this is a quick test and should be used for debugging
test_evasion: false,
};
var scraper = new se_scraper.ScrapeManager(browser_config);
// overwrite default config
for (var key in config) {
event[key] = config[key];
}
await scraper.start();
if (fs.existsSync(event.keyword_file)) {
event.keywords = read_keywords_from_file(event.keyword_file);
}
var results = await scraper.scrape(scrape_config);
if (!callback) {
// called when results are ready
callback = function (err, response) {
if (err) {
console.error(err)
}
await scraper.quit();
console.dir(response.results, {depth: null, colors: true});
}
}
await handler.handler(event, undefined, callback );
};
function read_keywords_from_file(fname) {
let kws = fs.readFileSync(fname).toString().split(os.EOL);
// clean keywords
kws = kws.filter((kw) => {
return kw.trim().length > 0;
});
return kws;
return results;
}
module.exports = {
scrape: scrape,
ScrapeManager: se_scraper.ScrapeManager,
Scraper: Scraper,
};

2044
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@ -1,16 +1,17 @@
{
"name": "se-scraper",
"version": "1.1.13",
"description": "A simple library using puppeteer to scrape several search engines such as Google, Duckduckgo and Bing.",
"version": "1.5.7",
"description": "A module using puppeteer to scrape several search engines such as Google, Bing and Duckduckgo",
"homepage": "https://scrapeulous.com/",
"main": "index.js",
"scripts": {
"test": "mocha"
"test": "mocha test test/modules"
},
"keywords": [
"scraping",
"search-engines",
"google",
"bing",
"web-scraping"
],
"author": "Nikolai Tschacher <hire@incolumitas.com> (https://incolumitas.com/)",
@ -20,9 +21,25 @@
},
"license": "ISC",
"dependencies": {
"chai": "^4.2.0",
"cheerio": "^1.0.0-rc.2",
"cheerio": "^1.0.0-rc.3",
"debug": "^4.1.1",
"got": "^9.6.0",
"puppeteer": "^1.12.2"
"lodash": "^4.17.14",
"puppeteer": "^2.0.0",
"puppeteer-cluster": "^0.18.0",
"puppeteer-extra": "^2.1.3",
"puppeteer-extra-plugin-stealth": "^2.2.2",
"user-agents": "^1.0.378",
"winston": "^3.2.1"
},
"devDependencies": {
"bluebird": "^3.7.2",
"chai": "^4.2.0",
"chai-string": "^1.5.0",
"express": "^4.17.1",
"http-mitm-proxy": "^0.8.2",
"key-cert": "^1.0.1",
"mocha": "^6.1.4",
"ua-parser-js": "^0.7.21"
}
}

105
run.js
View File

@ -1,35 +1,22 @@
const se_scraper = require('./index.js');
const resolve = require('path').resolve;
let config = {
// those options need to be provided on startup
// and cannot give to se-scraper on scrape() calls
let browser_config = {
// the user agent to scrape with
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
// if random_user_agent is set to True, a random user agent is chosen
random_user_agent: true,
// how long to sleep between requests. a random sleep interval within the range [a,b]
// is drawn before every request. empty string for no sleeping.
sleep_range: '[1,2]',
// which search engine to scrape
search_engine: 'google',
// whether debug information should be printed
// debug info is useful for developers when debugging
debug: false,
// whether verbose program output should be printed
// this output is informational
verbose: true,
// an array of keywords to scrape
keywords: ['news'],
// alternatively you can specify a keyword_file. this overwrites the keywords array
keyword_file: '',
// the number of pages to scrape for each keyword
num_pages: 1,
random_user_agent: false,
// whether to start the browser in headless mode
headless: true,
// path to output file, data will be stored in JSON
output_file: 'data.json',
// whether to prevent images, css, fonts from being loaded
// will speed up scraping a great deal
block_assets: true,
headless: false,
// whether debug information should be printed
// level 0: print nothing
// level 1: print most important info
// ...
// level 4: print all shit nobody wants to know
debug_level: 1,
// specify flags passed to chrome here
chrome_flags: [],
// path to js module that extends functionality
// this module should export the functions:
// get_browser, handle_metadata, close_browser
@ -40,26 +27,56 @@ let config = {
// example: 'socks5://78.94.172.42:1080'
// example: 'http://118.174.233.10:48400'
proxy: '',
// check if headless chrome escapes common detection techniques
// this is a quick test and should be used for debugging
test_evasion: false,
// log ip address data
log_ip_address: true,
// log http headers
log_http_headers: true,
// a file with one proxy per line. Example:
// socks5://78.94.172.42:1080
// http://118.174.233.10:48400
proxy_file: '',
puppeteer_cluster_config: {
timeout: 10 * 60 * 1000, // max timeout set to 10 minutes
monitor: false,
concurrency: 1, // one scraper per tab
maxConcurrency: 1, // scrape with 1 tab
}
};
function callback(err, response) {
if (err) { console.error(err) }
(async () => {
// scrape config can change on each scrape() call
let scrape_config = {
// which search engine to scrape
search_engine: 'duckduckgo',
// an array of keywords to scrape
keywords: ['cloud service'],
// the number of pages to scrape for each keyword
num_pages: 1,
/* response object has the following properties:
// OPTIONAL PARAMS BELOW:
// google_settings: {
// gl: 'us', // The gl parameter determines the Google country to use for the query.
// hl: 'fr', // The hl parameter determines the Google UI language to return results.
// start: 0, // Determines the results offset to use, defaults to 0.
// num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
// },
// instead of keywords you can specify a keyword_file. this overwrites the keywords array
keyword_file: '',
// how long to sleep between requests. a random sleep interval within the range [a,b]
// is drawn before every request. empty string for no sleeping.
sleep_range: '',
// path to output file, data will be stored in JSON
output_file: '',
// whether to prevent images, css, fonts from being loaded
// will speed up scraping a great deal
block_assets: false,
// check if headless chrome escapes common detection techniques
// this is a quick test and should be used for debugging
test_evasion: false,
apply_evasion_techniques: true,
// log ip address data
log_ip_address: false,
// log http headers
log_http_headers: false,
};
response.results - json object with the scraping results
response.metadata - json object with metadata information
response.statusCode - status code of the scraping process
*/
let results = await se_scraper.scrape(browser_config, scrape_config);
console.dir(results, {depth: null, colors: true});
})();
// console.dir(response.results, {depth: null, colors: true});
}
se_scraper.scrape(config, callback);

View File

@ -2,7 +2,9 @@
<module type="WEB_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="true">
<exclude-output />
<content url="file://$MODULE_DIR$" />
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/test/static_tests/html" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>

View File

@ -0,0 +1,55 @@
const { Browser } = require('puppeteer-cluster/dist/concurrency/builtInConcurrency');
const debug = require('debug')('se-scraper:CustomConcurrency');
const { timeoutExecute } = require('puppeteer-cluster/dist/util');
const BROWSER_TIMEOUT = 5000;
class CustomConcurrency extends Browser {
async init() {}
async close() {}
async workerInstance() {
const options = this.options.perBrowserOptions.shift();
debug('Launch puppeteer instance with options=%o', options);
let chrome = await this.puppeteer.launch(options);
let page;
let context;
return {
jobInstance: async () => {
await timeoutExecute(BROWSER_TIMEOUT, (async () => {
context = await chrome.createIncognitoBrowserContext();
page = await context.newPage();
})());
return {
resources: {
page,
},
close: async () => {
await timeoutExecute(BROWSER_TIMEOUT, context.close());
},
};
},
close: async () => {
await chrome.close();
},
repair: async () => {
debug('Starting repair');
try {
// will probably fail, but just in case the repair was not necessary
await chrome.close();
} catch (e) {}
// just relaunch as there is only one page per browser
chrome = await this.puppeteer.launch(options);
},
};
}
};
module.exports = CustomConcurrency;

View File

@ -1,78 +0,0 @@
const cheerio = require('cheerio');
const Scraper = require('./se_scraper');
class BaiduScraper extends Scraper {
parse(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
// perform queries
const results = [];
$('#content_left .result').each((i, link) => {
results.push({
link: $(link).find('h3 a').attr('href'),
title: $(link).find('h3').text(),
snippet: $(link).find('.c-abstract').text(),
visible_link: $(link).find('.f13').text(),
})
});
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim()) {
res.rank = this.result_rank++;
cleaned.push(res);
}
}
return {
time: (new Date()).toUTCString(),
no_results: false,
num_results: $('.nums_text').text(),
results: cleaned,
}
}
async load_start_page() {
try {
await this.page.goto('https://www.baidu.com/');
await this.page.waitForSelector('input[name="wd"]', { timeout: 5000 });
} catch (e) {
return false;
}
return true;
}
async search_keyword(keyword) {
const input = await this.page.$('input[name="wd"]');
// overwrites last text in input
await input.click({ clickCount: 3 });
await input.type(keyword);
await input.focus();
await this.page.keyboard.press("Enter");
}
async next_page() {
let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000});
if (!next_page_link) {
return false;
}
await next_page_link.click();
await this.page.waitForNavigation();
return true;
}
async wait_for_results() {
// TODO: very very bad, but nobody uses baidu, or does someone?
await this.sleep(2000);
}
async detected() {
}
}
module.exports = {
BaiduScraper: BaiduScraper,
};

View File

@ -3,163 +3,238 @@ const Scraper = require('./se_scraper');
class BingScraper extends Scraper {
parse(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
async parse_async(html) {
// perform queries
const results = [];
$('#b_content #b_results .b_algo').each((i, link) => {
results.push({
link: $(link).find('h2 a').attr('href'),
title: $(link).find('h2').text(),
snippet: $(link).find('.b_caption p').text(),
visible_link: $(link).find('cite').text(),
})
});
let results = await this.page.evaluate(() => {
// 'Including results for', 'Einschließlich Ergebnisse'
let no_results = this.no_results(
['There are no results', 'Es gibt keine Ergebnisse'],
$('#b_results').text()
);
let _text = (el, s) => {
let n = el.querySelector(s);
let effective_query = $('#sp_requery a').first().text() || '';
if (n) {
return n.innerText;
} else {
return '';
}
};
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.title && res.title.trim()) {
res.rank = this.result_rank++;
cleaned.push(res);
}
}
let _attr = (el, s, attr) => {
let n = el.querySelector(s);
return {
time: (new Date()).toUTCString(),
no_results: no_results,
effective_query: effective_query,
num_results: $('#b_content .sb_count').text(),
results: cleaned,
}
}
if (n) {
return n.getAttribute(attr);
} else {
return null;
}
};
async load_start_page() {
try {
await this.page.goto('https://www.bing.com/');
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
} catch (e) {
return false;
}
return true;
}
let results = {
num_results: '',
no_results: false,
effective_query: '',
results: [],
ads: [],
right_side_ads: [],
};
async search_keyword(keyword) {
const input = await this.page.$('input[name="q"]');
await this.set_input_value(`input[name="q"]`, keyword);
await this.sleep(50);
await input.focus();
await this.page.keyboard.press("Enter");
}
let num_results_el = document.querySelector('#b_content .sb_count');
async next_page() {
let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000});
if (!next_page_link) {
return false;
}
await next_page_link.click();
await this.page.waitForNavigation();
if (num_results_el) {
results.num_results = num_results_el.innerText;
}
return true;
}
let organic_results = document.querySelectorAll('#b_content #b_results .b_algo');
async wait_for_results() {
await this.page.waitForSelector('#b_content', { timeout: 5000 });
await this.sleep(500);
}
organic_results.forEach((el) => {
async detected() {
// TODO: I was actually never detected by bing. those are good guys.
}
let serp_obj = {
link: _attr(el, 'h2 a', 'href'),
title: _text(el, 'h2'),
snippet: _text(el, '.b_caption p'),
visible_link: _text(el, 'cite'),
};
results.results.push(serp_obj);
});
// check if no results
results.no_results = (results.results.length === 0);
// parse bing ads
let ads = document.querySelectorAll('#b_results .b_ad .sb_add');
ads.forEach((el) => {
let ad_obj = {
title: _text(el, 'h2 a'),
snippet: _text(el, '.b_caption p'),
visible_link: _text(el, '.b_adurl cite'),
tracking_link: _attr(el, 'h2 a', 'href'),
};
results.ads.push(ad_obj);
});
// right side ads
let right_side_ads = document.querySelectorAll('#b_context .b_ad .sb_add');
right_side_ads.forEach((el) => {
let ad_obj = {
title: _text(el, 'h2 a'),
snippet: _text(el, '.b_caption p'),
visible_link: _text(el, '.b_adurl cite'),
tracking_link: _attr(el, 'h2 a', 'href'),
};
results.right_side_ads.push(ad_obj);
});
let effective_query_el = document.querySelector('#sp_requery a');
if (effective_query_el) {
results.effective_query = effective_query_el.innerText;
}
return results;
});
results.results = this.clean_results(results.results, ['title', 'link']);
results.ads = this.clean_results(results.ads, ['title', 'visible_link', 'tracking_link']);
results.time = (new Date()).toUTCString();
return results;
}
async load_start_page() {
let startUrl = this.build_start_url('https://www.bing.com/search?') || 'https://www.bing.com/';
if (this.config.bing_settings) {
startUrl = `https://www.${this.config.bing_settings.bing_domain}/search?`;
if (this.config.bing_settings.bing_domain) {
startUrl = `https://www.${this.config.bing_settings.bing_domain}/search?`;
} else {
startUrl = `https://www.bing.com/search?`;
}
for (var key in this.config.bing_settings) {
if (key !== 'bing_domain') {
startUrl += `${key}=${this.config.bing_settings[key]}&`
}
}
}
await this.page.goto(startUrl);
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
return true;
}
async search_keyword(keyword) {
const input = await this.page.$('input[name="q"]');
await this.set_input_value(`input[name="q"]`, keyword);
await this.sleep(50);
await input.focus();
await this.page.keyboard.press("Enter");
}
async next_page() {
let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000});
if (!next_page_link) {
return false;
}
this.last_response = await Promise.all([
next_page_link.click(), // The promise resolves after navigation has finished
this.page.waitForNavigation(), // Clicking the link will indirectly cause a navigation
]);
return true;
}
async wait_for_results() {
await this.page.waitForSelector('#b_content', { timeout: this.STANDARD_TIMEOUT });
}
async detected() {
// TODO: I was actually never detected by bing. those are good boys.
}
}
class BingNewsScraper extends Scraper {
parse(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
parse(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
// perform queries
const results = [];
$('#algocore .newsitem').each((i, link) => {
results.push({
link: $(link).attr('url'),
title: $(link).find('a.title').text(),
snippet: $(link).find('.snippet').text(),
date: $(link).find('.source span').last().text(),
})
});
// perform queries
const results = [];
$('#algocore .newsitem').each((i, link) => {
results.push({
link: $(link).attr('url'),
title: $(link).find('a.title').text(),
snippet: $(link).find('.snippet').text(),
date: $(link).find('.source span').last().text(),
})
});
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.title && res.title.trim()) {
res.rank = this.result_rank++;
cleaned.push(res);
}
}
const cleaned = this.clean_results(results, ['title', 'link']);
return {
time: (new Date()).toUTCString(),
results: cleaned,
}
}
return {
time: (new Date()).toUTCString(),
results: cleaned,
}
}
async load_start_page() {
try {
await this.page.goto('https://www.bing.com/news/search?');
if (this.config.set_manual_settings === true) {
console.log('Sleeping 30 seconds. Set your settings now.');
await this.sleep(30000);
}
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
} catch (e) {
return false;
}
return true;
}
async load_start_page() {
let startUrl = 'https://www.bing.com/news/search?';
async search_keyword(keyword) {
const input = await this.page.$('input[name="q"]');
await this.set_input_value(`input[name="q"]`, keyword);
await this.sleep(50);
await input.focus();
await this.page.keyboard.press("Enter");
}
try {
await this.page.goto(startUrl);
if (this.config.set_manual_settings === true) {
console.log('Sleeping 30 seconds. Set your settings now.');
await this.sleep(30000);
}
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
} catch (e) {
return false;
}
async next_page() {
let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000});
if (!next_page_link) {
return false;
}
await next_page_link.click();
await this.page.waitForNavigation();
return true;
}
return true;
}
async search_keyword(keyword) {
const input = await this.page.$('input[name="q"]');
await this.set_input_value(`input[name="q"]`, keyword);
await this.sleep(50);
await input.focus();
await this.page.keyboard.press("Enter");
}
async wait_for_results() {
await this.page.waitForSelector('#news', { timeout: 5000 });
await this.sleep(2000);
}
async next_page() {
let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000});
if (!next_page_link) {
return false;
}
async detected() {
// TODO: I was actually never detected by bing news.
}
this.last_response = await Promise.all([
next_page_link.click(), // The promise resolves after navigation has finished
this.page.waitForNavigation(), // Clicking the link will indirectly cause a navigation
]);
return true;
}
async wait_for_results() {
await this.page.waitForSelector('#news', { timeout: this.STANDARD_TIMEOUT });
}
async detected() {
// TODO: I was actually never detected by bing news.
}
}
module.exports = {
BingNewsScraper: BingNewsScraper,
BingScraper: BingScraper,
};
BingNewsScraper: BingNewsScraper,
BingScraper: BingScraper,
};

View File

@ -1,15 +1,18 @@
const cheerio = require('cheerio');
const Scraper = require('./se_scraper');
const debug = require('debug')('se-scraper:DuckduckgoScraper');
class DuckduckgoScraper extends Scraper {
parse(html) {
debug('parse');
// load the page source into cheerio
const $ = cheerio.load(html);
// perform queries
const results = [];
$('.result__body').each((i, link) => {
const organicSelector = ($('#links .result--sep').length > 0) ? `#links #rld-${this.page_num - 1} ~ .result .result__body` : '#links .result__body';
$(organicSelector).each((i, link) => {
results.push({
link: $(link).find('.result__title .result__a').attr('href'),
title: $(link).find('.result__title .result__a').text(),
@ -19,35 +22,40 @@ class DuckduckgoScraper extends Scraper {
});
});
const ads = [];
$('.results--ads .result').each((i, element) => {
ads.push({
visible_link: $(element).find('.result__url').text(),
tracking_link: $(element).find('.result__title .result__a').attr('href'),
title: $(element).find('.result__title .result__a').text(),
snippet: $(element).find('.result__snippet').text(),
})
});
let effective_query = $('a.js-spelling-suggestion-link').attr('data-query') || '';
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.title && res.title.trim()) {
res.rank = this.result_rank++;
cleaned.push(res);
}
}
const cleaned = this.clean_results(results, ['title', 'link']);
return {
time: (new Date()).toUTCString(),
effective_query: effective_query,
results: cleaned
results: cleaned,
ads: ads,
}
}
async load_start_page() {
try {
await this.page.goto('https://duckduckgo.com/');
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
} catch (e) {
return false;
}
debug('load_start_page');
let startUrl = 'https://duckduckgo.com/';
this.last_response = await this.page.goto(startUrl);
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
return true;
}
async search_keyword(keyword) {
debug('search_keyword');
const input = await this.page.$('input[name="q"]');
await this.set_input_value(`input[name="q"]`, keyword);
await this.sleep(50);
@ -56,90 +64,20 @@ class DuckduckgoScraper extends Scraper {
}
async next_page() {
let next_page_link = await this.page.$('a.result--more__btn', {timeout: 1000});
debug('next_page');
let next_page_link = await this.page.$('.result.result--more a', {timeout: this.STANDARD_TIMEOUT});
if (!next_page_link) {
return false;
}
await next_page_link.click();
//await this.page.waitForNavigation();
await this.page.waitForNavigation({ timeout: this.STANDARD_TIMEOUT });
return true;
}
async wait_for_results() {
await this.page.waitForSelector('.serp__results', { timeout: 5000 });
await this.sleep(250);
}
async detected() {
}
}
class DuckduckgoNewsScraper extends Scraper {
parse(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
// perform queries
const results = [];
$('.result--news').each((i, link) => {
results.push({
link: $(link).find('.result__title .result__a').attr('href'),
title: $(link).find('.result__title .result__a').text(),
date: $(link).find('.result__timestamp').text(),
snippet: $(link).find('.result__snippet').text(),
});
});
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.title && res.title.trim()) {
res.rank = this.result_rank++;
cleaned.push(res);
}
}
return {
time: (new Date()).toUTCString(),
results: cleaned
}
}
async load_start_page() {
try {
await page.goto('https://duckduckgo.com/?q=42&t=h_&iar=news&ia=news');
await page.waitForSelector('input[name="q"]', { timeout: 5000 });
} catch (e) {
return false;
}
return true;
}
async search_keyword(keyword) {
const input = await this.page.$('input[name="q"]');
await this.set_input_value(`input[name="q"]`, keyword);
await this.sleep(50);
await input.focus();
await this.page.keyboard.press("Enter");
}
async next_page() {
let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000});
if (!next_page_link) {
return false;
}
await next_page_link.click();
await this.page.waitForNavigation();
return true;
}
async wait_for_results() {
await this.page.waitForSelector('.serp__results', { timeout: 5000 });
await this.sleep(1500);
debug('wait_for_results');
await this.page.waitForSelector('.result__body', { timeout: this.STANDARD_TIMEOUT });
}
async detected() {
@ -147,6 +85,5 @@ class DuckduckgoNewsScraper extends Scraper {
}
module.exports = {
DuckduckgoNewsScraper: DuckduckgoNewsScraper,
DuckduckgoScraper: DuckduckgoScraper,
};

File diff suppressed because it is too large Load Diff

View File

@ -41,8 +41,11 @@ class InfospaceScraper extends Scraper {
}
async load_start_page() {
let startUrl = this.build_start_url('http://search.infospace.com/search/web?') || 'http://infospace.com/index.html';
try {
await this.page.goto('http://infospace.com/index.html');
this.last_response = await this.page.goto(startUrl);
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
} catch (e) {
return false;
@ -64,14 +67,13 @@ class InfospaceScraper extends Scraper {
return false;
}
await next_page_link.click();
await this.page.waitForNavigation();
this.last_response = await this.page.waitForNavigation();
return true;
}
async wait_for_results() {
await this.page.waitForSelector('.mainline-results', { timeout: 5000 }); // TODO: this is not the best selector.
await this.sleep(250);
}
async detected() {
@ -98,14 +100,7 @@ class WebcrawlerNewsScraper extends Scraper {
});
});
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.title && res.title.trim()) {
res.rank = this.result_rank++;
cleaned.push(res);
}
}
const cleaned = this.clean_results(results, ['title', 'link']);
return {
time: (new Date()).toUTCString(),
@ -115,7 +110,7 @@ class WebcrawlerNewsScraper extends Scraper {
async load_start_page() {
try {
await this.page.goto('https://www.webcrawler.com/?qc=news');
this.last_response = await this.page.goto('https://www.webcrawler.com/?qc=news');
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
} catch (e) {
return false;
@ -144,7 +139,6 @@ class WebcrawlerNewsScraper extends Scraper {
async wait_for_results() {
await this.page.waitForSelector('.mainline-results', { timeout: 5000 });
await this.sleep(150);
}
async detected() {

View File

@ -1,33 +1,31 @@
const cheerio = require('cheerio');
module.exports = {
get_ip_data: get_ip_data,
get_http_headers: get_http_headers,
get_ip_data: get_ip_data,
get_http_headers: get_http_headers,
};
async function get_ip_data(browser) {
const page = await browser.newPage();
await page.goto('https://ipinfo.io/json', {
waitLoad: true,
waitNetworkIdle: true // defaults to false
});
let json = await page.content({
timeout: 20000
});
const $ = cheerio.load(json);
let ipinfo_text = $('pre').text();
return JSON.parse(ipinfo_text);
async function get_ip_data(page) {
await page.goto('https://ipinfo.io/json', {
waitLoad: true,
waitNetworkIdle: true
});
let json = await page.content({
timeout: 20000
});
const $ = cheerio.load(json);
let ipinfo_text = $('pre').text();
return JSON.parse(ipinfo_text);
}
async function get_http_headers(browser) {
const page = await browser.newPage();
await page.goto('https://httpbin.org/get', {
waitLoad: true,
waitNetworkIdle: true // defaults to false
});
let headers = await page.content();
async function get_http_headers(page) {
await page.goto('https://httpbin.org/get', {
waitLoad: true,
waitNetworkIdle: true
});
let headers = await page.content();
const $ = cheerio.load(headers);
let headers_text = $('pre').text();
return JSON.parse(headers_text);
const $ = cheerio.load(headers);
let headers_text = $('pre').text();
return JSON.parse(headers_text);
}

View File

@ -1,7 +1,6 @@
const start_url = {
'google': ''
};
'use strict';
const meta = require('./metadata.js');
const debug = require('debug')('se-scraper:Scraper');
/*
Get useful JS knowledge and get awesome...
@ -11,21 +10,28 @@ const start_url = {
module.exports = class Scraper {
constructor(options = {}) {
debug('constructor');
const {
browser = null,
config = {},
context = {},
pluggable = null,
page = null,
} = options;
this.page = page;
this.last_response = null; // the last response object
this.metadata = {
scraping_detected: false,
};
this.pluggable = pluggable;
this.browser = browser;
this.config = config;
this.logger = this.config.logger;
this.context = context;
this.STANDARD_TIMEOUT = 8000;
// longer timeout when using proxies
this.PROXY_TIMEOUT = 15000;
this.proxy = config.proxy;
this.keywords = config.keywords;
this.STANDARD_TIMEOUT = 10000;
this.SOLVE_CAPTCHA_TIME = 45000;
this.results = {};
@ -34,20 +40,42 @@ module.exports = class Scraper {
this.num_requests = 0;
// keep track of the keywords searched
this.num_keywords = 0;
let settings = this.config[`${this.config.search_engine}_settings`];
if (settings) {
if (typeof settings === 'string') {
settings = JSON.parse(settings);
this.config[`${this.config.search_engine}_settings`] = settings;
}
}
}
async run() {
async run({page, data, worker}) {
let do_continue = await this.load_search_engine();
debug('worker=%o', worker, this.config.keywords);
if (page) {
this.page = page;
}
await this.page.setViewport({ width: 1920, height: 1040 });
let do_continue = true;
if (this.config.scrape_from_file.length <= 0) {
do_continue = await this.load_search_engine();
}
if (!do_continue) {
console.error('Failed to load the search engine: load_search_engine()');
return this.results;
} else {
await this.scraping_loop();
}
await this.scraping_loop();
return this.results;
return {
results: this.results,
metadata: this.metadata,
num_requests: this.num_requests,
}
}
/**
@ -58,10 +86,10 @@ module.exports = class Scraper {
*/
async load_search_engine() {
this.page = await this.browser.newPage();
// prevent detection by evading common detection techniques
await evadeChromeHeadlessDetection(this.page);
if (this.config.apply_evasion_techniques === true) {
// prevent detection by evading common detection techniques
await evadeChromeHeadlessDetection(this.page);
}
// block some assets to speed up scraping
if (this.config.block_assets === true) {
@ -79,12 +107,35 @@ module.exports = class Scraper {
if (this.config.test_evasion === true) {
// Navigate to the page that will perform the tests.
const testUrl = 'https://intoli.com/blog/' +
'not-possible-to-block-chrome-headless/chrome-headless-test.html';
const testUrl = 'https://bot.sannysoft.com';
await this.page.goto(testUrl);
// Save a screenshot of the results.
await this.page.screenshot({path: 'headless-test-result.png'});
await this.page.screenshot({path: 'headless-evasion-result.png'});
}
if (this.config.log_http_headers === true) {
this.metadata.http_headers = await meta.get_http_headers(this.page);
debug('this.metadata.http_headers=%O', this.metadata.http_headers);
}
if (this.config.log_ip_address === true) {
let ipinfo = await meta.get_ip_data(this.page);
this.metadata.ipinfo = ipinfo;
debug('this.metadata.ipinfo', this.metadata.ipinfo);
}
// check that our proxy is working by confirming
// that ipinfo.io sees the proxy IP address
if (this.proxy && this.config.log_ip_address === true) {
debug(`${this.metadata.ipinfo.ip} vs ${this.proxy}`);
// if the ip returned by ipinfo is not a substring of our proxystring, get the heck outta here
if (!this.proxy.includes(this.metadata.ipinfo.ip)) {
throw new Error(`Proxy output ip ${this.proxy} does not match with provided one`);
} else {
this.logger.info(`Using valid Proxy: ${this.proxy}`);
}
}
return await this.load_start_page();
@ -98,37 +149,39 @@ module.exports = class Scraper {
* @returns {Promise<void>}
*/
async scraping_loop() {
for (let keyword of this.config.keywords) {
for (var keyword of this.keywords) {
this.num_keywords++;
this.keyword = keyword;
this.results[keyword] = {};
this.result_rank = 1;
if (this.pluggable.before_keyword_scraped) {
await this.pluggable.before_keyword_scraped({
num_keywords: this.num_keywords,
num_requests: this.num_requests,
keyword: keyword,
page: this.page,
config: this.config,
context: this.context,
});
}
let page_num = 1;
try {
await this.search_keyword(keyword);
if (this.pluggable && this.pluggable.before_keyword_scraped) {
await this.pluggable.before_keyword_scraped({
results: this.results,
num_keywords: this.num_keywords,
num_requests: this.num_requests,
keyword: keyword,
});
}
this.page_num = 1;
// load scraped page from file if `scrape_from_file` is given
if (this.config.scrape_from_file.length <= 0) {
await this.search_keyword(keyword);
} else {
this.last_response = await this.page.goto(this.config.scrape_from_file);
}
// when searching the keyword fails, num_requests will not
// be incremented.
this.num_requests++;
do {
if (this.config.verbose === true) {
console.log(`${this.config.search_engine} scrapes keyword "${keyword}" on page ${page_num}`);
}
this.logger.info(`${this.config.search_engine_name} scrapes keyword "${keyword}" on page ${this.page_num}`);
await this.wait_for_results();
@ -138,13 +191,66 @@ module.exports = class Scraper {
let html = await this.page.content();
let parsed = this.parse(html);
this.results[keyword][page_num] = parsed ? parsed : await this.parse_async(html);
this.results[keyword][this.page_num] = parsed ? parsed : await this.parse_async(html);
page_num += 1;
if (this.config.screen_output) {
this.results[keyword][this.page_num].screenshot = await this.page.screenshot({
encoding: 'base64',
fullPage: false,
});
}
if (this.config.html_output) {
if (this.config.clean_html_output) {
await this.page.evaluate(() => {
// remove script and style tags
Array.prototype.slice.call(document.getElementsByTagName('script')).forEach(
function(item) {
item.remove();
});
Array.prototype.slice.call(document.getElementsByTagName('style')).forEach(
function(item) {
item.remove();
});
// remove all comment nodes
var nodeIterator = document.createNodeIterator(
document.body,
NodeFilter.SHOW_COMMENT,
{ acceptNode: function(node) { return NodeFilter.FILTER_ACCEPT; } }
);
while(nodeIterator.nextNode()){
var commentNode = nodeIterator.referenceNode;
commentNode.remove();
}
});
}
if (this.config.clean_data_images) {
await this.page.evaluate(() => {
Array.prototype.slice.call(document.getElementsByTagName('img')).forEach(
function(item) {
let src = item.getAttribute('src');
if (src && src.startsWith('data:')) {
item.setAttribute('src', '');
}
});
});
}
let html_contents = await this.page.content();
// https://stackoverflow.com/questions/27841112/how-to-remove-white-space-between-html-tags-using-javascript
// TODO: not sure if this is save!
html_contents = html_contents.replace(/>\s+</g,'><');
this.results[keyword][this.page_num].html = html_contents;
}
this.page_num += 1;
// only load the next page when we will pass the next iteration
// step from the while loop
if (page_num <= this.config.num_pages) {
if (this.page_num <= this.config.num_pages) {
let next_page_loaded = await this.next_page();
@ -155,36 +261,66 @@ module.exports = class Scraper {
}
}
} while (page_num <= this.config.num_pages);
} while (this.page_num <= this.config.num_pages);
} catch (e) {
console.error(`Problem with scraping ${keyword} in search engine ${this.config.search_engine}: ${e}`);
this.logger.warn(`Problem with scraping ${keyword} in search engine ${this.config.search_engine_name}: ${e.message}`);
debug('this.last_response=%O', this.last_response);
if (await this.detected() === true) {
console.error(`${this.config.search_engine} DETECTED the scraping!`);
if (this.config.take_screenshot_on_error) {
await this.page.screenshot({ path: `debug_se_scraper_${this.config.search_engine_name}_${keyword}.png` });
}
this.metadata.scraping_detected = await this.detected();
if (this.metadata.scraping_detected === true) {
this.logger.warn(`${this.config.search_engine_name} detected the scraping!`);
if (this.config.is_local === true) {
await this.sleep(this.SOLVE_CAPTCHA_TIME);
console.error(`You have ${this.SOLVE_CAPTCHA_TIME}ms to enter the captcha.`);
this.logger.info(`You have ${this.SOLVE_CAPTCHA_TIME}ms to enter the captcha.`);
// expect that user filled out necessary captcha
} else {
break;
if (this.config.throw_on_detection === true) {
throw( e );
} else {
return;
}
}
} else {
// some other error, quit scraping process if stuff is broken
if (this.config.is_local === true) {
console.error('You have 30 seconds to fix this.');
await this.sleep(30000);
if (this.config.throw_on_detection === true) {
throw( e );
} else {
break;
return;
}
}
}
}
}
/**
* Generic function to append queryArgs to a search engine url.
*
* @param: The baseUrl to use for the build process.
*/
build_start_url(baseUrl) {
let settings = this.config[`${this.config.search_engine}_settings`];
if (settings) {
for (var key in settings) {
baseUrl += `${key}=${settings[key]}&`
}
this.logger.info('Using startUrl: ' + baseUrl);
return baseUrl;
}
return false;
}
sleep(ms) {
return new Promise(resolve => {
setTimeout(resolve, ms)
@ -194,9 +330,7 @@ module.exports = class Scraper {
async random_sleep() {
const [min, max] = this.config.sleep_range;
let rand = Math.floor(Math.random() * (max - min + 1) + min); //Generate Random number
if (this.config.debug === true) {
console.log(`Sleeping for ${rand}s`);
}
this.logger.info(`Sleeping for ${rand}s`);
await this.sleep(rand * 1000);
}
@ -210,15 +344,35 @@ module.exports = class Scraper {
no_results(needles, html) {
for (let needle of needles) {
if (html.includes(needle)) {
if (this.config.debug) {
console.log(`HTML contains needle ${needle}. no_results=true`);
}
this.logger.warn(`HTML contains needle ${needle}. no_results=true`);
return true;
}
}
return false;
}
/*
Throw away all elements that do not have data in the
specified attributes. Most be of value string.
*/
clean_results(results, attributes) {
const cleaned = [];
for (var res of results) {
let goodboy = true;
for (var attr of attributes) {
if (!res[attr] || !res[attr].trim()) {
goodboy = false;
break;
}
}
if (goodboy) {
res.rank = this.result_rank++;
cleaned.push(res);
}
}
return cleaned;
}
parse(html) {
}
@ -265,127 +419,131 @@ module.exports = class Scraper {
// This is where we'll put the code to get around the tests.
async function evadeChromeHeadlessDetection(page) {
// Pass the Webdriver Test.
await page.evaluateOnNewDocument(() => {
const newProto = navigator.__proto__;
delete newProto.webdriver;
navigator.__proto__ = newProto;
});
// Pass the Chrome Test.
await page.evaluateOnNewDocument(() => {
// We can mock this in as much depth as we need for the test.
const mockObj = {
app: {
isInstalled: false,
},
webstore: {
onInstallStageChanged: {},
onDownloadProgress: {},
},
runtime: {
PlatformOs: {
MAC: 'mac',
WIN: 'win',
ANDROID: 'android',
CROS: 'cros',
LINUX: 'linux',
OPENBSD: 'openbsd',
},
PlatformArch: {
ARM: 'arm',
X86_32: 'x86-32',
X86_64: 'x86-64',
},
PlatformNaclArch: {
ARM: 'arm',
X86_32: 'x86-32',
X86_64: 'x86-64',
},
RequestUpdateCheckStatus: {
THROTTLED: 'throttled',
NO_UPDATE: 'no_update',
UPDATE_AVAILABLE: 'update_available',
},
OnInstalledReason: {
INSTALL: 'install',
UPDATE: 'update',
CHROME_UPDATE: 'chrome_update',
SHARED_MODULE_UPDATE: 'shared_module_update',
},
OnRestartRequiredReason: {
APP_UPDATE: 'app_update',
OS_UPDATE: 'os_update',
PERIODIC: 'periodic',
},
},
};
window.navigator.chrome = mockObj;
window.chrome = mockObj;
});
// Pass the Permissions Test.
await page.evaluateOnNewDocument(() => {
const originalQuery = window.navigator.permissions.query;
window.navigator.permissions.__proto__.query = parameters =>
parameters.name === 'notifications'
? Promise.resolve({state: Notification.permission})
: originalQuery(parameters);
// Inspired by: https://github.com/ikarienator/phantomjs_hide_and_seek/blob/master/5.spoofFunctionBind.js
const oldCall = Function.prototype.call;
function call() {
return oldCall.apply(this, arguments);
}
Function.prototype.call = call;
const nativeToStringFunctionString = Error.toString().replace(/Error/g, "toString");
const oldToString = Function.prototype.toString;
function functionToString() {
if (this === window.navigator.permissions.query) {
return "function query() { [native code] }";
}
if (this === functionToString) {
return nativeToStringFunctionString;
}
return oldCall.call(oldToString, this);
}
Function.prototype.toString = functionToString;
});
// Pass the Plugins Length Test.
await page.evaluateOnNewDocument(() => {
// Overwrite the `plugins` property to use a custom getter.
Object.defineProperty(navigator, 'plugins', {
// This just needs to have `length > 0` for the current test,
// but we could mock the plugins too if necessary.
get: () => [1, 2, 3, 4, 5]
// Pass the Webdriver Test.
await page.evaluateOnNewDocument(() => {
const newProto = navigator.__proto__;
delete newProto.webdriver;
navigator.__proto__ = newProto;
});
});
// Pass the Languages Test.
await page.evaluateOnNewDocument(() => {
// Overwrite the `plugins` property to use a custom getter.
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en']
// Pass the Chrome Test.
await page.evaluateOnNewDocument(() => {
// We can mock this in as much depth as we need for the test.
const mockObj = {
app: {
isInstalled: false,
},
webstore: {
onInstallStageChanged: {},
onDownloadProgress: {},
},
runtime: {
PlatformOs: {
MAC: 'mac',
WIN: 'win',
ANDROID: 'android',
CROS: 'cros',
LINUX: 'linux',
OPENBSD: 'openbsd',
},
PlatformArch: {
ARM: 'arm',
X86_32: 'x86-32',
X86_64: 'x86-64',
},
PlatformNaclArch: {
ARM: 'arm',
X86_32: 'x86-32',
X86_64: 'x86-64',
},
RequestUpdateCheckStatus: {
THROTTLED: 'throttled',
NO_UPDATE: 'no_update',
UPDATE_AVAILABLE: 'update_available',
},
OnInstalledReason: {
INSTALL: 'install',
UPDATE: 'update',
CHROME_UPDATE: 'chrome_update',
SHARED_MODULE_UPDATE: 'shared_module_update',
},
OnRestartRequiredReason: {
APP_UPDATE: 'app_update',
OS_UPDATE: 'os_update',
PERIODIC: 'periodic',
},
},
};
window.navigator.chrome = mockObj;
window.chrome = mockObj;
});
});
// Pass the iframe Test
await page.evaluateOnNewDocument(() => {
Object.defineProperty(HTMLIFrameElement.prototype, 'contentWindow', {
get: function() {
return window;
// Pass the Permissions Test.
await page.evaluateOnNewDocument(() => {
const originalQuery = window.navigator.permissions.query;
window.navigator.permissions.__proto__.query = parameters =>
parameters.name === 'notifications'
? Promise.resolve({state: Notification.permission})
: originalQuery(parameters);
// Inspired by: https://github.com/ikarienator/phantomjs_hide_and_seek/blob/master/5.spoofFunctionBind.js
const oldCall = Function.prototype.call;
function call() {
return oldCall.apply(this, arguments);
}
});
});
// Pass toString test, though it breaks console.debug() from working
await page.evaluateOnNewDocument(() => {
window.console.debug = () => {
return null;
};
});
}
Function.prototype.call = call;
const nativeToStringFunctionString = Error.toString().replace(/Error/g, "toString");
const oldToString = Function.prototype.toString;
function functionToString() {
if (this === window.navigator.permissions.query) {
return "function query() { [native code] }";
}
if (this === functionToString) {
return nativeToStringFunctionString;
}
return oldCall.call(oldToString, this);
}
Function.prototype.toString = functionToString;
});
// Pass the Plugins Length Test.
await page.evaluateOnNewDocument(() => {
// Overwrite the `plugins` property to use a custom getter.
Object.defineProperty(navigator, 'plugins', {
// This just needs to have `length > 0` for the current test,
// but we could mock the plugins too if necessary.
get: () => [1, 2, 3, 4, 5]
});
});
// Pass the Languages Test.
await page.evaluateOnNewDocument(() => {
// Overwrite the `plugins` property to use a custom getter.
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en']
});
});
// Pass the iframe Test
await page.evaluateOnNewDocument(() => {
Object.defineProperty(HTMLIFrameElement.prototype, 'contentWindow', {
get: function () {
return window;
}
});
});
// Pass toString test, though it breaks console.debug() from working
await page.evaluateOnNewDocument(() => {
window.console.debug = () => {
return null;
};
});
}

View File

@ -1,215 +0,0 @@
const cheerio = require('cheerio');
const Scraper = require('./se_scraper');
class YahooFinanceScraper extends Scraper {
parse(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
const results = [];
$('.js-stream-content .Cf').each((i, link) => {
results.push({
link: $(link).find('h3 a').attr('href'),
title: $(link).find('h3').text(),
snippet: $(link).find('p').text(),
})
});
return {
time: (new Date()).toUTCString(),
results: results,
}
}
async load_start_page() {
try {
await this.page.goto('https://finance.yahoo.com/');
for (var i = 0; i < 3; i++) {
let consent = await this.page.waitForSelector('[type="submit"]');
await consent.click();
}
} catch (e) {
return false;
}
return true;
}
async search_keyword(keyword) {
await this.page.goto(`https://finance.yahoo.com/quote/${keyword}/news?p=${keyword}`);
await this.page.waitForSelector('#quote-header-info', { timeout: 8000 });
await this.sleep(1000);
}
async next_page() {
return false;
}
async wait_for_results() {
await this.page.waitForSelector('#b_content', { timeout: 5000 });
await this.sleep(500);
}
async detected() {
}
}
class MarketwatchFinanceScraper extends Scraper {
async parse_async(html) {
let res = await this.page.evaluate(() => {
let results = [];
// get the hotel elements
let items = document.querySelectorAll('.article__content');
// get the hotel data
items.forEach((newsitem) => {
let data = {};
try {
data.link = newsitem.querySelector('.article__headline a').getAttribute('href');
data.title = newsitem.querySelector('.article__headline a').innerText;
data.date = newsitem.querySelector('.article__timestamp').innerText;
data.author = newsitem.querySelector('.article__author').innerText;
}
catch (exception) {
console.error('Error parsing marketwatch data: ', exception);
}
results.push(data);
});
return results;
});
return {
time: (new Date()).toUTCString(),
results: res,
}
}
async load_start_page() {
return true;
}
async search_keyword(keyword) {
await this.page.goto(`https://www.marketwatch.com/investing/stock/${keyword}`);
}
async next_page() {
return false;
}
async wait_for_results() {
await this.page.waitForSelector('.intraday__data', { timeout: 8000 });
await this.sleep(500);
}
async detected() {
}
}
class ReutersFinanceScraper extends Scraper {
async parse_async(html) {
let newsData = await this.page.evaluate(() => {
let results = [];
// get the hotel elements
let items = document.querySelectorAll('div.feature');
// get the hotel data
items.forEach((newsitem) => {
let data = {};
try {
data.link = newsitem.querySelector('h2 a').getAttribute('href');
data.link = 'https://www.reuters.com' + data.link;
data.title = newsitem.querySelector('h2 a').innerText;
data.snippet = newsitem.querySelector('p').innerText;
data.date = newsitem.querySelector('.timestamp').innerText;
}
catch (exception) {
console.error('Error parsing reuters data: ', exception);
}
results.push(data);
});
return results;
});
return {
time: (new Date()).toUTCString(),
results: newsData,
}
}
async load_start_page() {
return true;
}
async search_keyword(keyword) {
await this.page.goto(`https://www.reuters.com/finance/stocks/overview/${keyword}`);
}
async next_page() {
return false;
}
async wait_for_results() {
await this.page.waitForSelector('#sectionHeader', { timeout: 8000 });
await this.sleep(500);
}
async detected() {
}
}
class CnbcFinanceScraper extends Scraper {
async parse_async(html) {
let newsData = await this.page.evaluate(() => {
let results = [];
// get the hotel elements
let items = document.querySelectorAll('div.headline');
// get the hotel data
items.forEach((newsitem) => {
let data = {};
try {
data.link = newsitem.querySelector('a').getAttribute('href');
data.title = newsitem.querySelector('[ng-bind="asset.headline"]').innerText;
data.date = newsitem.querySelector('span.note').innerText;
}
catch (exception) {
console.error('Error parsing cnbc data: ', exception);
}
results.push(data);
});
return results;
});
return {
time: (new Date()).toUTCString(),
results: newsData,
}
}
async load_start_page() {
return true;
}
async search_keyword(keyword) {
await this.page.goto(`https://www.cnbc.com/quotes/?symbol=${keyword}&tab=news`);
}
async next_page() {
return false;
}
async wait_for_results() {
await this.page.waitForSelector('#quote_title_and_chart', { timeout: 8000 });
await this.sleep(500);
}
async detected() {
}
}
module.exports = {
YahooFinanceScraper: YahooFinanceScraper,
ReutersFinanceScraper: ReutersFinanceScraper,
CnbcFinanceScraper: CnbcFinanceScraper,
MarketwatchFinanceScraper: MarketwatchFinanceScraper,
};

View File

@ -1,81 +0,0 @@
module.exports = {
random_user_agent: random_user_agent,
};
function random_user_agent() {
let rand = user_agents[Math.floor(Math.random()*user_agents.length)];
}
// updated: 29 Jan 2019
const user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 YaBrowser/18.11.1.805 Yowser/2.5 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763',
'Mozilla/5.0 (iPad; CPU OS 12_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.1 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 YaBrowser/18.11.1.805 Yowser/2.5 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.106',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.98 Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; Touch; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8',
'Mozilla/5.0 (Windows NT 10.0; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0',
'Mozilla/5.0 (X11; CrOS x86_64 11151.59.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.94 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
];

114
src/modules/yandex.js Normal file
View File

@ -0,0 +1,114 @@
'use strict';
const Scraper = require('./se_scraper');
class YandexScraper extends Scraper {
constructor(...args) {
super(...args);
}
async parse_async(html) {
let results = await this.page.evaluate(() => {
let serp_items = document.querySelectorAll('.serp-item');
const data = [];
serp_items.forEach((item) => {
let obj = {
is_ad: false,
};
try {
if (item) {
let linkElement = item.querySelector('h2 a.link');
if (linkElement) {
obj.link = linkElement.getAttribute('href');
obj.title = linkElement.innerText;
}
let label = item.querySelector('.organic__subtitle .label');
if (label) {
let labelText = label.innerText;
if (labelText) {
labelText = labelText.trim().toLowerCase();
console.log(labelText);
let ad_labels = ['ad', 'werbung', 'реклама', 'anuncio'];
obj.is_ad = ad_labels.includes(labelText);
}
}
obj.snippet = item.querySelector('.text-container.typo').innerText;
obj.visible_link = item.querySelector('.typo_type_greenurl').innerText;
if (obj.title) {
data.push(obj);
}
}
} catch (e) {
}
});
return data;
});
let num_results = await this.page.evaluate(() => {
let num_results = document.querySelector('.serp-adv__found');
if (num_results) {
return num_results.innerText;
}
});
const cleaned = this.clean_results(results, ['title', 'link' , 'snippet']);
return {
time: (new Date()).toUTCString(),
num_results: num_results,
results: cleaned,
};
}
async load_start_page() {
let startUrl = 'https://yandex.com';
this.logger.info('Using startUrl: ' + startUrl);
this.last_response = await this.page.goto(startUrl);
await this.page.waitForSelector('input[name="text"]', { timeout: this.STANDARD_TIMEOUT });
return true;
}
async search_keyword(keyword) {
const input = await this.page.$('input[name="text"]');
await this.set_input_value(`input[name="text"]`, keyword);
await this.sleep(50);
await input.focus();
await this.page.keyboard.press("Enter");
}
async next_page() {
let next_page_link = await this.page.$('.pager .pager__item_kind_next', {timeout: 1000});
if (!next_page_link) {
return false;
}
await next_page_link.click();
return true;
}
async wait_for_results() {
await this.page.waitForSelector('.main__content', { timeout: this.STANDARD_TIMEOUT });
}
async detected() {
}
}
module.exports = {
YandexScraper: YandexScraper,
};

View File

@ -1,105 +0,0 @@
const cheerio = require('cheerio');
const Scraper = require('./se_scraper');
class YoutubeScraper extends Scraper {
parse(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
// perform queries
const results = [];
$('#contents ytd-video-renderer,#contents ytd-grid-video-renderer').each((i, link) => {
results.push({
link: $(link).find('#video-title').attr('href'),
title: $(link).find('#video-title').text(),
snippet: $(link).find('#description-text').text(),
channel: $(link).find('#byline a').text(),
channel_link: $(link).find('#byline a').attr('href'),
num_views: $(link).find('#metadata-line span:nth-child(1)').text(),
release_date: $(link).find('#metadata-line span:nth-child(2)').text(),
})
});
let no_results = this.no_results(
['No results found', 'Keine Ergebnisse', 'Es werden Ergebnisse angezeigt', 'Showing results for' ],
$('yt-showing-results-for-renderer').text()
);
let effective_query = $('#corrected-link').text() || '';
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.title && res.title.trim()) {
res.title = res.title.trim();
res.snippet = res.snippet.trim();
res.rank = this.result_rank++;
// check if this result has been used before
if (this.all_videos.has(res.title) === false) {
cleaned.push(res);
}
this.all_videos.add(res.title);
}
}
return {
time: (new Date()).toUTCString(),
no_results: no_results,
effective_query: effective_query,
num_results: '',
results: cleaned,
}
}
async load_start_page() {
try {
this.all_videos = new Set();
await this.page.goto('https://www.youtube.com', {
referer: 'https://google.com'
});
await this.page.waitForSelector('input[id="search"]', { timeout: 5000 });
// before we do anything, parse the results of the front page of youtube
await this.page.waitForSelector('ytd-video-renderer,ytd-grid-video-renderer', { timeout: 10000 });
await this.sleep(500);
let html = await this.page.content();
this.results['frontpage'] = this.parse(html);
this.result_rank = 1;
} catch(e) {
return false;
}
return true;
}
async search_keyword(keyword) {
const input = await this.page.$('input[id="search"]');
// overwrites last text in input
await input.click({ clickCount: 3 });
await input.type(keyword);
await input.focus();
await this.page.keyboard.press("Enter");
}
async next_page() {
// youtube needs scrolling
// TODO: implement scrolling, no priority right now
return false;
}
async wait_for_results() {
await this.page.waitForFunction(`document.title.indexOf('${this.keyword}') !== -1`, { timeout: 5000 });
await this.page.waitForSelector('ytd-video-renderer,ytd-grid-video-renderer', { timeout: 5000 });
await this.sleep(500);
}
async detected() {
const title = await this.page.title();
let html = await this.page.content();
return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
}
}
module.exports = {
YoutubeScraper: YoutubeScraper,
};

View File

@ -1,291 +1,411 @@
const puppeteer = require('puppeteer');
const zlib = require('zlib');
var fs = require('fs');
'use strict';
// local module imports
const fs = require('fs');
const os = require('os');
const _ = require('lodash');
const { createLogger, format, transports } = require('winston');
const { combine, timestamp, printf } = format;
const debug = require('debug')('se-scraper:ScrapeManager');
const { Cluster } = require('puppeteer-cluster');
const UserAgent = require('user-agents');
const google = require('./modules/google.js');
const bing = require('./modules/bing.js');
const baidu = require('./modules/baidu.js');
const yandex = require('./modules/yandex.js');
const infospace = require('./modules/infospace.js');
const youtube = require('./modules/youtube.js');
const ua = require('./modules/user_agents.js');
const meta = require('./modules/metadata.js');
const duckduckgo = require('./modules/duckduckgo.js');
const tickersearch = require('./modules/ticker_search.js');
const CustomConcurrencyImpl = require('./concurrency-implementation');
const MAX_ALLOWED_BROWSERS = 6;
function write_results(fname, data) {
fs.writeFileSync(fname, data, (err) => {
if (err) throw err;
console.log(`Results written to file ${fname}`);
});
fs.writeFileSync(fname, data, (err) => {
if (err) throw err;
console.log(`Results written to file ${fname}`);
});
}
module.exports.handler = async function handler (event, context, callback) {
config = event;
pluggable = {};
if (config.custom_func) {
if (fs.existsSync(config.custom_func)) {
try {
Pluggable = require(config.custom_func);
pluggable = new Pluggable({config: config});
} catch (exception) {
console.error(exception);
}
} else {
console.error(`File "${config.custom_func}" does not exist...`);
}
}
function read_keywords_from_file(fname) {
let kws = fs.readFileSync(fname).toString().split(os.EOL);
// clean keywords
kws = kws.filter((kw) => {
return kw.trim().length > 0;
});
return kws;
}
try {
const startTime = Date.now();
config = parseEventData(config);
if (config.debug === true) {
console.log(config);
}
var ADDITIONAL_CHROME_FLAGS = [
'--disable-infobars',
'--window-position=0,0',
'--ignore-certifcate-errors',
'--ignore-certifcate-errors-spki-list',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--disable-gpu',
'--window-size=1920x1080',
'--hide-scrollbars',
];
function getScraper(search_engine, args) {
if (typeof search_engine === 'string') {
return new {
google: google.GoogleScraper,
google_news_old: google.GoogleNewsOldScraper,
google_news: google.GoogleNewsScraper,
google_image: google.GoogleImageScraper,
bing: bing.BingScraper,
yandex: yandex.YandexScraper,
bing_news: bing.BingNewsScraper,
duckduckgo: duckduckgo.DuckduckgoScraper,
infospace: infospace.InfospaceScraper,
webcrawler: infospace.WebcrawlerNewsScraper,
}[search_engine](args);
} else if (typeof search_engine === 'function') {
return new search_engine(args);
} else {
throw new Error(`search_engine must either be a string of class (function)`);
}
}
let USER_AGENT = '';
if (config.user_agent) {
USER_AGENT = config.user_agent;
}
class ScrapeManager {
if (config.random_user_agent === true) {
USER_AGENT = ua.random_user_agent();
}
constructor(config, context={}) {
if (USER_AGENT) {
ADDITIONAL_CHROME_FLAGS.push(
`--user-agent="${USER_AGENT}"`
)
}
this.cluster = null;
this.pluggable = null;
this.scraper = null;
this.context = context;
if (config.proxy) {
// check this out bubbles
// https://www.systutorials.com/241062/how-to-set-google-chromes-proxy-settings-in-command-line-on-linux/
// [<proxy-scheme>://]<proxy-host>[:<proxy-port>]
// "http", "socks", "socks4", "socks5".
ADDITIONAL_CHROME_FLAGS.push(
'--proxy-server=' + config.proxy,
)
}
this.config = _.defaults(config, {
// the user agent to scrape with
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3835.0 Safari/537.36',
// if random_user_agent is set to True, a random user agent is chosen
random_user_agent: false,
// whether to select manual settings in visible mode
set_manual_settings: false,
// log ip address data
log_ip_address: false,
// log http headers
log_http_headers: false,
// how long to sleep between requests. a random sleep interval within the range [a,b]
// is drawn before every request. empty string for no sleeping.
sleep_range: null,
// which search engine to scrape
search_engine: 'google',
search_engine_name: 'google',
logger: createLogger({
level: 'info',
format: combine(
timestamp(),
printf(({ level, message, timestamp }) => {
return `${timestamp} [${level}] ${message}`;
})
),
transports: [
new transports.Console()
]
}),
keywords: ['nodejs rocks',],
// whether to start the browser in headless mode
headless: true,
// specify flags passed to chrome here
// About our defaults values https://peter.sh/experiments/chromium-command-line-switches/
chrome_flags: [
'--disable-infobars',
'--window-position=0,0',
'--ignore-certifcate-errors',
'--ignore-certifcate-errors-spki-list',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--disable-gpu',
'--window-size=1920,1040',
'--start-fullscreen',
'--hide-scrollbars',
'--disable-notifications',
],
// the number of pages to scrape for each keyword
num_pages: 1,
// path to output file, data will be stored in JSON
output_file: '',
// whether to also passthru all the html output of the serp pages
html_output: false,
// whether to strip JS and CSS from the html_output
// has only an effect if `html_output` is true
clean_html_output: true,
// remove all data images from the html
clean_data_images: true,
// whether to return a screenshot of serp pages as b64 data
screen_output: false,
// Scrape url from local file. Mainly used for testing.
scrape_from_file: '',
// whether to prevent images, css, fonts and media from being loaded
// will speed up scraping a great deal
block_assets: true,
// path to js module that extends functionality
// this module should export the functions:
// get_browser, handle_metadata, close_browser
//custom_func: resolve('examples/pluggable.js'),
custom_func: null,
throw_on_detection: false,
// List of proxies to use ['socks5://78.94.172.42:1080', 'http://localhost:1080']
proxies: null,
// a file with one proxy per line. Example:
// socks5://78.94.172.42:1080
// http://118.174.233.10:48400
proxy_file: '',
// whether to use proxies only
// when this is set to true, se-scraper will not use
// your default IP address
use_proxies_only: false,
// check if headless chrome escapes common detection techniques
// this is a quick test and should be used for debugging
test_evasion: false,
apply_evasion_techniques: true,
// settings for puppeteer-cluster
puppeteer_cluster_config: {
timeout: 30 * 60 * 1000, // max timeout set to 30 minutes
monitor: false,
concurrency: Cluster.CONCURRENCY_BROWSER,
maxConcurrency: 1,
}
});
let launch_args = {
args: ADDITIONAL_CHROME_FLAGS,
headless: config.headless,
ignoreHTTPSErrors: true,
};
this.logger = this.config.logger;
if (config.debug === true) {
console.log("Chrome Args: ", launch_args);
}
if (config.sleep_range) {
// parse an array
config.sleep_range = eval(config.sleep_range);
if (pluggable.start_browser) {
launch_args.config = config;
browser = await pluggable.start_browser(launch_args);
} else {
browser = await puppeteer.launch(launch_args);
}
if (config.sleep_range.length !== 2 && typeof i[0] !== 'number' && typeof i[1] !== 'number') {
throw "sleep_range is not a valid array of two integers.";
}
}
let metadata = {};
if (fs.existsSync(this.config.keyword_file)) {
this.config.keywords = read_keywords_from_file(this.config.keyword_file);
}
if (config.log_http_headers === true) {
metadata.http_headers = await meta.get_http_headers(browser);
}
if (this.config.proxies && this.config.proxy_file) {
throw new Error('Either use a proxy_file or specify a proxy for all connections. Do not use both options.');
}
if (config.log_ip_address === true) {
metadata.ipinfo = await meta.get_ip_data(browser);
}
if (this.config.proxy_file) {
this.config.proxies = read_keywords_from_file(this.config.proxy_file);
this.logger.info(`${this.config.proxies.length} proxies read from file.`);
}
// check that our proxy is working by confirming
// that ipinfo.io sees the proxy IP address
if (config.proxy && config.log_ip_address === true) {
console.log(`${metadata.ipinfo} vs ${config.proxy}`);
if (!this.config.proxies && this.config.use_proxies_only) {
throw new Error('Must provide at least one proxy in proxies if you enable use_proxies_only');
}
try {
// if the ip returned by ipinfo is not a substring of our proxystring, get the heck outta here
if (!config.proxy.includes(metadata.ipinfo.ip)) {
console.error('Proxy not working properly.');
await browser.close();
return;
}
} catch (exception) {
debug('this.config=%O', this.config);
}
}
}
/*
* Launches the puppeteer cluster or browser.
*
* Returns true if the browser was successfully launched. Otherwise will return false.
*/
async start() {
var results = {};
if (this.config.custom_func) {
if (fs.existsSync(this.config.custom_func)) {
try {
const PluggableClass = require(this.config.custom_func);
this.pluggable = new PluggableClass({
config: this.config,
context: this.context
});
} catch (exception) {
console.error(exception);
return false;
}
} else {
console.error(`File "${this.config.custom_func}" does not exist!`);
return false;
}
}
Scraper = {
google: google.GoogleScraper,
google_news_old: google.GoogleNewsOldScraper,
google_news: google.GoogleNewsScraper,
google_image: google.GoogleImageScraper,
bing: bing.BingScraper,
bing_news: bing.BingNewsScraper,
duckduckgo: duckduckgo.DuckduckgoScraper,
duckduckgo_news: duckduckgo.DuckduckgoNewsScraper,
infospace: infospace.InfospaceScraper,
webcrawler: infospace.WebcrawlerNewsScraper,
baidu: baidu.BaiduScraper,
youtube: youtube.YoutubeScraper,
yahoo_news: tickersearch.YahooFinanceScraper,
reuters: tickersearch.ReutersFinanceScraper,
cnbc: tickersearch.CnbcFinanceScraper,
marketwatch: tickersearch.MarketwatchFinanceScraper,
}[config.search_engine];
const chrome_flags = _.clone(this.config.chrome_flags);
if (Scraper === undefined) {
console.info('Currently not implemented search_engine: ', config.search_engine);
} else {
scraperObj = new Scraper({
browser: browser,
config: config,
context: context,
pluggable: pluggable,
});
results = await scraperObj.run();
}
if (this.pluggable && this.pluggable.start_browser) {
launch_args.config = this.config;
this.browser = await this.pluggable.start_browser({
config: this.config,
});
this.page = await this.browser.newPage();
} else {
// if no custom start_browser functionality was given
// use puppeteer-cluster for scraping
if (pluggable.close_browser) {
await pluggable.close_browser();
} else {
await browser.close();
}
let proxies;
// if we have at least one proxy, always use CONCURRENCY_BROWSER
// and set maxConcurrency to this.config.proxies.length + 1
// else use whatever this.configuration was passed
if (this.config.proxies && this.config.proxies.length > 0) {
let num_requests = scraperObj.num_requests;
let timeDelta = Date.now() - startTime;
let ms_per_request = timeDelta/num_requests;
// because we use real browsers, we ran out of memory on normal laptops
// when using more than maybe 5 or 6 browsers.
// therefore hardcode a limit here
// TODO not sure this what we want
this.numClusters = Math.min(
this.config.proxies.length + (this.config.use_proxies_only ? 0 : 1),
MAX_ALLOWED_BROWSERS
);
proxies = _.clone(this.config.proxies);
if (config.verbose === true) {
console.log(`Scraper took ${timeDelta}ms to perform ${num_requests} requests.`);
console.log(`On average ms/request: ${ms_per_request}ms/request`);
console.dir(results, {depth: null, colors: true});
}
// Insert a first config without proxy if use_proxy_only is false
if (this.config.use_proxies_only === false) {
proxies.unshift(null);
}
if (config.compress === true) {
results = JSON.stringify(results);
// https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Encoding
results = zlib.deflateSync(results).toString('base64');
}
} else {
this.numClusters = this.config.puppeteer_cluster_config.maxConcurrency;
proxies = _.times(this.numClusters, null);
}
if (pluggable.handle_results) {
await pluggable.handle_results({
config: config,
results: results,
});
}
this.logger.info(`Using ${this.numClusters} clusters.`);
metadata.id = `${config.job_name} ${config.chunk_lines}`;
metadata.chunk_lines = config.chunk_lines;
metadata.elapsed_time = timeDelta.toString();
metadata.ms_per_keyword = ms_per_request.toString();
metadata.num_requests = num_requests;
// Give the per browser options
const perBrowserOptions = _.map(proxies, (proxy) => {
const userAgent = (this.config.random_user_agent) ? (new UserAgent({deviceCategory: 'desktop'})).toString() : this.config.user_agent;
let args = chrome_flags.concat([`--user-agent=${userAgent}`]);
if (config.verbose === true) {
console.log(metadata);
}
if (proxy) {
args = args.concat([`--proxy-server=${proxy}`]);
}
if (pluggable.handle_metadata) {
await pluggable.handle_metadata({metadata: metadata, config: config});
}
return {
headless: this.config.headless,
ignoreHTTPSErrors: true,
args
};
});
if (config.output_file) {
write_results(config.output_file, JSON.stringify(results));
}
debug('perBrowserOptions=%O', perBrowserOptions)
let response = {
headers: {
'Content-Type': 'text/json',
},
results: results,
metadata: metadata || {},
statusCode: 200
};
this.cluster = await Cluster.launch({
monitor: this.config.puppeteer_cluster_config.monitor,
timeout: this.config.puppeteer_cluster_config.timeout, // max timeout set to 30 minutes
concurrency: CustomConcurrencyImpl,
maxConcurrency: this.numClusters,
puppeteerOptions: {
perBrowserOptions: perBrowserOptions
}
});
}
}
callback(null, response);
/*
* Scrapes the keywords specified by the config.
*/
async scrape(scrape_config = {}) {
} catch (e) {
callback(e, null);
}
if (!scrape_config.keywords && !scrape_config.keyword_file) {
throw new Error('Either keywords or keyword_file must be supplied to scrape()');
}
Object.assign(this.config, scrape_config);
var results = {};
var num_requests = 0;
var metadata = {};
var startTime = Date.now();
this.config.search_engine_name = typeof this.config.search_engine === 'function' ? this.config.search_engine.name : this.config.search_engine;
this.logger.info(`scrapes ${this.config.search_engine_name} with ${this.config.keywords.length} keywords on ${this.config.num_pages} pages each.`);
if (this.pluggable && this.pluggable.start_browser) {
this.scraper = getScraper(this.config.search_engine, {
config: this.config,
context: this.context,
pluggable: this.pluggable,
page: this.page,
});
var {results, metadata, num_requests} = await this.scraper.run(this.page);
} else {
// Each browser will get N/(K+1) keywords and will issue N/(K+1) * M total requests to the search engine.
// https://github.com/GoogleChrome/puppeteer/issues/678
// The question is: Is it possible to set proxies per Page? Per Browser?
// as far as I can see, puppeteer cluster uses the same puppeteerOptions
// for every browser instance. We will use our custom puppeteer-cluster version.
// https://www.npmjs.com/package/proxy-chain
// this answer looks nice: https://github.com/GoogleChrome/puppeteer/issues/678#issuecomment-389096077
let chunks = [];
for (var n = 0; n < this.numClusters; n++) {
chunks.push([]);
}
for (var k = 0; k < this.config.keywords.length; k++) {
chunks[k % this.numClusters].push(this.config.keywords[k]);
}
debug('chunks=%o', chunks);
let execPromises = [];
for (var c = 0; c < chunks.length; c++) {
const config = _.clone(this.config);
config.keywords = chunks[c];
var obj = getScraper(this.config.search_engine, {
config: config,
context: {},
pluggable: this.pluggable,
});
var boundMethod = obj.run.bind(obj);
execPromises.push(this.cluster.execute({}, boundMethod));
}
let promiseReturns = await Promise.all(execPromises);
// Merge results and metadata per keyword
for (let promiseReturn of promiseReturns) {
Object.assign(results, promiseReturn.results);
Object.assign(metadata, promiseReturn.metadata);
num_requests += promiseReturn.num_requests;
}
}
let timeDelta = Date.now() - startTime;
let ms_per_request = timeDelta/num_requests;
this.logger.info(`Scraper took ${timeDelta}ms to perform ${num_requests} requests.`);
this.logger.info(`On average ms/request: ${ms_per_request}ms/request`);
if (this.pluggable && this.pluggable.handle_results) {
await this.pluggable.handle_results(results);
}
metadata.elapsed_time = timeDelta.toString();
metadata.ms_per_keyword = ms_per_request.toString();
metadata.num_requests = num_requests;
debug('metadata=%O', metadata);
if (this.pluggable && this.pluggable.handle_metadata) {
await this.pluggable.handle_metadata(metadata);
}
if (this.config.output_file) {
this.logger.info(`Writing results to ${this.config.output_file}`);
write_results(this.config.output_file, JSON.stringify(results, null, 4));
}
return {
results: results,
metadata: metadata || {},
};
}
/*
* Quit the puppeteer cluster/browser.
*/
async quit() {
if (this.pluggable && this.pluggable.close_browser) {
await this.pluggable.close_browser();
} else {
await this.cluster.idle();
await this.cluster.close();
}
}
}
module.exports = {
ScrapeManager: ScrapeManager,
};
function parseEventData(config) {
function _bool(e) {
e = String(e);
if (typeof e.trim === "function") {
return e.trim().toLowerCase() == 'true';
} else {
return e.toLowerCase() == 'true';
}
}
if (config.debug) {
config.debug = _bool(config.debug);
}
if (config.verbose) {
config.verbose = _bool(config.verbose);
}
if (config.upload_to_s3) {
config.upload_to_s3 = _bool(config.upload_to_s3);
}
if (config.log_ip_address) {
config.log_ip_address = _bool(config.log_ip_address);
}
if (config.log_http_headers) {
config.log_http_headers = _bool(config.log_http_headers);
}
if (config.random_user_agent) {
config.random_user_agent = _bool(config.random_user_agent);
}
if (config.compress) {
config.compress = _bool(config.compress);
}
if (config.is_local) {
config.is_local = _bool(config.is_local);
}
if (config.max_results) {
config.max_results = parseInt(config.max_results);
}
if (config.set_manual_settings) {
config.set_manual_settings = _bool(config.set_manual_settings);
}
if (config.block_assets) {
config.block_assets = _bool(config.block_assets);
}
if (config.sleep_range) {
// parse an array
config.sleep_range = eval(config.sleep_range);
if (config.sleep_range.length !== 2 && typeof i[0] !== 'number' && typeof i[1] !== 'number') {
throw "sleep_range is not a valid array of two integers.";
}
}
return config;
}

101
test/html_output.js Normal file
View File

@ -0,0 +1,101 @@
'use strict';
const express = require('express');
const { createLogger, transports } = require('winston');
const http = require('http');
const https = require('https');
const assert = require('assert');
const path = require('path');
const keyCert = require('key-cert');
const Promise = require('bluebird');
const Proxy = require('http-mitm-proxy');
const debug = require('debug')('se-scraper:test');
const se_scraper = require('../');
const httpPort = 3012;
const httpsPort = httpPort + 1;
const proxyPort = httpPort + 2;
const fakeSearchEngine = express();
fakeSearchEngine.get('/search', (req, res) => {
debug('q=%s', req.query.q);
const pageNumber = ((req.query.start/10) || 0) + 1;
res.sendFile(path.join(__dirname, 'mocks/google/' + req.query.q + '_page' + pageNumber + '.html'));
});
fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']}));
describe('Config', function(){
let httpServer, httpsServer, proxy;
before(async function(){
// Here mount our fake engine in both http and https listen server
httpServer = http.createServer(fakeSearchEngine);
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
proxy = Proxy();
proxy.onRequest((ctx, callback) => {
ctx.proxyToServerRequestOptions.host = 'localhost';
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
return callback();
});
await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
debug('Fake http search engine servers started');
});
after(function(){
httpsServer.close();
httpServer.close();
proxy.close();
});
describe('html_output', function(){
const testLogger = createLogger({
transports: [
new transports.Console({
level: 'error'
})
]
});
/**
* Test html_output option
*/
it('html_output single page single keyword', async function () {
const scrape_job = {
search_engine: 'google',
/* TODO refactor start_url
google_settings: {
start_url: 'http://localhost:' + httpPort
},
*/
keywords: ['test keyword'],
};
var scraper = new se_scraper.ScrapeManager({
throw_on_detection: true,
logger: testLogger,
html_output: true,
//clean_html_output: false,
//clean_data_images: false,
// TODO refactor start_url so we can use-it instead of depending of the proxy for this test
proxies: ['http://localhost:' + proxyPort],
use_proxies_only: true,
});
await scraper.start();
const { results } = await scraper.scrape(scrape_job);
await scraper.quit();
assert(results['test keyword']['1'].html.length > 1000, 'Html of google page 1 should be provided');
});
});
});

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,148 @@
<!DOCTYPE html>
<!--[if IEMobile 7 ]> <html lang="en_US" class="no-js iem7"> <![endif]-->
<!--[if lt IE 7]> <html class="ie6 lt-ie10 lt-ie9 lt-ie8 lt-ie7 no-js" lang="en_US"> <![endif]-->
<!--[if IE 7]> <html class="ie7 lt-ie10 lt-ie9 lt-ie8 no-js" lang="en_US"> <![endif]-->
<!--[if IE 8]> <html class="ie8 lt-ie10 lt-ie9 no-js" lang="en_US"> <![endif]-->
<!--[if IE 9]> <html class="ie9 lt-ie10 no-js" lang="en_US"> <![endif]-->
<!--[if (gte IE 9)|(gt IEMobile 7)|!(IEMobile)|!(IE)]><!--><html class="no-js" lang="en_US"><!--<![endif]-->
<head>
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
<meta http-equiv="content-type" content="text/html; charset=UTF-8;charset=utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=1" />
<meta name="HandheldFriendly" content="true"/>
<link rel="canonical" href="https://duckduckgo.com/">
<link rel="stylesheet" href="/s1847.css" type="text/css">
<link rel="stylesheet" href="/o1847.css" type="text/css">
<link rel="shortcut icon" href="/favicon.ico" type="image/x-icon"/>
<link rel="apple-touch-icon" href="/assets/icons/meta/DDG-iOS-icon_60x60.png"/>
<link rel="apple-touch-icon" sizes="76x76" href="/assets/icons/meta/DDG-iOS-icon_76x76.png"/>
<link rel="apple-touch-icon" sizes="120x120" href="/assets/icons/meta/DDG-iOS-icon_120x120.png"/>
<link rel="apple-touch-icon" sizes="152x152" href="/assets/icons/meta/DDG-iOS-icon_152x152.png"/>
<link rel="image_src" href="/assets/icons/meta/DDG-icon_256x256.png"/>
<link rel="manifest" href="/manifest.json"/>
<meta name="twitter:card" content="summary">
<meta name="twitter:site" value="@duckduckgo">
<meta property="og:url" content="https://duckduckgo.com/" />
<meta property="og:site_name" content="DuckDuckGo" />
<meta property="og:image" content="https://duckduckgo.com/assets/logo_social-media.png">
<title>DuckDuckGo — Privacy, simplified.</title>
<meta property="og:title" content="DuckDuckGo — Privacy, simplified." />
<meta property="og:description" content="The Internet privacy company that empowers you to seamlessly take control of your personal information online, without any tradeoffs.">
<meta name="description" content="The Internet privacy company that empowers you to seamlessly take control of your personal information online, without any tradeoffs.">
</head>
<body id="pg-index" class="page-index body--home">
<script type="text/javascript">
var settings_js_version = "/s2475.js",
locale = "en_US";
</script>
<script type="text/javascript" src="/lib/l113.js"></script>
<script type="text/javascript" src="/locale/en_US/duckduckgo14.js"></script>
<script type="text/javascript" src="/util/u418.js"></script>
<script type="text/javascript" src="/d2727.js"></script>
<script type="text/javascript">
DDG.page = new DDG.Pages.Home();
</script>
<div class="site-wrapper site-wrapper--home js-site-wrapper">
<div class="header-wrap--home js-header-wrap">
<div class="header--aside js-header-aside"></div>
<div class="js-header-home-search header-wrap--home__search">
<div class="logo-wrap--home">
<a id="logo_homepage_link" class="logo_homepage" href="/about">
About DuckDuckGo
<span class="logo_homepage__tt">Duck it!</span>
</a>
</div>
<form id="search_form_homepage_top" class="search search--home js-search-form-top" name="x" method="POST" action="/html">
<input class="search__input js-search-input" type="text" autocomplete="off" name="q" tabindex="1" value="">
<input class="search__button js-search-button" type="submit" tabindex="2" value="S" />
<input class="search__clear empty js-search-clear" type="button" tabindex="3" value="X" />
<div class="search__hidden js-search-hidden"></div>
</form>
</div>
</div>
<div id="" class="content-wrap--home">
<div id="content_homepage" class="content--home">
<div class="cw--c">
<div class="logo-wrap--home">
<a id="logo_homepage_link" class="logo_homepage" href="/about">
About DuckDuckGo
<span class="logo_homepage__tt">Duck it!</span>
</a>
</div>
<div class="search-wrap--home">
<form id="search_form_homepage" class="search search--home js-search-form" name="x" method="POST" action="/html">
<input id="search_form_input_homepage" class="search__input js-search-input" type="text" autocomplete="off" name="q" tabindex="1" value="">
<input id="search_button_homepage" class="search__button js-search-button" type="submit" tabindex="2" value="S" />
<input id="search_form_input_clear" class="search__clear empty js-search-clear" type="button" tabindex="3" value="X" />
<div id="search_elements_hidden" class="search__hidden js-search-hidden"></div>
</form>
</div>
<!-- en_US All Settings -->
<noscript>
<div class="tag-home">
<div class="tag-home__wrapper">
<div class="tag-home__item">
The search engine that doesn't track you.
<span class="hide--screen-xs"><a href="/about" class="tag-home__link">Learn More</a>.</span>
</div>
</div>
</div>
</noscript>
<div class="tag-home tag-home--slide no-js__hide js-tag-home"></div>
<div id="error_homepage"></div>
</div> <!-- cw -->
</div> <!-- content_homepage //-->
</div> <!-- content_wrapper_homepage //-->
<div id="footer_homepage" class="foot-home js-foot-home"></div>
<script type="text/javascript">
{function seterr(str) {
var error=document.getElementById('error_homepage');
error.innerHTML=str;
$(error).css('display','block');
}
var err=new RegExp('[\?\&]e=([^\&]+)');var errm=new Array();errm['2']='no search';errm['3']='search too long';errm['4']='not UTF\u002d8 encoding';errm['6']='too many search terms';if (err.test(window.location.href)) seterr('Oops, '+(errm[RegExp.$1]?errm[RegExp.$1]:'there was an error.')+' &nbsp;Please try again');};
if (kurl) {
document.getElementById("logo_homepage_link").href += (document.getElementById("logo_homepage_link").href.indexOf('?')==-1 ? '?t=i' : '') + kurl;
}
</script>
</div> <!-- site-wrapper -->
</body>
</html>

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

123
test/modules/bing.js Normal file
View File

@ -0,0 +1,123 @@
'use strict';
const express = require('express');
const puppeteer = require('puppeteer');
const { createLogger, transports } = require('winston');
const http = require('http');
const https = require('https');
const assert = require('assert');
const path = require('path');
const keyCert = require('key-cert');
const Promise = require('bluebird');
const Proxy = require('http-mitm-proxy');
const debug = require('debug')('se-scraper:test');
const { BingScraper } = require('../../src/modules/bing');
const httpPort = 3012;
const httpsPort = httpPort + 1;
const proxyPort = httpPort + 2;
const fakeSearchEngine = express();
fakeSearchEngine.get('/search', (req, res, next) => {
debug('q=%s', req.query.q);
const pageNumber = Math.round((req.query.first || 0) /10) + 1;
res.sendFile(path.join(__dirname, '../mocks/bing/' + req.query.q + '_page' + pageNumber + '.html'));
});
fakeSearchEngine.use(express.static('test/mocks/bing', {extensions: ['html']}));
describe('Module Bing', function(){
let httpServer, httpsServer, proxy;
before(async function(){
// Here mount our fake engine in both http and https listen server
httpServer = http.createServer(fakeSearchEngine);
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
proxy = Proxy();
proxy.onRequest((ctx, callback) => {
ctx.proxyToServerRequestOptions.host = 'localhost';
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
debug('connection proxied askedHost=%s toPort=%s', ctx.clientToProxyRequest.headers.host, ctx.proxyToServerRequestOptions.port);
return callback();
});
await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort });
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
debug('Fake http search engine servers started');
});
after(function(){
proxy.close();
httpsServer.close();
httpServer.close();
});
let browser;
let page;
beforeEach(async function(){
debug('Start a new browser');
browser = await puppeteer.launch({
//dumpio: true,
//headless: false,
ignoreHTTPSErrors: true,
args: [ '--proxy-server=http://localhost:' + proxyPort ]
});
debug('Open a fresh page');
page = await browser.newPage();
});
afterEach(async function(){
await browser.close();
});
const testLogger = createLogger({
transports: [
new transports.Console({
level: 'error'
})
]
});
it('one keyword one page', function(){
const bingScraper = new BingScraper({
config: {
search_engine_name: 'bing',
throw_on_detection: true,
keywords: ['test keyword'],
logger: testLogger,
scrape_from_file: '',
}
});
bingScraper.STANDARD_TIMEOUT = 500;
return bingScraper.run({page}).then(({results, metadata, num_requests}) => {
assert.strictEqual(num_requests, 1, 'Must do one request');
assert.strictEqual(results['test keyword']['1'].results.length, 6, 'Must have 6 organic results parsed');
});
});
it('one keyword 3 pages', function () {
const bingScraper = new BingScraper({
config: {
search_engine_name: 'bing',
throw_on_detection: true,
keywords: ['test keyword'],
logger: testLogger,
scrape_from_file: '',
num_pages: 3,
}
});
bingScraper.STANDARD_TIMEOUT = 500;
return bingScraper.run({page}).then(({results, metadata, num_requests}) => {
assert.strictEqual(num_requests, 3, 'Must three requests');
assert.strictEqual(results['test keyword']['1'].results.length, 6, 'Must have 6 organic results parsed on page 1');
assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tests | TestComplete Documentation', 'Title not matching on first organic result page 1');
assert.strictEqual(results['test keyword']['2'].results.length, 10, 'Must have 10 organic results parsed on page 2');
assert.strictEqual(results['test keyword']['2'].results[0].title, 'Keywords - TestLink', 'Title not matching on first organic result page 2');
assert.strictEqual(results['test keyword']['3'].results.length, 10, 'Must have 10 organic results parsed on page 3');
assert.strictEqual(results['test keyword']['3'].results[0].title, 'Keyword Driven Testing | TestComplete', 'Title not matching on first organic result page 3');
});
});
});

140
test/modules/duckduckgo.js Normal file
View File

@ -0,0 +1,140 @@
'use strict';
const express = require('express');
const puppeteer = require('puppeteer');
const { createLogger, transports } = require('winston');
const http = require('http');
const https = require('https');
const assert = require('assert');
const path = require('path');
const keyCert = require('key-cert');
const Promise = require('bluebird');
const Proxy = require('http-mitm-proxy');
const debug = require('debug')('se-scraper:test');
const { DuckduckgoScraper } = require('../../src/modules/duckduckgo');
const httpPort = 3012;
const httpsPort = httpPort + 1;
const proxyPort = httpPort + 2;
const fakeSearchEngine = express();
fakeSearchEngine.use(express.urlencoded({ extended: true }))
fakeSearchEngine.get('/', (req, res, next) => {
if(!req.query.q){
return next();
}
debug('q=%s page=%d', req.query.q, req.query.page);
const pageNumber = req.query.page;
res.sendFile(path.join(__dirname, '../mocks/duckduckgo/' + req.query.q + '_page' + pageNumber + '.html'));
});
fakeSearchEngine.post('/html', (req, res) => {
debug('body=%o', req.body);
const pageNumber = 1;
res.sendFile(path.join(__dirname, '../mocks/duckduckgo/' + req.body.q + '_page' + pageNumber + '.html'));
});
fakeSearchEngine.use(express.static('test/mocks/duckduckgo', {extensions: ['html']}));
describe('Module DuckDuckGo', function(){
let httpServer, httpsServer, proxy;
before(async function(){
// Here mount our fake engine in both http and https listen server
httpServer = http.createServer(fakeSearchEngine);
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
proxy = Proxy();
proxy.onRequest((ctx, callback) => {
ctx.proxyToServerRequestOptions.host = 'localhost';
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
debug('proxy askedHost=%s method=%s url=%s toPort=%s',
ctx.clientToProxyRequest.headers.host,
ctx.clientToProxyRequest.method,
ctx.clientToProxyRequest.url,
ctx.proxyToServerRequestOptions.port
);
return callback();
});
await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort });
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
debug('Fake http search engine servers started');
});
after(function(){
proxy.close();
httpsServer.close();
httpServer.close();
});
let browser;
let page;
beforeEach(async function(){
debug('Start a new browser');
browser = await puppeteer.launch({
//dumpio: true,
//headless: false,
ignoreHTTPSErrors: true,
args: [ '--proxy-server=http://localhost:' + proxyPort ]
});
debug('Open a fresh page');
page = await browser.newPage();
});
afterEach(async function(){
await browser.close();
});
const testLogger = createLogger({
transports: [
new transports.Console({
level: 'error'
})
]
});
it('one keyword one page', function(){
const duckduckgoScraper = new DuckduckgoScraper({
config: {
search_engine_name: 'duckduckgo',
throw_on_detection: true,
keywords: ['test keyword'],
logger: testLogger,
scrape_from_file: '',
}
});
duckduckgoScraper.STANDARD_TIMEOUT = 1000;
return duckduckgoScraper.run({page}).then(({results, metadata, num_requests}) => {
assert.strictEqual(num_requests, 1, 'Must do one request');
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed');
});
});
it('one keyword 3 pages', function () {
this.timeout(4000);
const duckduckgoScraper = new DuckduckgoScraper({
config: {
search_engine_name: 'google',
throw_on_detection: true,
keywords: ['test keyword'],
logger: testLogger,
scrape_from_file: '',
num_pages: 3,
}
});
duckduckgoScraper.STANDARD_TIMEOUT = 1000;
return duckduckgoScraper.run({page}).then(({results, metadata, num_requests}) => {
assert.strictEqual(num_requests, 3, 'Must three requests');
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed on page 1');
assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tests | TestComplete Documentation', 'Title not matching on first organic result page 1');
debug('results page 1 %O',results['test keyword']['1'].results);
debug('results page 2 %O', results['test keyword']['2'].results);
assert.strictEqual(results['test keyword']['2'].results.length, 19, 'Must have 19 organic results parsed on page 2');
assert.strictEqual(results['test keyword']['2'].results[0].title, 'Quest Diagnostics: Test Directory', 'Title not matching on first organic result page 1');
assert.strictEqual(results['test keyword']['3'].results.length, 48, 'Must have 48 organic results parsed on page 3');
assert.strictEqual(results['test keyword']['3'].results[0].title, 'Java Keywords Quiz - Sporcle', 'Title not matching on first organic result page 1');
});
});
});

123
test/modules/google.js Normal file
View File

@ -0,0 +1,123 @@
'use strict';
const express = require('express');
const puppeteer = require('puppeteer');
const { createLogger, transports } = require('winston');
const http = require('http');
const https = require('https');
const assert = require('assert');
const path = require('path');
const keyCert = require('key-cert');
const Promise = require('bluebird');
const Proxy = require('http-mitm-proxy');
const debug = require('debug')('se-scraper:test');
const { GoogleScraper } = require('../../src/modules/google');
const httpPort = 3012;
const httpsPort = httpPort + 1;
const proxyPort = httpPort + 2;
const fakeSearchEngine = express();
fakeSearchEngine.get('/search', (req, res) => {
debug('q=%s', req.query.q);
const pageNumber = ((req.query.start/10) || 0) + 1;
res.sendFile(path.join(__dirname, '../mocks/google/' + req.query.q + '_page' + pageNumber + '.html'));
});
fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']}));
describe('Module Google', function(){
let httpServer, httpsServer, proxy;
before(async function(){
// Here mount our fake engine in both http and https listen server
httpServer = http.createServer(fakeSearchEngine);
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
proxy = Proxy();
proxy.onRequest((ctx, callback) => {
ctx.proxyToServerRequestOptions.host = 'localhost';
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
debug('connection proxied askedHost=%s toPort=%s', ctx.clientToProxyRequest.headers.host, ctx.proxyToServerRequestOptions.port);
return callback();
});
await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort });
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
debug('Fake http search engine servers started');
});
after(function(){
proxy.close();
httpsServer.close();
httpServer.close();
});
let browser;
let page;
beforeEach(async function(){
debug('Start a new browser');
browser = await puppeteer.launch({
//dumpio: true,
//headless: false,
ignoreHTTPSErrors: true,
args: [ '--proxy-server=http://localhost:' + proxyPort ]
});
debug('Open a fresh page');
page = await browser.newPage();
});
afterEach(async function(){
await browser.close();
});
const testLogger = createLogger({
transports: [
new transports.Console({
level: 'error'
})
]
});
it('one keyword one page', function(){
const googleScraper = new GoogleScraper({
config: {
search_engine_name: 'google',
throw_on_detection: true,
keywords: ['test keyword'],
logger: testLogger,
scrape_from_file: '',
}
});
googleScraper.STANDARD_TIMEOUT = 500;
return googleScraper.run({page}).then(({results, metadata, num_requests}) => {
assert.strictEqual(num_requests, 1, 'Must do one request');
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed');
});
});
it('one keyword 3 pages', function () {
const googleScraper = new GoogleScraper({
config: {
search_engine_name: 'google',
throw_on_detection: true,
keywords: ['test keyword'],
logger: testLogger,
scrape_from_file: '',
num_pages: 3,
}
});
googleScraper.STANDARD_TIMEOUT = 500;
return googleScraper.run({page}).then(({results, metadata, num_requests}) => {
assert.strictEqual(num_requests, 3, 'Must three requests');
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed on page 1');
assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tool (FREE) ᐈ #1 Google Keyword Planner Alternative', 'Title not matching on first organic result page 1');
assert.strictEqual(results['test keyword']['2'].results.length, 10, 'Must have 10 organic results parsed on page 2');
assert.strictEqual(results['test keyword']['2'].results[0].title, 'Keyword Research | The Beginner\'s Guide to SEO - Moz', 'Title not matching on first organic result page 1');
assert.strictEqual(results['test keyword']['3'].results.length, 10, 'Must have 10 organic results parsed on page 3');
assert.strictEqual(results['test keyword']['3'].results[0].title, 'The ACT Keyword Study Plan — NerdCoach', 'Title not matching on first organic result page 1');
});
});
});

161
test/proxy.js Normal file
View File

@ -0,0 +1,161 @@
'use strict';
const express = require('express');
const { createLogger, transports } = require('winston');
const http = require('http');
const https = require('https');
const assert = require('assert');
const keyCert = require('key-cert');
const Promise = require('bluebird');
const Proxy = require('http-mitm-proxy');
const debug = require('debug')('se-scraper:test');
const se_scraper = require('../');
const Scraper = require('../src/modules/se_scraper');
const httpPort = 3012;
const httpsPort = httpPort + 1;
const proxyPort = httpPort + 2;
const fakeSearchEngine = express();
fakeSearchEngine.set('trust proxy', 'loopback');
fakeSearchEngine.get('/test-proxy', (req, res) => {
debug('fake-search-engine req.hostname=%s', req.hostname);
//debug('req to', req.socket.localAddress, req.socket.localPort);
res.send(req.hostname);
});
describe('Config', function(){
let httpServer, httpsServer, proxy;
before(async function(){
// Here mount our fake engine in both http and https listen server
httpServer = http.createServer(fakeSearchEngine);
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
proxy = Proxy();
proxy.onRequest((ctx, callback) => {
ctx.proxyToServerRequestOptions.host = 'localhost';
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
return callback();
});
await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
debug('Fake http search engine servers started');
});
after(function(){
httpsServer.close();
httpServer.close();
proxy.close();
});
describe('proxies', function(){
class MockScraperTestProxy extends Scraper {
async load_start_page(){
return true;
}
async search_keyword(){
await this.page.goto('http://test.local:' + httpPort + '/test-proxy');
}
async parse_async(){
const bodyHandle = await this.page.$('body');
return await this.page.evaluate(body => body.innerHTML, bodyHandle);
}
}
const testLogger = createLogger({
transports: [
new transports.Console({
level: 'error'
})
]
});
/**
* Jobs will be executed 2 by 2 through the proxy and direct connection
* THIS TEST NEED TO HAVE test.local 127.0.0.1 in /etc/hosts because chrome bypass localhost even with proxy set
*/
it('one proxy given, use_proxies_only=false', async function () {
const scrape_job = {
search_engine: MockScraperTestProxy,
keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'],
};
var scraper = new se_scraper.ScrapeManager({
throw_on_detection: true,
proxies: ['http://localhost:' + proxyPort],
// default is use_proxies_only: false,
logger: testLogger,
});
await scraper.start();
const { results } = await scraper.scrape(scrape_job);
assert.strictEqual(results['news']['1'], 'test.local');
assert.strictEqual(results['some stuff']['1'], 'ProxiedThroughFakeEngine');
assert.strictEqual(results['i work too much']['1'], 'test.local');
assert.strictEqual(results['what to do?']['1'], 'ProxiedThroughFakeEngine');
assert.strictEqual(results['javascript is hard']['1'], 'test.local');
await scraper.quit();
});
/**
* Jobs will be executed 1 by 1 through the proxy
*/
it('one proxy given, use_proxies_only=true', async function () {
const scrape_job = {
search_engine: MockScraperTestProxy,
keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'],
};
var scraper = new se_scraper.ScrapeManager({
throw_on_detection: true,
proxies: ['http://localhost:' + proxyPort],
use_proxies_only: true,
logger: testLogger,
});
await scraper.start();
const { results } = await scraper.scrape(scrape_job);
assert.strictEqual(results['news']['1'], 'ProxiedThroughFakeEngine');
assert.strictEqual(results['some stuff']['1'], 'ProxiedThroughFakeEngine');
assert.strictEqual(results['i work too much']['1'], 'ProxiedThroughFakeEngine');
assert.strictEqual(results['what to do?']['1'], 'ProxiedThroughFakeEngine');
assert.strictEqual(results['javascript is hard']['1'], 'ProxiedThroughFakeEngine');
await scraper.quit();
});
it('zero proxy given, use_proxies_only=true', async function () {
const scrape_job = {
search_engine: MockScraperTestProxy,
keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'],
};
await assert.rejects(async () => {
var scraper = new se_scraper.ScrapeManager({
throw_on_detection: true,
use_proxies_only: true,
logger: testLogger,
});
await scraper.start();
const { results } = await scraper.scrape(scrape_job);
await scraper.quit();
}, /Must provide at least one proxy in proxies if you enable use_proxies_only/);
});
});
});

View File

@ -1,203 +0,0 @@
const se_scraper = require('./../index.js');
var assert = require('chai').assert;
/*
* Use chai and mocha for tests.
* https://mochajs.org/#installation
*/
const normal_search_keywords = ['apple tree', 'weather tomorrow'];
async function normal_search_test() {
let config = {
search_engine: 'bing',
compress: false,
debug: false,
verbose: false,
keywords: normal_search_keywords,
keyword_file: '',
num_pages: 3,
headless: true,
output_file: '',
block_assets: true,
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
random_user_agent: false,
};
console.log('normal_search_test()');
await se_scraper.scrape(config, normal_search_test_case);
}
// we test with a callback function to our handler
function normal_search_test_case(err, response) {
if (err) {
console.error(err);
} else {
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
assert.equal(response.statusCode, 200, 'status code must be 200');
assert.equal(response.metadata.num_requests, 6);
for (let query in response.results) {
let total_rank = 1;
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
for (let res of obj.results) {
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'rank'], 'not all keys are in the SERP object');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'visible_link must be ok');
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
assert.isNumber(res.rank, 'rank must be integer');
assert.equal(res.rank, total_rank++, 'rank ist wrong');
}
}
}
}
}
const keywords_no_results = ['2342kljp;fj9834u40abJAkasdlfkjsladfkjasfdas;lk3453-934023safkl34a44dsflkjaQQuBBdfk',];
async function no_results_test() {
let config = {
search_engine: 'bing',
compress: false,
debug: false,
verbose: false,
keywords: keywords_no_results,
keyword_file: '',
num_pages: 1,
headless: true,
output_file: '',
block_assets: true,
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
random_user_agent: false,
};
console.log('no_results_test()');
await se_scraper.scrape(config, test_case_no_results);
}
// we test with a callback function to our handler
function test_case_no_results(err, response) {
if (err) {
console.error(err);
} else {
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
assert.equal(response.statusCode, 200, 'status code must be 200');
assert.equal(response.metadata.num_requests, 1);
results = response.results;
for (let query in response.results) {
assert.containsAllKeys(response.results, keywords_no_results, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
assert(obj.results.length === 0, 'results must have 0 SERP objects');
assert.equal(obj.no_results, true, 'no results should be true');
assert.isEmpty(obj.num_results, 'no results should be a empty string');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
}
}
}
}
const effective_query_keywords = ['mount everrest'];
async function effective_query_test() {
let config = {
search_engine: 'bing',
compress: false,
debug: false,
verbose: false,
keywords: effective_query_keywords,
keyword_file: '',
num_pages: 1,
headless: true,
output_file: '',
block_assets: true,
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
random_user_agent: false,
};
console.log('effective_query_test()');
await se_scraper.scrape(config, test_case_effective_query);
}
// we test with a callback function to our handler
function test_case_effective_query(err, response) {
if (err) {
console.error(err);
} else {
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
assert.equal(response.statusCode, 200, 'status code must be 200');
assert.equal(response.metadata.num_requests, 1);
results = response.results;
for (let query in response.results) {
assert.containsAllKeys(response.results, effective_query_keywords, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
// effective query must be different to the original keyword
assert.isOk(obj.effective_query, 'effective query must be ok');
assert.isNotEmpty(obj.effective_query, 'effective query must be valid');
assert(obj.effective_query !== query, 'effective query must be different from keyword');
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
}
}
}
}
(async () => {
await normal_search_test();
await no_results_test();
await effective_query_test();
})();

View File

@ -1,145 +0,0 @@
const se_scraper = require('./../index.js');
var assert = require('chai').assert;
/*
* Use chai and mocha for tests.
* https://mochajs.org/#installation
*/
const normal_search_keywords = ['apple tree', 'weather tomorrow'];
async function normal_search_test() {
let config = {
search_engine: 'duckduckgo',
compress: false,
debug: false,
verbose: false,
keywords: normal_search_keywords,
keyword_file: '',
num_pages: 2,
headless: false,
output_file: '',
block_assets: true,
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
random_user_agent: false,
};
console.log('normal_search_test()');
await se_scraper.scrape(config, normal_search_test_case);
}
// we test with a callback function to our handler
function normal_search_test_case(err, response) {
if (err) {
console.error(err);
} else {
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
assert.equal(response.statusCode, 200, 'status code must be 200');
assert.equal(response.metadata.num_requests, 4);
for (let query in response.results) {
let total_rank = 1;
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time', 'effective_query'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
for (let res of obj.results) {
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'visible_link must be ok');
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
assert.isNumber(res.rank, 'rank must be integer');
assert.equal(res.rank, total_rank++, 'rank ist wrong');
}
}
}
}
}
const effective_query_keywords = ['mount everrest'];
async function effective_query_test() {
let config = {
search_engine: 'duckduckgo',
compress: false,
debug: false,
verbose: false,
keywords: effective_query_keywords,
keyword_file: '',
num_pages: 1,
headless: true,
output_file: '',
block_assets: true,
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
random_user_agent: false,
};
console.log('effective_query_test()');
await se_scraper.scrape(config, test_case_effective_query);
}
// we test with a callback function to our handler
function test_case_effective_query(err, response) {
if (err) {
console.error(err);
} else {
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
assert.equal(response.statusCode, 200, 'status code must be 200');
assert.equal(response.metadata.num_requests, 1);
results = response.results;
for (let query in response.results) {
assert.containsAllKeys(response.results, effective_query_keywords, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time', 'effective_query'], 'not all keys are in the object');
// effective query must be different to the original keyword
assert.isOk(obj.effective_query, 'effective query must be ok');
assert.isNotEmpty(obj.effective_query, 'effective query must be valid');
assert(obj.effective_query !== query, 'effective query must be different from keyword');
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
}
}
}
}
(async () => {
await normal_search_test();
await effective_query_test();
})();

View File

@ -1,204 +0,0 @@
const se_scraper = require('./../index.js');
var assert = require('chai').assert;
/*
* Use chai and mocha for tests.
* https://mochajs.org/#installation
*/
const normal_search_keywords = ['apple tree', 'weather tomorrow'];
async function normal_search_test() {
let config = {
search_engine: 'google',
compress: false,
debug: false,
verbose: false,
keywords: normal_search_keywords,
keyword_file: '',
num_pages: 3,
headless: true,
output_file: '',
block_assets: true,
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
random_user_agent: false,
};
console.log('normal_search_test()');
await se_scraper.scrape(config, normal_search_test_case);
}
// we test with a callback function to our handler
function normal_search_test_case(err, response) {
if (err) {
console.error(err);
} else {
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
assert.equal(response.statusCode, 200, 'status code must be 200');
assert.equal(response.metadata.num_requests, 6);
for (let query in response.results) {
let total_rank = 1;
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 8, 'results must have at least 8 SERP objects');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
for (let res of obj.results) {
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link'], 'not all keys are in the SERP object');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'visible_link must be ok');
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
assert.isNumber(res.rank, 'rank must be integer');
assert.equal(res.rank, total_rank++, 'rank ist wrong');
}
}
}
}
}
const keywords_no_results = ['fgskl34440abJAksafkl34a44dsflkjaQQuBBdfk',];
async function no_results_test() {
let config = {
search_engine: 'google',
compress: false,
debug: false,
verbose: false,
keywords: keywords_no_results,
keyword_file: '',
num_pages: 1,
headless: true,
output_file: '',
block_assets: true,
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
random_user_agent: false,
};
console.log('no_results_test()');
await se_scraper.scrape(config, test_case_no_results);
}
// we test with a callback function to our handler
function test_case_no_results(err, response) {
if (err) {
console.error(err);
} else {
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
assert.equal(response.statusCode, 200, 'status code must be 200');
assert.equal(response.metadata.num_requests, 1);
results = response.results;
for (let query in response.results) {
assert.containsAllKeys(response.results, keywords_no_results, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
assert(obj.results.length === 0, 'results must have 0 SERP objects');
assert.equal(obj.no_results, true, 'no results should be true');
assert.isEmpty(obj.num_results, 'no results should be a empty string');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
}
}
}
}
const effective_query_keywords = ['mount evverrest'];
async function effective_query_test() {
let config = {
search_engine: 'google',
compress: false,
debug: false,
verbose: false,
keywords: effective_query_keywords,
keyword_file: '',
num_pages: 1,
headless: true,
output_file: '',
block_assets: true,
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
random_user_agent: false,
};
console.log('effective_query_test()');
await se_scraper.scrape(config, test_case_effective_query);
}
// we test with a callback function to our handler
function test_case_effective_query(err, response) {
if (err) {
console.error(err);
} else {
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
assert.equal(response.statusCode, 200, 'status code must be 200');
assert.equal(response.metadata.num_requests, 1);
results = response.results;
for (let query in response.results) {
assert.containsAllKeys(response.results, effective_query_keywords, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
// effective query must be different to the original keyword
assert.isOk(obj.effective_query, 'effective query must be ok');
assert.isNotEmpty(obj.effective_query, 'effective query must be valid');
assert(obj.effective_query !== query, 'effective query must be different from keyword');
assert.isAtLeast(obj.results.length, 8, 'results must have at least 8 SERP objects');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
}
}
}
}
(async () => {
await normal_search_test();
await no_results_test();
await effective_query_test();
})();

View File

@ -1,85 +0,0 @@
const se_scraper = require('./../index.js');
var assert = require('chai').assert;
/*
* Use chai and mocha for tests.
* https://mochajs.org/#installation
*/
const normal_search_keywords = ['apple', 'rain'];
async function normal_image_search_test() {
let config = {
search_engine: 'google_image',
compress: false,
debug: false,
verbose: false,
keywords: normal_search_keywords,
keyword_file: '',
num_pages: 2,
headless: true,
output_file: '',
block_assets: true,
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
random_user_agent: false,
};
console.log('normal_image_search_test()');
await se_scraper.scrape(config, normal_image_search_test_case);
}
// we test with a callback function to our handler
function normal_image_search_test_case(err, response) {
if (err) {
console.error(err);
} else {
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
assert.equal(response.statusCode, 200, 'status code must be 200');
assert.equal(response.metadata.num_requests, 2);
for (let query in response.results) {
let total_rank = 1;
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'effective_query'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 15, 'results must have at least 15 SERP objects');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
for (let res of obj.results) {
assert.containsAllKeys(res, ['link', 'snippet', 'rank', 'clean_link'], 'not all keys are in the SERP object');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.clean_link, 'clean_link must be ok');
assert.typeOf(res.clean_link, 'string', 'clean_link must be string');
assert.isAtLeast(res.clean_link.length, 5, 'clean_link must have at least 5 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
assert.isNumber(res.rank, 'rank must be integer');
assert.equal(res.rank, total_rank++, 'rank ist wrong');
}
}
}
}
}
(async () => {
await normal_image_search_test();
})();

View File

@ -1,221 +0,0 @@
const se_scraper = require('./../index.js');
var assert = require('chai').assert;
/*
* Use chai and mocha for tests.
* https://mochajs.org/#installation
*/
const quote_search_keywords = ['MSFT', 'AAPL'];
async function reuters_search_test() {
let config = {
search_engine: 'reuters',
compress: false,
debug: false,
verbose: false,
keywords: quote_search_keywords,
keyword_file: '',
num_pages: 1,
headless: true,
output_file: '',
block_assets: true,
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
random_user_agent: false,
};
console.log('reuters_search_test()');
await se_scraper.scrape(config, reuters_search_test_case);
}
// we test with a callback function to our handler
function reuters_search_test_case(err, response) {
if (err) {
console.error(err);
} else {
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
assert.equal(response.statusCode, 200, 'status code must be 200');
for (let query in response.results) {
let total_rank = 1;
assert.containsAllKeys(response.results, quote_search_keywords, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
for (let res of obj.results) {
assert.containsAllKeys(res, ['link', 'title', 'date', 'snippet'], 'not all keys are in the SERP object');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
assert.isOk(res.date, 'date must be ok');
assert.typeOf(res.date, 'string', 'date must be string');
assert.isAtLeast(res.date.length, 5, 'date must have at least 5 chars');
}
}
}
}
}
async function cnbc_search_test() {
let config = {
search_engine: 'cnbc',
compress: false,
debug: false,
verbose: false,
keywords: quote_search_keywords,
keyword_file: '',
num_pages: 1,
headless: true,
output_file: '',
block_assets: true,
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
random_user_agent: false,
};
console.log('cnbc_search_test()');
await se_scraper.scrape(config, cnbc_search_test_case);
}
// we test with a callback function to our handler
function cnbc_search_test_case(err, response) {
if (err) {
console.error(err);
} else {
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
assert.equal(response.statusCode, 200, 'status code must be 200');
for (let query in response.results) {
let total_rank = 1;
assert.containsAllKeys(response.results, quote_search_keywords, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
for (let res of obj.results) {
assert.containsAllKeys(res, ['link', 'title', 'date'], 'not all keys are in the SERP object');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
assert.isOk(res.date, 'date must be ok');
assert.typeOf(res.date, 'string', 'date must be string');
assert.isAtLeast(res.date.length, 5, 'date must have at least 5 chars');
}
}
}
}
}
const marketwatch_search_keywords = ['MSFT'];
async function marketwatch_search_test() {
let config = {
search_engine: 'marketwatch',
compress: false,
debug: false,
verbose: false,
keywords: marketwatch_search_keywords,
keyword_file: '',
num_pages: 1,
headless: true,
output_file: '',
block_assets: true,
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
random_user_agent: false,
};
console.log('marketwatch_search_test()');
await se_scraper.scrape(config, marketwatch_search_test_case);
}
// we test with a callback function to our handler
function marketwatch_search_test_case(err, response) {
if (err) {
console.error(err);
} else {
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
assert.equal(response.statusCode, 200, 'status code must be 200');
for (let query in response.results) {
let total_rank = 1;
assert.containsAllKeys(response.results, marketwatch_search_keywords, 'not all keywords were scraped.');
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
for (let res of obj.results) {
assert.containsAllKeys(res, ['link', 'title', 'date', 'author'], 'not all keys are in the SERP object');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
assert.isOk(res.author, 'author must be ok');
assert.typeOf(res.author, 'string', 'author must be string');
assert.isAtLeast(res.author.length, 5, 'author must have at least 5 chars');
assert.isOk(res.date, 'date must be ok');
assert.typeOf(res.date, 'string', 'date must be string');
assert.isAtLeast(res.date.length, 5, 'date must have at least 5 chars');
}
}
}
}
}
(async () => {
await reuters_search_test();
await cnbc_search_test();
await marketwatch_search_test();
})();

144
test/user_agent.js Normal file
View File

@ -0,0 +1,144 @@
'use strict';
const express = require('express');
const { createLogger, transports } = require('winston');
const http = require('http');
const https = require('https');
const assert = require('assert');
const keyCert = require('key-cert');
const Promise = require('bluebird');
const Proxy = require('http-mitm-proxy');
const UAParser = require('ua-parser-js');
const _ = require('lodash');
const debug = require('debug')('se-scraper:test');
const se_scraper = require('../');
const Scraper = require('../src/modules/se_scraper');
const httpPort = 3012;
const httpsPort = httpPort + 1;
const proxyPort = httpPort + 2;
const fakeSearchEngine = express();
fakeSearchEngine.set('trust proxy', 'loopback');
fakeSearchEngine.get('/test-user_agent', (req, res) => {
debug('fake-search-engine req.headers.user-agent=%s', req.headers['user-agent']);
res.send(req.headers['user-agent']);
});
describe('Config', function(){
let httpServer, httpsServer, proxy;
before(async function(){
// Here mount our fake engine in both http and https listen server
httpServer = http.createServer(fakeSearchEngine);
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
proxy = Proxy();
proxy.onRequest((ctx, callback) => {
ctx.proxyToServerRequestOptions.host = 'localhost';
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
return callback();
});
await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
debug('Fake http search engine servers started');
});
after(function(){
httpsServer.close();
httpServer.close();
proxy.close();
});
describe('user_agent', function(){
class MockScraperTestUserAgent extends Scraper {
async load_start_page(){
return true;
}
async search_keyword(){
await this.page.goto('http://localhost:' + httpPort + '/test-user_agent');
}
async parse_async(){
const bodyHandle = await this.page.$('body');
return await this.page.evaluate(body => body.innerHTML, bodyHandle);
}
}
const testLogger = createLogger({
transports: [
new transports.Console({
level: 'error'
})
]
});
/**
* Test user_agent option
*/
it('fixed user_agent', async function () {
const scrape_job = {
search_engine: MockScraperTestUserAgent,
keywords: ['javascript is hard'],
};
var scraper = new se_scraper.ScrapeManager({
throw_on_detection: true,
logger: testLogger,
user_agent: 'THIS IS A USERAGENT 42.0'
});
await scraper.start();
const { results } = await scraper.scrape(scrape_job);
assert.strictEqual(results['javascript is hard']['1'], 'THIS IS A USERAGENT 42.0');
await scraper.quit();
});
/**
* Test random_user_agent option
* TODO generated user_agent should be different for each keyword
* TODO this test will sometimes fail because user_agent not very random :-(
*/
it('random_user_agent', async function () {
const scrape_job = {
search_engine: MockScraperTestUserAgent,
keywords: ['news'],
};
const NUMBER_OF_EXEC = 10;
const uaList = await Promise.map(_.range(NUMBER_OF_EXEC), async (i) => {
const scraper = new se_scraper.ScrapeManager({
throw_on_detection: true,
logger: testLogger,
random_user_agent: true,
});
await scraper.start();
const { results: { news } } = await scraper.scrape(scrape_job);
await scraper.quit();
return news['1'];
});
uaList.forEach((userAgent) => {
const uaParsed = UAParser(userAgent);
assert(uaParsed.browser.name, 'UserAgent should have a browser name detected');
assert(uaParsed.os.name, 'UserAgent should have a os name detected');
});
assert( _.chain(uaList).countBy().toPairs().sortBy(e => e[1]).last().value()[1] < (NUMBER_OF_EXEC * 0.4), 'Each user agent should appear less than 40% of the time' );
});
});
});