forked from extern/se-scraper
Compare commits
106 Commits
add-code-o
...
master
Author | SHA1 | Date | |
---|---|---|---|
5a0eea201d | |||
0278b24f0d | |||
33fa371716 | |||
6b806dedfe | |||
5633b10e50 | |||
c58d4fa74d | |||
4f467abf1e | |||
89dc5c3ebb | |||
4b33ef9b19 | |||
28332528ea | |||
b685fb4def | |||
394b567db6 | |||
cac6b87e92 | |||
1c1db88545 | |||
8f6317cea7 | |||
f192e4ebb4 | |||
3ab8e46126 | |||
392c43390e | |||
77c1bb8372 | |||
8f40057534 | |||
301695cd2b | |||
d362e4ae2c | |||
bcd181111b | |||
b4a86fcc51 | |||
9e6a555663 | |||
ca9f5f7f50 | |||
1694ee92d0 | |||
da69913272 | |||
4a3a0e6fd4 | |||
4953d9da7a | |||
5e47c27c70 | |||
95a5ee56d8 | |||
52a2ec7b33 | |||
07f3dceba1 | |||
b25f7a4285 | |||
4b581bd03f | |||
21378dab02 | |||
77d6c4f04a | |||
b513bb0f5b | |||
855a874f9e | |||
dde1711d9d | |||
7ba7ee9226 | |||
e661241f6f | |||
98414259fe | |||
19a172c654 | |||
0f7e89c272 | |||
ca941cee45 | |||
4c77aeba76 | |||
0427d9f915 | |||
87fcdd35d5 | |||
4ca50ab2b9 | |||
8e629f6266 | |||
a369bd07f9 | |||
dde2b14fc0 | |||
0db6e068da | |||
50bda275a6 | |||
a61fade2c9 | |||
78fe12390b | |||
fcbe66b56b | |||
59154694f2 | |||
60a9d52924 | |||
1fc7f0d1c8 | |||
baaff5824e | |||
dab25f9068 | |||
a413cb54ef | |||
bbebe3ce60 | |||
09c1255400 | |||
5e8ff1cb34 | |||
c1a036e8da | |||
d1e9b21269 | |||
593f3a95e5 | |||
d9ac9f4162 | |||
a0e63aa4b0 | |||
a3ebe357a4 | |||
0d7f6dcd11 | |||
80d23a9d57 | |||
ebe9ba8ea9 | |||
caa93df3b0 | |||
0c9f353cb2 | |||
43d5732de7 | |||
06d500f75c | |||
784e887787 | |||
db5fbb23d2 | |||
5bf7c94b9a | |||
d4d06f7d67 | |||
35943e7449 | |||
7e06944fa1 | |||
6825c97790 | |||
3d69f4e249 | |||
1593759556 | |||
775dcfa077 | |||
b82c769bb1 | |||
1bed9c5854 | |||
7a8c6f13f0 | |||
51d617442d | |||
dd1f36076e | |||
62b3b688b4 | |||
7b52b4e62f | |||
7239e23cba | |||
8cbf37eaba | |||
abf4458e46 | |||
79d32a315a | |||
089e410ec6 | |||
393b9c0450 | |||
fb3f2836e4 | |||
53c9ebf467 |
18
.gitignore
vendored
18
.gitignore
vendored
@ -1,3 +1,19 @@
|
||||
# ignore static tests
|
||||
|
||||
test/static_tests/html/
|
||||
test/static_tests/html/*
|
||||
|
||||
.idea
|
||||
|
||||
# ignore data
|
||||
|
||||
examples/data/
|
||||
examples/data/*
|
||||
|
||||
examples/results/
|
||||
examples/results/*
|
||||
|
||||
|
||||
# Logs
|
||||
logs
|
||||
*.log
|
||||
@ -63,3 +79,5 @@ typings/
|
||||
|
||||
.idea/
|
||||
GoogleScraperPup.iml
|
||||
|
||||
.http-mitm-proxy
|
||||
|
0
.gitmodules
vendored
Normal file
0
.gitmodules
vendored
Normal file
73
Dockerfile
Normal file
73
Dockerfile
Normal file
@ -0,0 +1,73 @@
|
||||
FROM node:10-slim
|
||||
|
||||
# Application parameters and variables
|
||||
# ENV NODE_ENV=production
|
||||
ENV HOST=0.0.0.0
|
||||
ENV PORT=3000
|
||||
ENV application_directory=/se-scraper
|
||||
ENV puppeteer_cluster_directory=/se-scraper/src/puppeteer-cluster
|
||||
|
||||
# Create app directory
|
||||
WORKDIR $application_directory
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y \
|
||||
gconf-service \
|
||||
libasound2 \
|
||||
libatk1.0-0 \
|
||||
libc6 \
|
||||
libcairo2 \
|
||||
libcups2 \
|
||||
libdbus-1-3 \
|
||||
libexpat1 \
|
||||
libfontconfig1 \
|
||||
libgcc1 \
|
||||
libgconf-2-4 \
|
||||
libgdk-pixbuf2.0-0 \
|
||||
libglib2.0-0 \
|
||||
libgtk-3-0 \
|
||||
libnspr4 \
|
||||
libpango-1.0-0 \
|
||||
libpangocairo-1.0-0 \
|
||||
libstdc++6 \
|
||||
libx11-6 \
|
||||
libx11-xcb1 \
|
||||
libxcb1 \
|
||||
libxcomposite1 \
|
||||
libxcursor1 \
|
||||
libxdamage1 \
|
||||
libxext6 \
|
||||
libxfixes3 \
|
||||
libxi6 \
|
||||
libxrandr2 \
|
||||
libxrender1 \
|
||||
libxss1 \
|
||||
libxtst6 \
|
||||
ca-certificates \
|
||||
fonts-liberation \
|
||||
libappindicator1 \
|
||||
libnss3 \
|
||||
lsb-release \
|
||||
xdg-utils \
|
||||
wget
|
||||
|
||||
# Bundle app source
|
||||
COPY . .
|
||||
WORKDIR $puppeteer_cluster_directory
|
||||
RUN npm install \
|
||||
&& npm run build
|
||||
|
||||
WORKDIR $application_directory
|
||||
# skip installing scripts for puppeteer dependencies
|
||||
# we've already installed puppeteer above.
|
||||
RUN npm install --ignore-scripts
|
||||
|
||||
# Cleanup
|
||||
RUN apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
ADD https://github.com/Yelp/dumb-init/releases/download/v1.2.2/dumb-init_1.2.2_amd64 /usr/local/bin/dumb-init
|
||||
RUN chmod +x /usr/local/bin/dumb-init
|
||||
|
||||
EXPOSE $PORT
|
||||
|
||||
CMD ["dumb-init", "node", "server/server.js"]
|
201
LICENSE
Normal file
201
LICENSE
Normal file
@ -0,0 +1,201 @@
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright 2019 Nikolai Tschacher
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
625
README.md
625
README.md
@ -1,78 +1,307 @@
|
||||
# Search Engine Scraper
|
||||
# [The maintained successor of se-scraper is the general purpose crawling infrastructure](https://github.com/NikolaiT/Crawling-Infrastructure)
|
||||
|
||||
This node module supports scraping several search engines.
|
||||
## Search Engine Scraper - se-scraper
|
||||
|
||||
Right now scraping the search engines
|
||||
[](https://www.npmjs.com/package/se-scraper)
|
||||
[](https://www.paypal.me/incolumitas)
|
||||
[](https://snyk.io/test/github/NikolaiT/se-scraper)
|
||||
|
||||
This node module allows you to scrape search engines concurrently with different proxies.
|
||||
|
||||
If you don't have extensive technical experience or don't want to purchase proxies, you can use [my scraping service](https://scrapeulous.com/).
|
||||
|
||||
#### Table of Contents
|
||||
- [Installation](#installation)
|
||||
- [Docker](#docker-support)
|
||||
- [Minimal Example](#minimal-example)
|
||||
- [Quickstart](#quickstart)
|
||||
- [Contribute](#contribute)
|
||||
- [Using Proxies](#proxies)
|
||||
- [Custom Scrapers](#custom-scrapers)
|
||||
- [Examples](#examples)
|
||||
- [Scraping Model](#scraping-model)
|
||||
- [Technical Notes](#technical-notes)
|
||||
- [Advanced Usage](#advanced-usage)
|
||||
- [Special Query String Parameters for Search Engines](#query-string-parameters)
|
||||
|
||||
|
||||
Se-scraper supports the following search engines:
|
||||
* Google
|
||||
* Google News
|
||||
* Google News App version (https://news.google.com)
|
||||
* Google Image
|
||||
* Bing
|
||||
* Baidu
|
||||
* Youtube
|
||||
* Bing News
|
||||
* Infospace
|
||||
* Duckduckgo
|
||||
* Yandex
|
||||
* Webcrawler
|
||||
|
||||
is supported.
|
||||
This module uses puppeteer and a modified version of [puppeteer-cluster](https://github.com/thomasdondorf/puppeteer-cluster/). It was created by the Developer of [GoogleScraper](https://github.com/NikolaiT/GoogleScraper), a module with 1800 Stars on Github.
|
||||
|
||||
Additionally **se-scraper** supports investment ticker search from the following sites:
|
||||
## Installation
|
||||
|
||||
* Reuters
|
||||
* cnbc
|
||||
* Marketwatch
|
||||
You need a working installation of **node** and the **npm** package manager.
|
||||
|
||||
This module uses puppeteer. It was created by the Developer of https://github.com/NikolaiT/GoogleScraper, a module with 1800 Stars on Github.
|
||||
|
||||
### Quickstart
|
||||
|
||||
**Note**: If you don't want puppeteer to download a complete chromium browser, add this variable to your environments:
|
||||
For example, if you are using Ubuntu 18.04, you can install node and npm with the following commands:
|
||||
|
||||
```bash
|
||||
export PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=1
|
||||
sudo apt update;
|
||||
|
||||
sudo apt install nodejs;
|
||||
|
||||
# recent version of npm
|
||||
curl -sL https://deb.nodesource.com/setup_10.x -o nodesource_setup.sh;
|
||||
sudo bash nodesource_setup.sh;
|
||||
sudo apt install npm;
|
||||
```
|
||||
|
||||
Then install with
|
||||
Chrome and puppeteer [need some additional libraries to run on ubuntu](https://techoverflow.net/2018/06/05/how-to-fix-puppetteer-error-).
|
||||
|
||||
This command will install dependencies:
|
||||
|
||||
```bash
|
||||
# install all that is needed by chromium browser. Maybe not everything needed
|
||||
sudo apt-get install gconf-service libasound2 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 libgcc1 libgconf-2-4 libgdk-pixbuf2.0-0 libglib2.0-0 libgtk-3-0 libnspr4 libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6 ca-certificates fonts-liberation libappindicator1 libnss3 lsb-release xdg-utils wget;
|
||||
```
|
||||
|
||||
Install **se-scraper** by entering the following command in your terminal
|
||||
|
||||
```bash
|
||||
npm install se-scraper
|
||||
```
|
||||
|
||||
then create a file with the following contents and start scraping.
|
||||
If you **don't** want puppeteer to download a complete chromium browser, add this variable to your environment. Then this module is not guaranteed to run out of the box.
|
||||
|
||||
```bash
|
||||
export PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=1
|
||||
```
|
||||
|
||||
### Docker Support
|
||||
|
||||
I will maintain a public docker image of se-scraper. Pull the docker image with the command:
|
||||
|
||||
```bash
|
||||
docker pull tschachn/se-scraper
|
||||
```
|
||||
|
||||
Confirm that the docker image was correctly pulled:
|
||||
|
||||
```bash
|
||||
docker image ls
|
||||
```
|
||||
|
||||
Should show something like that:
|
||||
|
||||
```
|
||||
tschachn/se-scraper latest 897e1aeeba78 21 minutes ago 1.29GB
|
||||
```
|
||||
|
||||
You can check the [latest tag here](https://hub.docker.com/r/tschachn/se-scraper/tags). In the example below, the latest tag is **latest**. This will most likely remain **latest** in the future.
|
||||
|
||||
Run the docker image and map the internal port 3000 to the external
|
||||
port 3000:
|
||||
|
||||
```bash
|
||||
$ docker run -p 3000:3000 tschachn/se-scraper:latest
|
||||
|
||||
Running on http://0.0.0.0:3000
|
||||
```
|
||||
|
||||
When the image is running, you may start scrape jobs via HTTP API:
|
||||
|
||||
```bash
|
||||
curl -XPOST http://0.0.0.0:3000 -H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"browser_config": {
|
||||
"random_user_agent": true
|
||||
},
|
||||
"scrape_config": {
|
||||
"search_engine": "google",
|
||||
"keywords": ["test"],
|
||||
"num_pages": 1
|
||||
}
|
||||
}'
|
||||
```
|
||||
|
||||
Many thanks goes to [slotix](https://github.com/NikolaiT/se-scraper/pull/21) for his tremendous help in setting up a docker image.
|
||||
|
||||
|
||||
## Minimal Example
|
||||
|
||||
Create a file named `minimal.js` with the following contents
|
||||
|
||||
```js
|
||||
const se_scraper = require('se-scraper');
|
||||
|
||||
let config = {
|
||||
search_engine: 'google',
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: ['news', 'scraping scrapeulous.com'],
|
||||
num_pages: 3,
|
||||
output_file: 'data.json',
|
||||
};
|
||||
(async () => {
|
||||
let scrape_job = {
|
||||
search_engine: 'google',
|
||||
keywords: ['lets go boys'],
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
function callback(err, response) {
|
||||
if (err) { console.error(err) }
|
||||
console.dir(response, {depth: null, colors: true});
|
||||
}
|
||||
var results = await se_scraper.scrape({}, scrape_job);
|
||||
|
||||
se_scraper.scrape(config, callback);
|
||||
console.dir(results, {depth: null, colors: true});
|
||||
})();
|
||||
```
|
||||
|
||||
### Technical Notes
|
||||
Start scraping by firing up the command `node minimal.js`
|
||||
|
||||
## Quickstart
|
||||
|
||||
Create a file named `run.js` with the following contents
|
||||
|
||||
```js
|
||||
const se_scraper = require('se-scraper');
|
||||
|
||||
(async () => {
|
||||
let browser_config = {
|
||||
debug_level: 1,
|
||||
output_file: 'examples/results/data.json',
|
||||
};
|
||||
|
||||
let scrape_job = {
|
||||
search_engine: 'google',
|
||||
keywords: ['news', 'se-scraper'],
|
||||
num_pages: 1,
|
||||
// add some cool google search settings
|
||||
google_settings: {
|
||||
gl: 'us', // The gl parameter determines the Google country to use for the query.
|
||||
hl: 'en', // The hl parameter determines the Google UI language to return results.
|
||||
start: 0, // Determines the results offset to use, defaults to 0.
|
||||
num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
|
||||
},
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(browser_config);
|
||||
|
||||
await scraper.start();
|
||||
|
||||
var results = await scraper.scrape(scrape_job);
|
||||
|
||||
console.dir(results, {depth: null, colors: true});
|
||||
|
||||
await scraper.quit();
|
||||
})();
|
||||
```
|
||||
|
||||
Start scraping by firing up the command `node run.js`
|
||||
|
||||
## Contribute
|
||||
|
||||
I really help and love your help! However scraping is a dirty business and it often takes me a lot of time to find failing selectors or missing JS logic. So if any search engine does not yield the results of your liking, please create a **static test case** similar to [this static test of google](test/static_tests/google.js) that fails. I will try to correct se-scraper then.
|
||||
|
||||
That's how you would proceed:
|
||||
|
||||
1. Copy the [static google test case](test/static_tests/google.js)
|
||||
2. Remove all unnecessary testing code
|
||||
3. Save a search to file where se-scraper does not work correctly.
|
||||
3. Implement the static test case using the saved search html where se-scraper currently fails.
|
||||
4. Submit a new issue with the failing test case as pull request
|
||||
5. I will fix it! (or better: you submit a pull request directly)
|
||||
|
||||
## Proxies
|
||||
|
||||
**se-scraper** will create one browser instance per proxy. So the maximal amount of concurrency is equivalent to the number of proxies plus one (your own IP).
|
||||
|
||||
```js
|
||||
const se_scraper = require('se-scraper');
|
||||
|
||||
(async () => {
|
||||
let browser_config = {
|
||||
debug_level: 1,
|
||||
output_file: 'examples/results/proxyresults.json',
|
||||
proxy_file: '/home/nikolai/.proxies', // one proxy per line
|
||||
log_ip_address: true,
|
||||
};
|
||||
|
||||
let scrape_job = {
|
||||
search_engine: 'google',
|
||||
keywords: ['news', 'scrapeulous.com', 'incolumitas.com', 'i work too much', 'what to do?', 'javascript is hard'],
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(browser_config);
|
||||
await scraper.start();
|
||||
|
||||
var results = await scraper.scrape(scrape_job);
|
||||
console.dir(results, {depth: null, colors: true});
|
||||
await scraper.quit();
|
||||
})();
|
||||
```
|
||||
|
||||
With a proxy file such as
|
||||
|
||||
```text
|
||||
socks5://53.34.23.55:55523
|
||||
socks4://51.11.23.22:22222
|
||||
```
|
||||
|
||||
This will scrape with **three** browser instance each having their own IP address. Unfortunately, it is currently not possible to scrape with different proxies per tab. Chromium does not support that.
|
||||
|
||||
|
||||
## Custom Scrapers
|
||||
|
||||
You can define your own scraper class and use it within se-scraper.
|
||||
|
||||
[Check this example out](examples/custom_scraper.js) that defines a custom scraper for Ecosia.
|
||||
|
||||
|
||||
## Examples
|
||||
|
||||
* [Reuse existing browser](examples/multiple_search_engines.js) yields [these results](examples/results/multiple_search_engines.json)
|
||||
* [Simple example scraping google](examples/quickstart.js) yields [these results](examples/results/data.json)
|
||||
* [Scrape with one proxy per browser](examples/proxies.js) yields [these results](examples/results/proxyresults.json)
|
||||
* [Scrape 100 keywords on Bing with multible tabs in one browser](examples/multiple_tabs.js) produces [this](examples/results/bing.json)
|
||||
* [Inject your own scraping logic](examples/pluggable.js)
|
||||
* [For the Lulz: Scraping google dorks for SQL injection vulnerabilites and confirming them.](examples/for_the_lulz.js)
|
||||
* [Scrape google maps/locations](examples/google_maps.js) yields [these results](examples/results/maps.json)
|
||||
|
||||
|
||||
## Scraping Model
|
||||
|
||||
**se-scraper** scrapes search engines only. In order to introduce concurrency into this library, it is necessary to define the scraping model. Then we can decide how we divide and conquer.
|
||||
|
||||
#### Scraping Resources
|
||||
|
||||
What are common scraping resources?
|
||||
|
||||
1. **Memory and CPU**. Necessary to launch multiple browser instances.
|
||||
2. **Network Bandwith**. Is not often the bottleneck.
|
||||
3. **IP Addresses**. Websites often block IP addresses after a certain amount of requests from the same IP address. Can be circumvented by using proxies.
|
||||
4. Spoofable identifiers such as browser fingerprint or user agents. Those will be handled by **se-scraper**
|
||||
|
||||
#### Concurrency Model
|
||||
|
||||
**se-scraper** should be able to run without any concurrency at all. This is the default case. No concurrency means only one browser/tab is searching at the time.
|
||||
|
||||
For concurrent use, we will make use of a modified [puppeteer-cluster library](https://github.com/thomasdondorf/puppeteer-cluster).
|
||||
|
||||
One scrape job is properly defined by
|
||||
|
||||
* 1 search engine such as `google`
|
||||
* `M` pages
|
||||
* `N` keywords/queries
|
||||
* `K` proxies and `K+1` browser instances (because when we have no proxies available, we will scrape with our dedicated IP)
|
||||
|
||||
Then **se-scraper** will create `K+1` dedicated browser instances with a unique ip address. Each browser will get `N/(K+1)` keywords and will issue `N/(K+1) * M` total requests to the search engine.
|
||||
|
||||
The problem is that [puppeteer-cluster library](https://github.com/thomasdondorf/puppeteer-cluster) does only allow identical options for subsequent new browser instances. Therefore, it is not trivial to launch a cluster of browsers with distinct proxy settings. Right now, every browser has the same options. It's not possible to set options on a per browser basis.
|
||||
|
||||
Solution:
|
||||
|
||||
1. Create a [upstream proxy router](https://github.com/GoogleChrome/puppeteer/issues/678).
|
||||
2. Modify [puppeteer-cluster library](https://github.com/thomasdondorf/puppeteer-cluster) to accept a list of proxy strings and then pop() from this list at every new call to `workerInstance()` in https://github.com/thomasdondorf/puppeteer-cluster/blob/master/src/Cluster.ts I wrote an [issue here](https://github.com/thomasdondorf/puppeteer-cluster/issues/107). **I ended up doing this**.
|
||||
|
||||
|
||||
## Technical Notes
|
||||
|
||||
Scraping is done with a headless chromium browser using the automation library puppeteer. Puppeteer is a Node library which provides a high-level API to control headless Chrome or Chromium over the DevTools Protocol.
|
||||
|
||||
No multithreading is supported for now. Only one scraping worker per `scrape()` call.
|
||||
|
||||
We will soon support parallelization. **se-scraper** will support an architecture similar to:
|
||||
|
||||
1. https://antoinevastel.com/crawler/2018/09/20/parallel-crawler-puppeteer.html
|
||||
2. https://docs.browserless.io/blog/2018/06/04/puppeteer-best-practices.html
|
||||
|
||||
If you need to deploy scraping to the cloud (AWS or Azure), you can contact me at hire@incolumitas.com
|
||||
If you need to deploy scraping to the cloud (AWS or Azure), you can contact me at **hire@incolumitas.com**
|
||||
|
||||
The chromium browser is started with the following flags to prevent
|
||||
scraping detection.
|
||||
@ -90,11 +319,12 @@ var ADDITIONAL_CHROME_FLAGS = [
|
||||
'--disable-gpu',
|
||||
'--window-size=1920x1080',
|
||||
'--hide-scrollbars',
|
||||
'--disable-notifications',
|
||||
];
|
||||
```
|
||||
|
||||
Furthermore, to avoid loading unnecessary ressources and to speed up
|
||||
scraping a great deal, we instruct chrome to not load images and css:
|
||||
scraping a great deal, we instruct chrome to not load images and css and media:
|
||||
|
||||
```js
|
||||
await page.setRequestInterception(true);
|
||||
@ -109,10 +339,11 @@ page.on('request', (req) => {
|
||||
});
|
||||
```
|
||||
|
||||
### Making puppeteer and headless chrome undetectable
|
||||
#### Making puppeteer and headless chrome undetectable
|
||||
|
||||
Consider the following resources:
|
||||
|
||||
* https://antoinevastel.com/bot%20detection/2019/07/19/detecting-chrome-headless-v3.html
|
||||
* https://intoli.com/blog/making-chrome-headless-undetectable/
|
||||
* https://intoli.com/blog/not-possible-to-block-chrome-headless/
|
||||
* https://news.ycombinator.com/item?id=16179602
|
||||
@ -136,19 +367,20 @@ let config = {
|
||||
|
||||
It will create a screenshot named `headless-test-result.png` in the directory where the scraper was started that shows whether all test have passed.
|
||||
|
||||
### Advanced Usage
|
||||
## Advanced Usage
|
||||
|
||||
Use se-scraper by calling it with a script such as the one below.
|
||||
Use **se-scraper** by calling it with a script such as the one below.
|
||||
|
||||
```js
|
||||
const se_scraper = require('se-scraper');
|
||||
const resolve = require('path').resolve;
|
||||
|
||||
let config = {
|
||||
// those options need to be provided on startup
|
||||
// and cannot give to se-scraper on scrape() calls
|
||||
let browser_config = {
|
||||
// the user agent to scrape with
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3835.0 Safari/537.36',
|
||||
// if random_user_agent is set to True, a random user agent is chosen
|
||||
random_user_agent: true,
|
||||
random_user_agent: false,
|
||||
// whether to select manual settings in visible mode
|
||||
set_manual_settings: false,
|
||||
// log ip address data
|
||||
@ -157,19 +389,29 @@ let config = {
|
||||
log_http_headers: false,
|
||||
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
||||
// is drawn before every request. empty string for no sleeping.
|
||||
sleep_range: '[1,1]',
|
||||
sleep_range: '',
|
||||
// which search engine to scrape
|
||||
search_engine: 'google',
|
||||
compress: false, // compress
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: ['scrapeulous.com'],
|
||||
// whether debug information should be printed
|
||||
// level 0: print nothing
|
||||
// level 1: print most important info
|
||||
// ...
|
||||
// level 4: print all shit nobody wants to know
|
||||
debug_level: 1,
|
||||
keywords: ['nodejs rocks',],
|
||||
// whether to start the browser in headless mode
|
||||
headless: true,
|
||||
// specify flags passed to chrome here
|
||||
chrome_flags: [],
|
||||
// the number of pages to scrape for each keyword
|
||||
num_pages: 1,
|
||||
// path to output file, data will be stored in JSON
|
||||
output_file: '',
|
||||
// whether to also passthru all the html output of the serp pages
|
||||
html_output: false,
|
||||
// whether to return a screenshot of serp pages as b64 data
|
||||
screen_output: false,
|
||||
// whether to prevent images, css, fonts and media from being loaded
|
||||
// will speed up scraping a great deal
|
||||
block_assets: true,
|
||||
@ -178,223 +420,92 @@ let config = {
|
||||
// get_browser, handle_metadata, close_browser
|
||||
//custom_func: resolve('examples/pluggable.js'),
|
||||
custom_func: '',
|
||||
throw_on_detection: false,
|
||||
// use a proxy for all connections
|
||||
// example: 'socks5://78.94.172.42:1080'
|
||||
// example: 'http://118.174.233.10:48400'
|
||||
proxy: '',
|
||||
// a file with one proxy per line. Example:
|
||||
// socks5://78.94.172.42:1080
|
||||
// http://118.174.233.10:48400
|
||||
proxy_file: '',
|
||||
// whether to use proxies only
|
||||
// when this is set to true, se-scraper will not use
|
||||
// your default IP address
|
||||
use_proxies_only: false,
|
||||
// check if headless chrome escapes common detection techniques
|
||||
// this is a quick test and should be used for debugging
|
||||
test_evasion: false,
|
||||
apply_evasion_techniques: true,
|
||||
// settings for puppeteer-cluster
|
||||
puppeteer_cluster_config: {
|
||||
timeout: 30 * 60 * 1000, // max timeout set to 30 minutes
|
||||
monitor: false,
|
||||
concurrency: Cluster.CONCURRENCY_BROWSER,
|
||||
maxConcurrency: 1,
|
||||
}
|
||||
};
|
||||
|
||||
function callback(err, response) {
|
||||
if (err) { console.error(err) }
|
||||
(async () => {
|
||||
// scrape config can change on each scrape() call
|
||||
let scrape_config = {
|
||||
// which search engine to scrape
|
||||
search_engine: 'google',
|
||||
// an array of keywords to scrape
|
||||
keywords: ['cat', 'mouse'],
|
||||
// the number of pages to scrape for each keyword
|
||||
num_pages: 2,
|
||||
|
||||
/* response object has the following properties:
|
||||
// OPTIONAL PARAMS BELOW:
|
||||
google_settings: {
|
||||
gl: 'us', // The gl parameter determines the Google country to use for the query.
|
||||
hl: 'fr', // The hl parameter determines the Google UI language to return results.
|
||||
start: 0, // Determines the results offset to use, defaults to 0.
|
||||
num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
|
||||
},
|
||||
// instead of keywords you can specify a keyword_file. this overwrites the keywords array
|
||||
keyword_file: '',
|
||||
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
||||
// is drawn before every request. empty string for no sleeping.
|
||||
sleep_range: '',
|
||||
// path to output file, data will be stored in JSON
|
||||
output_file: 'output.json',
|
||||
// whether to prevent images, css, fonts from being loaded
|
||||
// will speed up scraping a great deal
|
||||
block_assets: false,
|
||||
// check if headless chrome escapes common detection techniques
|
||||
// this is a quick test and should be used for debugging
|
||||
test_evasion: false,
|
||||
apply_evasion_techniques: true,
|
||||
// log ip address data
|
||||
log_ip_address: false,
|
||||
// log http headers
|
||||
log_http_headers: false,
|
||||
};
|
||||
|
||||
response.results - json object with the scraping results
|
||||
response.metadata - json object with metadata information
|
||||
response.statusCode - status code of the scraping process
|
||||
*/
|
||||
let results = await se_scraper.scrape(browser_config, scrape_config);
|
||||
console.dir(results, {depth: null, colors: true});
|
||||
})();
|
||||
```
|
||||
|
||||
console.dir(response.results, {depth: null, colors: true});
|
||||
[Output for the above script on my machine.](examples/results/advanced.json)
|
||||
|
||||
### Query String Parameters
|
||||
|
||||
You can add your custom query string parameters to the configuration object by specifying a `google_settings` key. In general: `{{search engine}}_settings`.
|
||||
|
||||
For example you can customize your google search with the following config:
|
||||
|
||||
```js
|
||||
let scrape_config = {
|
||||
search_engine: 'google',
|
||||
// use specific search engine parameters for various search engines
|
||||
google_settings: {
|
||||
google_domain: 'google.com',
|
||||
gl: 'us', // The gl parameter determines the Google country to use for the query.
|
||||
hl: 'us', // The hl parameter determines the Google UI language to return results.
|
||||
start: 0, // Determines the results offset to use, defaults to 0.
|
||||
num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
|
||||
},
|
||||
}
|
||||
|
||||
se_scraper.scrape(config, callback);
|
||||
```
|
||||
|
||||
Supported options for the `search_engine` config key:
|
||||
|
||||
```javascript
|
||||
'google'
|
||||
'google_news_old'
|
||||
'google_news'
|
||||
'google_image'
|
||||
'bing'
|
||||
'bing_news'
|
||||
'infospace'
|
||||
'webcrawler'
|
||||
'baidu'
|
||||
'youtube'
|
||||
'duckduckgo_news'
|
||||
'reuters'
|
||||
'cnbc'
|
||||
'marketwatch'
|
||||
```
|
||||
|
||||
Output for the above script on my machine:
|
||||
|
||||
```text
|
||||
{ 'scraping scrapeulous.com':
|
||||
{ '1':
|
||||
{ time: 'Tue, 29 Jan 2019 21:39:22 GMT',
|
||||
num_results: 'Ungefähr 145 Ergebnisse (0,18 Sekunden) ',
|
||||
no_results: false,
|
||||
effective_query: '',
|
||||
results:
|
||||
[ { link: 'https://scrapeulous.com/',
|
||||
title:
|
||||
'Scrapeuloushttps://scrapeulous.com/Im CacheDiese Seite übersetzen',
|
||||
snippet:
|
||||
'Scrapeulous.com allows you to scrape various search engines automatically ... or to find hidden links, Scrapeulous.com enables you to scrape a ever increasing ...',
|
||||
visible_link: 'https://scrapeulous.com/',
|
||||
date: '',
|
||||
rank: 1 },
|
||||
{ link: 'https://scrapeulous.com/about/',
|
||||
title:
|
||||
'About - Scrapeuloushttps://scrapeulous.com/about/Im CacheDiese Seite übersetzen',
|
||||
snippet:
|
||||
'Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. The business requirement to scrape information from ...',
|
||||
visible_link: 'https://scrapeulous.com/about/',
|
||||
date: '',
|
||||
rank: 2 },
|
||||
{ link: 'https://scrapeulous.com/howto/',
|
||||
title:
|
||||
'Howto - Scrapeuloushttps://scrapeulous.com/howto/Im CacheDiese Seite übersetzen',
|
||||
snippet:
|
||||
'We offer scraping large amounts of keywords for the Google Search Engine. Large means any number of keywords between 40 and 50000. Additionally, we ...',
|
||||
visible_link: 'https://scrapeulous.com/howto/',
|
||||
date: '',
|
||||
rank: 3 },
|
||||
{ link: 'https://github.com/NikolaiT/se-scraper',
|
||||
title:
|
||||
'GitHub - NikolaiT/se-scraper: Javascript scraping module based on ...https://github.com/NikolaiT/se-scraperIm CacheDiese Seite übersetzen',
|
||||
snippet:
|
||||
'24.12.2018 - Javascript scraping module based on puppeteer for many different search ... for many different search engines... https://scrapeulous.com/.',
|
||||
visible_link: 'https://github.com/NikolaiT/se-scraper',
|
||||
date: '24.12.2018 - ',
|
||||
rank: 4 },
|
||||
{ link:
|
||||
'https://github.com/NikolaiT/GoogleScraper/blob/master/README.md',
|
||||
title:
|
||||
'GoogleScraper/README.md at master · NikolaiT/GoogleScraper ...https://github.com/NikolaiT/GoogleScraper/blob/.../README.mdIm CacheÄhnliche SeitenDiese Seite übersetzen',
|
||||
snippet:
|
||||
'GoogleScraper - Scraping search engines professionally. Scrapeulous.com - Scraping Service. GoogleScraper is a open source tool and will remain a open ...',
|
||||
visible_link:
|
||||
'https://github.com/NikolaiT/GoogleScraper/blob/.../README.md',
|
||||
date: '',
|
||||
rank: 5 },
|
||||
{ link: 'https://googlescraper.readthedocs.io/',
|
||||
title:
|
||||
'Welcome to GoogleScraper\'s documentation! — GoogleScraper ...https://googlescraper.readthedocs.io/Im CacheDiese Seite übersetzen',
|
||||
snippet:
|
||||
'Welcome to GoogleScraper\'s documentation!¶. Contents: GoogleScraper - Scraping search engines professionally · Scrapeulous.com - Scraping Service ...',
|
||||
visible_link: 'https://googlescraper.readthedocs.io/',
|
||||
date: '',
|
||||
rank: 6 },
|
||||
{ link: 'https://incolumitas.com/pages/scrapeulous/',
|
||||
title:
|
||||
'Coding, Learning and Business Ideas – Scrapeulous.com - Incolumitashttps://incolumitas.com/pages/scrapeulous/Im CacheDiese Seite übersetzen',
|
||||
snippet:
|
||||
'A scraping service for scientists, marketing professionals, analysts or SEO folk. In autumn 2018, I created a scraping service called scrapeulous.com. There you ...',
|
||||
visible_link: 'https://incolumitas.com/pages/scrapeulous/',
|
||||
date: '',
|
||||
rank: 7 },
|
||||
{ link: 'https://incolumitas.com/',
|
||||
title:
|
||||
'Coding, Learning and Business Ideashttps://incolumitas.com/Im CacheDiese Seite übersetzen',
|
||||
snippet:
|
||||
'Scraping Amazon Reviews using Headless Chrome Browser and Python3. Posted on Mi ... GoogleScraper Tutorial - How to scrape 1000 keywords with Google.',
|
||||
visible_link: 'https://incolumitas.com/',
|
||||
date: '',
|
||||
rank: 8 },
|
||||
{ link: 'https://en.wikipedia.org/wiki/Search_engine_scraping',
|
||||
title:
|
||||
'Search engine scraping - Wikipediahttps://en.wikipedia.org/wiki/Search_engine_scrapingIm CacheDiese Seite übersetzen',
|
||||
snippet:
|
||||
'Search engine scraping is the process of harvesting URLs, descriptions, or other information from search engines such as Google, Bing or Yahoo. This is a ...',
|
||||
visible_link: 'https://en.wikipedia.org/wiki/Search_engine_scraping',
|
||||
date: '',
|
||||
rank: 9 },
|
||||
{ link:
|
||||
'https://readthedocs.org/projects/googlescraper/downloads/pdf/latest/',
|
||||
title:
|
||||
'GoogleScraper Documentation - Read the Docshttps://readthedocs.org/projects/googlescraper/downloads/.../latest...Im CacheDiese Seite übersetzen',
|
||||
snippet:
|
||||
'23.12.2018 - Contents: 1 GoogleScraper - Scraping search engines professionally. 1. 1.1 ... For this reason, I created the web service scrapeulous.com.',
|
||||
visible_link:
|
||||
'https://readthedocs.org/projects/googlescraper/downloads/.../latest...',
|
||||
date: '23.12.2018 - ',
|
||||
rank: 10 } ] },
|
||||
'2':
|
||||
{ time: 'Tue, 29 Jan 2019 21:39:24 GMT',
|
||||
num_results: 'Seite 2 von ungefähr 145 Ergebnissen (0,20 Sekunden) ',
|
||||
no_results: false,
|
||||
effective_query: '',
|
||||
results:
|
||||
[ { link: 'https://pypi.org/project/CountryGoogleScraper/',
|
||||
title:
|
||||
'CountryGoogleScraper · PyPIhttps://pypi.org/project/CountryGoogleScraper/Im CacheDiese Seite übersetzen',
|
||||
snippet:
|
||||
'A module to scrape and extract links, titles and descriptions from various search ... Look [here to get an idea how to use asynchronous mode](http://scrapeulous.',
|
||||
visible_link: 'https://pypi.org/project/CountryGoogleScraper/',
|
||||
date: '',
|
||||
rank: 1 },
|
||||
{ link: 'https://www.youtube.com/watch?v=a6xn6rc9GbI',
|
||||
title:
|
||||
'scrapeulous intro - YouTubehttps://www.youtube.com/watch?v=a6xn6rc9GbIDiese Seite übersetzen',
|
||||
snippet:
|
||||
'scrapeulous intro. Scrapeulous Scrapeulous. Loading... Unsubscribe from ... on Dec 16, 2018. Introduction ...',
|
||||
visible_link: 'https://www.youtube.com/watch?v=a6xn6rc9GbI',
|
||||
date: '',
|
||||
rank: 3 },
|
||||
{ link:
|
||||
'https://www.reddit.com/r/Python/comments/2tii3r/scraping_260_search_queries_in_bing_in_a_matter/',
|
||||
title:
|
||||
'Scraping 260 search queries in Bing in a matter of seconds using ...https://www.reddit.com/.../scraping_260_search_queries_in_bing...Im CacheDiese Seite übersetzen',
|
||||
snippet:
|
||||
'24.01.2015 - Scraping 260 search queries in Bing in a matter of seconds using asyncio and aiohttp. (scrapeulous.com). submitted 3 years ago by ...',
|
||||
visible_link:
|
||||
'https://www.reddit.com/.../scraping_260_search_queries_in_bing...',
|
||||
date: '24.01.2015 - ',
|
||||
rank: 4 },
|
||||
{ link: 'https://twitter.com/incolumitas_?lang=de',
|
||||
title:
|
||||
'Nikolai Tschacher (@incolumitas_) | Twitterhttps://twitter.com/incolumitas_?lang=deIm CacheÄhnliche SeitenDiese Seite übersetzen',
|
||||
snippet:
|
||||
'Learn how to scrape millions of url from yandex and google or bing with: http://scrapeulous.com/googlescraper-market-analysis.html … 0 replies 0 retweets 0 ...',
|
||||
visible_link: 'https://twitter.com/incolumitas_?lang=de',
|
||||
date: '',
|
||||
rank: 5 },
|
||||
{ link:
|
||||
'http://blog.shodan.io/hostility-in-the-python-package-index/',
|
||||
title:
|
||||
'Hostility in the Cheese Shop - Shodan Blogblog.shodan.io/hostility-in-the-python-package-index/Im CacheDiese Seite übersetzen',
|
||||
snippet:
|
||||
'22.02.2015 - https://zzz.scrapeulous.com/r? According to the author of the website, these hostile packages are used as honeypots. Honeypots are usually ...',
|
||||
visible_link: 'blog.shodan.io/hostility-in-the-python-package-index/',
|
||||
date: '22.02.2015 - ',
|
||||
rank: 6 },
|
||||
{ link: 'https://libraries.io/github/NikolaiT/GoogleScraper',
|
||||
title:
|
||||
'NikolaiT/GoogleScraper - Libraries.iohttps://libraries.io/github/NikolaiT/GoogleScraperIm CacheDiese Seite übersetzen',
|
||||
snippet:
|
||||
'A Python module to scrape several search engines (like Google, Yandex, Bing, ... https://scrapeulous.com/ ... You can install GoogleScraper comfortably with pip:',
|
||||
visible_link: 'https://libraries.io/github/NikolaiT/GoogleScraper',
|
||||
date: '',
|
||||
rank: 7 },
|
||||
{ link: 'https://pydigger.com/pypi/CountryGoogleScraper',
|
||||
title:
|
||||
'CountryGoogleScraper - PyDiggerhttps://pydigger.com/pypi/CountryGoogleScraperDiese Seite übersetzen',
|
||||
snippet:
|
||||
'19.10.2016 - Look [here to get an idea how to use asynchronous mode](http://scrapeulous.com/googlescraper-260-keywords-in-a-second.html). ### Table ...',
|
||||
visible_link: 'https://pydigger.com/pypi/CountryGoogleScraper',
|
||||
date: '19.10.2016 - ',
|
||||
rank: 8 },
|
||||
{ link: 'https://hub.docker.com/r/cimenx/data-mining-penandtest/',
|
||||
title:
|
||||
'cimenx/data-mining-penandtest - Docker Hubhttps://hub.docker.com/r/cimenx/data-mining-penandtest/Im CacheDiese Seite übersetzen',
|
||||
snippet:
|
||||
'Container. OverviewTagsDockerfileBuilds · http://scrapeulous.com/googlescraper-260-keywords-in-a-second.html. Docker Pull Command. Owner. profile ...',
|
||||
visible_link: 'https://hub.docker.com/r/cimenx/data-mining-penandtest/',
|
||||
date: '',
|
||||
rank: 9 },
|
||||
{ link: 'https://www.revolvy.com/page/Search-engine-scraping',
|
||||
title:
|
||||
'Search engine scraping | Revolvyhttps://www.revolvy.com/page/Search-engine-scrapingIm CacheDiese Seite übersetzen',
|
||||
snippet:
|
||||
'Search engine scraping is the process of harvesting URLs, descriptions, or other information from search engines such as Google, Bing or Yahoo. This is a ...',
|
||||
visible_link: 'https://www.revolvy.com/page/Search-engine-scraping',
|
||||
date: '',
|
||||
rank: 10 } ] } } }
|
||||
```
|
88
TODO.md
Normal file
88
TODO.md
Normal file
@ -0,0 +1,88 @@
|
||||
### 24.12.2018
|
||||
- fix interface to scrape() [DONE]
|
||||
- add to Github
|
||||
|
||||
|
||||
### 24.1.2018
|
||||
- fix issue #3: add functionality to add keyword file
|
||||
|
||||
### 27.1.2019
|
||||
- Add functionality to block images and CSS from loading as described here:
|
||||
https://www.scrapehero.com/how-to-increase-web-scraping-speed-using-puppeteer/
|
||||
https://www.scrapehero.com/how-to-build-a-web-scraper-using-puppeteer-and-node-js/
|
||||
|
||||
### 29.1.2019
|
||||
- implement proxy support functionality
|
||||
- implement proxy check
|
||||
|
||||
- implement scraping more than 1 page
|
||||
- do it for google
|
||||
- and bing
|
||||
- implement duckduckgo scraping
|
||||
|
||||
|
||||
### 30.1.2019
|
||||
- modify all scrapers to use the generic class where it makes sense
|
||||
- Bing, Baidu, Google, Duckduckgo
|
||||
|
||||
### 7.2.2019
|
||||
- add num_requests to test cases [done]
|
||||
|
||||
### 25.2.2019
|
||||
- https://antoinevastel.com/crawler/2018/09/20/parallel-crawler-puppeteer.html
|
||||
- add support for browsing with multiple browsers, use this neat library:
|
||||
- https://github.com/thomasdondorf/puppeteer-cluster [done]
|
||||
|
||||
|
||||
### 28.2.2019
|
||||
- write test case for multiple browsers/proxies
|
||||
- write test case and example for multiple tabs with bing
|
||||
- make README.md nicer. https://github.com/thomasdondorf/puppeteer-cluster/blob/master/README.md as template
|
||||
|
||||
|
||||
### 11.6.2019
|
||||
- TODO: fix amazon scraping
|
||||
- change api of remaining test cases [done]
|
||||
- TODO: implement custom search engine parameters on scrape()
|
||||
|
||||
### 12.6.2019
|
||||
- remove unnecessary sleep() calls and replace with waitFor selectors
|
||||
|
||||
|
||||
### 16.7.2019
|
||||
|
||||
- resolve issues
|
||||
- fix this https://github.com/NikolaiT/se-scraper/issues/37 [done]
|
||||
|
||||
- use puppeteer stealth plugin: https://www.npmjs.com/package/puppeteer-extra-plugin-stealth
|
||||
|
||||
- we will need to load at the concurrency impl of puppeteer-cluster [no typescript support :(), I will not support this right now]
|
||||
|
||||
- user random user agents plugin: https://github.com/intoli/user-agents [done]
|
||||
|
||||
- add screenshot capability (make the screen after parsing)
|
||||
- store as b64 [done]
|
||||
|
||||
|
||||
|
||||
### 12.8.2019
|
||||
|
||||
- add static test case for bing [done]
|
||||
- add options that minimize `html_output` flag:
|
||||
`clean_html_output` will remove all JS and CSS from the html
|
||||
`clean_data_images` removes all data images from the html
|
||||
[done]
|
||||
|
||||
|
||||
### 13.8.2019
|
||||
- Write test case for clean html output [done]
|
||||
- Consider better compression algorithm. [done] There is the brotli algorithm, but this is only supported
|
||||
in very recent versions of nodejs
|
||||
- what else can we remove from the dom [done] Removing comment nodes now! They are large in BING.
|
||||
- remove all whitespace and \n and \t from html
|
||||
|
||||
### TODO:
|
||||
1. fix googlenewsscraper waiting for results and parsing. remove the static sleep [done]
|
||||
2. when using multiple browsers and random user agent, pass a random user agent to each perBrowserOptions
|
||||
|
||||
3. dont create a new tab when opening a new scraper
|
45
TODO.txt
45
TODO.txt
@ -1,45 +0,0 @@
|
||||
24.12.2018
|
||||
- fix interface to scrape() [DONE]
|
||||
- add to Github
|
||||
|
||||
|
||||
24.1.2018
|
||||
|
||||
- fix issue #3: add functionality to add keyword file
|
||||
|
||||
27.1.2019
|
||||
|
||||
- Add functionality to block images and CSS from loading as described here:
|
||||
|
||||
https://www.scrapehero.com/how-to-increase-web-scraping-speed-using-puppeteer/
|
||||
https://www.scrapehero.com/how-to-build-a-web-scraper-using-puppeteer-and-node-js/
|
||||
|
||||
29.1.2019
|
||||
|
||||
- implement proxy support functionality
|
||||
- implement proxy check
|
||||
|
||||
- implement scraping more than 1 page
|
||||
- do it for google
|
||||
- and bing
|
||||
|
||||
- implement duckduckgo scraping
|
||||
|
||||
|
||||
30.1.2019
|
||||
|
||||
- modify all scrapers to use the generic class where it makes sense
|
||||
- Bing, Baidu, Google, Duckduckgo
|
||||
|
||||
7.2.2019
|
||||
- add num_requests to test cases [done]
|
||||
|
||||
|
||||
|
||||
TODO:
|
||||
- add captcha service solving support
|
||||
- check if news instances run the same browser and if we can have one proxy per tab wokers
|
||||
|
||||
- write test case for:
|
||||
- pluggable
|
||||
- full metadata (log http headers, log ip address)
|
4645
examples/bing_de.json
Normal file
4645
examples/bing_de.json
Normal file
File diff suppressed because it is too large
Load Diff
85
examples/bing_multiple_browser_multiple_pages.js
Normal file
85
examples/bing_multiple_browser_multiple_pages.js
Normal file
@ -0,0 +1,85 @@
|
||||
var fs = require('fs');
|
||||
var path = require('path');
|
||||
var os = require("os");
|
||||
|
||||
const se_scraper = require('./../index.js');
|
||||
var filepath_de = path.join(__dirname, '/data/keywords_de.txt');
|
||||
|
||||
function read_keywords_from_file(fpath) {
|
||||
let kws = fs.readFileSync(fpath).toString().split(os.EOL);
|
||||
// clean keywords
|
||||
kws = kws.filter((kw) => {
|
||||
return kw.trim().length > 0;
|
||||
});
|
||||
return kws;
|
||||
}
|
||||
|
||||
let keywords_de = read_keywords_from_file(filepath_de);
|
||||
|
||||
const Cluster = {
|
||||
CONCURRENCY_PAGE: 1, // shares cookies, etc.
|
||||
CONCURRENCY_CONTEXT: 2, // no cookie sharing (uses contexts)
|
||||
CONCURRENCY_BROWSER: 3, // no cookie sharing and individual processes (uses contexts)
|
||||
};
|
||||
|
||||
// those options need to be provided on startup
|
||||
// and cannot give to se-scraper on scrape() calls
|
||||
let browser_config = {
|
||||
// the user agent to scrape with
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
// if random_user_agent is set to True, a random user agent is chosen
|
||||
random_user_agent: true,
|
||||
verbose: true,
|
||||
// whether to start the browser in headless mode
|
||||
headless: true,
|
||||
is_local: false,
|
||||
throw_on_detection: false,
|
||||
puppeteer_cluster_config: {
|
||||
headless: true,
|
||||
timeout: 2 * 60 * 1000, // max timeout set to 2 minutes
|
||||
monitor: false,
|
||||
concurrency: 3, // one scraper per tab
|
||||
maxConcurrency: 3, // scrape with 5 tabs
|
||||
}
|
||||
};
|
||||
|
||||
(async () => {
|
||||
// scrape config can change on each scrape() call
|
||||
let scrape_config_bing_de = {
|
||||
// which search engine to scrape
|
||||
search_engine: 'bing',
|
||||
// an array of keywords to scrape
|
||||
keywords: keywords_de,
|
||||
// the number of pages to scrape for each keyword
|
||||
num_pages: 10,
|
||||
|
||||
// OPTIONAL PARAMS BELOW:
|
||||
// https://docs.microsoft.com/en-us/rest/api/cognitiveservices-bingsearch/bing-web-api-v5-reference#query-parameters
|
||||
bing_settings: {
|
||||
cc: 'DE', // The cc parameter determines the country to use for the query.
|
||||
mkt: 'de-DE', // The mkt parameter determines the UI language to return results.
|
||||
offset: 0, // Determines the results offset to use, defaults to 0.
|
||||
count: 20, // Determines the number of results to show, defaults to 10. Maximum is 100.
|
||||
},
|
||||
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
||||
// is drawn before every request. empty string for no sleeping.
|
||||
sleep_range: '',
|
||||
// path to output file, data will be stored in JSON
|
||||
output_file: 'examples/bing_de.json',
|
||||
// whether to prevent images, css, fonts from being loaded
|
||||
// will speed up scraping a great deal
|
||||
block_assets: true,
|
||||
// check if headless chrome escapes common detection techniques
|
||||
// this is a quick test and should be used for debugging
|
||||
test_evasion: false,
|
||||
apply_evasion_techniques: true,
|
||||
// log ip address data
|
||||
log_ip_address: false,
|
||||
// log http headers
|
||||
log_http_headers: false,
|
||||
};
|
||||
|
||||
let results = await se_scraper.scrape(browser_config, scrape_config_bing_de);
|
||||
console.dir(results.metadata, {depth: null, colors: true});
|
||||
|
||||
})();
|
25
examples/cleaned_html.js
Normal file
25
examples/cleaned_html.js
Normal file
@ -0,0 +1,25 @@
|
||||
const se_scraper = require('./../index.js');
|
||||
const fs = require('fs');
|
||||
|
||||
(async () => {
|
||||
|
||||
let kw = 'news iran'
|
||||
|
||||
let scrape_job = {
|
||||
search_engine: 'baidu',
|
||||
keywords: [kw],
|
||||
num_pages: 1,
|
||||
html_output: true,
|
||||
// whether to strip JS and CSS from the html_output
|
||||
// has only an effect if `html_output` is true
|
||||
clean_html_output: true,
|
||||
// remove all data images from the html
|
||||
clean_data_images: true,
|
||||
};
|
||||
|
||||
var response = await se_scraper.scrape({}, scrape_job);
|
||||
|
||||
console.dir(response, {depth: null, colors: true});
|
||||
|
||||
fs.writeFileSync('example_cleaned.html', response.results[kw]['1']['html']);
|
||||
})();
|
119
examples/custom_scraper.js
Normal file
119
examples/custom_scraper.js
Normal file
@ -0,0 +1,119 @@
|
||||
const se_scraper = require('./../index.js');
|
||||
|
||||
/*
|
||||
* This example shows how you can define your custom scraper class and use it
|
||||
* within se-scraper.
|
||||
*/
|
||||
class EcosiaScraper extends se_scraper.Scraper {
|
||||
|
||||
constructor(...args) {
|
||||
super(...args);
|
||||
}
|
||||
|
||||
async parse_async(html) {
|
||||
// In this example we use vanilla javascript to parse out the
|
||||
// interesting information from the search engine
|
||||
|
||||
// you may also use a external library such as cheerio.
|
||||
|
||||
return await this.page.evaluate(() => {
|
||||
var results = {
|
||||
num_results: '',
|
||||
no_results: false,
|
||||
effective_query: '',
|
||||
results: [],
|
||||
};
|
||||
|
||||
document.querySelectorAll('.results .result').forEach((result) => {
|
||||
var serp = {};
|
||||
var title = result.querySelector('.result-title');
|
||||
if (title) {
|
||||
serp.title = title.innerText;
|
||||
serp.link = title.getAttribute('href');
|
||||
}
|
||||
|
||||
var green = result.querySelector('.result-url');
|
||||
if (green) {
|
||||
serp.green = green.getAttribute('href');
|
||||
}
|
||||
|
||||
var snippet = result.querySelector('.result-snippet');
|
||||
|
||||
if (snippet) {
|
||||
serp.snippet = snippet.innerText;
|
||||
}
|
||||
|
||||
results.results.push(serp);
|
||||
});
|
||||
|
||||
var num_res = document.querySelector('.card-title-result-count');
|
||||
if (num_res) {
|
||||
results.num_results = num_res.innerText;
|
||||
}
|
||||
|
||||
results.no_results = document.querySelector('.empty-result') != null;
|
||||
|
||||
var effective = document.querySelector('.query-context-text .result-title');
|
||||
|
||||
if (effective) {
|
||||
results.effective_query = effective.innerText;
|
||||
}
|
||||
|
||||
return results;
|
||||
});
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
let startUrl = 'https://www.ecosia.org/';
|
||||
|
||||
await this.page.goto(startUrl);
|
||||
|
||||
try {
|
||||
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
async search_keyword(keyword) {
|
||||
const input = await this.page.$('input[name="q"]');
|
||||
await this.set_input_value(`input[name="q"]`, keyword);
|
||||
await this.sleep(50);
|
||||
await input.focus();
|
||||
await this.page.keyboard.press("Enter");
|
||||
}
|
||||
|
||||
async next_page() {
|
||||
let next_page_link = await this.page.$('.pagination-next', {timeout: 1000});
|
||||
if (!next_page_link) {
|
||||
return false;
|
||||
}
|
||||
await next_page_link.click();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
async wait_for_results() {
|
||||
await this.page.waitForSelector('.results .result', { timeout: this.STANDARD_TIMEOUT });
|
||||
}
|
||||
|
||||
async detected() {
|
||||
// check whether scraping was detected.
|
||||
}
|
||||
}
|
||||
|
||||
(async () => {
|
||||
|
||||
let scrape_job = {
|
||||
search_engine: EcosiaScraper,
|
||||
keywords: ['lets go boys'],
|
||||
num_pages: 2,
|
||||
};
|
||||
|
||||
var results = await se_scraper.scrape({headless: true}, scrape_job);
|
||||
|
||||
console.dir(results, {depth: null, colors: true});
|
||||
|
||||
})();
|
11
examples/delete_comments.js
Normal file
11
examples/delete_comments.js
Normal file
@ -0,0 +1,11 @@
|
||||
var nodeIterator = document.createNodeIterator(
|
||||
document.body,
|
||||
NodeFilter.SHOW_COMMENT,
|
||||
{ acceptNode: function(node) { return NodeFilter.FILTER_ACCEPT; } }
|
||||
);
|
||||
|
||||
// Remove all comment nodes
|
||||
while(nodeIterator.nextNode()){
|
||||
var commentNode = nodeIterator.referenceNode;
|
||||
commentNode.remove();
|
||||
}
|
97
examples/for_the_lulz.js
Normal file
97
examples/for_the_lulz.js
Normal file
@ -0,0 +1,97 @@
|
||||
|
||||
/*
|
||||
* Do not run this, this is probably illegal in your country ;)
|
||||
*/
|
||||
|
||||
const se_scraper = require('./../index.js');
|
||||
|
||||
|
||||
// generate some google dorks
|
||||
|
||||
function genGoogleDorks(iter=4) {
|
||||
let lulz_keywords = [];
|
||||
['seite', 'inicio', 'index'].forEach((x) => {
|
||||
for (var i = 0; i < iter; i++) {
|
||||
lulz_keywords.push(
|
||||
'inurl:"' + x + '.php?id=' + Math.floor(Math.random() * 100) + '"'
|
||||
)
|
||||
}
|
||||
});
|
||||
return lulz_keywords;
|
||||
}
|
||||
|
||||
const lulz_keywords = genGoogleDorks();
|
||||
console.log(lulz_keywords);
|
||||
|
||||
|
||||
// those options need to be provided on startup
|
||||
// and cannot give to se-scraper on scrape() calls
|
||||
let browser_config = {
|
||||
// if random_user_agent is set to True, a random user agent is chosen
|
||||
random_user_agent: true,
|
||||
headless: true,
|
||||
is_local: false,
|
||||
throw_on_detection: false,
|
||||
puppeteer_cluster_config: {
|
||||
headless: true,
|
||||
timeout: 2 * 60 * 1000, // max timeout set to 2 minutes
|
||||
monitor: false,
|
||||
concurrency: 3, // one scraper per tab
|
||||
maxConcurrency: 4, // scrape with 4 tabs
|
||||
}
|
||||
};
|
||||
|
||||
(async () => {
|
||||
// scrape config can change on each scrape() call
|
||||
let lulz_config = {
|
||||
// which search engine to scrape
|
||||
search_engine: 'google',
|
||||
// an array of keywords to scrape
|
||||
keywords: lulz_keywords,
|
||||
// the number of pages to scrape for each keyword
|
||||
num_pages: 3,
|
||||
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
||||
// is drawn before every request. empty string for no sleeping.
|
||||
sleep_range: '',
|
||||
// path to output file, data will be stored in JSON
|
||||
output_file: 'goodboys.json',
|
||||
// whether to prevent images, css, fonts from being loaded
|
||||
// will speed up scraping a great deal
|
||||
block_assets: true,
|
||||
// check if headless chrome escapes common detection techniques
|
||||
// this is a quick test and should be used for debugging
|
||||
test_evasion: false,
|
||||
apply_evasion_techniques: true,
|
||||
// log ip address data
|
||||
log_ip_address: false,
|
||||
// log http headers
|
||||
log_http_headers: false,
|
||||
};
|
||||
|
||||
let results = await se_scraper.scrape(browser_config, lulz_config);
|
||||
|
||||
const all_links = [];
|
||||
|
||||
for (var kw in results) {
|
||||
for (var page in results[kw]) {
|
||||
for (var res of results[kw][page]['results']) {
|
||||
all_links.push(res.link);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log(all_links);
|
||||
|
||||
for (var link of all_links) {
|
||||
try {
|
||||
const response = await got(link.replace(/(id=\d+)/g, "$1'"));
|
||||
let html = response.body;
|
||||
if (html.includes('error') || html.includes('mysql')) {
|
||||
console.log('Got a mysql injection in ' + url);
|
||||
}
|
||||
} catch (error) {
|
||||
console.log(error.response.statusCode);
|
||||
}
|
||||
}
|
||||
|
||||
})();
|
23
examples/gimage.js
Normal file
23
examples/gimage.js
Normal file
@ -0,0 +1,23 @@
|
||||
const se_scraper = require('./../src/node_scraper.js');
|
||||
|
||||
(async () => {
|
||||
let browser_config = {
|
||||
output_file: '',
|
||||
};
|
||||
|
||||
let scrape_job = {
|
||||
search_engine: 'google_image',
|
||||
keywords: ['manaslu', 'everest', 'pitcairn'],
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(browser_config);
|
||||
|
||||
await scraper.start();
|
||||
|
||||
var results = await scraper.scrape(scrape_job);
|
||||
|
||||
console.dir(results, {depth: null, colors: true});
|
||||
|
||||
await scraper.quit();
|
||||
})();
|
26
examples/gnold.js
Normal file
26
examples/gnold.js
Normal file
@ -0,0 +1,26 @@
|
||||
const se_scraper = require('./../src/node_scraper.js');
|
||||
|
||||
(async () => {
|
||||
let browser_config = {
|
||||
output_file: 'examples/results/gnold.json',
|
||||
google_news_old_settings: {
|
||||
gl: 'us', // The gl parameter determines the Google country to use for the query.
|
||||
hl: 'fr', // The hl parameter determines the Google UI language to return results.
|
||||
start: 0, // Determines the results offset to use, defaults to 0.
|
||||
num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
|
||||
},
|
||||
};
|
||||
|
||||
let scrape_job = {
|
||||
search_engine: 'google_news_old',
|
||||
keywords: ['news world'],
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(browser_config);
|
||||
await scraper.start();
|
||||
|
||||
var results = await scraper.scrape(scrape_job);
|
||||
console.dir(results, {depth: null, colors: true});
|
||||
await scraper.quit();
|
||||
})();
|
30
examples/google_maps.js
Normal file
30
examples/google_maps.js
Normal file
@ -0,0 +1,30 @@
|
||||
const se_scraper = require('./../src/node_scraper.js');
|
||||
|
||||
(async () => {
|
||||
let browser_config = {
|
||||
output_file: 'examples/results/maps.json',
|
||||
test_evasion: false,
|
||||
block_assets: false,
|
||||
headless: false,
|
||||
|
||||
google_maps_settings: {
|
||||
scrape_in_detail: false,
|
||||
}
|
||||
};
|
||||
|
||||
let scrape_job = {
|
||||
search_engine: 'google_maps',
|
||||
keywords: ['Berlin Zahnarzt'],
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(browser_config);
|
||||
|
||||
await scraper.start();
|
||||
|
||||
var results = await scraper.scrape(scrape_job);
|
||||
|
||||
console.dir(results, {depth: null, colors: true});
|
||||
|
||||
await scraper.quit();
|
||||
})();
|
21
examples/minimal.js
Normal file
21
examples/minimal.js
Normal file
@ -0,0 +1,21 @@
|
||||
const se_scraper = require('./../index.js');
|
||||
|
||||
(async () => {
|
||||
|
||||
let kws = [
|
||||
'https://www.linkedin.com/in/aakanksha-majhi-b24a8449',
|
||||
'https://www.linkedin.com/in/aakash-srivastava-7374a830',
|
||||
'https://www.linkedin.com/in/aakash-tiwari-019b8569',
|
||||
];
|
||||
|
||||
let scrape_job = {
|
||||
search_engine: 'google',
|
||||
keywords: kws,
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
var results = await se_scraper.scrape({}, scrape_job);
|
||||
|
||||
console.dir(results, {depth: null, colors: true});
|
||||
|
||||
})();
|
35
examples/multiple_browsers.js
Normal file
35
examples/multiple_browsers.js
Normal file
@ -0,0 +1,35 @@
|
||||
const se_scraper = require('./../src/node_scraper.js');
|
||||
|
||||
(async () => {
|
||||
let browser_config = {
|
||||
search_engine: 'google',
|
||||
random_user_agent: true,
|
||||
is_local: false,
|
||||
html_output: false,
|
||||
throw_on_detection: false,
|
||||
headless: true,
|
||||
puppeteer_cluster_config: {
|
||||
headless: true,
|
||||
timeout: 30 * 60 * 1000, // max timeout set to 30 minutes
|
||||
monitor: false,
|
||||
concurrency: 3, // 3 == CONCURRENCY_BROWSER
|
||||
maxConcurrency: 3, // 3 browsers will scrape
|
||||
},
|
||||
};
|
||||
|
||||
let scrape_job = {
|
||||
search_engine: 'google',
|
||||
keywords: ['news', 'mountain', 'what', 'are good', 'keyword', 'who', 'nice'],
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(browser_config);
|
||||
|
||||
await scraper.start();
|
||||
|
||||
var results = await scraper.scrape(scrape_job);
|
||||
|
||||
console.dir(results, {depth: null, colors: true});
|
||||
|
||||
await scraper.quit();
|
||||
})();
|
@ -1,35 +1,29 @@
|
||||
const se_scraper = require('../index.js');
|
||||
const se_scraper = require('./../src/node_scraper.js');
|
||||
|
||||
async function multiple_search_engines() {
|
||||
(async () => {
|
||||
let browser_config = {
|
||||
random_user_agent: true,
|
||||
write_meta_data: true,
|
||||
sleep_range: '[1,1]',
|
||||
headless: true,
|
||||
output_file: `examples/results/multiple_search_engines.json`
|
||||
};
|
||||
|
||||
var searchEnginesList = ['google', 'bing'];
|
||||
let scrape_job = {
|
||||
search_engine: 'google',
|
||||
keywords: ['news', 'se-scraper'],
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
for (let index = 0; index < searchEnginesList.length; index++) {
|
||||
const searchEngine = searchEnginesList[index];
|
||||
let config = {
|
||||
random_user_agent: true,
|
||||
write_meta_data: true,
|
||||
sleep_range: '[1,1]',
|
||||
search_engine: searchEngine,
|
||||
debug: false,
|
||||
verbose: false,
|
||||
// the list of keywords to scrape
|
||||
keywords: ['scrapeulous.com',],
|
||||
// whether to start the browser in headless mode
|
||||
headless: true,
|
||||
output_file: `${searchEngine}.json`
|
||||
};
|
||||
var scraper = new se_scraper.ScrapeManager(browser_config);
|
||||
await scraper.start();
|
||||
|
||||
await se_scraper.scrape(config, (err, response) => {
|
||||
if (err) {
|
||||
console.error(err)
|
||||
}
|
||||
console.dir(response.results, {
|
||||
depth: null,
|
||||
colors: true
|
||||
});
|
||||
});
|
||||
for (var se of ['google', 'bing']) {
|
||||
scrape_job.search_engine = se;
|
||||
var results = await scraper.scrape(scrape_job);
|
||||
console.dir(results, {depth: null, colors: true});
|
||||
}
|
||||
}
|
||||
|
||||
multiple_search_engines();
|
||||
await scraper.quit();
|
||||
})();
|
||||
|
||||
|
134
examples/multiple_tabs.js
Normal file
134
examples/multiple_tabs.js
Normal file
@ -0,0 +1,134 @@
|
||||
const se_scraper = require('./../index.js');
|
||||
|
||||
const Cluster = {
|
||||
CONCURRENCY_PAGE: 1, // shares cookies, etc.
|
||||
CONCURRENCY_CONTEXT: 2, // no cookie sharing (uses contexts)
|
||||
CONCURRENCY_BROWSER: 3, // no cookie sharing and individual processes (uses contexts)
|
||||
};
|
||||
|
||||
let keywords = ['New York',
|
||||
'Los Angeles',
|
||||
'Chicago',
|
||||
'Houston',
|
||||
'Philadelphia',
|
||||
'Phoenix',
|
||||
'San Antonio',
|
||||
'San Diego',
|
||||
'Dallas',
|
||||
'San Jose',
|
||||
'Austin',
|
||||
'Indianapolis',
|
||||
'Jacksonville',
|
||||
'San Francisco',
|
||||
'Columbus',
|
||||
'Charlotte',
|
||||
'Fort Worth',
|
||||
'Detroit',
|
||||
'El Paso',
|
||||
'Memphis',
|
||||
'Seattle',
|
||||
'Denver',
|
||||
'Washington',
|
||||
'Boston',
|
||||
'Nashville-Davidson',
|
||||
'Baltimore',
|
||||
'Oklahoma City',
|
||||
'Louisville/Jefferson County',
|
||||
'Portland',
|
||||
'Las Vegas',
|
||||
'Milwaukee',
|
||||
'Albuquerque',
|
||||
'Tucson',
|
||||
'Fresno',
|
||||
'Sacramento',
|
||||
'Long Beach',
|
||||
'Kansas City',
|
||||
'Mesa',
|
||||
'Virginia Beach',
|
||||
'Atlanta',
|
||||
'Colorado Springs',
|
||||
'Omaha',
|
||||
'Raleigh',
|
||||
'Miami',
|
||||
'Oakland',
|
||||
'Minneapolis',
|
||||
'Tulsa',
|
||||
'Cleveland',
|
||||
'Wichita',
|
||||
'Arlington',
|
||||
'New Orleans',
|
||||
'Bakersfield',
|
||||
'Tampa',
|
||||
'Honolulu',
|
||||
'Aurora',
|
||||
'Anaheim',
|
||||
'Santa Ana',
|
||||
'St. Louis',
|
||||
'Riverside',
|
||||
'Corpus Christi',
|
||||
'Lexington-Fayette',
|
||||
'Pittsburgh',
|
||||
'Anchorage',
|
||||
'Stockton',
|
||||
'Cincinnati',
|
||||
'St. Paul',
|
||||
'Toledo',
|
||||
'Greensboro',
|
||||
'Newark',
|
||||
'Plano',
|
||||
'Henderson',
|
||||
'Lincoln',
|
||||
'Buffalo',
|
||||
'Jersey City',
|
||||
'Chula Vista',
|
||||
'Fort Wayne',
|
||||
'Orlando',
|
||||
'St. Petersburg',
|
||||
'Chandler',
|
||||
'Laredo',
|
||||
'Norfolk',
|
||||
'Durham',
|
||||
'Madison',
|
||||
'Lubbock',
|
||||
'Irvine',
|
||||
'Winston-Salem',
|
||||
'Glendale',
|
||||
'Garland',
|
||||
'Hialeah',
|
||||
'Reno',
|
||||
'Chesapeake',
|
||||
'Gilbert',
|
||||
'Baton Rouge',
|
||||
'Irving',
|
||||
'Scottsdale',
|
||||
'North Las Vegas',
|
||||
'Fremont',
|
||||
'Boise City',
|
||||
'Richmond',
|
||||
'San Bernardino'];
|
||||
|
||||
let config = {
|
||||
search_engine: 'bing',
|
||||
debug: false,
|
||||
verbose: true,
|
||||
keywords: keywords,
|
||||
num_pages: 1, // how many pages per keyword
|
||||
output_file: 'examples/results/bing.json',
|
||||
log_ip_address: false,
|
||||
headless: true,
|
||||
puppeteer_cluster_config: {
|
||||
timeout: 10 * 60 * 1000, // max timeout set to 10 minutes
|
||||
monitor: false,
|
||||
concurrency: Cluster.CONCURRENCY_PAGE, // one scraper per tab
|
||||
maxConcurrency: 7, // scrape with 7 tabs
|
||||
}
|
||||
};
|
||||
|
||||
function callback(err, response) {
|
||||
if (err) {
|
||||
console.error(err)
|
||||
}
|
||||
console.dir(response, {depth: null, colors: true});
|
||||
}
|
||||
|
||||
se_scraper.scrape(config, callback);
|
76
examples/per_page_proxy.js
Normal file
76
examples/per_page_proxy.js
Normal file
@ -0,0 +1,76 @@
|
||||
const puppeteer = require('puppeteer');
|
||||
const ProxyChain = require('proxy-chain');
|
||||
|
||||
const ROUTER_PROXY = 'http://127.0.0.1:8000';
|
||||
|
||||
// SEE: https://github.com/GoogleChrome/puppeteer/issues/678
|
||||
// Idea is: Setup a local router proxy that assigns requests identified by unique user-agent strings
|
||||
// distinct upstream proxies. With this way it is possible to use one proxy per chromium tab.
|
||||
// downside: not fast and efficient
|
||||
|
||||
const uas = [
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
|
||||
];
|
||||
|
||||
const proxies = ['http://142.93.57.147:3128', 'http://85.132.31.115:8181'];
|
||||
|
||||
(async () => {
|
||||
const browser = await puppeteer.launch({
|
||||
headless: false,
|
||||
args: [`--proxy-server=${ROUTER_PROXY}`],
|
||||
});
|
||||
const page1 = await browser.newPage();
|
||||
const page2 = await browser.newPage();
|
||||
|
||||
try {
|
||||
await page1.setUserAgent(uas[0]);
|
||||
await page1.goto('https://www.whatsmyip.org/');
|
||||
} catch (e) {
|
||||
console.log(e);
|
||||
}
|
||||
|
||||
try {
|
||||
await page2.setUserAgent(uas[1]);
|
||||
await page2.goto('https://www.whatsmyip.org/');
|
||||
} catch (e) {
|
||||
console.log(e);
|
||||
}
|
||||
|
||||
//await browser.close();
|
||||
})();
|
||||
|
||||
const server = new ProxyChain.Server({
|
||||
// Port where the server the server will listen. By default 8000.
|
||||
port: 8000,
|
||||
|
||||
// Enables verbose logging
|
||||
verbose: true,
|
||||
|
||||
prepareRequestFunction: ({
|
||||
request,
|
||||
username,
|
||||
password,
|
||||
hostname,
|
||||
port,
|
||||
isHttp,
|
||||
}) => {
|
||||
var upstreamProxyUrl;
|
||||
|
||||
if (request.headers['user-agent'] === uas[0]) {
|
||||
upstreamProxyUrl = proxies[0];
|
||||
}
|
||||
|
||||
if (request.headers['user-agent'] === uas[1]) {
|
||||
upstreamProxyUrl = proxies[1];
|
||||
}
|
||||
|
||||
console.log('Using proxy: ' + upstreamProxyUrl);
|
||||
|
||||
return { upstreamProxyUrl };
|
||||
},
|
||||
});
|
||||
|
||||
server.listen(() => {
|
||||
console.log(`Router Proxy server is listening on port ${8000}`);
|
||||
});
|
@ -9,17 +9,13 @@ module.exports = class Pluggable {
|
||||
'--disable-gpu',
|
||||
'--window-size=1920x1080',
|
||||
'--hide-scrollbars',
|
||||
'--user-agent=Chrome',
|
||||
'--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3835.0 Safari/537.36',
|
||||
],
|
||||
userAgent = 'Chrome',
|
||||
headless = true,
|
||||
} = options;
|
||||
|
||||
this.chromeFlags = chromeFlags;
|
||||
this.userAgent = userAgent;
|
||||
this.headless = headless;
|
||||
|
||||
this.chromeFlags.push(this.userAgent);
|
||||
}
|
||||
|
||||
async close_browser() {
|
||||
@ -65,4 +61,9 @@ module.exports = class Pluggable {
|
||||
|
||||
return this.browser;
|
||||
}
|
||||
|
||||
async do_work(page) {
|
||||
// do some scraping work and return results and num_requests
|
||||
|
||||
}
|
||||
};
|
31
examples/pluggable_example.js
Normal file
31
examples/pluggable_example.js
Normal file
@ -0,0 +1,31 @@
|
||||
const se_scraper = require('./../src/node_scraper.js');
|
||||
const resolve = require('path').resolve;
|
||||
|
||||
(async () => {
|
||||
let browser_config = {
|
||||
test_evasion: false,
|
||||
log_http_headers: true,
|
||||
log_ip_address: true,
|
||||
random_user_agent: false,
|
||||
apply_evasion_techniques: false,
|
||||
screen_output: false,
|
||||
custom_func: resolve('./examples/pluggable.js'),
|
||||
headless: false,
|
||||
};
|
||||
|
||||
let scrape_job = {
|
||||
search_engine: 'google',
|
||||
keywords: ['news usa'],
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(browser_config);
|
||||
|
||||
await scraper.start();
|
||||
|
||||
var results = await scraper.scrape(scrape_job);
|
||||
|
||||
console.dir(results, {depth: null, colors: true});
|
||||
|
||||
await scraper.quit();
|
||||
})();
|
29
examples/proxies.js
Normal file
29
examples/proxies.js
Normal file
@ -0,0 +1,29 @@
|
||||
const se_scraper = require('./../src/node_scraper.js');
|
||||
|
||||
(async () => {
|
||||
let browser_config = {
|
||||
output_file: 'examples/results/proxyresults.json',
|
||||
log_ip_address: true,
|
||||
// a file with one proxy per line. Example:
|
||||
// socks5://78.94.172.42:1080
|
||||
// http://118.174.233.10:48400
|
||||
proxy_file: '/home/nikolai/.proxies', // one proxy per line
|
||||
// whether to use proxies only
|
||||
// when this is set to true, se-scraper will not use
|
||||
// your default IP address in a browser
|
||||
use_proxies_only: true,
|
||||
};
|
||||
|
||||
let scrape_job = {
|
||||
search_engine: 'google',
|
||||
keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'],
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(browser_config);
|
||||
await scraper.start();
|
||||
|
||||
var results = await scraper.scrape(scrape_job);
|
||||
console.dir(results, {depth: null, colors: true});
|
||||
await scraper.quit();
|
||||
})();
|
@ -1,17 +1,36 @@
|
||||
const se_scraper = require('./../index.js');
|
||||
const se_scraper = require('./../src/node_scraper.js');
|
||||
|
||||
let config = {
|
||||
search_engine: 'duckduckgo',
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: ['news'],
|
||||
num_pages: 2,
|
||||
output_file: 'data.json',
|
||||
};
|
||||
(async () => {
|
||||
let browser_config = {
|
||||
test_evasion: false,
|
||||
log_http_headers: false,
|
||||
log_ip_address: false,
|
||||
random_user_agent: false,
|
||||
apply_evasion_techniques: true,
|
||||
screen_output: false,
|
||||
html_output: false,
|
||||
clean_html_output: true,
|
||||
};
|
||||
|
||||
function callback(err, response) {
|
||||
if (err) { console.error(err) }
|
||||
console.dir(response, {depth: null, colors: true});
|
||||
}
|
||||
let scrape_job = {
|
||||
search_engine: 'google',
|
||||
keywords: ['buy a nice car'],
|
||||
num_pages: 1,
|
||||
google_settings: {
|
||||
"gl": "us",
|
||||
"hl": "en",
|
||||
"start": 0,
|
||||
"num": 10
|
||||
}
|
||||
};
|
||||
|
||||
se_scraper.scrape(config, callback);
|
||||
var scraper = new se_scraper.ScrapeManager(browser_config);
|
||||
|
||||
await scraper.start();
|
||||
|
||||
var results = await scraper.scrape(scrape_job);
|
||||
|
||||
console.dir(results, {depth: null, colors: true});
|
||||
|
||||
await scraper.quit();
|
||||
})();
|
||||
|
30
examples/reusing.js
Normal file
30
examples/reusing.js
Normal file
@ -0,0 +1,30 @@
|
||||
const se_scraper = require('./../src/node_scraper.js');
|
||||
|
||||
(async () => {
|
||||
let browser_config = {
|
||||
output_file: 'examples/results/data.json',
|
||||
};
|
||||
|
||||
let scrape_job = {
|
||||
search_engine: 'google',
|
||||
keywords: ['news', 'se-scraper'],
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
let scrape_job2 = {
|
||||
search_engine: 'bing',
|
||||
keywords: ['test', 'what a wonderful world'],
|
||||
num_pages: 1,
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager(browser_config);
|
||||
await scraper.start();
|
||||
|
||||
var results = await scraper.scrape(scrape_job);
|
||||
console.dir(results, {depth: null, colors: true});
|
||||
|
||||
var results2 = await scraper.scrape(scrape_job2);
|
||||
console.dir(results2, {depth: null, colors: true});
|
||||
|
||||
await scraper.quit();
|
||||
})();
|
87
examples/test_cluster.js
Normal file
87
examples/test_cluster.js
Normal file
@ -0,0 +1,87 @@
|
||||
const { Cluster } = require('../../puppeteer-cluster/dist/index.js');
|
||||
var fs = require('fs');
|
||||
var os = require("os");
|
||||
|
||||
const PROXY_FILE = '/home/nikolai/.proxies';
|
||||
|
||||
function read_items_from_file(fname) {
|
||||
let kws = fs.readFileSync(fname).toString().split(os.EOL);
|
||||
// clean keywords
|
||||
kws = kws.filter((kw) => {
|
||||
return kw.trim().length > 0;
|
||||
});
|
||||
return kws;
|
||||
}
|
||||
|
||||
(async () => {
|
||||
|
||||
let browserArgs = [
|
||||
'--disable-infobars',
|
||||
'--window-position=0,0',
|
||||
'--ignore-certifcate-errors',
|
||||
'--ignore-certifcate-errors-spki-list',
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-accelerated-2d-canvas',
|
||||
'--disable-gpu',
|
||||
'--window-size=1920x1080',
|
||||
'--hide-scrollbars',
|
||||
];
|
||||
|
||||
let proxies = read_items_from_file(PROXY_FILE);
|
||||
|
||||
console.dir(proxies);
|
||||
|
||||
// each new call to workerInstance() will
|
||||
// left pop() one element from this list
|
||||
// maxConcurrency should be equal to perBrowserOptions.length
|
||||
|
||||
// the first browser config with home IP
|
||||
let perBrowserOptions = [{
|
||||
headless: false,
|
||||
ignoreHTTPSErrors: true,
|
||||
args: browserArgs
|
||||
}];
|
||||
|
||||
for (var proxy of proxies) {
|
||||
perBrowserOptions.push({
|
||||
headless: false,
|
||||
ignoreHTTPSErrors: true,
|
||||
args: browserArgs.concat(`--proxy-server=${proxy}`)
|
||||
})
|
||||
}
|
||||
|
||||
const cluster = await Cluster.launch({
|
||||
monitor: true,
|
||||
timeout: 12 * 60 * 60 * 1000, // 12 hours in ms
|
||||
concurrency: Cluster.CONCURRENCY_BROWSER,
|
||||
maxConcurrency: perBrowserOptions.length,
|
||||
puppeteerOptions: {
|
||||
headless: false,
|
||||
args: browserArgs,
|
||||
ignoreHTTPSErrors: true,
|
||||
},
|
||||
perBrowserOptions: perBrowserOptions
|
||||
});
|
||||
|
||||
// Event handler to be called in case of problems
|
||||
cluster.on('taskerror', (err, data) => {
|
||||
console.log(`Error crawling ${data}: ${err.message}`);
|
||||
});
|
||||
|
||||
|
||||
await cluster.task(async ({ page, data: url }) => {
|
||||
await page.goto(url, {waitUntil: 'domcontentloaded', timeout: 20000});
|
||||
const pageTitle = await page.evaluate(() => document.title);
|
||||
console.log(`Page title of ${url} is ${pageTitle}`);
|
||||
console.log(await page.content());
|
||||
});
|
||||
|
||||
for(var i = 0; i < perBrowserOptions.length; i++) {
|
||||
await cluster.queue('http://ipinfo.io/json');
|
||||
}
|
||||
|
||||
await cluster.idle();
|
||||
await cluster.close();
|
||||
})();
|
40
examples/test_promise.js
Normal file
40
examples/test_promise.js
Normal file
@ -0,0 +1,40 @@
|
||||
class Test {
|
||||
constructor(options = {}) {
|
||||
const {
|
||||
config = {},
|
||||
} = options;
|
||||
|
||||
this.config = config;
|
||||
}
|
||||
|
||||
run(vars) {
|
||||
|
||||
console.log(this.config)
|
||||
}
|
||||
}
|
||||
|
||||
let o1 = new Test({config: {a: Math.random()}});
|
||||
let o2 = new Test({config: {a: Math.random()}});
|
||||
|
||||
o1.run()
|
||||
o2.run()
|
||||
|
||||
// (async () => {
|
||||
//
|
||||
// let prom = [];
|
||||
//
|
||||
// for (var i = 0; i < 3; i++) {
|
||||
// var obj = new Test({
|
||||
// config: {a: Math.random()},
|
||||
// });
|
||||
// prom.push(new Promise(resolve => {
|
||||
// setTimeout(() => { new Test({
|
||||
// config: {a: Math.random()},
|
||||
// }).run(); resolve() }, 1000);
|
||||
// }));
|
||||
// }
|
||||
//
|
||||
// let res = await Promise.all(prom);
|
||||
// console.log(res);
|
||||
//
|
||||
// })();
|
29
examples/test_proxyflag.js
Normal file
29
examples/test_proxyflag.js
Normal file
@ -0,0 +1,29 @@
|
||||
const puppeteer = require('puppeteer');
|
||||
|
||||
(async () => {
|
||||
const browser = await puppeteer.launch({
|
||||
args: [
|
||||
// SET PROXY HERE
|
||||
'--proxy-server=socks5://IP:PORT',
|
||||
'--disable-infobars',
|
||||
'--window-position=0,0',
|
||||
'--ignore-certifcate-errors',
|
||||
'--ignore-certifcate-errors-spki-list',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-accelerated-2d-canvas',
|
||||
'--disable-gpu',
|
||||
'--window-size=1920x1080',
|
||||
'--hide-scrollbars',
|
||||
'--disable-notifications',
|
||||
'--no-sandbox',
|
||||
'--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.98 Chrome/71.0.3578.98 Safari/537.36'
|
||||
],
|
||||
headless: true
|
||||
});
|
||||
var page = await browser.newPage();
|
||||
await page.setViewport({width: 1920, height: 926});
|
||||
await page.goto('http://ipinfo.io/json');
|
||||
console.log(await page.content());
|
||||
await browser.close();
|
||||
})();
|
Binary file not shown.
Before Width: | Height: | Size: 43 KiB |
90
index.js
90
index.js
@ -1,81 +1,23 @@
|
||||
const handler = require('./src/node_scraper.js');
|
||||
var fs = require('fs');
|
||||
var os = require("os");
|
||||
const se_scraper = require('./src/node_scraper.js');
|
||||
var Scraper = require('./src/modules/se_scraper');
|
||||
|
||||
exports.scrape = async function(config, callback) {
|
||||
async function scrape(browser_config, scrape_config) {
|
||||
// scrape config overwrites the browser_config
|
||||
Object.assign(browser_config, scrape_config);
|
||||
|
||||
// options for scraping
|
||||
event = {
|
||||
// the user agent to scrape with
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
// if random_user_agent is set to True, a random user agent is chosen
|
||||
random_user_agent: true,
|
||||
// whether to select manual settings in visible mode
|
||||
set_manual_settings: false,
|
||||
// log ip address data
|
||||
log_ip_address: false,
|
||||
// log http headers
|
||||
log_http_headers: false,
|
||||
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
||||
// is drawn before every request. empty string for no sleeping.
|
||||
sleep_range: '[1,1]',
|
||||
// which search engine to scrape
|
||||
search_engine: 'google',
|
||||
compress: false, // compress
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: ['scrapeulous.com'],
|
||||
// whether to start the browser in headless mode
|
||||
headless: true,
|
||||
// the number of pages to scrape for each keyword
|
||||
num_pages: 1,
|
||||
// path to output file, data will be stored in JSON
|
||||
output_file: '',
|
||||
// whether to prevent images, css, fonts and media from being loaded
|
||||
// will speed up scraping a great deal
|
||||
block_assets: true,
|
||||
// path to js module that extends functionality
|
||||
// this module should export the functions:
|
||||
// get_browser, handle_metadata, close_browser
|
||||
//custom_func: resolve('examples/pluggable.js'),
|
||||
custom_func: '',
|
||||
// use a proxy for all connections
|
||||
// example: 'socks5://78.94.172.42:1080'
|
||||
// example: 'http://118.174.233.10:48400'
|
||||
proxy: '',
|
||||
// check if headless chrome escapes common detection techniques
|
||||
// this is a quick test and should be used for debugging
|
||||
test_evasion: false,
|
||||
};
|
||||
var scraper = new se_scraper.ScrapeManager(browser_config);
|
||||
|
||||
// overwrite default config
|
||||
for (var key in config) {
|
||||
event[key] = config[key];
|
||||
}
|
||||
await scraper.start();
|
||||
|
||||
if (fs.existsSync(event.keyword_file)) {
|
||||
event.keywords = read_keywords_from_file(event.keyword_file);
|
||||
}
|
||||
var results = await scraper.scrape(scrape_config);
|
||||
|
||||
if (!callback) {
|
||||
// called when results are ready
|
||||
callback = function (err, response) {
|
||||
if (err) {
|
||||
console.error(err)
|
||||
}
|
||||
await scraper.quit();
|
||||
|
||||
console.dir(response.results, {depth: null, colors: true});
|
||||
}
|
||||
}
|
||||
|
||||
await handler.handler(event, undefined, callback );
|
||||
};
|
||||
|
||||
function read_keywords_from_file(fname) {
|
||||
let kws = fs.readFileSync(fname).toString().split(os.EOL);
|
||||
// clean keywords
|
||||
kws = kws.filter((kw) => {
|
||||
return kw.trim().length > 0;
|
||||
});
|
||||
return kws;
|
||||
return results;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
scrape: scrape,
|
||||
ScrapeManager: se_scraper.ScrapeManager,
|
||||
Scraper: Scraper,
|
||||
};
|
||||
|
2044
package-lock.json
generated
2044
package-lock.json
generated
File diff suppressed because it is too large
Load Diff
29
package.json
29
package.json
@ -1,16 +1,17 @@
|
||||
{
|
||||
"name": "se-scraper",
|
||||
"version": "1.1.13",
|
||||
"description": "A simple library using puppeteer to scrape several search engines such as Google, Duckduckgo and Bing.",
|
||||
"version": "1.5.7",
|
||||
"description": "A module using puppeteer to scrape several search engines such as Google, Bing and Duckduckgo",
|
||||
"homepage": "https://scrapeulous.com/",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"test": "mocha"
|
||||
"test": "mocha test test/modules"
|
||||
},
|
||||
"keywords": [
|
||||
"scraping",
|
||||
"search-engines",
|
||||
"google",
|
||||
"bing",
|
||||
"web-scraping"
|
||||
],
|
||||
"author": "Nikolai Tschacher <hire@incolumitas.com> (https://incolumitas.com/)",
|
||||
@ -20,9 +21,25 @@
|
||||
},
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"chai": "^4.2.0",
|
||||
"cheerio": "^1.0.0-rc.2",
|
||||
"cheerio": "^1.0.0-rc.3",
|
||||
"debug": "^4.1.1",
|
||||
"got": "^9.6.0",
|
||||
"puppeteer": "^1.12.2"
|
||||
"lodash": "^4.17.14",
|
||||
"puppeteer": "^2.0.0",
|
||||
"puppeteer-cluster": "^0.18.0",
|
||||
"puppeteer-extra": "^2.1.3",
|
||||
"puppeteer-extra-plugin-stealth": "^2.2.2",
|
||||
"user-agents": "^1.0.378",
|
||||
"winston": "^3.2.1"
|
||||
},
|
||||
"devDependencies": {
|
||||
"bluebird": "^3.7.2",
|
||||
"chai": "^4.2.0",
|
||||
"chai-string": "^1.5.0",
|
||||
"express": "^4.17.1",
|
||||
"http-mitm-proxy": "^0.8.2",
|
||||
"key-cert": "^1.0.1",
|
||||
"mocha": "^6.1.4",
|
||||
"ua-parser-js": "^0.7.21"
|
||||
}
|
||||
}
|
||||
|
105
run.js
105
run.js
@ -1,35 +1,22 @@
|
||||
const se_scraper = require('./index.js');
|
||||
const resolve = require('path').resolve;
|
||||
|
||||
let config = {
|
||||
// those options need to be provided on startup
|
||||
// and cannot give to se-scraper on scrape() calls
|
||||
let browser_config = {
|
||||
// the user agent to scrape with
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
// if random_user_agent is set to True, a random user agent is chosen
|
||||
random_user_agent: true,
|
||||
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
||||
// is drawn before every request. empty string for no sleeping.
|
||||
sleep_range: '[1,2]',
|
||||
// which search engine to scrape
|
||||
search_engine: 'google',
|
||||
// whether debug information should be printed
|
||||
// debug info is useful for developers when debugging
|
||||
debug: false,
|
||||
// whether verbose program output should be printed
|
||||
// this output is informational
|
||||
verbose: true,
|
||||
// an array of keywords to scrape
|
||||
keywords: ['news'],
|
||||
// alternatively you can specify a keyword_file. this overwrites the keywords array
|
||||
keyword_file: '',
|
||||
// the number of pages to scrape for each keyword
|
||||
num_pages: 1,
|
||||
random_user_agent: false,
|
||||
// whether to start the browser in headless mode
|
||||
headless: true,
|
||||
// path to output file, data will be stored in JSON
|
||||
output_file: 'data.json',
|
||||
// whether to prevent images, css, fonts from being loaded
|
||||
// will speed up scraping a great deal
|
||||
block_assets: true,
|
||||
headless: false,
|
||||
// whether debug information should be printed
|
||||
// level 0: print nothing
|
||||
// level 1: print most important info
|
||||
// ...
|
||||
// level 4: print all shit nobody wants to know
|
||||
debug_level: 1,
|
||||
// specify flags passed to chrome here
|
||||
chrome_flags: [],
|
||||
// path to js module that extends functionality
|
||||
// this module should export the functions:
|
||||
// get_browser, handle_metadata, close_browser
|
||||
@ -40,26 +27,56 @@ let config = {
|
||||
// example: 'socks5://78.94.172.42:1080'
|
||||
// example: 'http://118.174.233.10:48400'
|
||||
proxy: '',
|
||||
// check if headless chrome escapes common detection techniques
|
||||
// this is a quick test and should be used for debugging
|
||||
test_evasion: false,
|
||||
// log ip address data
|
||||
log_ip_address: true,
|
||||
// log http headers
|
||||
log_http_headers: true,
|
||||
// a file with one proxy per line. Example:
|
||||
// socks5://78.94.172.42:1080
|
||||
// http://118.174.233.10:48400
|
||||
proxy_file: '',
|
||||
puppeteer_cluster_config: {
|
||||
timeout: 10 * 60 * 1000, // max timeout set to 10 minutes
|
||||
monitor: false,
|
||||
concurrency: 1, // one scraper per tab
|
||||
maxConcurrency: 1, // scrape with 1 tab
|
||||
}
|
||||
};
|
||||
|
||||
function callback(err, response) {
|
||||
if (err) { console.error(err) }
|
||||
(async () => {
|
||||
// scrape config can change on each scrape() call
|
||||
let scrape_config = {
|
||||
// which search engine to scrape
|
||||
search_engine: 'duckduckgo',
|
||||
// an array of keywords to scrape
|
||||
keywords: ['cloud service'],
|
||||
// the number of pages to scrape for each keyword
|
||||
num_pages: 1,
|
||||
|
||||
/* response object has the following properties:
|
||||
// OPTIONAL PARAMS BELOW:
|
||||
// google_settings: {
|
||||
// gl: 'us', // The gl parameter determines the Google country to use for the query.
|
||||
// hl: 'fr', // The hl parameter determines the Google UI language to return results.
|
||||
// start: 0, // Determines the results offset to use, defaults to 0.
|
||||
// num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
|
||||
// },
|
||||
// instead of keywords you can specify a keyword_file. this overwrites the keywords array
|
||||
keyword_file: '',
|
||||
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
||||
// is drawn before every request. empty string for no sleeping.
|
||||
sleep_range: '',
|
||||
// path to output file, data will be stored in JSON
|
||||
output_file: '',
|
||||
// whether to prevent images, css, fonts from being loaded
|
||||
// will speed up scraping a great deal
|
||||
block_assets: false,
|
||||
// check if headless chrome escapes common detection techniques
|
||||
// this is a quick test and should be used for debugging
|
||||
test_evasion: false,
|
||||
apply_evasion_techniques: true,
|
||||
// log ip address data
|
||||
log_ip_address: false,
|
||||
// log http headers
|
||||
log_http_headers: false,
|
||||
};
|
||||
|
||||
response.results - json object with the scraping results
|
||||
response.metadata - json object with metadata information
|
||||
response.statusCode - status code of the scraping process
|
||||
*/
|
||||
let results = await se_scraper.scrape(browser_config, scrape_config);
|
||||
console.dir(results, {depth: null, colors: true});
|
||||
})();
|
||||
|
||||
// console.dir(response.results, {depth: null, colors: true});
|
||||
}
|
||||
|
||||
se_scraper.scrape(config, callback);
|
||||
|
@ -2,7 +2,9 @@
|
||||
<module type="WEB_MODULE" version="4">
|
||||
<component name="NewModuleRootManager" inherit-compiler-output="true">
|
||||
<exclude-output />
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<excludeFolder url="file://$MODULE_DIR$/test/static_tests/html" />
|
||||
</content>
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
|
55
src/concurrency-implementation.js
Normal file
55
src/concurrency-implementation.js
Normal file
@ -0,0 +1,55 @@
|
||||
const { Browser } = require('puppeteer-cluster/dist/concurrency/builtInConcurrency');
|
||||
const debug = require('debug')('se-scraper:CustomConcurrency');
|
||||
const { timeoutExecute } = require('puppeteer-cluster/dist/util');
|
||||
|
||||
const BROWSER_TIMEOUT = 5000;
|
||||
|
||||
class CustomConcurrency extends Browser {
|
||||
|
||||
async init() {}
|
||||
async close() {}
|
||||
|
||||
async workerInstance() {
|
||||
const options = this.options.perBrowserOptions.shift();
|
||||
debug('Launch puppeteer instance with options=%o', options);
|
||||
let chrome = await this.puppeteer.launch(options);
|
||||
let page;
|
||||
let context;
|
||||
|
||||
return {
|
||||
jobInstance: async () => {
|
||||
await timeoutExecute(BROWSER_TIMEOUT, (async () => {
|
||||
context = await chrome.createIncognitoBrowserContext();
|
||||
page = await context.newPage();
|
||||
})());
|
||||
|
||||
return {
|
||||
resources: {
|
||||
page,
|
||||
},
|
||||
|
||||
close: async () => {
|
||||
await timeoutExecute(BROWSER_TIMEOUT, context.close());
|
||||
},
|
||||
};
|
||||
},
|
||||
|
||||
close: async () => {
|
||||
await chrome.close();
|
||||
},
|
||||
|
||||
repair: async () => {
|
||||
debug('Starting repair');
|
||||
try {
|
||||
// will probably fail, but just in case the repair was not necessary
|
||||
await chrome.close();
|
||||
} catch (e) {}
|
||||
|
||||
// just relaunch as there is only one page per browser
|
||||
chrome = await this.puppeteer.launch(options);
|
||||
},
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
module.exports = CustomConcurrency;
|
@ -1,78 +0,0 @@
|
||||
const cheerio = require('cheerio');
|
||||
const Scraper = require('./se_scraper');
|
||||
|
||||
class BaiduScraper extends Scraper {
|
||||
parse(html) {
|
||||
// load the page source into cheerio
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
// perform queries
|
||||
const results = [];
|
||||
$('#content_left .result').each((i, link) => {
|
||||
results.push({
|
||||
link: $(link).find('h3 a').attr('href'),
|
||||
title: $(link).find('h3').text(),
|
||||
snippet: $(link).find('.c-abstract').text(),
|
||||
visible_link: $(link).find('.f13').text(),
|
||||
})
|
||||
});
|
||||
|
||||
const cleaned = [];
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.link && res.link.trim()) {
|
||||
res.rank = this.result_rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
no_results: false,
|
||||
num_results: $('.nums_text').text(),
|
||||
results: cleaned,
|
||||
}
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
try {
|
||||
await this.page.goto('https://www.baidu.com/');
|
||||
await this.page.waitForSelector('input[name="wd"]', { timeout: 5000 });
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
async search_keyword(keyword) {
|
||||
const input = await this.page.$('input[name="wd"]');
|
||||
// overwrites last text in input
|
||||
await input.click({ clickCount: 3 });
|
||||
await input.type(keyword);
|
||||
await input.focus();
|
||||
await this.page.keyboard.press("Enter");
|
||||
}
|
||||
|
||||
async next_page() {
|
||||
let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000});
|
||||
if (!next_page_link) {
|
||||
return false;
|
||||
}
|
||||
await next_page_link.click();
|
||||
await this.page.waitForNavigation();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
async wait_for_results() {
|
||||
// TODO: very very bad, but nobody uses baidu, or does someone?
|
||||
await this.sleep(2000);
|
||||
}
|
||||
|
||||
async detected() {
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
BaiduScraper: BaiduScraper,
|
||||
};
|
@ -3,163 +3,238 @@ const Scraper = require('./se_scraper');
|
||||
|
||||
class BingScraper extends Scraper {
|
||||
|
||||
parse(html) {
|
||||
// load the page source into cheerio
|
||||
const $ = cheerio.load(html);
|
||||
async parse_async(html) {
|
||||
|
||||
// perform queries
|
||||
const results = [];
|
||||
$('#b_content #b_results .b_algo').each((i, link) => {
|
||||
results.push({
|
||||
link: $(link).find('h2 a').attr('href'),
|
||||
title: $(link).find('h2').text(),
|
||||
snippet: $(link).find('.b_caption p').text(),
|
||||
visible_link: $(link).find('cite').text(),
|
||||
})
|
||||
});
|
||||
let results = await this.page.evaluate(() => {
|
||||
|
||||
// 'Including results for', 'Einschließlich Ergebnisse'
|
||||
let no_results = this.no_results(
|
||||
['There are no results', 'Es gibt keine Ergebnisse'],
|
||||
$('#b_results').text()
|
||||
);
|
||||
let _text = (el, s) => {
|
||||
let n = el.querySelector(s);
|
||||
|
||||
let effective_query = $('#sp_requery a').first().text() || '';
|
||||
if (n) {
|
||||
return n.innerText;
|
||||
} else {
|
||||
return '';
|
||||
}
|
||||
};
|
||||
|
||||
const cleaned = [];
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
||||
res.rank = this.result_rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
let _attr = (el, s, attr) => {
|
||||
let n = el.querySelector(s);
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
no_results: no_results,
|
||||
effective_query: effective_query,
|
||||
num_results: $('#b_content .sb_count').text(),
|
||||
results: cleaned,
|
||||
}
|
||||
}
|
||||
if (n) {
|
||||
return n.getAttribute(attr);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
async load_start_page() {
|
||||
try {
|
||||
await this.page.goto('https://www.bing.com/');
|
||||
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
let results = {
|
||||
num_results: '',
|
||||
no_results: false,
|
||||
effective_query: '',
|
||||
results: [],
|
||||
ads: [],
|
||||
right_side_ads: [],
|
||||
};
|
||||
|
||||
async search_keyword(keyword) {
|
||||
const input = await this.page.$('input[name="q"]');
|
||||
await this.set_input_value(`input[name="q"]`, keyword);
|
||||
await this.sleep(50);
|
||||
await input.focus();
|
||||
await this.page.keyboard.press("Enter");
|
||||
}
|
||||
let num_results_el = document.querySelector('#b_content .sb_count');
|
||||
|
||||
async next_page() {
|
||||
let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000});
|
||||
if (!next_page_link) {
|
||||
return false;
|
||||
}
|
||||
await next_page_link.click();
|
||||
await this.page.waitForNavigation();
|
||||
if (num_results_el) {
|
||||
results.num_results = num_results_el.innerText;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
let organic_results = document.querySelectorAll('#b_content #b_results .b_algo');
|
||||
|
||||
async wait_for_results() {
|
||||
await this.page.waitForSelector('#b_content', { timeout: 5000 });
|
||||
await this.sleep(500);
|
||||
}
|
||||
organic_results.forEach((el) => {
|
||||
|
||||
async detected() {
|
||||
// TODO: I was actually never detected by bing. those are good guys.
|
||||
}
|
||||
let serp_obj = {
|
||||
link: _attr(el, 'h2 a', 'href'),
|
||||
title: _text(el, 'h2'),
|
||||
snippet: _text(el, '.b_caption p'),
|
||||
visible_link: _text(el, 'cite'),
|
||||
};
|
||||
|
||||
results.results.push(serp_obj);
|
||||
});
|
||||
|
||||
// check if no results
|
||||
results.no_results = (results.results.length === 0);
|
||||
|
||||
// parse bing ads
|
||||
let ads = document.querySelectorAll('#b_results .b_ad .sb_add');
|
||||
|
||||
ads.forEach((el) => {
|
||||
|
||||
let ad_obj = {
|
||||
title: _text(el, 'h2 a'),
|
||||
snippet: _text(el, '.b_caption p'),
|
||||
visible_link: _text(el, '.b_adurl cite'),
|
||||
tracking_link: _attr(el, 'h2 a', 'href'),
|
||||
};
|
||||
|
||||
results.ads.push(ad_obj);
|
||||
});
|
||||
|
||||
// right side ads
|
||||
let right_side_ads = document.querySelectorAll('#b_context .b_ad .sb_add');
|
||||
|
||||
right_side_ads.forEach((el) => {
|
||||
|
||||
let ad_obj = {
|
||||
title: _text(el, 'h2 a'),
|
||||
snippet: _text(el, '.b_caption p'),
|
||||
visible_link: _text(el, '.b_adurl cite'),
|
||||
tracking_link: _attr(el, 'h2 a', 'href'),
|
||||
};
|
||||
|
||||
results.right_side_ads.push(ad_obj);
|
||||
});
|
||||
|
||||
|
||||
let effective_query_el = document.querySelector('#sp_requery a');
|
||||
|
||||
if (effective_query_el) {
|
||||
results.effective_query = effective_query_el.innerText;
|
||||
}
|
||||
|
||||
return results;
|
||||
});
|
||||
|
||||
results.results = this.clean_results(results.results, ['title', 'link']);
|
||||
results.ads = this.clean_results(results.ads, ['title', 'visible_link', 'tracking_link']);
|
||||
results.time = (new Date()).toUTCString();
|
||||
return results;
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
let startUrl = this.build_start_url('https://www.bing.com/search?') || 'https://www.bing.com/';
|
||||
|
||||
if (this.config.bing_settings) {
|
||||
startUrl = `https://www.${this.config.bing_settings.bing_domain}/search?`;
|
||||
if (this.config.bing_settings.bing_domain) {
|
||||
startUrl = `https://www.${this.config.bing_settings.bing_domain}/search?`;
|
||||
} else {
|
||||
startUrl = `https://www.bing.com/search?`;
|
||||
}
|
||||
|
||||
for (var key in this.config.bing_settings) {
|
||||
if (key !== 'bing_domain') {
|
||||
startUrl += `${key}=${this.config.bing_settings[key]}&`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
await this.page.goto(startUrl);
|
||||
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
async search_keyword(keyword) {
|
||||
const input = await this.page.$('input[name="q"]');
|
||||
await this.set_input_value(`input[name="q"]`, keyword);
|
||||
await this.sleep(50);
|
||||
await input.focus();
|
||||
await this.page.keyboard.press("Enter");
|
||||
}
|
||||
|
||||
async next_page() {
|
||||
let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000});
|
||||
if (!next_page_link) {
|
||||
return false;
|
||||
}
|
||||
|
||||
this.last_response = await Promise.all([
|
||||
next_page_link.click(), // The promise resolves after navigation has finished
|
||||
this.page.waitForNavigation(), // Clicking the link will indirectly cause a navigation
|
||||
]);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
async wait_for_results() {
|
||||
await this.page.waitForSelector('#b_content', { timeout: this.STANDARD_TIMEOUT });
|
||||
}
|
||||
|
||||
async detected() {
|
||||
// TODO: I was actually never detected by bing. those are good boys.
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class BingNewsScraper extends Scraper {
|
||||
|
||||
parse(html) {
|
||||
// load the page source into cheerio
|
||||
const $ = cheerio.load(html);
|
||||
parse(html) {
|
||||
// load the page source into cheerio
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
// perform queries
|
||||
const results = [];
|
||||
$('#algocore .newsitem').each((i, link) => {
|
||||
results.push({
|
||||
link: $(link).attr('url'),
|
||||
title: $(link).find('a.title').text(),
|
||||
snippet: $(link).find('.snippet').text(),
|
||||
date: $(link).find('.source span').last().text(),
|
||||
})
|
||||
});
|
||||
// perform queries
|
||||
const results = [];
|
||||
$('#algocore .newsitem').each((i, link) => {
|
||||
results.push({
|
||||
link: $(link).attr('url'),
|
||||
title: $(link).find('a.title').text(),
|
||||
snippet: $(link).find('.snippet').text(),
|
||||
date: $(link).find('.source span').last().text(),
|
||||
})
|
||||
});
|
||||
|
||||
const cleaned = [];
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
||||
res.rank = this.result_rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
const cleaned = this.clean_results(results, ['title', 'link']);
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
results: cleaned,
|
||||
}
|
||||
}
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
results: cleaned,
|
||||
}
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
try {
|
||||
await this.page.goto('https://www.bing.com/news/search?');
|
||||
if (this.config.set_manual_settings === true) {
|
||||
console.log('Sleeping 30 seconds. Set your settings now.');
|
||||
await this.sleep(30000);
|
||||
}
|
||||
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
async load_start_page() {
|
||||
let startUrl = 'https://www.bing.com/news/search?';
|
||||
|
||||
async search_keyword(keyword) {
|
||||
const input = await this.page.$('input[name="q"]');
|
||||
await this.set_input_value(`input[name="q"]`, keyword);
|
||||
await this.sleep(50);
|
||||
await input.focus();
|
||||
await this.page.keyboard.press("Enter");
|
||||
}
|
||||
try {
|
||||
await this.page.goto(startUrl);
|
||||
if (this.config.set_manual_settings === true) {
|
||||
console.log('Sleeping 30 seconds. Set your settings now.');
|
||||
await this.sleep(30000);
|
||||
}
|
||||
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
|
||||
async next_page() {
|
||||
let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000});
|
||||
if (!next_page_link) {
|
||||
return false;
|
||||
}
|
||||
await next_page_link.click();
|
||||
await this.page.waitForNavigation();
|
||||
return true;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
async search_keyword(keyword) {
|
||||
const input = await this.page.$('input[name="q"]');
|
||||
await this.set_input_value(`input[name="q"]`, keyword);
|
||||
await this.sleep(50);
|
||||
await input.focus();
|
||||
await this.page.keyboard.press("Enter");
|
||||
}
|
||||
|
||||
async wait_for_results() {
|
||||
await this.page.waitForSelector('#news', { timeout: 5000 });
|
||||
await this.sleep(2000);
|
||||
}
|
||||
async next_page() {
|
||||
let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000});
|
||||
if (!next_page_link) {
|
||||
return false;
|
||||
}
|
||||
|
||||
async detected() {
|
||||
// TODO: I was actually never detected by bing news.
|
||||
}
|
||||
this.last_response = await Promise.all([
|
||||
next_page_link.click(), // The promise resolves after navigation has finished
|
||||
this.page.waitForNavigation(), // Clicking the link will indirectly cause a navigation
|
||||
]);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
async wait_for_results() {
|
||||
await this.page.waitForSelector('#news', { timeout: this.STANDARD_TIMEOUT });
|
||||
}
|
||||
|
||||
async detected() {
|
||||
// TODO: I was actually never detected by bing news.
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
BingNewsScraper: BingNewsScraper,
|
||||
BingScraper: BingScraper,
|
||||
};
|
||||
BingNewsScraper: BingNewsScraper,
|
||||
BingScraper: BingScraper,
|
||||
};
|
||||
|
@ -1,15 +1,18 @@
|
||||
const cheerio = require('cheerio');
|
||||
const Scraper = require('./se_scraper');
|
||||
const debug = require('debug')('se-scraper:DuckduckgoScraper');
|
||||
|
||||
class DuckduckgoScraper extends Scraper {
|
||||
|
||||
parse(html) {
|
||||
debug('parse');
|
||||
// load the page source into cheerio
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
// perform queries
|
||||
const results = [];
|
||||
$('.result__body').each((i, link) => {
|
||||
const organicSelector = ($('#links .result--sep').length > 0) ? `#links #rld-${this.page_num - 1} ~ .result .result__body` : '#links .result__body';
|
||||
$(organicSelector).each((i, link) => {
|
||||
results.push({
|
||||
link: $(link).find('.result__title .result__a').attr('href'),
|
||||
title: $(link).find('.result__title .result__a').text(),
|
||||
@ -19,35 +22,40 @@ class DuckduckgoScraper extends Scraper {
|
||||
});
|
||||
});
|
||||
|
||||
const ads = [];
|
||||
$('.results--ads .result').each((i, element) => {
|
||||
ads.push({
|
||||
visible_link: $(element).find('.result__url').text(),
|
||||
tracking_link: $(element).find('.result__title .result__a').attr('href'),
|
||||
title: $(element).find('.result__title .result__a').text(),
|
||||
snippet: $(element).find('.result__snippet').text(),
|
||||
})
|
||||
});
|
||||
|
||||
let effective_query = $('a.js-spelling-suggestion-link').attr('data-query') || '';
|
||||
|
||||
const cleaned = [];
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
||||
res.rank = this.result_rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
const cleaned = this.clean_results(results, ['title', 'link']);
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
effective_query: effective_query,
|
||||
results: cleaned
|
||||
results: cleaned,
|
||||
ads: ads,
|
||||
}
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
try {
|
||||
await this.page.goto('https://duckduckgo.com/');
|
||||
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
debug('load_start_page');
|
||||
let startUrl = 'https://duckduckgo.com/';
|
||||
|
||||
this.last_response = await this.page.goto(startUrl);
|
||||
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
async search_keyword(keyword) {
|
||||
debug('search_keyword');
|
||||
const input = await this.page.$('input[name="q"]');
|
||||
await this.set_input_value(`input[name="q"]`, keyword);
|
||||
await this.sleep(50);
|
||||
@ -56,90 +64,20 @@ class DuckduckgoScraper extends Scraper {
|
||||
}
|
||||
|
||||
async next_page() {
|
||||
let next_page_link = await this.page.$('a.result--more__btn', {timeout: 1000});
|
||||
debug('next_page');
|
||||
let next_page_link = await this.page.$('.result.result--more a', {timeout: this.STANDARD_TIMEOUT});
|
||||
if (!next_page_link) {
|
||||
return false;
|
||||
}
|
||||
await next_page_link.click();
|
||||
//await this.page.waitForNavigation();
|
||||
await this.page.waitForNavigation({ timeout: this.STANDARD_TIMEOUT });
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
async wait_for_results() {
|
||||
await this.page.waitForSelector('.serp__results', { timeout: 5000 });
|
||||
await this.sleep(250);
|
||||
}
|
||||
|
||||
async detected() {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class DuckduckgoNewsScraper extends Scraper {
|
||||
|
||||
parse(html) {
|
||||
// load the page source into cheerio
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
// perform queries
|
||||
const results = [];
|
||||
$('.result--news').each((i, link) => {
|
||||
results.push({
|
||||
link: $(link).find('.result__title .result__a').attr('href'),
|
||||
title: $(link).find('.result__title .result__a').text(),
|
||||
date: $(link).find('.result__timestamp').text(),
|
||||
snippet: $(link).find('.result__snippet').text(),
|
||||
});
|
||||
});
|
||||
|
||||
const cleaned = [];
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
||||
res.rank = this.result_rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
results: cleaned
|
||||
}
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
try {
|
||||
await page.goto('https://duckduckgo.com/?q=42&t=h_&iar=news&ia=news');
|
||||
await page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
async search_keyword(keyword) {
|
||||
const input = await this.page.$('input[name="q"]');
|
||||
await this.set_input_value(`input[name="q"]`, keyword);
|
||||
await this.sleep(50);
|
||||
await input.focus();
|
||||
await this.page.keyboard.press("Enter");
|
||||
}
|
||||
|
||||
async next_page() {
|
||||
let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000});
|
||||
if (!next_page_link) {
|
||||
return false;
|
||||
}
|
||||
await next_page_link.click();
|
||||
await this.page.waitForNavigation();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
async wait_for_results() {
|
||||
await this.page.waitForSelector('.serp__results', { timeout: 5000 });
|
||||
await this.sleep(1500);
|
||||
debug('wait_for_results');
|
||||
await this.page.waitForSelector('.result__body', { timeout: this.STANDARD_TIMEOUT });
|
||||
}
|
||||
|
||||
async detected() {
|
||||
@ -147,6 +85,5 @@ class DuckduckgoNewsScraper extends Scraper {
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
DuckduckgoNewsScraper: DuckduckgoNewsScraper,
|
||||
DuckduckgoScraper: DuckduckgoScraper,
|
||||
};
|
File diff suppressed because it is too large
Load Diff
@ -41,8 +41,11 @@ class InfospaceScraper extends Scraper {
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
|
||||
let startUrl = this.build_start_url('http://search.infospace.com/search/web?') || 'http://infospace.com/index.html';
|
||||
|
||||
try {
|
||||
await this.page.goto('http://infospace.com/index.html');
|
||||
this.last_response = await this.page.goto(startUrl);
|
||||
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
||||
} catch (e) {
|
||||
return false;
|
||||
@ -64,14 +67,13 @@ class InfospaceScraper extends Scraper {
|
||||
return false;
|
||||
}
|
||||
await next_page_link.click();
|
||||
await this.page.waitForNavigation();
|
||||
this.last_response = await this.page.waitForNavigation();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
async wait_for_results() {
|
||||
await this.page.waitForSelector('.mainline-results', { timeout: 5000 }); // TODO: this is not the best selector.
|
||||
await this.sleep(250);
|
||||
}
|
||||
|
||||
async detected() {
|
||||
@ -98,14 +100,7 @@ class WebcrawlerNewsScraper extends Scraper {
|
||||
});
|
||||
});
|
||||
|
||||
const cleaned = [];
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
||||
res.rank = this.result_rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
const cleaned = this.clean_results(results, ['title', 'link']);
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
@ -115,7 +110,7 @@ class WebcrawlerNewsScraper extends Scraper {
|
||||
|
||||
async load_start_page() {
|
||||
try {
|
||||
await this.page.goto('https://www.webcrawler.com/?qc=news');
|
||||
this.last_response = await this.page.goto('https://www.webcrawler.com/?qc=news');
|
||||
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
||||
} catch (e) {
|
||||
return false;
|
||||
@ -144,7 +139,6 @@ class WebcrawlerNewsScraper extends Scraper {
|
||||
|
||||
async wait_for_results() {
|
||||
await this.page.waitForSelector('.mainline-results', { timeout: 5000 });
|
||||
await this.sleep(150);
|
||||
}
|
||||
|
||||
async detected() {
|
||||
|
@ -1,33 +1,31 @@
|
||||
const cheerio = require('cheerio');
|
||||
|
||||
module.exports = {
|
||||
get_ip_data: get_ip_data,
|
||||
get_http_headers: get_http_headers,
|
||||
get_ip_data: get_ip_data,
|
||||
get_http_headers: get_http_headers,
|
||||
};
|
||||
|
||||
async function get_ip_data(browser) {
|
||||
const page = await browser.newPage();
|
||||
await page.goto('https://ipinfo.io/json', {
|
||||
waitLoad: true,
|
||||
waitNetworkIdle: true // defaults to false
|
||||
});
|
||||
let json = await page.content({
|
||||
timeout: 20000
|
||||
});
|
||||
const $ = cheerio.load(json);
|
||||
let ipinfo_text = $('pre').text();
|
||||
return JSON.parse(ipinfo_text);
|
||||
async function get_ip_data(page) {
|
||||
await page.goto('https://ipinfo.io/json', {
|
||||
waitLoad: true,
|
||||
waitNetworkIdle: true
|
||||
});
|
||||
let json = await page.content({
|
||||
timeout: 20000
|
||||
});
|
||||
const $ = cheerio.load(json);
|
||||
let ipinfo_text = $('pre').text();
|
||||
return JSON.parse(ipinfo_text);
|
||||
}
|
||||
|
||||
async function get_http_headers(browser) {
|
||||
const page = await browser.newPage();
|
||||
await page.goto('https://httpbin.org/get', {
|
||||
waitLoad: true,
|
||||
waitNetworkIdle: true // defaults to false
|
||||
});
|
||||
let headers = await page.content();
|
||||
async function get_http_headers(page) {
|
||||
await page.goto('https://httpbin.org/get', {
|
||||
waitLoad: true,
|
||||
waitNetworkIdle: true
|
||||
});
|
||||
let headers = await page.content();
|
||||
|
||||
const $ = cheerio.load(headers);
|
||||
let headers_text = $('pre').text();
|
||||
return JSON.parse(headers_text);
|
||||
const $ = cheerio.load(headers);
|
||||
let headers_text = $('pre').text();
|
||||
return JSON.parse(headers_text);
|
||||
}
|
@ -1,7 +1,6 @@
|
||||
const start_url = {
|
||||
'google': ''
|
||||
};
|
||||
|
||||
'use strict';
|
||||
const meta = require('./metadata.js');
|
||||
const debug = require('debug')('se-scraper:Scraper');
|
||||
/*
|
||||
Get useful JS knowledge and get awesome...
|
||||
|
||||
@ -11,21 +10,28 @@ const start_url = {
|
||||
|
||||
module.exports = class Scraper {
|
||||
constructor(options = {}) {
|
||||
debug('constructor');
|
||||
const {
|
||||
browser = null,
|
||||
config = {},
|
||||
context = {},
|
||||
pluggable = null,
|
||||
page = null,
|
||||
} = options;
|
||||
|
||||
this.page = page;
|
||||
this.last_response = null; // the last response object
|
||||
this.metadata = {
|
||||
scraping_detected: false,
|
||||
};
|
||||
this.pluggable = pluggable;
|
||||
this.browser = browser;
|
||||
this.config = config;
|
||||
this.logger = this.config.logger;
|
||||
this.context = context;
|
||||
|
||||
this.STANDARD_TIMEOUT = 8000;
|
||||
// longer timeout when using proxies
|
||||
this.PROXY_TIMEOUT = 15000;
|
||||
this.proxy = config.proxy;
|
||||
this.keywords = config.keywords;
|
||||
|
||||
this.STANDARD_TIMEOUT = 10000;
|
||||
this.SOLVE_CAPTCHA_TIME = 45000;
|
||||
|
||||
this.results = {};
|
||||
@ -34,20 +40,42 @@ module.exports = class Scraper {
|
||||
this.num_requests = 0;
|
||||
// keep track of the keywords searched
|
||||
this.num_keywords = 0;
|
||||
|
||||
let settings = this.config[`${this.config.search_engine}_settings`];
|
||||
if (settings) {
|
||||
if (typeof settings === 'string') {
|
||||
settings = JSON.parse(settings);
|
||||
this.config[`${this.config.search_engine}_settings`] = settings;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async run() {
|
||||
async run({page, data, worker}) {
|
||||
|
||||
let do_continue = await this.load_search_engine();
|
||||
debug('worker=%o', worker, this.config.keywords);
|
||||
|
||||
if (page) {
|
||||
this.page = page;
|
||||
}
|
||||
|
||||
await this.page.setViewport({ width: 1920, height: 1040 });
|
||||
let do_continue = true;
|
||||
|
||||
if (this.config.scrape_from_file.length <= 0) {
|
||||
do_continue = await this.load_search_engine();
|
||||
}
|
||||
|
||||
if (!do_continue) {
|
||||
console.error('Failed to load the search engine: load_search_engine()');
|
||||
return this.results;
|
||||
} else {
|
||||
await this.scraping_loop();
|
||||
}
|
||||
|
||||
await this.scraping_loop();
|
||||
|
||||
return this.results;
|
||||
return {
|
||||
results: this.results,
|
||||
metadata: this.metadata,
|
||||
num_requests: this.num_requests,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -58,10 +86,10 @@ module.exports = class Scraper {
|
||||
*/
|
||||
async load_search_engine() {
|
||||
|
||||
this.page = await this.browser.newPage();
|
||||
|
||||
// prevent detection by evading common detection techniques
|
||||
await evadeChromeHeadlessDetection(this.page);
|
||||
if (this.config.apply_evasion_techniques === true) {
|
||||
// prevent detection by evading common detection techniques
|
||||
await evadeChromeHeadlessDetection(this.page);
|
||||
}
|
||||
|
||||
// block some assets to speed up scraping
|
||||
if (this.config.block_assets === true) {
|
||||
@ -79,12 +107,35 @@ module.exports = class Scraper {
|
||||
|
||||
if (this.config.test_evasion === true) {
|
||||
// Navigate to the page that will perform the tests.
|
||||
const testUrl = 'https://intoli.com/blog/' +
|
||||
'not-possible-to-block-chrome-headless/chrome-headless-test.html';
|
||||
const testUrl = 'https://bot.sannysoft.com';
|
||||
await this.page.goto(testUrl);
|
||||
|
||||
// Save a screenshot of the results.
|
||||
await this.page.screenshot({path: 'headless-test-result.png'});
|
||||
await this.page.screenshot({path: 'headless-evasion-result.png'});
|
||||
}
|
||||
|
||||
if (this.config.log_http_headers === true) {
|
||||
this.metadata.http_headers = await meta.get_http_headers(this.page);
|
||||
debug('this.metadata.http_headers=%O', this.metadata.http_headers);
|
||||
}
|
||||
|
||||
if (this.config.log_ip_address === true) {
|
||||
let ipinfo = await meta.get_ip_data(this.page);
|
||||
this.metadata.ipinfo = ipinfo;
|
||||
debug('this.metadata.ipinfo', this.metadata.ipinfo);
|
||||
}
|
||||
|
||||
// check that our proxy is working by confirming
|
||||
// that ipinfo.io sees the proxy IP address
|
||||
if (this.proxy && this.config.log_ip_address === true) {
|
||||
debug(`${this.metadata.ipinfo.ip} vs ${this.proxy}`);
|
||||
|
||||
// if the ip returned by ipinfo is not a substring of our proxystring, get the heck outta here
|
||||
if (!this.proxy.includes(this.metadata.ipinfo.ip)) {
|
||||
throw new Error(`Proxy output ip ${this.proxy} does not match with provided one`);
|
||||
} else {
|
||||
this.logger.info(`Using valid Proxy: ${this.proxy}`);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return await this.load_start_page();
|
||||
@ -98,37 +149,39 @@ module.exports = class Scraper {
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async scraping_loop() {
|
||||
for (let keyword of this.config.keywords) {
|
||||
for (var keyword of this.keywords) {
|
||||
this.num_keywords++;
|
||||
this.keyword = keyword;
|
||||
this.results[keyword] = {};
|
||||
this.result_rank = 1;
|
||||
|
||||
if (this.pluggable.before_keyword_scraped) {
|
||||
await this.pluggable.before_keyword_scraped({
|
||||
num_keywords: this.num_keywords,
|
||||
num_requests: this.num_requests,
|
||||
keyword: keyword,
|
||||
page: this.page,
|
||||
config: this.config,
|
||||
context: this.context,
|
||||
});
|
||||
}
|
||||
|
||||
let page_num = 1;
|
||||
|
||||
try {
|
||||
|
||||
await this.search_keyword(keyword);
|
||||
if (this.pluggable && this.pluggable.before_keyword_scraped) {
|
||||
await this.pluggable.before_keyword_scraped({
|
||||
results: this.results,
|
||||
num_keywords: this.num_keywords,
|
||||
num_requests: this.num_requests,
|
||||
keyword: keyword,
|
||||
});
|
||||
}
|
||||
|
||||
this.page_num = 1;
|
||||
|
||||
// load scraped page from file if `scrape_from_file` is given
|
||||
if (this.config.scrape_from_file.length <= 0) {
|
||||
await this.search_keyword(keyword);
|
||||
} else {
|
||||
this.last_response = await this.page.goto(this.config.scrape_from_file);
|
||||
}
|
||||
|
||||
// when searching the keyword fails, num_requests will not
|
||||
// be incremented.
|
||||
this.num_requests++;
|
||||
|
||||
do {
|
||||
|
||||
if (this.config.verbose === true) {
|
||||
console.log(`${this.config.search_engine} scrapes keyword "${keyword}" on page ${page_num}`);
|
||||
}
|
||||
this.logger.info(`${this.config.search_engine_name} scrapes keyword "${keyword}" on page ${this.page_num}`);
|
||||
|
||||
await this.wait_for_results();
|
||||
|
||||
@ -138,13 +191,66 @@ module.exports = class Scraper {
|
||||
|
||||
let html = await this.page.content();
|
||||
let parsed = this.parse(html);
|
||||
this.results[keyword][page_num] = parsed ? parsed : await this.parse_async(html);
|
||||
this.results[keyword][this.page_num] = parsed ? parsed : await this.parse_async(html);
|
||||
|
||||
page_num += 1;
|
||||
if (this.config.screen_output) {
|
||||
this.results[keyword][this.page_num].screenshot = await this.page.screenshot({
|
||||
encoding: 'base64',
|
||||
fullPage: false,
|
||||
});
|
||||
}
|
||||
|
||||
if (this.config.html_output) {
|
||||
|
||||
if (this.config.clean_html_output) {
|
||||
await this.page.evaluate(() => {
|
||||
// remove script and style tags
|
||||
Array.prototype.slice.call(document.getElementsByTagName('script')).forEach(
|
||||
function(item) {
|
||||
item.remove();
|
||||
});
|
||||
Array.prototype.slice.call(document.getElementsByTagName('style')).forEach(
|
||||
function(item) {
|
||||
item.remove();
|
||||
});
|
||||
|
||||
// remove all comment nodes
|
||||
var nodeIterator = document.createNodeIterator(
|
||||
document.body,
|
||||
NodeFilter.SHOW_COMMENT,
|
||||
{ acceptNode: function(node) { return NodeFilter.FILTER_ACCEPT; } }
|
||||
);
|
||||
while(nodeIterator.nextNode()){
|
||||
var commentNode = nodeIterator.referenceNode;
|
||||
commentNode.remove();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (this.config.clean_data_images) {
|
||||
await this.page.evaluate(() => {
|
||||
Array.prototype.slice.call(document.getElementsByTagName('img')).forEach(
|
||||
function(item) {
|
||||
let src = item.getAttribute('src');
|
||||
if (src && src.startsWith('data:')) {
|
||||
item.setAttribute('src', '');
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
let html_contents = await this.page.content();
|
||||
// https://stackoverflow.com/questions/27841112/how-to-remove-white-space-between-html-tags-using-javascript
|
||||
// TODO: not sure if this is save!
|
||||
html_contents = html_contents.replace(/>\s+</g,'><');
|
||||
this.results[keyword][this.page_num].html = html_contents;
|
||||
}
|
||||
|
||||
this.page_num += 1;
|
||||
|
||||
// only load the next page when we will pass the next iteration
|
||||
// step from the while loop
|
||||
if (page_num <= this.config.num_pages) {
|
||||
if (this.page_num <= this.config.num_pages) {
|
||||
|
||||
let next_page_loaded = await this.next_page();
|
||||
|
||||
@ -155,36 +261,66 @@ module.exports = class Scraper {
|
||||
}
|
||||
}
|
||||
|
||||
} while (page_num <= this.config.num_pages);
|
||||
} while (this.page_num <= this.config.num_pages);
|
||||
|
||||
} catch (e) {
|
||||
|
||||
console.error(`Problem with scraping ${keyword} in search engine ${this.config.search_engine}: ${e}`);
|
||||
this.logger.warn(`Problem with scraping ${keyword} in search engine ${this.config.search_engine_name}: ${e.message}`);
|
||||
debug('this.last_response=%O', this.last_response);
|
||||
|
||||
if (await this.detected() === true) {
|
||||
console.error(`${this.config.search_engine} DETECTED the scraping!`);
|
||||
if (this.config.take_screenshot_on_error) {
|
||||
await this.page.screenshot({ path: `debug_se_scraper_${this.config.search_engine_name}_${keyword}.png` });
|
||||
}
|
||||
|
||||
this.metadata.scraping_detected = await this.detected();
|
||||
|
||||
if (this.metadata.scraping_detected === true) {
|
||||
this.logger.warn(`${this.config.search_engine_name} detected the scraping!`);
|
||||
|
||||
if (this.config.is_local === true) {
|
||||
await this.sleep(this.SOLVE_CAPTCHA_TIME);
|
||||
console.error(`You have ${this.SOLVE_CAPTCHA_TIME}ms to enter the captcha.`);
|
||||
this.logger.info(`You have ${this.SOLVE_CAPTCHA_TIME}ms to enter the captcha.`);
|
||||
// expect that user filled out necessary captcha
|
||||
} else {
|
||||
break;
|
||||
if (this.config.throw_on_detection === true) {
|
||||
throw( e );
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// some other error, quit scraping process if stuff is broken
|
||||
if (this.config.is_local === true) {
|
||||
console.error('You have 30 seconds to fix this.');
|
||||
await this.sleep(30000);
|
||||
if (this.config.throw_on_detection === true) {
|
||||
throw( e );
|
||||
} else {
|
||||
break;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generic function to append queryArgs to a search engine url.
|
||||
*
|
||||
* @param: The baseUrl to use for the build process.
|
||||
*/
|
||||
build_start_url(baseUrl) {
|
||||
let settings = this.config[`${this.config.search_engine}_settings`];
|
||||
|
||||
if (settings) {
|
||||
for (var key in settings) {
|
||||
baseUrl += `${key}=${settings[key]}&`
|
||||
}
|
||||
|
||||
this.logger.info('Using startUrl: ' + baseUrl);
|
||||
|
||||
return baseUrl;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
sleep(ms) {
|
||||
return new Promise(resolve => {
|
||||
setTimeout(resolve, ms)
|
||||
@ -194,9 +330,7 @@ module.exports = class Scraper {
|
||||
async random_sleep() {
|
||||
const [min, max] = this.config.sleep_range;
|
||||
let rand = Math.floor(Math.random() * (max - min + 1) + min); //Generate Random number
|
||||
if (this.config.debug === true) {
|
||||
console.log(`Sleeping for ${rand}s`);
|
||||
}
|
||||
this.logger.info(`Sleeping for ${rand}s`);
|
||||
await this.sleep(rand * 1000);
|
||||
}
|
||||
|
||||
@ -210,15 +344,35 @@ module.exports = class Scraper {
|
||||
no_results(needles, html) {
|
||||
for (let needle of needles) {
|
||||
if (html.includes(needle)) {
|
||||
if (this.config.debug) {
|
||||
console.log(`HTML contains needle ${needle}. no_results=true`);
|
||||
}
|
||||
this.logger.warn(`HTML contains needle ${needle}. no_results=true`);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
Throw away all elements that do not have data in the
|
||||
specified attributes. Most be of value string.
|
||||
*/
|
||||
clean_results(results, attributes) {
|
||||
const cleaned = [];
|
||||
for (var res of results) {
|
||||
let goodboy = true;
|
||||
for (var attr of attributes) {
|
||||
if (!res[attr] || !res[attr].trim()) {
|
||||
goodboy = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (goodboy) {
|
||||
res.rank = this.result_rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
return cleaned;
|
||||
}
|
||||
|
||||
parse(html) {
|
||||
|
||||
}
|
||||
@ -265,127 +419,131 @@ module.exports = class Scraper {
|
||||
|
||||
// This is where we'll put the code to get around the tests.
|
||||
async function evadeChromeHeadlessDetection(page) {
|
||||
// Pass the Webdriver Test.
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
const newProto = navigator.__proto__;
|
||||
delete newProto.webdriver;
|
||||
navigator.__proto__ = newProto;
|
||||
});
|
||||
|
||||
// Pass the Chrome Test.
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
// We can mock this in as much depth as we need for the test.
|
||||
const mockObj = {
|
||||
app: {
|
||||
isInstalled: false,
|
||||
},
|
||||
webstore: {
|
||||
onInstallStageChanged: {},
|
||||
onDownloadProgress: {},
|
||||
},
|
||||
runtime: {
|
||||
PlatformOs: {
|
||||
MAC: 'mac',
|
||||
WIN: 'win',
|
||||
ANDROID: 'android',
|
||||
CROS: 'cros',
|
||||
LINUX: 'linux',
|
||||
OPENBSD: 'openbsd',
|
||||
},
|
||||
PlatformArch: {
|
||||
ARM: 'arm',
|
||||
X86_32: 'x86-32',
|
||||
X86_64: 'x86-64',
|
||||
},
|
||||
PlatformNaclArch: {
|
||||
ARM: 'arm',
|
||||
X86_32: 'x86-32',
|
||||
X86_64: 'x86-64',
|
||||
},
|
||||
RequestUpdateCheckStatus: {
|
||||
THROTTLED: 'throttled',
|
||||
NO_UPDATE: 'no_update',
|
||||
UPDATE_AVAILABLE: 'update_available',
|
||||
},
|
||||
OnInstalledReason: {
|
||||
INSTALL: 'install',
|
||||
UPDATE: 'update',
|
||||
CHROME_UPDATE: 'chrome_update',
|
||||
SHARED_MODULE_UPDATE: 'shared_module_update',
|
||||
},
|
||||
OnRestartRequiredReason: {
|
||||
APP_UPDATE: 'app_update',
|
||||
OS_UPDATE: 'os_update',
|
||||
PERIODIC: 'periodic',
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
window.navigator.chrome = mockObj;
|
||||
window.chrome = mockObj;
|
||||
});
|
||||
|
||||
// Pass the Permissions Test.
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
const originalQuery = window.navigator.permissions.query;
|
||||
window.navigator.permissions.__proto__.query = parameters =>
|
||||
parameters.name === 'notifications'
|
||||
? Promise.resolve({state: Notification.permission})
|
||||
: originalQuery(parameters);
|
||||
|
||||
// Inspired by: https://github.com/ikarienator/phantomjs_hide_and_seek/blob/master/5.spoofFunctionBind.js
|
||||
const oldCall = Function.prototype.call;
|
||||
function call() {
|
||||
return oldCall.apply(this, arguments);
|
||||
}
|
||||
Function.prototype.call = call;
|
||||
|
||||
const nativeToStringFunctionString = Error.toString().replace(/Error/g, "toString");
|
||||
const oldToString = Function.prototype.toString;
|
||||
|
||||
function functionToString() {
|
||||
if (this === window.navigator.permissions.query) {
|
||||
return "function query() { [native code] }";
|
||||
}
|
||||
if (this === functionToString) {
|
||||
return nativeToStringFunctionString;
|
||||
}
|
||||
return oldCall.call(oldToString, this);
|
||||
}
|
||||
Function.prototype.toString = functionToString;
|
||||
});
|
||||
|
||||
// Pass the Plugins Length Test.
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
// Overwrite the `plugins` property to use a custom getter.
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
// This just needs to have `length > 0` for the current test,
|
||||
// but we could mock the plugins too if necessary.
|
||||
get: () => [1, 2, 3, 4, 5]
|
||||
// Pass the Webdriver Test.
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
const newProto = navigator.__proto__;
|
||||
delete newProto.webdriver;
|
||||
navigator.__proto__ = newProto;
|
||||
});
|
||||
});
|
||||
|
||||
// Pass the Languages Test.
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
// Overwrite the `plugins` property to use a custom getter.
|
||||
Object.defineProperty(navigator, 'languages', {
|
||||
get: () => ['en-US', 'en']
|
||||
// Pass the Chrome Test.
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
// We can mock this in as much depth as we need for the test.
|
||||
const mockObj = {
|
||||
app: {
|
||||
isInstalled: false,
|
||||
},
|
||||
webstore: {
|
||||
onInstallStageChanged: {},
|
||||
onDownloadProgress: {},
|
||||
},
|
||||
runtime: {
|
||||
PlatformOs: {
|
||||
MAC: 'mac',
|
||||
WIN: 'win',
|
||||
ANDROID: 'android',
|
||||
CROS: 'cros',
|
||||
LINUX: 'linux',
|
||||
OPENBSD: 'openbsd',
|
||||
},
|
||||
PlatformArch: {
|
||||
ARM: 'arm',
|
||||
X86_32: 'x86-32',
|
||||
X86_64: 'x86-64',
|
||||
},
|
||||
PlatformNaclArch: {
|
||||
ARM: 'arm',
|
||||
X86_32: 'x86-32',
|
||||
X86_64: 'x86-64',
|
||||
},
|
||||
RequestUpdateCheckStatus: {
|
||||
THROTTLED: 'throttled',
|
||||
NO_UPDATE: 'no_update',
|
||||
UPDATE_AVAILABLE: 'update_available',
|
||||
},
|
||||
OnInstalledReason: {
|
||||
INSTALL: 'install',
|
||||
UPDATE: 'update',
|
||||
CHROME_UPDATE: 'chrome_update',
|
||||
SHARED_MODULE_UPDATE: 'shared_module_update',
|
||||
},
|
||||
OnRestartRequiredReason: {
|
||||
APP_UPDATE: 'app_update',
|
||||
OS_UPDATE: 'os_update',
|
||||
PERIODIC: 'periodic',
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
window.navigator.chrome = mockObj;
|
||||
window.chrome = mockObj;
|
||||
});
|
||||
});
|
||||
|
||||
// Pass the iframe Test
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(HTMLIFrameElement.prototype, 'contentWindow', {
|
||||
get: function() {
|
||||
return window;
|
||||
// Pass the Permissions Test.
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
const originalQuery = window.navigator.permissions.query;
|
||||
window.navigator.permissions.__proto__.query = parameters =>
|
||||
parameters.name === 'notifications'
|
||||
? Promise.resolve({state: Notification.permission})
|
||||
: originalQuery(parameters);
|
||||
|
||||
// Inspired by: https://github.com/ikarienator/phantomjs_hide_and_seek/blob/master/5.spoofFunctionBind.js
|
||||
const oldCall = Function.prototype.call;
|
||||
|
||||
function call() {
|
||||
return oldCall.apply(this, arguments);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
// Pass toString test, though it breaks console.debug() from working
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
window.console.debug = () => {
|
||||
return null;
|
||||
};
|
||||
});
|
||||
}
|
||||
Function.prototype.call = call;
|
||||
|
||||
const nativeToStringFunctionString = Error.toString().replace(/Error/g, "toString");
|
||||
const oldToString = Function.prototype.toString;
|
||||
|
||||
function functionToString() {
|
||||
if (this === window.navigator.permissions.query) {
|
||||
return "function query() { [native code] }";
|
||||
}
|
||||
if (this === functionToString) {
|
||||
return nativeToStringFunctionString;
|
||||
}
|
||||
return oldCall.call(oldToString, this);
|
||||
}
|
||||
|
||||
Function.prototype.toString = functionToString;
|
||||
});
|
||||
|
||||
// Pass the Plugins Length Test.
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
// Overwrite the `plugins` property to use a custom getter.
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
// This just needs to have `length > 0` for the current test,
|
||||
// but we could mock the plugins too if necessary.
|
||||
get: () => [1, 2, 3, 4, 5]
|
||||
});
|
||||
});
|
||||
|
||||
// Pass the Languages Test.
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
// Overwrite the `plugins` property to use a custom getter.
|
||||
Object.defineProperty(navigator, 'languages', {
|
||||
get: () => ['en-US', 'en']
|
||||
});
|
||||
});
|
||||
|
||||
// Pass the iframe Test
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(HTMLIFrameElement.prototype, 'contentWindow', {
|
||||
get: function () {
|
||||
return window;
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
// Pass toString test, though it breaks console.debug() from working
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
window.console.debug = () => {
|
||||
return null;
|
||||
};
|
||||
});
|
||||
}
|
||||
|
@ -1,215 +0,0 @@
|
||||
const cheerio = require('cheerio');
|
||||
const Scraper = require('./se_scraper');
|
||||
|
||||
class YahooFinanceScraper extends Scraper {
|
||||
|
||||
parse(html) {
|
||||
// load the page source into cheerio
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const results = [];
|
||||
$('.js-stream-content .Cf').each((i, link) => {
|
||||
results.push({
|
||||
link: $(link).find('h3 a').attr('href'),
|
||||
title: $(link).find('h3').text(),
|
||||
snippet: $(link).find('p').text(),
|
||||
})
|
||||
});
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
results: results,
|
||||
}
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
try {
|
||||
await this.page.goto('https://finance.yahoo.com/');
|
||||
for (var i = 0; i < 3; i++) {
|
||||
let consent = await this.page.waitForSelector('[type="submit"]');
|
||||
await consent.click();
|
||||
}
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
async search_keyword(keyword) {
|
||||
await this.page.goto(`https://finance.yahoo.com/quote/${keyword}/news?p=${keyword}`);
|
||||
await this.page.waitForSelector('#quote-header-info', { timeout: 8000 });
|
||||
await this.sleep(1000);
|
||||
}
|
||||
|
||||
async next_page() {
|
||||
return false;
|
||||
}
|
||||
|
||||
async wait_for_results() {
|
||||
await this.page.waitForSelector('#b_content', { timeout: 5000 });
|
||||
await this.sleep(500);
|
||||
}
|
||||
|
||||
async detected() {
|
||||
}
|
||||
}
|
||||
|
||||
class MarketwatchFinanceScraper extends Scraper {
|
||||
|
||||
async parse_async(html) {
|
||||
let res = await this.page.evaluate(() => {
|
||||
let results = [];
|
||||
// get the hotel elements
|
||||
let items = document.querySelectorAll('.article__content');
|
||||
// get the hotel data
|
||||
items.forEach((newsitem) => {
|
||||
let data = {};
|
||||
try {
|
||||
data.link = newsitem.querySelector('.article__headline a').getAttribute('href');
|
||||
data.title = newsitem.querySelector('.article__headline a').innerText;
|
||||
data.date = newsitem.querySelector('.article__timestamp').innerText;
|
||||
data.author = newsitem.querySelector('.article__author').innerText;
|
||||
}
|
||||
catch (exception) {
|
||||
console.error('Error parsing marketwatch data: ', exception);
|
||||
}
|
||||
results.push(data);
|
||||
});
|
||||
return results;
|
||||
});
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
results: res,
|
||||
}
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
return true;
|
||||
}
|
||||
|
||||
async search_keyword(keyword) {
|
||||
await this.page.goto(`https://www.marketwatch.com/investing/stock/${keyword}`);
|
||||
}
|
||||
|
||||
async next_page() {
|
||||
return false;
|
||||
}
|
||||
|
||||
async wait_for_results() {
|
||||
await this.page.waitForSelector('.intraday__data', { timeout: 8000 });
|
||||
await this.sleep(500);
|
||||
}
|
||||
|
||||
async detected() {
|
||||
}
|
||||
}
|
||||
|
||||
class ReutersFinanceScraper extends Scraper {
|
||||
|
||||
async parse_async(html) {
|
||||
let newsData = await this.page.evaluate(() => {
|
||||
let results = [];
|
||||
// get the hotel elements
|
||||
let items = document.querySelectorAll('div.feature');
|
||||
// get the hotel data
|
||||
items.forEach((newsitem) => {
|
||||
let data = {};
|
||||
try {
|
||||
data.link = newsitem.querySelector('h2 a').getAttribute('href');
|
||||
data.link = 'https://www.reuters.com' + data.link;
|
||||
data.title = newsitem.querySelector('h2 a').innerText;
|
||||
data.snippet = newsitem.querySelector('p').innerText;
|
||||
data.date = newsitem.querySelector('.timestamp').innerText;
|
||||
}
|
||||
catch (exception) {
|
||||
console.error('Error parsing reuters data: ', exception);
|
||||
}
|
||||
results.push(data);
|
||||
});
|
||||
return results;
|
||||
});
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
results: newsData,
|
||||
}
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
return true;
|
||||
}
|
||||
|
||||
async search_keyword(keyword) {
|
||||
await this.page.goto(`https://www.reuters.com/finance/stocks/overview/${keyword}`);
|
||||
}
|
||||
|
||||
async next_page() {
|
||||
return false;
|
||||
}
|
||||
|
||||
async wait_for_results() {
|
||||
await this.page.waitForSelector('#sectionHeader', { timeout: 8000 });
|
||||
await this.sleep(500);
|
||||
}
|
||||
|
||||
async detected() {
|
||||
}
|
||||
}
|
||||
|
||||
class CnbcFinanceScraper extends Scraper {
|
||||
|
||||
async parse_async(html) {
|
||||
let newsData = await this.page.evaluate(() => {
|
||||
let results = [];
|
||||
// get the hotel elements
|
||||
let items = document.querySelectorAll('div.headline');
|
||||
// get the hotel data
|
||||
items.forEach((newsitem) => {
|
||||
let data = {};
|
||||
try {
|
||||
data.link = newsitem.querySelector('a').getAttribute('href');
|
||||
data.title = newsitem.querySelector('[ng-bind="asset.headline"]').innerText;
|
||||
data.date = newsitem.querySelector('span.note').innerText;
|
||||
}
|
||||
catch (exception) {
|
||||
console.error('Error parsing cnbc data: ', exception);
|
||||
}
|
||||
results.push(data);
|
||||
});
|
||||
return results;
|
||||
});
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
results: newsData,
|
||||
}
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
return true;
|
||||
}
|
||||
|
||||
async search_keyword(keyword) {
|
||||
await this.page.goto(`https://www.cnbc.com/quotes/?symbol=${keyword}&tab=news`);
|
||||
}
|
||||
|
||||
async next_page() {
|
||||
return false;
|
||||
}
|
||||
|
||||
async wait_for_results() {
|
||||
await this.page.waitForSelector('#quote_title_and_chart', { timeout: 8000 });
|
||||
await this.sleep(500);
|
||||
}
|
||||
|
||||
async detected() {
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
YahooFinanceScraper: YahooFinanceScraper,
|
||||
ReutersFinanceScraper: ReutersFinanceScraper,
|
||||
CnbcFinanceScraper: CnbcFinanceScraper,
|
||||
MarketwatchFinanceScraper: MarketwatchFinanceScraper,
|
||||
};
|
@ -1,81 +0,0 @@
|
||||
module.exports = {
|
||||
random_user_agent: random_user_agent,
|
||||
};
|
||||
|
||||
function random_user_agent() {
|
||||
let rand = user_agents[Math.floor(Math.random()*user_agents.length)];
|
||||
}
|
||||
|
||||
// updated: 29 Jan 2019
|
||||
const user_agents = [
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134',
|
||||
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
||||
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
||||
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 YaBrowser/18.11.1.805 Yowser/2.5 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763',
|
||||
'Mozilla/5.0 (iPad; CPU OS 12_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.1 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116',
|
||||
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 YaBrowser/18.11.1.805 Yowser/2.5 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299',
|
||||
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.106',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0',
|
||||
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.98 Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; Touch; rv:11.0) like Gecko',
|
||||
'Mozilla/5.0 (Windows NT 6.1; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
|
||||
'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8',
|
||||
'Mozilla/5.0 (Windows NT 10.0; rv:64.0) Gecko/20100101 Firefox/64.0',
|
||||
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
|
||||
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0',
|
||||
'Mozilla/5.0 (X11; CrOS x86_64 11151.59.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.94 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
];
|
114
src/modules/yandex.js
Normal file
114
src/modules/yandex.js
Normal file
@ -0,0 +1,114 @@
|
||||
'use strict';
|
||||
|
||||
const Scraper = require('./se_scraper');
|
||||
|
||||
class YandexScraper extends Scraper {
|
||||
|
||||
constructor(...args) {
|
||||
super(...args);
|
||||
}
|
||||
|
||||
async parse_async(html) {
|
||||
|
||||
let results = await this.page.evaluate(() => {
|
||||
let serp_items = document.querySelectorAll('.serp-item');
|
||||
const data = [];
|
||||
serp_items.forEach((item) => {
|
||||
let obj = {
|
||||
is_ad: false,
|
||||
};
|
||||
try {
|
||||
if (item) {
|
||||
|
||||
let linkElement = item.querySelector('h2 a.link');
|
||||
|
||||
if (linkElement) {
|
||||
obj.link = linkElement.getAttribute('href');
|
||||
obj.title = linkElement.innerText;
|
||||
}
|
||||
|
||||
|
||||
let label = item.querySelector('.organic__subtitle .label');
|
||||
|
||||
if (label) {
|
||||
let labelText = label.innerText;
|
||||
|
||||
if (labelText) {
|
||||
labelText = labelText.trim().toLowerCase();
|
||||
console.log(labelText);
|
||||
let ad_labels = ['ad', 'werbung', 'реклама', 'anuncio'];
|
||||
obj.is_ad = ad_labels.includes(labelText);
|
||||
}
|
||||
}
|
||||
|
||||
obj.snippet = item.querySelector('.text-container.typo').innerText;
|
||||
obj.visible_link = item.querySelector('.typo_type_greenurl').innerText;
|
||||
|
||||
if (obj.title) {
|
||||
data.push(obj);
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
}
|
||||
});
|
||||
return data;
|
||||
});
|
||||
|
||||
let num_results = await this.page.evaluate(() => {
|
||||
let num_results = document.querySelector('.serp-adv__found');
|
||||
if (num_results) {
|
||||
return num_results.innerText;
|
||||
}
|
||||
});
|
||||
|
||||
const cleaned = this.clean_results(results, ['title', 'link' , 'snippet']);
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
num_results: num_results,
|
||||
results: cleaned,
|
||||
};
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
let startUrl = 'https://yandex.com';
|
||||
|
||||
this.logger.info('Using startUrl: ' + startUrl);
|
||||
|
||||
this.last_response = await this.page.goto(startUrl);
|
||||
|
||||
await this.page.waitForSelector('input[name="text"]', { timeout: this.STANDARD_TIMEOUT });
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
async search_keyword(keyword) {
|
||||
const input = await this.page.$('input[name="text"]');
|
||||
await this.set_input_value(`input[name="text"]`, keyword);
|
||||
await this.sleep(50);
|
||||
await input.focus();
|
||||
await this.page.keyboard.press("Enter");
|
||||
}
|
||||
|
||||
async next_page() {
|
||||
let next_page_link = await this.page.$('.pager .pager__item_kind_next', {timeout: 1000});
|
||||
if (!next_page_link) {
|
||||
return false;
|
||||
}
|
||||
await next_page_link.click();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
async wait_for_results() {
|
||||
await this.page.waitForSelector('.main__content', { timeout: this.STANDARD_TIMEOUT });
|
||||
}
|
||||
|
||||
async detected() {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
YandexScraper: YandexScraper,
|
||||
};
|
@ -1,105 +0,0 @@
|
||||
const cheerio = require('cheerio');
|
||||
const Scraper = require('./se_scraper');
|
||||
|
||||
class YoutubeScraper extends Scraper {
|
||||
|
||||
parse(html) {
|
||||
// load the page source into cheerio
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
// perform queries
|
||||
const results = [];
|
||||
$('#contents ytd-video-renderer,#contents ytd-grid-video-renderer').each((i, link) => {
|
||||
results.push({
|
||||
link: $(link).find('#video-title').attr('href'),
|
||||
title: $(link).find('#video-title').text(),
|
||||
snippet: $(link).find('#description-text').text(),
|
||||
channel: $(link).find('#byline a').text(),
|
||||
channel_link: $(link).find('#byline a').attr('href'),
|
||||
num_views: $(link).find('#metadata-line span:nth-child(1)').text(),
|
||||
release_date: $(link).find('#metadata-line span:nth-child(2)').text(),
|
||||
})
|
||||
});
|
||||
|
||||
let no_results = this.no_results(
|
||||
['No results found', 'Keine Ergebnisse', 'Es werden Ergebnisse angezeigt', 'Showing results for' ],
|
||||
$('yt-showing-results-for-renderer').text()
|
||||
);
|
||||
|
||||
let effective_query = $('#corrected-link').text() || '';
|
||||
|
||||
const cleaned = [];
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
||||
res.title = res.title.trim();
|
||||
res.snippet = res.snippet.trim();
|
||||
res.rank = this.result_rank++;
|
||||
|
||||
// check if this result has been used before
|
||||
if (this.all_videos.has(res.title) === false) {
|
||||
cleaned.push(res);
|
||||
}
|
||||
this.all_videos.add(res.title);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
no_results: no_results,
|
||||
effective_query: effective_query,
|
||||
num_results: '',
|
||||
results: cleaned,
|
||||
}
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
try {
|
||||
this.all_videos = new Set();
|
||||
await this.page.goto('https://www.youtube.com', {
|
||||
referer: 'https://google.com'
|
||||
});
|
||||
await this.page.waitForSelector('input[id="search"]', { timeout: 5000 });
|
||||
// before we do anything, parse the results of the front page of youtube
|
||||
await this.page.waitForSelector('ytd-video-renderer,ytd-grid-video-renderer', { timeout: 10000 });
|
||||
await this.sleep(500);
|
||||
let html = await this.page.content();
|
||||
this.results['frontpage'] = this.parse(html);
|
||||
this.result_rank = 1;
|
||||
} catch(e) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
async search_keyword(keyword) {
|
||||
const input = await this.page.$('input[id="search"]');
|
||||
// overwrites last text in input
|
||||
await input.click({ clickCount: 3 });
|
||||
await input.type(keyword);
|
||||
await input.focus();
|
||||
await this.page.keyboard.press("Enter");
|
||||
}
|
||||
|
||||
async next_page() {
|
||||
// youtube needs scrolling
|
||||
// TODO: implement scrolling, no priority right now
|
||||
return false;
|
||||
}
|
||||
|
||||
async wait_for_results() {
|
||||
await this.page.waitForFunction(`document.title.indexOf('${this.keyword}') !== -1`, { timeout: 5000 });
|
||||
await this.page.waitForSelector('ytd-video-renderer,ytd-grid-video-renderer', { timeout: 5000 });
|
||||
await this.sleep(500);
|
||||
}
|
||||
|
||||
async detected() {
|
||||
const title = await this.page.title();
|
||||
let html = await this.page.content();
|
||||
return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
YoutubeScraper: YoutubeScraper,
|
||||
};
|
@ -1,291 +1,411 @@
|
||||
const puppeteer = require('puppeteer');
|
||||
const zlib = require('zlib');
|
||||
var fs = require('fs');
|
||||
'use strict';
|
||||
|
||||
// local module imports
|
||||
const fs = require('fs');
|
||||
const os = require('os');
|
||||
const _ = require('lodash');
|
||||
const { createLogger, format, transports } = require('winston');
|
||||
const { combine, timestamp, printf } = format;
|
||||
const debug = require('debug')('se-scraper:ScrapeManager');
|
||||
const { Cluster } = require('puppeteer-cluster');
|
||||
|
||||
const UserAgent = require('user-agents');
|
||||
const google = require('./modules/google.js');
|
||||
const bing = require('./modules/bing.js');
|
||||
const baidu = require('./modules/baidu.js');
|
||||
const yandex = require('./modules/yandex.js');
|
||||
const infospace = require('./modules/infospace.js');
|
||||
const youtube = require('./modules/youtube.js');
|
||||
const ua = require('./modules/user_agents.js');
|
||||
const meta = require('./modules/metadata.js');
|
||||
const duckduckgo = require('./modules/duckduckgo.js');
|
||||
const tickersearch = require('./modules/ticker_search.js');
|
||||
const CustomConcurrencyImpl = require('./concurrency-implementation');
|
||||
|
||||
const MAX_ALLOWED_BROWSERS = 6;
|
||||
|
||||
function write_results(fname, data) {
|
||||
fs.writeFileSync(fname, data, (err) => {
|
||||
if (err) throw err;
|
||||
console.log(`Results written to file ${fname}`);
|
||||
});
|
||||
fs.writeFileSync(fname, data, (err) => {
|
||||
if (err) throw err;
|
||||
console.log(`Results written to file ${fname}`);
|
||||
});
|
||||
}
|
||||
|
||||
module.exports.handler = async function handler (event, context, callback) {
|
||||
config = event;
|
||||
pluggable = {};
|
||||
if (config.custom_func) {
|
||||
if (fs.existsSync(config.custom_func)) {
|
||||
try {
|
||||
Pluggable = require(config.custom_func);
|
||||
pluggable = new Pluggable({config: config});
|
||||
} catch (exception) {
|
||||
console.error(exception);
|
||||
}
|
||||
} else {
|
||||
console.error(`File "${config.custom_func}" does not exist...`);
|
||||
}
|
||||
}
|
||||
function read_keywords_from_file(fname) {
|
||||
let kws = fs.readFileSync(fname).toString().split(os.EOL);
|
||||
// clean keywords
|
||||
kws = kws.filter((kw) => {
|
||||
return kw.trim().length > 0;
|
||||
});
|
||||
return kws;
|
||||
}
|
||||
|
||||
try {
|
||||
const startTime = Date.now();
|
||||
config = parseEventData(config);
|
||||
if (config.debug === true) {
|
||||
console.log(config);
|
||||
}
|
||||
|
||||
var ADDITIONAL_CHROME_FLAGS = [
|
||||
'--disable-infobars',
|
||||
'--window-position=0,0',
|
||||
'--ignore-certifcate-errors',
|
||||
'--ignore-certifcate-errors-spki-list',
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-accelerated-2d-canvas',
|
||||
'--disable-gpu',
|
||||
'--window-size=1920x1080',
|
||||
'--hide-scrollbars',
|
||||
];
|
||||
function getScraper(search_engine, args) {
|
||||
if (typeof search_engine === 'string') {
|
||||
return new {
|
||||
google: google.GoogleScraper,
|
||||
google_news_old: google.GoogleNewsOldScraper,
|
||||
google_news: google.GoogleNewsScraper,
|
||||
google_image: google.GoogleImageScraper,
|
||||
bing: bing.BingScraper,
|
||||
yandex: yandex.YandexScraper,
|
||||
bing_news: bing.BingNewsScraper,
|
||||
duckduckgo: duckduckgo.DuckduckgoScraper,
|
||||
infospace: infospace.InfospaceScraper,
|
||||
webcrawler: infospace.WebcrawlerNewsScraper,
|
||||
}[search_engine](args);
|
||||
} else if (typeof search_engine === 'function') {
|
||||
return new search_engine(args);
|
||||
} else {
|
||||
throw new Error(`search_engine must either be a string of class (function)`);
|
||||
}
|
||||
}
|
||||
|
||||
let USER_AGENT = '';
|
||||
|
||||
if (config.user_agent) {
|
||||
USER_AGENT = config.user_agent;
|
||||
}
|
||||
class ScrapeManager {
|
||||
|
||||
if (config.random_user_agent === true) {
|
||||
USER_AGENT = ua.random_user_agent();
|
||||
}
|
||||
constructor(config, context={}) {
|
||||
|
||||
if (USER_AGENT) {
|
||||
ADDITIONAL_CHROME_FLAGS.push(
|
||||
`--user-agent="${USER_AGENT}"`
|
||||
)
|
||||
}
|
||||
this.cluster = null;
|
||||
this.pluggable = null;
|
||||
this.scraper = null;
|
||||
this.context = context;
|
||||
|
||||
if (config.proxy) {
|
||||
// check this out bubbles
|
||||
// https://www.systutorials.com/241062/how-to-set-google-chromes-proxy-settings-in-command-line-on-linux/
|
||||
// [<proxy-scheme>://]<proxy-host>[:<proxy-port>]
|
||||
// "http", "socks", "socks4", "socks5".
|
||||
ADDITIONAL_CHROME_FLAGS.push(
|
||||
'--proxy-server=' + config.proxy,
|
||||
)
|
||||
}
|
||||
this.config = _.defaults(config, {
|
||||
// the user agent to scrape with
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3835.0 Safari/537.36',
|
||||
// if random_user_agent is set to True, a random user agent is chosen
|
||||
random_user_agent: false,
|
||||
// whether to select manual settings in visible mode
|
||||
set_manual_settings: false,
|
||||
// log ip address data
|
||||
log_ip_address: false,
|
||||
// log http headers
|
||||
log_http_headers: false,
|
||||
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
||||
// is drawn before every request. empty string for no sleeping.
|
||||
sleep_range: null,
|
||||
// which search engine to scrape
|
||||
search_engine: 'google',
|
||||
search_engine_name: 'google',
|
||||
logger: createLogger({
|
||||
level: 'info',
|
||||
format: combine(
|
||||
timestamp(),
|
||||
printf(({ level, message, timestamp }) => {
|
||||
return `${timestamp} [${level}] ${message}`;
|
||||
})
|
||||
),
|
||||
transports: [
|
||||
new transports.Console()
|
||||
]
|
||||
}),
|
||||
keywords: ['nodejs rocks',],
|
||||
// whether to start the browser in headless mode
|
||||
headless: true,
|
||||
// specify flags passed to chrome here
|
||||
// About our defaults values https://peter.sh/experiments/chromium-command-line-switches/
|
||||
chrome_flags: [
|
||||
'--disable-infobars',
|
||||
'--window-position=0,0',
|
||||
'--ignore-certifcate-errors',
|
||||
'--ignore-certifcate-errors-spki-list',
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-accelerated-2d-canvas',
|
||||
'--disable-gpu',
|
||||
'--window-size=1920,1040',
|
||||
'--start-fullscreen',
|
||||
'--hide-scrollbars',
|
||||
'--disable-notifications',
|
||||
],
|
||||
// the number of pages to scrape for each keyword
|
||||
num_pages: 1,
|
||||
// path to output file, data will be stored in JSON
|
||||
output_file: '',
|
||||
// whether to also passthru all the html output of the serp pages
|
||||
html_output: false,
|
||||
// whether to strip JS and CSS from the html_output
|
||||
// has only an effect if `html_output` is true
|
||||
clean_html_output: true,
|
||||
// remove all data images from the html
|
||||
clean_data_images: true,
|
||||
// whether to return a screenshot of serp pages as b64 data
|
||||
screen_output: false,
|
||||
// Scrape url from local file. Mainly used for testing.
|
||||
scrape_from_file: '',
|
||||
// whether to prevent images, css, fonts and media from being loaded
|
||||
// will speed up scraping a great deal
|
||||
block_assets: true,
|
||||
// path to js module that extends functionality
|
||||
// this module should export the functions:
|
||||
// get_browser, handle_metadata, close_browser
|
||||
//custom_func: resolve('examples/pluggable.js'),
|
||||
custom_func: null,
|
||||
throw_on_detection: false,
|
||||
// List of proxies to use ['socks5://78.94.172.42:1080', 'http://localhost:1080']
|
||||
proxies: null,
|
||||
// a file with one proxy per line. Example:
|
||||
// socks5://78.94.172.42:1080
|
||||
// http://118.174.233.10:48400
|
||||
proxy_file: '',
|
||||
// whether to use proxies only
|
||||
// when this is set to true, se-scraper will not use
|
||||
// your default IP address
|
||||
use_proxies_only: false,
|
||||
// check if headless chrome escapes common detection techniques
|
||||
// this is a quick test and should be used for debugging
|
||||
test_evasion: false,
|
||||
apply_evasion_techniques: true,
|
||||
// settings for puppeteer-cluster
|
||||
puppeteer_cluster_config: {
|
||||
timeout: 30 * 60 * 1000, // max timeout set to 30 minutes
|
||||
monitor: false,
|
||||
concurrency: Cluster.CONCURRENCY_BROWSER,
|
||||
maxConcurrency: 1,
|
||||
}
|
||||
});
|
||||
|
||||
let launch_args = {
|
||||
args: ADDITIONAL_CHROME_FLAGS,
|
||||
headless: config.headless,
|
||||
ignoreHTTPSErrors: true,
|
||||
};
|
||||
this.logger = this.config.logger;
|
||||
|
||||
if (config.debug === true) {
|
||||
console.log("Chrome Args: ", launch_args);
|
||||
}
|
||||
if (config.sleep_range) {
|
||||
// parse an array
|
||||
config.sleep_range = eval(config.sleep_range);
|
||||
|
||||
if (pluggable.start_browser) {
|
||||
launch_args.config = config;
|
||||
browser = await pluggable.start_browser(launch_args);
|
||||
} else {
|
||||
browser = await puppeteer.launch(launch_args);
|
||||
}
|
||||
if (config.sleep_range.length !== 2 && typeof i[0] !== 'number' && typeof i[1] !== 'number') {
|
||||
throw "sleep_range is not a valid array of two integers.";
|
||||
}
|
||||
}
|
||||
|
||||
let metadata = {};
|
||||
if (fs.existsSync(this.config.keyword_file)) {
|
||||
this.config.keywords = read_keywords_from_file(this.config.keyword_file);
|
||||
}
|
||||
|
||||
if (config.log_http_headers === true) {
|
||||
metadata.http_headers = await meta.get_http_headers(browser);
|
||||
}
|
||||
if (this.config.proxies && this.config.proxy_file) {
|
||||
throw new Error('Either use a proxy_file or specify a proxy for all connections. Do not use both options.');
|
||||
}
|
||||
|
||||
if (config.log_ip_address === true) {
|
||||
metadata.ipinfo = await meta.get_ip_data(browser);
|
||||
}
|
||||
if (this.config.proxy_file) {
|
||||
this.config.proxies = read_keywords_from_file(this.config.proxy_file);
|
||||
this.logger.info(`${this.config.proxies.length} proxies read from file.`);
|
||||
}
|
||||
|
||||
// check that our proxy is working by confirming
|
||||
// that ipinfo.io sees the proxy IP address
|
||||
if (config.proxy && config.log_ip_address === true) {
|
||||
console.log(`${metadata.ipinfo} vs ${config.proxy}`);
|
||||
if (!this.config.proxies && this.config.use_proxies_only) {
|
||||
throw new Error('Must provide at least one proxy in proxies if you enable use_proxies_only');
|
||||
}
|
||||
|
||||
try {
|
||||
// if the ip returned by ipinfo is not a substring of our proxystring, get the heck outta here
|
||||
if (!config.proxy.includes(metadata.ipinfo.ip)) {
|
||||
console.error('Proxy not working properly.');
|
||||
await browser.close();
|
||||
return;
|
||||
}
|
||||
} catch (exception) {
|
||||
debug('this.config=%O', this.config);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Launches the puppeteer cluster or browser.
|
||||
*
|
||||
* Returns true if the browser was successfully launched. Otherwise will return false.
|
||||
*/
|
||||
async start() {
|
||||
|
||||
var results = {};
|
||||
if (this.config.custom_func) {
|
||||
if (fs.existsSync(this.config.custom_func)) {
|
||||
try {
|
||||
const PluggableClass = require(this.config.custom_func);
|
||||
this.pluggable = new PluggableClass({
|
||||
config: this.config,
|
||||
context: this.context
|
||||
});
|
||||
} catch (exception) {
|
||||
console.error(exception);
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
console.error(`File "${this.config.custom_func}" does not exist!`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
Scraper = {
|
||||
google: google.GoogleScraper,
|
||||
google_news_old: google.GoogleNewsOldScraper,
|
||||
google_news: google.GoogleNewsScraper,
|
||||
google_image: google.GoogleImageScraper,
|
||||
bing: bing.BingScraper,
|
||||
bing_news: bing.BingNewsScraper,
|
||||
duckduckgo: duckduckgo.DuckduckgoScraper,
|
||||
duckduckgo_news: duckduckgo.DuckduckgoNewsScraper,
|
||||
infospace: infospace.InfospaceScraper,
|
||||
webcrawler: infospace.WebcrawlerNewsScraper,
|
||||
baidu: baidu.BaiduScraper,
|
||||
youtube: youtube.YoutubeScraper,
|
||||
yahoo_news: tickersearch.YahooFinanceScraper,
|
||||
reuters: tickersearch.ReutersFinanceScraper,
|
||||
cnbc: tickersearch.CnbcFinanceScraper,
|
||||
marketwatch: tickersearch.MarketwatchFinanceScraper,
|
||||
}[config.search_engine];
|
||||
const chrome_flags = _.clone(this.config.chrome_flags);
|
||||
|
||||
if (Scraper === undefined) {
|
||||
console.info('Currently not implemented search_engine: ', config.search_engine);
|
||||
} else {
|
||||
scraperObj = new Scraper({
|
||||
browser: browser,
|
||||
config: config,
|
||||
context: context,
|
||||
pluggable: pluggable,
|
||||
});
|
||||
results = await scraperObj.run();
|
||||
}
|
||||
if (this.pluggable && this.pluggable.start_browser) {
|
||||
launch_args.config = this.config;
|
||||
this.browser = await this.pluggable.start_browser({
|
||||
config: this.config,
|
||||
});
|
||||
this.page = await this.browser.newPage();
|
||||
} else {
|
||||
// if no custom start_browser functionality was given
|
||||
// use puppeteer-cluster for scraping
|
||||
|
||||
if (pluggable.close_browser) {
|
||||
await pluggable.close_browser();
|
||||
} else {
|
||||
await browser.close();
|
||||
}
|
||||
let proxies;
|
||||
// if we have at least one proxy, always use CONCURRENCY_BROWSER
|
||||
// and set maxConcurrency to this.config.proxies.length + 1
|
||||
// else use whatever this.configuration was passed
|
||||
if (this.config.proxies && this.config.proxies.length > 0) {
|
||||
|
||||
let num_requests = scraperObj.num_requests;
|
||||
let timeDelta = Date.now() - startTime;
|
||||
let ms_per_request = timeDelta/num_requests;
|
||||
// because we use real browsers, we ran out of memory on normal laptops
|
||||
// when using more than maybe 5 or 6 browsers.
|
||||
// therefore hardcode a limit here
|
||||
// TODO not sure this what we want
|
||||
this.numClusters = Math.min(
|
||||
this.config.proxies.length + (this.config.use_proxies_only ? 0 : 1),
|
||||
MAX_ALLOWED_BROWSERS
|
||||
);
|
||||
proxies = _.clone(this.config.proxies);
|
||||
|
||||
if (config.verbose === true) {
|
||||
console.log(`Scraper took ${timeDelta}ms to perform ${num_requests} requests.`);
|
||||
console.log(`On average ms/request: ${ms_per_request}ms/request`);
|
||||
console.dir(results, {depth: null, colors: true});
|
||||
}
|
||||
// Insert a first config without proxy if use_proxy_only is false
|
||||
if (this.config.use_proxies_only === false) {
|
||||
proxies.unshift(null);
|
||||
}
|
||||
|
||||
if (config.compress === true) {
|
||||
results = JSON.stringify(results);
|
||||
// https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Encoding
|
||||
results = zlib.deflateSync(results).toString('base64');
|
||||
}
|
||||
} else {
|
||||
this.numClusters = this.config.puppeteer_cluster_config.maxConcurrency;
|
||||
proxies = _.times(this.numClusters, null);
|
||||
}
|
||||
|
||||
if (pluggable.handle_results) {
|
||||
await pluggable.handle_results({
|
||||
config: config,
|
||||
results: results,
|
||||
});
|
||||
}
|
||||
this.logger.info(`Using ${this.numClusters} clusters.`);
|
||||
|
||||
metadata.id = `${config.job_name} ${config.chunk_lines}`;
|
||||
metadata.chunk_lines = config.chunk_lines;
|
||||
metadata.elapsed_time = timeDelta.toString();
|
||||
metadata.ms_per_keyword = ms_per_request.toString();
|
||||
metadata.num_requests = num_requests;
|
||||
// Give the per browser options
|
||||
const perBrowserOptions = _.map(proxies, (proxy) => {
|
||||
const userAgent = (this.config.random_user_agent) ? (new UserAgent({deviceCategory: 'desktop'})).toString() : this.config.user_agent;
|
||||
let args = chrome_flags.concat([`--user-agent=${userAgent}`]);
|
||||
|
||||
if (config.verbose === true) {
|
||||
console.log(metadata);
|
||||
}
|
||||
if (proxy) {
|
||||
args = args.concat([`--proxy-server=${proxy}`]);
|
||||
}
|
||||
|
||||
if (pluggable.handle_metadata) {
|
||||
await pluggable.handle_metadata({metadata: metadata, config: config});
|
||||
}
|
||||
return {
|
||||
headless: this.config.headless,
|
||||
ignoreHTTPSErrors: true,
|
||||
args
|
||||
};
|
||||
});
|
||||
|
||||
if (config.output_file) {
|
||||
write_results(config.output_file, JSON.stringify(results));
|
||||
}
|
||||
debug('perBrowserOptions=%O', perBrowserOptions)
|
||||
|
||||
let response = {
|
||||
headers: {
|
||||
'Content-Type': 'text/json',
|
||||
},
|
||||
results: results,
|
||||
metadata: metadata || {},
|
||||
statusCode: 200
|
||||
};
|
||||
this.cluster = await Cluster.launch({
|
||||
monitor: this.config.puppeteer_cluster_config.monitor,
|
||||
timeout: this.config.puppeteer_cluster_config.timeout, // max timeout set to 30 minutes
|
||||
concurrency: CustomConcurrencyImpl,
|
||||
maxConcurrency: this.numClusters,
|
||||
puppeteerOptions: {
|
||||
perBrowserOptions: perBrowserOptions
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
callback(null, response);
|
||||
/*
|
||||
* Scrapes the keywords specified by the config.
|
||||
*/
|
||||
async scrape(scrape_config = {}) {
|
||||
|
||||
} catch (e) {
|
||||
callback(e, null);
|
||||
}
|
||||
if (!scrape_config.keywords && !scrape_config.keyword_file) {
|
||||
throw new Error('Either keywords or keyword_file must be supplied to scrape()');
|
||||
}
|
||||
|
||||
Object.assign(this.config, scrape_config);
|
||||
|
||||
var results = {};
|
||||
var num_requests = 0;
|
||||
var metadata = {};
|
||||
var startTime = Date.now();
|
||||
|
||||
this.config.search_engine_name = typeof this.config.search_engine === 'function' ? this.config.search_engine.name : this.config.search_engine;
|
||||
|
||||
this.logger.info(`scrapes ${this.config.search_engine_name} with ${this.config.keywords.length} keywords on ${this.config.num_pages} pages each.`);
|
||||
|
||||
if (this.pluggable && this.pluggable.start_browser) {
|
||||
|
||||
this.scraper = getScraper(this.config.search_engine, {
|
||||
config: this.config,
|
||||
context: this.context,
|
||||
pluggable: this.pluggable,
|
||||
page: this.page,
|
||||
});
|
||||
|
||||
var {results, metadata, num_requests} = await this.scraper.run(this.page);
|
||||
|
||||
} else {
|
||||
// Each browser will get N/(K+1) keywords and will issue N/(K+1) * M total requests to the search engine.
|
||||
// https://github.com/GoogleChrome/puppeteer/issues/678
|
||||
// The question is: Is it possible to set proxies per Page? Per Browser?
|
||||
// as far as I can see, puppeteer cluster uses the same puppeteerOptions
|
||||
// for every browser instance. We will use our custom puppeteer-cluster version.
|
||||
// https://www.npmjs.com/package/proxy-chain
|
||||
// this answer looks nice: https://github.com/GoogleChrome/puppeteer/issues/678#issuecomment-389096077
|
||||
let chunks = [];
|
||||
for (var n = 0; n < this.numClusters; n++) {
|
||||
chunks.push([]);
|
||||
}
|
||||
for (var k = 0; k < this.config.keywords.length; k++) {
|
||||
chunks[k % this.numClusters].push(this.config.keywords[k]);
|
||||
}
|
||||
|
||||
debug('chunks=%o', chunks);
|
||||
|
||||
let execPromises = [];
|
||||
for (var c = 0; c < chunks.length; c++) {
|
||||
const config = _.clone(this.config);
|
||||
config.keywords = chunks[c];
|
||||
|
||||
var obj = getScraper(this.config.search_engine, {
|
||||
config: config,
|
||||
context: {},
|
||||
pluggable: this.pluggable,
|
||||
});
|
||||
|
||||
var boundMethod = obj.run.bind(obj);
|
||||
execPromises.push(this.cluster.execute({}, boundMethod));
|
||||
}
|
||||
|
||||
let promiseReturns = await Promise.all(execPromises);
|
||||
|
||||
// Merge results and metadata per keyword
|
||||
for (let promiseReturn of promiseReturns) {
|
||||
Object.assign(results, promiseReturn.results);
|
||||
Object.assign(metadata, promiseReturn.metadata);
|
||||
num_requests += promiseReturn.num_requests;
|
||||
}
|
||||
}
|
||||
|
||||
let timeDelta = Date.now() - startTime;
|
||||
let ms_per_request = timeDelta/num_requests;
|
||||
|
||||
this.logger.info(`Scraper took ${timeDelta}ms to perform ${num_requests} requests.`);
|
||||
this.logger.info(`On average ms/request: ${ms_per_request}ms/request`);
|
||||
|
||||
if (this.pluggable && this.pluggable.handle_results) {
|
||||
await this.pluggable.handle_results(results);
|
||||
}
|
||||
|
||||
metadata.elapsed_time = timeDelta.toString();
|
||||
metadata.ms_per_keyword = ms_per_request.toString();
|
||||
metadata.num_requests = num_requests;
|
||||
|
||||
debug('metadata=%O', metadata);
|
||||
|
||||
if (this.pluggable && this.pluggable.handle_metadata) {
|
||||
await this.pluggable.handle_metadata(metadata);
|
||||
}
|
||||
|
||||
if (this.config.output_file) {
|
||||
this.logger.info(`Writing results to ${this.config.output_file}`);
|
||||
write_results(this.config.output_file, JSON.stringify(results, null, 4));
|
||||
}
|
||||
|
||||
return {
|
||||
results: results,
|
||||
metadata: metadata || {},
|
||||
};
|
||||
}
|
||||
|
||||
/*
|
||||
* Quit the puppeteer cluster/browser.
|
||||
*/
|
||||
async quit() {
|
||||
if (this.pluggable && this.pluggable.close_browser) {
|
||||
await this.pluggable.close_browser();
|
||||
} else {
|
||||
await this.cluster.idle();
|
||||
await this.cluster.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
ScrapeManager: ScrapeManager,
|
||||
};
|
||||
|
||||
function parseEventData(config) {
|
||||
|
||||
function _bool(e) {
|
||||
e = String(e);
|
||||
if (typeof e.trim === "function") {
|
||||
return e.trim().toLowerCase() == 'true';
|
||||
} else {
|
||||
return e.toLowerCase() == 'true';
|
||||
}
|
||||
}
|
||||
|
||||
if (config.debug) {
|
||||
config.debug = _bool(config.debug);
|
||||
}
|
||||
|
||||
if (config.verbose) {
|
||||
config.verbose = _bool(config.verbose);
|
||||
}
|
||||
|
||||
if (config.upload_to_s3) {
|
||||
config.upload_to_s3 = _bool(config.upload_to_s3);
|
||||
}
|
||||
|
||||
if (config.log_ip_address) {
|
||||
config.log_ip_address = _bool(config.log_ip_address);
|
||||
}
|
||||
|
||||
if (config.log_http_headers) {
|
||||
config.log_http_headers = _bool(config.log_http_headers);
|
||||
}
|
||||
|
||||
if (config.random_user_agent) {
|
||||
config.random_user_agent = _bool(config.random_user_agent);
|
||||
}
|
||||
|
||||
if (config.compress) {
|
||||
config.compress = _bool(config.compress);
|
||||
}
|
||||
|
||||
if (config.is_local) {
|
||||
config.is_local = _bool(config.is_local);
|
||||
}
|
||||
|
||||
if (config.max_results) {
|
||||
config.max_results = parseInt(config.max_results);
|
||||
}
|
||||
|
||||
if (config.set_manual_settings) {
|
||||
config.set_manual_settings = _bool(config.set_manual_settings);
|
||||
}
|
||||
|
||||
if (config.block_assets) {
|
||||
config.block_assets = _bool(config.block_assets);
|
||||
}
|
||||
|
||||
if (config.sleep_range) {
|
||||
// parse an array
|
||||
config.sleep_range = eval(config.sleep_range);
|
||||
|
||||
if (config.sleep_range.length !== 2 && typeof i[0] !== 'number' && typeof i[1] !== 'number') {
|
||||
throw "sleep_range is not a valid array of two integers.";
|
||||
}
|
||||
}
|
||||
|
||||
return config;
|
||||
}
|
101
test/html_output.js
Normal file
101
test/html_output.js
Normal file
@ -0,0 +1,101 @@
|
||||
'use strict';
|
||||
const express = require('express');
|
||||
const { createLogger, transports } = require('winston');
|
||||
const http = require('http');
|
||||
const https = require('https');
|
||||
const assert = require('assert');
|
||||
const path = require('path');
|
||||
const keyCert = require('key-cert');
|
||||
const Promise = require('bluebird');
|
||||
const Proxy = require('http-mitm-proxy');
|
||||
|
||||
const debug = require('debug')('se-scraper:test');
|
||||
const se_scraper = require('../');
|
||||
|
||||
const httpPort = 3012;
|
||||
const httpsPort = httpPort + 1;
|
||||
const proxyPort = httpPort + 2;
|
||||
|
||||
const fakeSearchEngine = express();
|
||||
fakeSearchEngine.get('/search', (req, res) => {
|
||||
debug('q=%s', req.query.q);
|
||||
const pageNumber = ((req.query.start/10) || 0) + 1;
|
||||
res.sendFile(path.join(__dirname, 'mocks/google/' + req.query.q + '_page' + pageNumber + '.html'));
|
||||
});
|
||||
fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']}));
|
||||
|
||||
describe('Config', function(){
|
||||
|
||||
let httpServer, httpsServer, proxy;
|
||||
before(async function(){
|
||||
// Here mount our fake engine in both http and https listen server
|
||||
httpServer = http.createServer(fakeSearchEngine);
|
||||
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
||||
|
||||
proxy = Proxy();
|
||||
proxy.onRequest((ctx, callback) => {
|
||||
ctx.proxyToServerRequestOptions.host = 'localhost';
|
||||
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
|
||||
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
|
||||
debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
|
||||
return callback();
|
||||
});
|
||||
|
||||
await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
|
||||
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
|
||||
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
||||
debug('Fake http search engine servers started');
|
||||
});
|
||||
|
||||
after(function(){
|
||||
httpsServer.close();
|
||||
httpServer.close();
|
||||
proxy.close();
|
||||
});
|
||||
|
||||
describe('html_output', function(){
|
||||
|
||||
const testLogger = createLogger({
|
||||
transports: [
|
||||
new transports.Console({
|
||||
level: 'error'
|
||||
})
|
||||
]
|
||||
});
|
||||
|
||||
/**
|
||||
* Test html_output option
|
||||
*/
|
||||
it('html_output single page single keyword', async function () {
|
||||
|
||||
const scrape_job = {
|
||||
search_engine: 'google',
|
||||
/* TODO refactor start_url
|
||||
google_settings: {
|
||||
start_url: 'http://localhost:' + httpPort
|
||||
},
|
||||
*/
|
||||
keywords: ['test keyword'],
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager({
|
||||
throw_on_detection: true,
|
||||
logger: testLogger,
|
||||
html_output: true,
|
||||
//clean_html_output: false,
|
||||
//clean_data_images: false,
|
||||
// TODO refactor start_url so we can use-it instead of depending of the proxy for this test
|
||||
proxies: ['http://localhost:' + proxyPort],
|
||||
use_proxies_only: true,
|
||||
});
|
||||
await scraper.start();
|
||||
const { results } = await scraper.scrape(scrape_job);
|
||||
await scraper.quit();
|
||||
|
||||
assert(results['test keyword']['1'].html.length > 1000, 'Html of google page 1 should be provided');
|
||||
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
});
|
23
test/mocks/bing/index.html
Normal file
23
test/mocks/bing/index.html
Normal file
File diff suppressed because one or more lines are too long
42
test/mocks/bing/test keyword_page1.html
Normal file
42
test/mocks/bing/test keyword_page1.html
Normal file
File diff suppressed because one or more lines are too long
42
test/mocks/bing/test keyword_page2.html
Normal file
42
test/mocks/bing/test keyword_page2.html
Normal file
File diff suppressed because one or more lines are too long
40
test/mocks/bing/test keyword_page3.html
Normal file
40
test/mocks/bing/test keyword_page3.html
Normal file
File diff suppressed because one or more lines are too long
148
test/mocks/duckduckgo/index.html
Normal file
148
test/mocks/duckduckgo/index.html
Normal file
@ -0,0 +1,148 @@
|
||||
<!DOCTYPE html>
|
||||
<!--[if IEMobile 7 ]> <html lang="en_US" class="no-js iem7"> <![endif]-->
|
||||
<!--[if lt IE 7]> <html class="ie6 lt-ie10 lt-ie9 lt-ie8 lt-ie7 no-js" lang="en_US"> <![endif]-->
|
||||
<!--[if IE 7]> <html class="ie7 lt-ie10 lt-ie9 lt-ie8 no-js" lang="en_US"> <![endif]-->
|
||||
<!--[if IE 8]> <html class="ie8 lt-ie10 lt-ie9 no-js" lang="en_US"> <![endif]-->
|
||||
<!--[if IE 9]> <html class="ie9 lt-ie10 no-js" lang="en_US"> <![endif]-->
|
||||
<!--[if (gte IE 9)|(gt IEMobile 7)|!(IEMobile)|!(IE)]><!--><html class="no-js" lang="en_US"><!--<![endif]-->
|
||||
|
||||
<head>
|
||||
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
|
||||
<meta http-equiv="content-type" content="text/html; charset=UTF-8;charset=utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=1" />
|
||||
<meta name="HandheldFriendly" content="true"/>
|
||||
|
||||
<link rel="canonical" href="https://duckduckgo.com/">
|
||||
|
||||
<link rel="stylesheet" href="/s1847.css" type="text/css">
|
||||
|
||||
<link rel="stylesheet" href="/o1847.css" type="text/css">
|
||||
|
||||
|
||||
|
||||
<link rel="shortcut icon" href="/favicon.ico" type="image/x-icon"/>
|
||||
<link rel="apple-touch-icon" href="/assets/icons/meta/DDG-iOS-icon_60x60.png"/>
|
||||
<link rel="apple-touch-icon" sizes="76x76" href="/assets/icons/meta/DDG-iOS-icon_76x76.png"/>
|
||||
<link rel="apple-touch-icon" sizes="120x120" href="/assets/icons/meta/DDG-iOS-icon_120x120.png"/>
|
||||
<link rel="apple-touch-icon" sizes="152x152" href="/assets/icons/meta/DDG-iOS-icon_152x152.png"/>
|
||||
<link rel="image_src" href="/assets/icons/meta/DDG-icon_256x256.png"/>
|
||||
<link rel="manifest" href="/manifest.json"/>
|
||||
|
||||
<meta name="twitter:card" content="summary">
|
||||
<meta name="twitter:site" value="@duckduckgo">
|
||||
|
||||
<meta property="og:url" content="https://duckduckgo.com/" />
|
||||
<meta property="og:site_name" content="DuckDuckGo" />
|
||||
<meta property="og:image" content="https://duckduckgo.com/assets/logo_social-media.png">
|
||||
|
||||
|
||||
<title>DuckDuckGo — Privacy, simplified.</title>
|
||||
<meta property="og:title" content="DuckDuckGo — Privacy, simplified." />
|
||||
|
||||
|
||||
<meta property="og:description" content="The Internet privacy company that empowers you to seamlessly take control of your personal information online, without any tradeoffs.">
|
||||
<meta name="description" content="The Internet privacy company that empowers you to seamlessly take control of your personal information online, without any tradeoffs.">
|
||||
|
||||
|
||||
</head>
|
||||
<body id="pg-index" class="page-index body--home">
|
||||
<script type="text/javascript">
|
||||
var settings_js_version = "/s2475.js",
|
||||
locale = "en_US";
|
||||
</script>
|
||||
<script type="text/javascript" src="/lib/l113.js"></script>
|
||||
<script type="text/javascript" src="/locale/en_US/duckduckgo14.js"></script>
|
||||
<script type="text/javascript" src="/util/u418.js"></script>
|
||||
<script type="text/javascript" src="/d2727.js"></script>
|
||||
|
||||
|
||||
|
||||
<script type="text/javascript">
|
||||
DDG.page = new DDG.Pages.Home();
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
<div class="site-wrapper site-wrapper--home js-site-wrapper">
|
||||
|
||||
|
||||
<div class="header-wrap--home js-header-wrap">
|
||||
<div class="header--aside js-header-aside"></div>
|
||||
<div class="js-header-home-search header-wrap--home__search">
|
||||
<div class="logo-wrap--home">
|
||||
<a id="logo_homepage_link" class="logo_homepage" href="/about">
|
||||
About DuckDuckGo
|
||||
<span class="logo_homepage__tt">Duck it!</span>
|
||||
</a>
|
||||
</div>
|
||||
|
||||
<form id="search_form_homepage_top" class="search search--home js-search-form-top" name="x" method="POST" action="/html">
|
||||
<input class="search__input js-search-input" type="text" autocomplete="off" name="q" tabindex="1" value="">
|
||||
<input class="search__button js-search-button" type="submit" tabindex="2" value="S" />
|
||||
<input class="search__clear empty js-search-clear" type="button" tabindex="3" value="X" />
|
||||
<div class="search__hidden js-search-hidden"></div>
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
<div id="" class="content-wrap--home">
|
||||
<div id="content_homepage" class="content--home">
|
||||
<div class="cw--c">
|
||||
<div class="logo-wrap--home">
|
||||
<a id="logo_homepage_link" class="logo_homepage" href="/about">
|
||||
About DuckDuckGo
|
||||
<span class="logo_homepage__tt">Duck it!</span>
|
||||
</a>
|
||||
</div>
|
||||
|
||||
<div class="search-wrap--home">
|
||||
<form id="search_form_homepage" class="search search--home js-search-form" name="x" method="POST" action="/html">
|
||||
<input id="search_form_input_homepage" class="search__input js-search-input" type="text" autocomplete="off" name="q" tabindex="1" value="">
|
||||
<input id="search_button_homepage" class="search__button js-search-button" type="submit" tabindex="2" value="S" />
|
||||
<input id="search_form_input_clear" class="search__clear empty js-search-clear" type="button" tabindex="3" value="X" />
|
||||
<div id="search_elements_hidden" class="search__hidden js-search-hidden"></div>
|
||||
</form>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<!-- en_US All Settings -->
|
||||
<noscript>
|
||||
<div class="tag-home">
|
||||
<div class="tag-home__wrapper">
|
||||
<div class="tag-home__item">
|
||||
The search engine that doesn't track you.
|
||||
<span class="hide--screen-xs"><a href="/about" class="tag-home__link">Learn More</a>.</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</noscript>
|
||||
<div class="tag-home tag-home--slide no-js__hide js-tag-home"></div>
|
||||
<div id="error_homepage"></div>
|
||||
|
||||
|
||||
|
||||
|
||||
</div> <!-- cw -->
|
||||
</div> <!-- content_homepage //-->
|
||||
</div> <!-- content_wrapper_homepage //-->
|
||||
<div id="footer_homepage" class="foot-home js-foot-home"></div>
|
||||
|
||||
<script type="text/javascript">
|
||||
{function seterr(str) {
|
||||
var error=document.getElementById('error_homepage');
|
||||
error.innerHTML=str;
|
||||
$(error).css('display','block');
|
||||
}
|
||||
var err=new RegExp('[\?\&]e=([^\&]+)');var errm=new Array();errm['2']='no search';errm['3']='search too long';errm['4']='not UTF\u002d8 encoding';errm['6']='too many search terms';if (err.test(window.location.href)) seterr('Oops, '+(errm[RegExp.$1]?errm[RegExp.$1]:'there was an error.')+' Please try again');};
|
||||
|
||||
if (kurl) {
|
||||
document.getElementById("logo_homepage_link").href += (document.getElementById("logo_homepage_link").href.indexOf('?')==-1 ? '?t=i' : '') + kurl;
|
||||
}
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
</div> <!-- site-wrapper -->
|
||||
</body>
|
||||
</html>
|
3
test/mocks/duckduckgo/test keyword_page1.html
Normal file
3
test/mocks/duckduckgo/test keyword_page1.html
Normal file
File diff suppressed because one or more lines are too long
3
test/mocks/duckduckgo/test keyword_page2.html
Normal file
3
test/mocks/duckduckgo/test keyword_page2.html
Normal file
File diff suppressed because one or more lines are too long
3
test/mocks/duckduckgo/test keyword_page3.html
Normal file
3
test/mocks/duckduckgo/test keyword_page3.html
Normal file
File diff suppressed because one or more lines are too long
358
test/mocks/google/index.html
Normal file
358
test/mocks/google/index.html
Normal file
File diff suppressed because one or more lines are too long
209
test/mocks/google/test keyword_page1.html
Normal file
209
test/mocks/google/test keyword_page1.html
Normal file
File diff suppressed because one or more lines are too long
206
test/mocks/google/test keyword_page2.html
Normal file
206
test/mocks/google/test keyword_page2.html
Normal file
File diff suppressed because one or more lines are too long
191
test/mocks/google/test keyword_page3.html
Normal file
191
test/mocks/google/test keyword_page3.html
Normal file
File diff suppressed because one or more lines are too long
123
test/modules/bing.js
Normal file
123
test/modules/bing.js
Normal file
@ -0,0 +1,123 @@
|
||||
'use strict';
|
||||
const express = require('express');
|
||||
const puppeteer = require('puppeteer');
|
||||
const { createLogger, transports } = require('winston');
|
||||
const http = require('http');
|
||||
const https = require('https');
|
||||
const assert = require('assert');
|
||||
const path = require('path');
|
||||
const keyCert = require('key-cert');
|
||||
const Promise = require('bluebird');
|
||||
const Proxy = require('http-mitm-proxy');
|
||||
|
||||
const debug = require('debug')('se-scraper:test');
|
||||
const { BingScraper } = require('../../src/modules/bing');
|
||||
|
||||
const httpPort = 3012;
|
||||
const httpsPort = httpPort + 1;
|
||||
const proxyPort = httpPort + 2;
|
||||
|
||||
const fakeSearchEngine = express();
|
||||
fakeSearchEngine.get('/search', (req, res, next) => {
|
||||
debug('q=%s', req.query.q);
|
||||
const pageNumber = Math.round((req.query.first || 0) /10) + 1;
|
||||
res.sendFile(path.join(__dirname, '../mocks/bing/' + req.query.q + '_page' + pageNumber + '.html'));
|
||||
});
|
||||
fakeSearchEngine.use(express.static('test/mocks/bing', {extensions: ['html']}));
|
||||
|
||||
describe('Module Bing', function(){
|
||||
|
||||
let httpServer, httpsServer, proxy;
|
||||
before(async function(){
|
||||
// Here mount our fake engine in both http and https listen server
|
||||
httpServer = http.createServer(fakeSearchEngine);
|
||||
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
||||
|
||||
proxy = Proxy();
|
||||
proxy.onRequest((ctx, callback) => {
|
||||
ctx.proxyToServerRequestOptions.host = 'localhost';
|
||||
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
|
||||
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
|
||||
debug('connection proxied askedHost=%s toPort=%s', ctx.clientToProxyRequest.headers.host, ctx.proxyToServerRequestOptions.port);
|
||||
return callback();
|
||||
});
|
||||
|
||||
await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort });
|
||||
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
|
||||
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
||||
debug('Fake http search engine servers started');
|
||||
});
|
||||
|
||||
after(function(){
|
||||
proxy.close();
|
||||
httpsServer.close();
|
||||
httpServer.close();
|
||||
});
|
||||
|
||||
let browser;
|
||||
let page;
|
||||
beforeEach(async function(){
|
||||
debug('Start a new browser');
|
||||
browser = await puppeteer.launch({
|
||||
//dumpio: true,
|
||||
//headless: false,
|
||||
ignoreHTTPSErrors: true,
|
||||
args: [ '--proxy-server=http://localhost:' + proxyPort ]
|
||||
});
|
||||
debug('Open a fresh page');
|
||||
page = await browser.newPage();
|
||||
});
|
||||
|
||||
afterEach(async function(){
|
||||
await browser.close();
|
||||
});
|
||||
|
||||
const testLogger = createLogger({
|
||||
transports: [
|
||||
new transports.Console({
|
||||
level: 'error'
|
||||
})
|
||||
]
|
||||
});
|
||||
|
||||
it('one keyword one page', function(){
|
||||
const bingScraper = new BingScraper({
|
||||
config: {
|
||||
search_engine_name: 'bing',
|
||||
throw_on_detection: true,
|
||||
keywords: ['test keyword'],
|
||||
logger: testLogger,
|
||||
scrape_from_file: '',
|
||||
}
|
||||
});
|
||||
bingScraper.STANDARD_TIMEOUT = 500;
|
||||
return bingScraper.run({page}).then(({results, metadata, num_requests}) => {
|
||||
assert.strictEqual(num_requests, 1, 'Must do one request');
|
||||
assert.strictEqual(results['test keyword']['1'].results.length, 6, 'Must have 6 organic results parsed');
|
||||
});
|
||||
});
|
||||
|
||||
it('one keyword 3 pages', function () {
|
||||
const bingScraper = new BingScraper({
|
||||
config: {
|
||||
search_engine_name: 'bing',
|
||||
throw_on_detection: true,
|
||||
keywords: ['test keyword'],
|
||||
logger: testLogger,
|
||||
scrape_from_file: '',
|
||||
num_pages: 3,
|
||||
}
|
||||
});
|
||||
bingScraper.STANDARD_TIMEOUT = 500;
|
||||
return bingScraper.run({page}).then(({results, metadata, num_requests}) => {
|
||||
assert.strictEqual(num_requests, 3, 'Must three requests');
|
||||
assert.strictEqual(results['test keyword']['1'].results.length, 6, 'Must have 6 organic results parsed on page 1');
|
||||
assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tests | TestComplete Documentation', 'Title not matching on first organic result page 1');
|
||||
assert.strictEqual(results['test keyword']['2'].results.length, 10, 'Must have 10 organic results parsed on page 2');
|
||||
assert.strictEqual(results['test keyword']['2'].results[0].title, 'Keywords - TestLink', 'Title not matching on first organic result page 2');
|
||||
assert.strictEqual(results['test keyword']['3'].results.length, 10, 'Must have 10 organic results parsed on page 3');
|
||||
assert.strictEqual(results['test keyword']['3'].results[0].title, 'Keyword Driven Testing | TestComplete', 'Title not matching on first organic result page 3');
|
||||
});
|
||||
});
|
||||
|
||||
});
|
140
test/modules/duckduckgo.js
Normal file
140
test/modules/duckduckgo.js
Normal file
@ -0,0 +1,140 @@
|
||||
'use strict';
|
||||
const express = require('express');
|
||||
const puppeteer = require('puppeteer');
|
||||
const { createLogger, transports } = require('winston');
|
||||
const http = require('http');
|
||||
const https = require('https');
|
||||
const assert = require('assert');
|
||||
const path = require('path');
|
||||
const keyCert = require('key-cert');
|
||||
const Promise = require('bluebird');
|
||||
const Proxy = require('http-mitm-proxy');
|
||||
|
||||
const debug = require('debug')('se-scraper:test');
|
||||
const { DuckduckgoScraper } = require('../../src/modules/duckduckgo');
|
||||
|
||||
const httpPort = 3012;
|
||||
const httpsPort = httpPort + 1;
|
||||
const proxyPort = httpPort + 2;
|
||||
|
||||
const fakeSearchEngine = express();
|
||||
fakeSearchEngine.use(express.urlencoded({ extended: true }))
|
||||
fakeSearchEngine.get('/', (req, res, next) => {
|
||||
if(!req.query.q){
|
||||
return next();
|
||||
}
|
||||
debug('q=%s page=%d', req.query.q, req.query.page);
|
||||
const pageNumber = req.query.page;
|
||||
res.sendFile(path.join(__dirname, '../mocks/duckduckgo/' + req.query.q + '_page' + pageNumber + '.html'));
|
||||
});
|
||||
fakeSearchEngine.post('/html', (req, res) => {
|
||||
debug('body=%o', req.body);
|
||||
const pageNumber = 1;
|
||||
res.sendFile(path.join(__dirname, '../mocks/duckduckgo/' + req.body.q + '_page' + pageNumber + '.html'));
|
||||
});
|
||||
fakeSearchEngine.use(express.static('test/mocks/duckduckgo', {extensions: ['html']}));
|
||||
|
||||
describe('Module DuckDuckGo', function(){
|
||||
|
||||
let httpServer, httpsServer, proxy;
|
||||
before(async function(){
|
||||
// Here mount our fake engine in both http and https listen server
|
||||
httpServer = http.createServer(fakeSearchEngine);
|
||||
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
||||
|
||||
proxy = Proxy();
|
||||
proxy.onRequest((ctx, callback) => {
|
||||
ctx.proxyToServerRequestOptions.host = 'localhost';
|
||||
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
|
||||
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
|
||||
debug('proxy askedHost=%s method=%s url=%s toPort=%s',
|
||||
ctx.clientToProxyRequest.headers.host,
|
||||
ctx.clientToProxyRequest.method,
|
||||
ctx.clientToProxyRequest.url,
|
||||
ctx.proxyToServerRequestOptions.port
|
||||
);
|
||||
return callback();
|
||||
});
|
||||
|
||||
await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort });
|
||||
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
|
||||
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
||||
debug('Fake http search engine servers started');
|
||||
});
|
||||
|
||||
after(function(){
|
||||
proxy.close();
|
||||
httpsServer.close();
|
||||
httpServer.close();
|
||||
});
|
||||
|
||||
let browser;
|
||||
let page;
|
||||
beforeEach(async function(){
|
||||
debug('Start a new browser');
|
||||
browser = await puppeteer.launch({
|
||||
//dumpio: true,
|
||||
//headless: false,
|
||||
ignoreHTTPSErrors: true,
|
||||
args: [ '--proxy-server=http://localhost:' + proxyPort ]
|
||||
});
|
||||
debug('Open a fresh page');
|
||||
page = await browser.newPage();
|
||||
});
|
||||
|
||||
afterEach(async function(){
|
||||
await browser.close();
|
||||
});
|
||||
|
||||
const testLogger = createLogger({
|
||||
transports: [
|
||||
new transports.Console({
|
||||
level: 'error'
|
||||
})
|
||||
]
|
||||
});
|
||||
|
||||
it('one keyword one page', function(){
|
||||
const duckduckgoScraper = new DuckduckgoScraper({
|
||||
config: {
|
||||
search_engine_name: 'duckduckgo',
|
||||
throw_on_detection: true,
|
||||
keywords: ['test keyword'],
|
||||
logger: testLogger,
|
||||
scrape_from_file: '',
|
||||
}
|
||||
});
|
||||
duckduckgoScraper.STANDARD_TIMEOUT = 1000;
|
||||
return duckduckgoScraper.run({page}).then(({results, metadata, num_requests}) => {
|
||||
assert.strictEqual(num_requests, 1, 'Must do one request');
|
||||
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed');
|
||||
});
|
||||
});
|
||||
|
||||
it('one keyword 3 pages', function () {
|
||||
this.timeout(4000);
|
||||
const duckduckgoScraper = new DuckduckgoScraper({
|
||||
config: {
|
||||
search_engine_name: 'google',
|
||||
throw_on_detection: true,
|
||||
keywords: ['test keyword'],
|
||||
logger: testLogger,
|
||||
scrape_from_file: '',
|
||||
num_pages: 3,
|
||||
}
|
||||
});
|
||||
duckduckgoScraper.STANDARD_TIMEOUT = 1000;
|
||||
return duckduckgoScraper.run({page}).then(({results, metadata, num_requests}) => {
|
||||
assert.strictEqual(num_requests, 3, 'Must three requests');
|
||||
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed on page 1');
|
||||
assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tests | TestComplete Documentation', 'Title not matching on first organic result page 1');
|
||||
debug('results page 1 %O',results['test keyword']['1'].results);
|
||||
debug('results page 2 %O', results['test keyword']['2'].results);
|
||||
assert.strictEqual(results['test keyword']['2'].results.length, 19, 'Must have 19 organic results parsed on page 2');
|
||||
assert.strictEqual(results['test keyword']['2'].results[0].title, 'Quest Diagnostics: Test Directory', 'Title not matching on first organic result page 1');
|
||||
assert.strictEqual(results['test keyword']['3'].results.length, 48, 'Must have 48 organic results parsed on page 3');
|
||||
assert.strictEqual(results['test keyword']['3'].results[0].title, 'Java Keywords Quiz - Sporcle', 'Title not matching on first organic result page 1');
|
||||
});
|
||||
});
|
||||
|
||||
});
|
123
test/modules/google.js
Normal file
123
test/modules/google.js
Normal file
@ -0,0 +1,123 @@
|
||||
'use strict';
|
||||
const express = require('express');
|
||||
const puppeteer = require('puppeteer');
|
||||
const { createLogger, transports } = require('winston');
|
||||
const http = require('http');
|
||||
const https = require('https');
|
||||
const assert = require('assert');
|
||||
const path = require('path');
|
||||
const keyCert = require('key-cert');
|
||||
const Promise = require('bluebird');
|
||||
const Proxy = require('http-mitm-proxy');
|
||||
|
||||
const debug = require('debug')('se-scraper:test');
|
||||
const { GoogleScraper } = require('../../src/modules/google');
|
||||
|
||||
const httpPort = 3012;
|
||||
const httpsPort = httpPort + 1;
|
||||
const proxyPort = httpPort + 2;
|
||||
|
||||
const fakeSearchEngine = express();
|
||||
fakeSearchEngine.get('/search', (req, res) => {
|
||||
debug('q=%s', req.query.q);
|
||||
const pageNumber = ((req.query.start/10) || 0) + 1;
|
||||
res.sendFile(path.join(__dirname, '../mocks/google/' + req.query.q + '_page' + pageNumber + '.html'));
|
||||
});
|
||||
fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']}));
|
||||
|
||||
describe('Module Google', function(){
|
||||
|
||||
let httpServer, httpsServer, proxy;
|
||||
before(async function(){
|
||||
// Here mount our fake engine in both http and https listen server
|
||||
httpServer = http.createServer(fakeSearchEngine);
|
||||
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
||||
|
||||
proxy = Proxy();
|
||||
proxy.onRequest((ctx, callback) => {
|
||||
ctx.proxyToServerRequestOptions.host = 'localhost';
|
||||
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
|
||||
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
|
||||
debug('connection proxied askedHost=%s toPort=%s', ctx.clientToProxyRequest.headers.host, ctx.proxyToServerRequestOptions.port);
|
||||
return callback();
|
||||
});
|
||||
|
||||
await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort });
|
||||
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
|
||||
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
||||
debug('Fake http search engine servers started');
|
||||
});
|
||||
|
||||
after(function(){
|
||||
proxy.close();
|
||||
httpsServer.close();
|
||||
httpServer.close();
|
||||
});
|
||||
|
||||
let browser;
|
||||
let page;
|
||||
beforeEach(async function(){
|
||||
debug('Start a new browser');
|
||||
browser = await puppeteer.launch({
|
||||
//dumpio: true,
|
||||
//headless: false,
|
||||
ignoreHTTPSErrors: true,
|
||||
args: [ '--proxy-server=http://localhost:' + proxyPort ]
|
||||
});
|
||||
debug('Open a fresh page');
|
||||
page = await browser.newPage();
|
||||
});
|
||||
|
||||
afterEach(async function(){
|
||||
await browser.close();
|
||||
});
|
||||
|
||||
const testLogger = createLogger({
|
||||
transports: [
|
||||
new transports.Console({
|
||||
level: 'error'
|
||||
})
|
||||
]
|
||||
});
|
||||
|
||||
it('one keyword one page', function(){
|
||||
const googleScraper = new GoogleScraper({
|
||||
config: {
|
||||
search_engine_name: 'google',
|
||||
throw_on_detection: true,
|
||||
keywords: ['test keyword'],
|
||||
logger: testLogger,
|
||||
scrape_from_file: '',
|
||||
}
|
||||
});
|
||||
googleScraper.STANDARD_TIMEOUT = 500;
|
||||
return googleScraper.run({page}).then(({results, metadata, num_requests}) => {
|
||||
assert.strictEqual(num_requests, 1, 'Must do one request');
|
||||
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed');
|
||||
});
|
||||
});
|
||||
|
||||
it('one keyword 3 pages', function () {
|
||||
const googleScraper = new GoogleScraper({
|
||||
config: {
|
||||
search_engine_name: 'google',
|
||||
throw_on_detection: true,
|
||||
keywords: ['test keyword'],
|
||||
logger: testLogger,
|
||||
scrape_from_file: '',
|
||||
num_pages: 3,
|
||||
}
|
||||
});
|
||||
googleScraper.STANDARD_TIMEOUT = 500;
|
||||
return googleScraper.run({page}).then(({results, metadata, num_requests}) => {
|
||||
assert.strictEqual(num_requests, 3, 'Must three requests');
|
||||
assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed on page 1');
|
||||
assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tool (FREE) ᐈ #1 Google Keyword Planner Alternative', 'Title not matching on first organic result page 1');
|
||||
assert.strictEqual(results['test keyword']['2'].results.length, 10, 'Must have 10 organic results parsed on page 2');
|
||||
assert.strictEqual(results['test keyword']['2'].results[0].title, 'Keyword Research | The Beginner\'s Guide to SEO - Moz', 'Title not matching on first organic result page 1');
|
||||
assert.strictEqual(results['test keyword']['3'].results.length, 10, 'Must have 10 organic results parsed on page 3');
|
||||
assert.strictEqual(results['test keyword']['3'].results[0].title, 'The ACT Keyword Study Plan — NerdCoach', 'Title not matching on first organic result page 1');
|
||||
});
|
||||
});
|
||||
|
||||
});
|
161
test/proxy.js
Normal file
161
test/proxy.js
Normal file
@ -0,0 +1,161 @@
|
||||
'use strict';
|
||||
const express = require('express');
|
||||
const { createLogger, transports } = require('winston');
|
||||
const http = require('http');
|
||||
const https = require('https');
|
||||
const assert = require('assert');
|
||||
const keyCert = require('key-cert');
|
||||
const Promise = require('bluebird');
|
||||
const Proxy = require('http-mitm-proxy');
|
||||
|
||||
const debug = require('debug')('se-scraper:test');
|
||||
const se_scraper = require('../');
|
||||
const Scraper = require('../src/modules/se_scraper');
|
||||
|
||||
const httpPort = 3012;
|
||||
const httpsPort = httpPort + 1;
|
||||
const proxyPort = httpPort + 2;
|
||||
|
||||
const fakeSearchEngine = express();
|
||||
fakeSearchEngine.set('trust proxy', 'loopback');
|
||||
fakeSearchEngine.get('/test-proxy', (req, res) => {
|
||||
debug('fake-search-engine req.hostname=%s', req.hostname);
|
||||
//debug('req to', req.socket.localAddress, req.socket.localPort);
|
||||
res.send(req.hostname);
|
||||
});
|
||||
|
||||
describe('Config', function(){
|
||||
|
||||
let httpServer, httpsServer, proxy;
|
||||
before(async function(){
|
||||
// Here mount our fake engine in both http and https listen server
|
||||
httpServer = http.createServer(fakeSearchEngine);
|
||||
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
||||
|
||||
proxy = Proxy();
|
||||
proxy.onRequest((ctx, callback) => {
|
||||
ctx.proxyToServerRequestOptions.host = 'localhost';
|
||||
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
|
||||
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
|
||||
debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
|
||||
return callback();
|
||||
});
|
||||
|
||||
await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
|
||||
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
|
||||
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
||||
debug('Fake http search engine servers started');
|
||||
});
|
||||
|
||||
after(function(){
|
||||
httpsServer.close();
|
||||
httpServer.close();
|
||||
proxy.close();
|
||||
});
|
||||
|
||||
describe('proxies', function(){
|
||||
|
||||
class MockScraperTestProxy extends Scraper {
|
||||
|
||||
async load_start_page(){
|
||||
return true;
|
||||
}
|
||||
|
||||
async search_keyword(){
|
||||
await this.page.goto('http://test.local:' + httpPort + '/test-proxy');
|
||||
}
|
||||
|
||||
async parse_async(){
|
||||
const bodyHandle = await this.page.$('body');
|
||||
return await this.page.evaluate(body => body.innerHTML, bodyHandle);
|
||||
}
|
||||
}
|
||||
|
||||
const testLogger = createLogger({
|
||||
transports: [
|
||||
new transports.Console({
|
||||
level: 'error'
|
||||
})
|
||||
]
|
||||
});
|
||||
|
||||
/**
|
||||
* Jobs will be executed 2 by 2 through the proxy and direct connection
|
||||
* THIS TEST NEED TO HAVE test.local 127.0.0.1 in /etc/hosts because chrome bypass localhost even with proxy set
|
||||
*/
|
||||
it('one proxy given, use_proxies_only=false', async function () {
|
||||
|
||||
const scrape_job = {
|
||||
search_engine: MockScraperTestProxy,
|
||||
keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'],
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager({
|
||||
throw_on_detection: true,
|
||||
proxies: ['http://localhost:' + proxyPort],
|
||||
// default is use_proxies_only: false,
|
||||
logger: testLogger,
|
||||
});
|
||||
await scraper.start();
|
||||
|
||||
const { results } = await scraper.scrape(scrape_job);
|
||||
assert.strictEqual(results['news']['1'], 'test.local');
|
||||
assert.strictEqual(results['some stuff']['1'], 'ProxiedThroughFakeEngine');
|
||||
assert.strictEqual(results['i work too much']['1'], 'test.local');
|
||||
assert.strictEqual(results['what to do?']['1'], 'ProxiedThroughFakeEngine');
|
||||
assert.strictEqual(results['javascript is hard']['1'], 'test.local');
|
||||
|
||||
await scraper.quit();
|
||||
});
|
||||
|
||||
/**
|
||||
* Jobs will be executed 1 by 1 through the proxy
|
||||
*/
|
||||
it('one proxy given, use_proxies_only=true', async function () {
|
||||
|
||||
const scrape_job = {
|
||||
search_engine: MockScraperTestProxy,
|
||||
keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'],
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager({
|
||||
throw_on_detection: true,
|
||||
proxies: ['http://localhost:' + proxyPort],
|
||||
use_proxies_only: true,
|
||||
logger: testLogger,
|
||||
});
|
||||
await scraper.start();
|
||||
|
||||
const { results } = await scraper.scrape(scrape_job);
|
||||
assert.strictEqual(results['news']['1'], 'ProxiedThroughFakeEngine');
|
||||
assert.strictEqual(results['some stuff']['1'], 'ProxiedThroughFakeEngine');
|
||||
assert.strictEqual(results['i work too much']['1'], 'ProxiedThroughFakeEngine');
|
||||
assert.strictEqual(results['what to do?']['1'], 'ProxiedThroughFakeEngine');
|
||||
assert.strictEqual(results['javascript is hard']['1'], 'ProxiedThroughFakeEngine');
|
||||
|
||||
await scraper.quit();
|
||||
});
|
||||
|
||||
it('zero proxy given, use_proxies_only=true', async function () {
|
||||
|
||||
const scrape_job = {
|
||||
search_engine: MockScraperTestProxy,
|
||||
keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'],
|
||||
};
|
||||
|
||||
await assert.rejects(async () => {
|
||||
var scraper = new se_scraper.ScrapeManager({
|
||||
throw_on_detection: true,
|
||||
use_proxies_only: true,
|
||||
logger: testLogger,
|
||||
});
|
||||
await scraper.start();
|
||||
const { results } = await scraper.scrape(scrape_job);
|
||||
await scraper.quit();
|
||||
}, /Must provide at least one proxy in proxies if you enable use_proxies_only/);
|
||||
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
});
|
@ -1,203 +0,0 @@
|
||||
const se_scraper = require('./../index.js');
|
||||
var assert = require('chai').assert;
|
||||
|
||||
/*
|
||||
* Use chai and mocha for tests.
|
||||
* https://mochajs.org/#installation
|
||||
*/
|
||||
|
||||
const normal_search_keywords = ['apple tree', 'weather tomorrow'];
|
||||
|
||||
async function normal_search_test() {
|
||||
let config = {
|
||||
search_engine: 'bing',
|
||||
compress: false,
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: normal_search_keywords,
|
||||
keyword_file: '',
|
||||
num_pages: 3,
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
console.log('normal_search_test()');
|
||||
await se_scraper.scrape(config, normal_search_test_case);
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function normal_search_test_case(err, response) {
|
||||
|
||||
if (err) {
|
||||
console.error(err);
|
||||
} else {
|
||||
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
|
||||
assert.equal(response.statusCode, 200, 'status code must be 200');
|
||||
assert.equal(response.metadata.num_requests, 6);
|
||||
|
||||
for (let query in response.results) {
|
||||
let total_rank = 1;
|
||||
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'rank'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
assert.equal(res.rank, total_rank++, 'rank ist wrong');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const keywords_no_results = ['2342kljp;fj9834u40abJAkasdlfkjsladfkjasfdas;lk3453-934023safkl34a44dsflkjaQQuBBdfk',];
|
||||
|
||||
async function no_results_test() {
|
||||
let config = {
|
||||
search_engine: 'bing',
|
||||
compress: false,
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: keywords_no_results,
|
||||
keyword_file: '',
|
||||
num_pages: 1,
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
console.log('no_results_test()');
|
||||
await se_scraper.scrape(config, test_case_no_results);
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function test_case_no_results(err, response) {
|
||||
if (err) {
|
||||
console.error(err);
|
||||
} else {
|
||||
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
|
||||
assert.equal(response.statusCode, 200, 'status code must be 200');
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
results = response.results;
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, keywords_no_results, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
assert(obj.results.length === 0, 'results must have 0 SERP objects');
|
||||
assert.equal(obj.no_results, true, 'no results should be true');
|
||||
assert.isEmpty(obj.num_results, 'no results should be a empty string');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const effective_query_keywords = ['mount everrest'];
|
||||
|
||||
async function effective_query_test() {
|
||||
let config = {
|
||||
search_engine: 'bing',
|
||||
compress: false,
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: effective_query_keywords,
|
||||
keyword_file: '',
|
||||
num_pages: 1,
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
console.log('effective_query_test()');
|
||||
await se_scraper.scrape(config, test_case_effective_query);
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function test_case_effective_query(err, response) {
|
||||
|
||||
if (err) {
|
||||
console.error(err);
|
||||
} else {
|
||||
|
||||
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
|
||||
assert.equal(response.statusCode, 200, 'status code must be 200');
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
results = response.results;
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, effective_query_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
// effective query must be different to the original keyword
|
||||
assert.isOk(obj.effective_query, 'effective query must be ok');
|
||||
assert.isNotEmpty(obj.effective_query, 'effective query must be valid');
|
||||
assert(obj.effective_query !== query, 'effective query must be different from keyword');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
(async () => {
|
||||
await normal_search_test();
|
||||
await no_results_test();
|
||||
await effective_query_test();
|
||||
})();
|
@ -1,145 +0,0 @@
|
||||
const se_scraper = require('./../index.js');
|
||||
var assert = require('chai').assert;
|
||||
|
||||
/*
|
||||
* Use chai and mocha for tests.
|
||||
* https://mochajs.org/#installation
|
||||
*/
|
||||
|
||||
const normal_search_keywords = ['apple tree', 'weather tomorrow'];
|
||||
|
||||
async function normal_search_test() {
|
||||
let config = {
|
||||
search_engine: 'duckduckgo',
|
||||
compress: false,
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: normal_search_keywords,
|
||||
keyword_file: '',
|
||||
num_pages: 2,
|
||||
headless: false,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
console.log('normal_search_test()');
|
||||
await se_scraper.scrape(config, normal_search_test_case);
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function normal_search_test_case(err, response) {
|
||||
|
||||
if (err) {
|
||||
console.error(err);
|
||||
} else {
|
||||
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
|
||||
assert.equal(response.statusCode, 200, 'status code must be 200');
|
||||
assert.equal(response.metadata.num_requests, 4);
|
||||
|
||||
for (let query in response.results) {
|
||||
let total_rank = 1;
|
||||
|
||||
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link', 'snippet'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
assert.equal(res.rank, total_rank++, 'rank ist wrong');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const effective_query_keywords = ['mount everrest'];
|
||||
|
||||
async function effective_query_test() {
|
||||
let config = {
|
||||
search_engine: 'duckduckgo',
|
||||
compress: false,
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: effective_query_keywords,
|
||||
keyword_file: '',
|
||||
num_pages: 1,
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
console.log('effective_query_test()');
|
||||
await se_scraper.scrape(config, test_case_effective_query);
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function test_case_effective_query(err, response) {
|
||||
|
||||
if (err) {
|
||||
console.error(err);
|
||||
} else {
|
||||
|
||||
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
|
||||
assert.equal(response.statusCode, 200, 'status code must be 200');
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
results = response.results;
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, effective_query_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
// effective query must be different to the original keyword
|
||||
assert.isOk(obj.effective_query, 'effective query must be ok');
|
||||
assert.isNotEmpty(obj.effective_query, 'effective query must be valid');
|
||||
assert(obj.effective_query !== query, 'effective query must be different from keyword');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
(async () => {
|
||||
await normal_search_test();
|
||||
await effective_query_test();
|
||||
})();
|
@ -1,204 +0,0 @@
|
||||
const se_scraper = require('./../index.js');
|
||||
var assert = require('chai').assert;
|
||||
|
||||
/*
|
||||
* Use chai and mocha for tests.
|
||||
* https://mochajs.org/#installation
|
||||
*/
|
||||
|
||||
const normal_search_keywords = ['apple tree', 'weather tomorrow'];
|
||||
|
||||
async function normal_search_test() {
|
||||
let config = {
|
||||
search_engine: 'google',
|
||||
compress: false,
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: normal_search_keywords,
|
||||
keyword_file: '',
|
||||
num_pages: 3,
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
console.log('normal_search_test()');
|
||||
await se_scraper.scrape(config, normal_search_test_case);
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function normal_search_test_case(err, response) {
|
||||
|
||||
if (err) {
|
||||
console.error(err);
|
||||
} else {
|
||||
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
|
||||
assert.equal(response.statusCode, 200, 'status code must be 200');
|
||||
assert.equal(response.metadata.num_requests, 6);
|
||||
|
||||
for (let query in response.results) {
|
||||
let total_rank = 1;
|
||||
|
||||
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 8, 'results must have at least 8 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank', 'visible_link'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
assert.equal(res.rank, total_rank++, 'rank ist wrong');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const keywords_no_results = ['fgskl34440abJAksafkl34a44dsflkjaQQuBBdfk',];
|
||||
|
||||
async function no_results_test() {
|
||||
let config = {
|
||||
search_engine: 'google',
|
||||
compress: false,
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: keywords_no_results,
|
||||
keyword_file: '',
|
||||
num_pages: 1,
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
console.log('no_results_test()');
|
||||
await se_scraper.scrape(config, test_case_no_results);
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function test_case_no_results(err, response) {
|
||||
if (err) {
|
||||
console.error(err);
|
||||
} else {
|
||||
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
|
||||
assert.equal(response.statusCode, 200, 'status code must be 200');
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
results = response.results;
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, keywords_no_results, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
assert(obj.results.length === 0, 'results must have 0 SERP objects');
|
||||
assert.equal(obj.no_results, true, 'no results should be true');
|
||||
assert.isEmpty(obj.num_results, 'no results should be a empty string');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const effective_query_keywords = ['mount evverrest'];
|
||||
|
||||
async function effective_query_test() {
|
||||
let config = {
|
||||
search_engine: 'google',
|
||||
compress: false,
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: effective_query_keywords,
|
||||
keyword_file: '',
|
||||
num_pages: 1,
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
console.log('effective_query_test()');
|
||||
await se_scraper.scrape(config, test_case_effective_query);
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function test_case_effective_query(err, response) {
|
||||
|
||||
if (err) {
|
||||
console.error(err);
|
||||
} else {
|
||||
|
||||
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
|
||||
assert.equal(response.statusCode, 200, 'status code must be 200');
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
results = response.results;
|
||||
for (let query in response.results) {
|
||||
|
||||
assert.containsAllKeys(response.results, effective_query_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
// effective query must be different to the original keyword
|
||||
assert.isOk(obj.effective_query, 'effective query must be ok');
|
||||
assert.isNotEmpty(obj.effective_query, 'effective query must be valid');
|
||||
assert(obj.effective_query !== query, 'effective query must be different from keyword');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 8, 'results must have at least 8 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
(async () => {
|
||||
await normal_search_test();
|
||||
await no_results_test();
|
||||
await effective_query_test();
|
||||
})();
|
@ -1,85 +0,0 @@
|
||||
const se_scraper = require('./../index.js');
|
||||
var assert = require('chai').assert;
|
||||
|
||||
/*
|
||||
* Use chai and mocha for tests.
|
||||
* https://mochajs.org/#installation
|
||||
*/
|
||||
|
||||
const normal_search_keywords = ['apple', 'rain'];
|
||||
|
||||
async function normal_image_search_test() {
|
||||
let config = {
|
||||
search_engine: 'google_image',
|
||||
compress: false,
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: normal_search_keywords,
|
||||
keyword_file: '',
|
||||
num_pages: 2,
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
console.log('normal_image_search_test()');
|
||||
await se_scraper.scrape(config, normal_image_search_test_case);
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function normal_image_search_test_case(err, response) {
|
||||
|
||||
if (err) {
|
||||
console.error(err);
|
||||
} else {
|
||||
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
|
||||
assert.equal(response.statusCode, 200, 'status code must be 200');
|
||||
assert.equal(response.metadata.num_requests, 2);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
let total_rank = 1;
|
||||
|
||||
assert.containsAllKeys(response.results, normal_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'effective_query'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 15, 'results must have at least 15 SERP objects');
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'snippet', 'rank', 'clean_link'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.clean_link, 'clean_link must be ok');
|
||||
assert.typeOf(res.clean_link, 'string', 'clean_link must be string');
|
||||
assert.isAtLeast(res.clean_link.length, 5, 'clean_link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
assert.equal(res.rank, total_rank++, 'rank ist wrong');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
(async () => {
|
||||
await normal_image_search_test();
|
||||
})();
|
@ -1,221 +0,0 @@
|
||||
const se_scraper = require('./../index.js');
|
||||
var assert = require('chai').assert;
|
||||
|
||||
/*
|
||||
* Use chai and mocha for tests.
|
||||
* https://mochajs.org/#installation
|
||||
*/
|
||||
|
||||
const quote_search_keywords = ['MSFT', 'AAPL'];
|
||||
|
||||
async function reuters_search_test() {
|
||||
let config = {
|
||||
search_engine: 'reuters',
|
||||
compress: false,
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: quote_search_keywords,
|
||||
keyword_file: '',
|
||||
num_pages: 1,
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
console.log('reuters_search_test()');
|
||||
await se_scraper.scrape(config, reuters_search_test_case);
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function reuters_search_test_case(err, response) {
|
||||
|
||||
if (err) {
|
||||
console.error(err);
|
||||
} else {
|
||||
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
|
||||
assert.equal(response.statusCode, 200, 'status code must be 200');
|
||||
|
||||
for (let query in response.results) {
|
||||
let total_rank = 1;
|
||||
assert.containsAllKeys(response.results, quote_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'date', 'snippet'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.date, 'date must be ok');
|
||||
assert.typeOf(res.date, 'string', 'date must be string');
|
||||
assert.isAtLeast(res.date.length, 5, 'date must have at least 5 chars');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function cnbc_search_test() {
|
||||
let config = {
|
||||
search_engine: 'cnbc',
|
||||
compress: false,
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: quote_search_keywords,
|
||||
keyword_file: '',
|
||||
num_pages: 1,
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
console.log('cnbc_search_test()');
|
||||
await se_scraper.scrape(config, cnbc_search_test_case);
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function cnbc_search_test_case(err, response) {
|
||||
|
||||
if (err) {
|
||||
console.error(err);
|
||||
} else {
|
||||
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
|
||||
assert.equal(response.statusCode, 200, 'status code must be 200');
|
||||
|
||||
for (let query in response.results) {
|
||||
let total_rank = 1;
|
||||
assert.containsAllKeys(response.results, quote_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'date'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.date, 'date must be ok');
|
||||
assert.typeOf(res.date, 'string', 'date must be string');
|
||||
assert.isAtLeast(res.date.length, 5, 'date must have at least 5 chars');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const marketwatch_search_keywords = ['MSFT'];
|
||||
|
||||
async function marketwatch_search_test() {
|
||||
let config = {
|
||||
search_engine: 'marketwatch',
|
||||
compress: false,
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: marketwatch_search_keywords,
|
||||
keyword_file: '',
|
||||
num_pages: 1,
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
console.log('marketwatch_search_test()');
|
||||
await se_scraper.scrape(config, marketwatch_search_test_case);
|
||||
}
|
||||
|
||||
// we test with a callback function to our handler
|
||||
function marketwatch_search_test_case(err, response) {
|
||||
|
||||
if (err) {
|
||||
console.error(err);
|
||||
} else {
|
||||
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
|
||||
assert.equal(response.statusCode, 200, 'status code must be 200');
|
||||
|
||||
for (let query in response.results) {
|
||||
let total_rank = 1;
|
||||
assert.containsAllKeys(response.results, marketwatch_search_keywords, 'not all keywords were scraped.');
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time'], 'not all keys are in the object');
|
||||
|
||||
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'date', 'author'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.author, 'author must be ok');
|
||||
assert.typeOf(res.author, 'string', 'author must be string');
|
||||
assert.isAtLeast(res.author.length, 5, 'author must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.date, 'date must be ok');
|
||||
assert.typeOf(res.date, 'string', 'date must be string');
|
||||
assert.isAtLeast(res.date.length, 5, 'date must have at least 5 chars');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
(async () => {
|
||||
await reuters_search_test();
|
||||
await cnbc_search_test();
|
||||
await marketwatch_search_test();
|
||||
})();
|
144
test/user_agent.js
Normal file
144
test/user_agent.js
Normal file
@ -0,0 +1,144 @@
|
||||
'use strict';
|
||||
const express = require('express');
|
||||
const { createLogger, transports } = require('winston');
|
||||
const http = require('http');
|
||||
const https = require('https');
|
||||
const assert = require('assert');
|
||||
const keyCert = require('key-cert');
|
||||
const Promise = require('bluebird');
|
||||
const Proxy = require('http-mitm-proxy');
|
||||
const UAParser = require('ua-parser-js');
|
||||
const _ = require('lodash');
|
||||
|
||||
const debug = require('debug')('se-scraper:test');
|
||||
const se_scraper = require('../');
|
||||
const Scraper = require('../src/modules/se_scraper');
|
||||
|
||||
const httpPort = 3012;
|
||||
const httpsPort = httpPort + 1;
|
||||
const proxyPort = httpPort + 2;
|
||||
|
||||
const fakeSearchEngine = express();
|
||||
fakeSearchEngine.set('trust proxy', 'loopback');
|
||||
fakeSearchEngine.get('/test-user_agent', (req, res) => {
|
||||
debug('fake-search-engine req.headers.user-agent=%s', req.headers['user-agent']);
|
||||
res.send(req.headers['user-agent']);
|
||||
});
|
||||
|
||||
describe('Config', function(){
|
||||
|
||||
let httpServer, httpsServer, proxy;
|
||||
before(async function(){
|
||||
// Here mount our fake engine in both http and https listen server
|
||||
httpServer = http.createServer(fakeSearchEngine);
|
||||
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
|
||||
|
||||
proxy = Proxy();
|
||||
proxy.onRequest((ctx, callback) => {
|
||||
ctx.proxyToServerRequestOptions.host = 'localhost';
|
||||
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
|
||||
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
|
||||
debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
|
||||
return callback();
|
||||
});
|
||||
|
||||
await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
|
||||
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
|
||||
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
|
||||
debug('Fake http search engine servers started');
|
||||
});
|
||||
|
||||
after(function(){
|
||||
httpsServer.close();
|
||||
httpServer.close();
|
||||
proxy.close();
|
||||
});
|
||||
|
||||
describe('user_agent', function(){
|
||||
|
||||
class MockScraperTestUserAgent extends Scraper {
|
||||
|
||||
async load_start_page(){
|
||||
return true;
|
||||
}
|
||||
|
||||
async search_keyword(){
|
||||
await this.page.goto('http://localhost:' + httpPort + '/test-user_agent');
|
||||
}
|
||||
|
||||
async parse_async(){
|
||||
const bodyHandle = await this.page.$('body');
|
||||
return await this.page.evaluate(body => body.innerHTML, bodyHandle);
|
||||
}
|
||||
}
|
||||
|
||||
const testLogger = createLogger({
|
||||
transports: [
|
||||
new transports.Console({
|
||||
level: 'error'
|
||||
})
|
||||
]
|
||||
});
|
||||
|
||||
/**
|
||||
* Test user_agent option
|
||||
*/
|
||||
it('fixed user_agent', async function () {
|
||||
|
||||
const scrape_job = {
|
||||
search_engine: MockScraperTestUserAgent,
|
||||
keywords: ['javascript is hard'],
|
||||
};
|
||||
|
||||
var scraper = new se_scraper.ScrapeManager({
|
||||
throw_on_detection: true,
|
||||
logger: testLogger,
|
||||
user_agent: 'THIS IS A USERAGENT 42.0'
|
||||
});
|
||||
await scraper.start();
|
||||
|
||||
const { results } = await scraper.scrape(scrape_job);
|
||||
assert.strictEqual(results['javascript is hard']['1'], 'THIS IS A USERAGENT 42.0');
|
||||
|
||||
await scraper.quit();
|
||||
});
|
||||
|
||||
/**
|
||||
* Test random_user_agent option
|
||||
* TODO generated user_agent should be different for each keyword
|
||||
* TODO this test will sometimes fail because user_agent not very random :-(
|
||||
*/
|
||||
it('random_user_agent', async function () {
|
||||
|
||||
const scrape_job = {
|
||||
search_engine: MockScraperTestUserAgent,
|
||||
keywords: ['news'],
|
||||
};
|
||||
|
||||
const NUMBER_OF_EXEC = 10;
|
||||
|
||||
const uaList = await Promise.map(_.range(NUMBER_OF_EXEC), async (i) => {
|
||||
const scraper = new se_scraper.ScrapeManager({
|
||||
throw_on_detection: true,
|
||||
logger: testLogger,
|
||||
random_user_agent: true,
|
||||
});
|
||||
await scraper.start();
|
||||
const { results: { news } } = await scraper.scrape(scrape_job);
|
||||
await scraper.quit();
|
||||
return news['1'];
|
||||
});
|
||||
|
||||
uaList.forEach((userAgent) => {
|
||||
const uaParsed = UAParser(userAgent);
|
||||
assert(uaParsed.browser.name, 'UserAgent should have a browser name detected');
|
||||
assert(uaParsed.os.name, 'UserAgent should have a os name detected');
|
||||
});
|
||||
|
||||
assert( _.chain(uaList).countBy().toPairs().sortBy(e => e[1]).last().value()[1] < (NUMBER_OF_EXEC * 0.4), 'Each user agent should appear less than 40% of the time' );
|
||||
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
});
|
Reference in New Issue
Block a user