This commit is contained in:
Nikolai Tschacher 2018-12-24 14:25:02 +01:00
commit 9af1630e19
25 changed files with 3230 additions and 0 deletions

61
.gitignore vendored Normal file
View File

@ -0,0 +1,61 @@
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
# Runtime data
pids
*.pid
*.seed
*.pid.lock
# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov
# Coverage directory used by tools like istanbul
coverage
# nyc test coverage
.nyc_output
# Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
.grunt
# Bower dependency directory (https://bower.io/)
bower_components
# node-waf configuration
.lock-wscript
# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release
# Dependency directories
node_modules/
jspm_packages/
# TypeScript v1 declaration files
typings/
# Optional npm cache directory
.npm
# Optional eslint cache
.eslintcache
# Optional REPL history
.node_repl_history
# Output of 'npm pack'
*.tgz
# Yarn Integrity file
.yarn-integrity
# dotenv environment variables file
.env
# next.js build output
.next

4
.idea/encodings.xml Normal file
View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Encoding" addBOMForNewFiles="with NO BOM" />
</project>

9
.idea/misc.xml Normal file
View File

@ -0,0 +1,9 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="JavaScriptSettings">
<option name="languageLevel" value="ES6" />
</component>
<component name="ProjectRootManager">
<output url="file://$PROJECT_DIR$/out" />
</component>
</project>

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/GoogleScraperPup.iml" filepath="$PROJECT_DIR$/GoogleScraperPup.iml" />
</modules>
</component>
</project>

424
.idea/workspace.xml Normal file
View File

@ -0,0 +1,424 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ChangeListManager">
<list default="true" id="918cf824-5c77-4055-9379-a8d228c9db9d" name="Default Changelist" comment="" />
<ignored path="$PROJECT_DIR$/out/" />
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
<option name="LAST_RESOLUTION" value="IGNORE" />
</component>
<component name="FileEditorManager">
<leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/src/node_scraper.js">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="76">
<caret line="4" column="46" lean-forward="true" selection-start-line="4" selection-start-column="46" selection-end-line="4" selection-end-column="46" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/test/tests.js">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="76">
<caret line="82" column="1" lean-forward="true" selection-start-line="82" selection-start-column="1" selection-end-line="82" selection-end-column="1" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/src/modules/google.js">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="284">
<caret line="388" column="33" selection-start-line="388" selection-start-column="23" selection-end-line="388" selection-end-column="33" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/index.js">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="323">
<caret line="17" column="29" lean-forward="true" selection-start-line="17" selection-start-column="29" selection-end-line="17" selection-end-column="29" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/README.md">
<provider selected="true" editor-type-id="split-provider[text-editor;markdown-preview-editor]">
<state split_layout="SPLIT">
<first_editor relative-caret-position="149">
<caret line="41" column="41" lean-forward="true" selection-start-line="41" selection-start-column="41" selection-end-line="41" selection-end-column="41" />
</first_editor>
<second_editor />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/src/modules/infospace.js">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="227">
<caret line="119" column="53" lean-forward="true" selection-start-line="119" selection-start-column="53" selection-end-line="119" selection-end-column="53" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/src/modules/baidu.js">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="265">
<caret line="49" column="19" selection-start-line="49" selection-start-column="19" selection-end-line="49" selection-end-column="19" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/package.json">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="38">
<caret line="2" column="21" lean-forward="true" selection-start-line="2" selection-start-column="21" selection-end-line="2" selection-end-column="21" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/run.js">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="247">
<caret line="13" column="28" lean-forward="true" selection-end-line="34" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/TODO.txt">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="38">
<caret line="2" column="19" selection-start-line="2" selection-start-column="19" selection-end-line="2" selection-end-column="19" />
</state>
</provider>
</entry>
</file>
</leaf>
</component>
<component name="FileTemplateManagerImpl">
<option name="RECENT_TEMPLATES">
<list>
<option value="JavaScript File" />
</list>
</option>
</component>
<component name="FindInProjectRecents">
<findStrings>
<find>aws</find>
<find>should_turn_down</find>
<find>tools</find>
<find>.body</find>
<find>random_sleep</find>
<find>waitFor</find>
<find>waitForNav</find>
</findStrings>
</component>
<component name="IdeDocumentHistory">
<option name="CHANGED_PATHS">
<list>
<option value="$PROJECT_DIR$/keywords.txt" />
<option value="$PROJECT_DIR$/src/node_scraper.js" />
<option value="$PROJECT_DIR$/index.js" />
<option value="$PROJECT_DIR$/src/modules/functions.js" />
<option value="$PROJECT_DIR$/src/modules/infospace.js" />
<option value="$PROJECT_DIR$/src/modules/youtube.js" />
<option value="$PROJECT_DIR$/src/modules/google.js" />
<option value="$PROJECT_DIR$/src/modules/duckduckgo.js" />
<option value="$PROJECT_DIR$/src/modules/bing.js" />
<option value="$PROJECT_DIR$/src/modules/baidu.js" />
<option value="$PROJECT_DIR$/test/tests.js" />
<option value="$PROJECT_DIR$/run.js" />
<option value="$PROJECT_DIR$/TODO.txt" />
<option value="$PROJECT_DIR$/README.md" />
<option value="$PROJECT_DIR$/package.json" />
</list>
</option>
</component>
<component name="JsFlowSettings">
<service-enabled>true</service-enabled>
<exe-path />
<other-services-enabled>true</other-services-enabled>
<auto-save>true</auto-save>
</component>
<component name="ProjectFrameBounds" extendedState="6">
<option name="x" value="-3" />
<option name="y" value="25" />
<option name="width" value="1926" />
<option name="height" value="1058" />
</component>
<component name="ProjectView">
<navigator proportions="" version="1">
<foldersAlwaysOnTop value="true" />
</navigator>
<panes>
<pane id="PackagesPane" />
<pane id="ProjectPane">
<subPane>
<expand>
<path>
<item name="GoogleScraperPup" type="b2602c69:ProjectViewProjectNode" />
<item name="GoogleScraperPup" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="GoogleScraperPup" type="b2602c69:ProjectViewProjectNode" />
<item name="GoogleScraperPup" type="462c0819:PsiDirectoryNode" />
<item name="src" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="GoogleScraperPup" type="b2602c69:ProjectViewProjectNode" />
<item name="GoogleScraperPup" type="462c0819:PsiDirectoryNode" />
<item name="src" type="462c0819:PsiDirectoryNode" />
<item name="modules" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="GoogleScraperPup" type="b2602c69:ProjectViewProjectNode" />
<item name="GoogleScraperPup" type="462c0819:PsiDirectoryNode" />
<item name="test" type="462c0819:PsiDirectoryNode" />
</path>
</expand>
<select />
</subPane>
</pane>
<pane id="Scope" />
</panes>
</component>
<component name="PropertiesComponent">
<property name="JavaScriptWeakerCompletionTypeGuess" value="true" />
<property name="WebServerToolWindowFactoryState" value="false" />
<property name="aspect.path.notification.shown" value="true" />
<property name="com.android.tools.idea.instantapp.provision.ProvisionBeforeRunTaskProvider.myTimeStamp" value="1545657510456" />
<property name="nodejs_interpreter_path.stuck_in_default_project" value="undefined stuck path" />
<property name="nodejs_npm_path_reset_for_default_project" value="true" />
<property name="nodejs_package_manager_path" value="npm" />
<property name="settings.editor.selected.configurable" value="Settings.JavaScript" />
</component>
<component name="RunDashboard">
<option name="ruleStates">
<list>
<RuleState>
<option name="name" value="ConfigurationTypeDashboardGroupingRule" />
</RuleState>
<RuleState>
<option name="name" value="StatusDashboardGroupingRule" />
</RuleState>
</list>
</option>
</component>
<component name="SvnConfiguration">
<configuration />
</component>
<component name="TaskManager">
<task active="true" id="Default" summary="Default task">
<changelist id="918cf824-5c77-4055-9379-a8d228c9db9d" name="Default Changelist" comment="" />
<created>1545584819565</created>
<option name="number" value="Default" />
<option name="presentableId" value="Default" />
<updated>1545584819565</updated>
<workItem from="1545584824223" duration="2443000" />
<workItem from="1545587608625" duration="4725000" />
<workItem from="1545652501900" duration="3774000" />
</task>
<servers />
</component>
<component name="TimeTrackingManager">
<option name="totallyTimeSpent" value="10942000" />
</component>
<component name="ToolWindowManager">
<frame x="-3" y="25" width="1926" height="1058" extended-state="6" />
<editor active="true" />
<layout>
<window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.16506922" />
<window_info id="Structure" order="1" side_tool="true" weight="0.25" />
<window_info id="Designer" order="2" />
<window_info id="Image Layers" order="3" />
<window_info id="Capture Tool" order="4" />
<window_info id="Favorites" order="5" side_tool="true" />
<window_info id="UI Designer" order="6" />
<window_info anchor="bottom" id="Message" order="0" />
<window_info anchor="bottom" id="Find" order="1" />
<window_info anchor="bottom" id="Run" order="2" />
<window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
<window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
<window_info anchor="bottom" id="TODO" order="6" />
<window_info anchor="bottom" id="Terminal" order="7" />
<window_info anchor="bottom" id="Docker" order="8" show_stripe_button="false" />
<window_info anchor="bottom" id="Event Log" order="9" side_tool="true" />
<window_info anchor="bottom" id="Database Changes" order="10" />
<window_info anchor="bottom" id="Version Control" order="11" />
<window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
<window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
<window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.24973376" />
<window_info anchor="right" id="Maven" order="3" />
<window_info anchor="right" id="Palette" order="4" />
<window_info anchor="right" id="Cargo" order="5" />
<window_info anchor="right" id="SciView" order="6" />
<window_info anchor="right" id="Database" order="7" />
<window_info anchor="right" id="Palette&#9;" order="8" />
<window_info anchor="right" id="Theme Preview" order="9" />
<window_info anchor="right" id="Capture Analysis" order="10" />
</layout>
</component>
<component name="TypeScriptGeneratedFilesManager">
<option name="version" value="1" />
</component>
<component name="editorHistoryManager">
<entry file="file://$PROJECT_DIR$/src/modules/google_manual.js" />
<entry file="file://$PROJECT_DIR$/src/modules/tools.js" />
<entry file="file://$PROJECT_DIR$/src/tests/always_work.js" />
<entry file="file://$PROJECT_DIR$/src/modules/metadata.js">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="171">
<caret line="9" column="19" selection-start-line="9" selection-start-column="19" selection-end-line="9" selection-end-column="19" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/modules/user_agents.js">
<provider selected="true" editor-type-id="text-editor" />
</entry>
<entry file="file://$PROJECT_DIR$/results.json">
<provider selected="true" editor-type-id="text-editor" />
</entry>
<entry file="file://$PROJECT_DIR$/keywords.txt">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="38">
<caret line="2" column="15" selection-start-line="2" selection-start-column="15" selection-end-line="2" selection-end-column="15" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/package-lock.json">
<provider selected="true" editor-type-id="text-editor" />
</entry>
<entry file="file://$PROJECT_DIR$/GoogleScraperPup.iml">
<provider selected="true" editor-type-id="text-editor" />
</entry>
<entry file="file://$PROJECT_DIR$/src/modules/functions.js">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="494">
<caret line="26" column="20" selection-start-line="26" selection-start-column="20" selection-end-line="26" selection-end-column="20" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/modules/youtube.js">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="265">
<caret line="45" selection-start-line="45" selection-end-line="45" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/modules/duckduckgo.js">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="246">
<caret line="36" column="13" lean-forward="true" selection-start-line="36" selection-start-column="13" selection-end-line="36" selection-end-column="13" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/modules/bing.js">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="247">
<caret line="13" column="26" selection-start-line="13" selection-start-column="26" selection-end-line="13" selection-end-column="26" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/modules/baidu.js">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="265">
<caret line="49" column="19" selection-start-line="49" selection-start-column="19" selection-end-line="49" selection-end-column="19" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/modules/google.js">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="284">
<caret line="388" column="33" selection-start-line="388" selection-start-column="23" selection-end-line="388" selection-end-column="33" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/modules/infospace.js">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="227">
<caret line="119" column="53" lean-forward="true" selection-start-line="119" selection-start-column="53" selection-end-line="119" selection-end-column="53" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/index.js">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="323">
<caret line="17" column="29" lean-forward="true" selection-start-line="17" selection-start-column="29" selection-end-line="17" selection-end-column="29" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/test/tests.js">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="76">
<caret line="82" column="1" lean-forward="true" selection-start-line="82" selection-start-column="1" selection-end-line="82" selection-end-column="1" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/node_scraper.js">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="76">
<caret line="4" column="46" lean-forward="true" selection-start-line="4" selection-start-column="46" selection-end-line="4" selection-end-column="46" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/TODO.txt">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="38">
<caret line="2" column="19" selection-start-line="2" selection-start-column="19" selection-end-line="2" selection-end-column="19" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/run.js">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="247">
<caret line="13" column="28" lean-forward="true" selection-end-line="34" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/README.md">
<provider selected="true" editor-type-id="split-provider[text-editor;markdown-preview-editor]">
<state split_layout="SPLIT">
<first_editor relative-caret-position="149">
<caret line="41" column="41" lean-forward="true" selection-start-line="41" selection-start-column="41" selection-end-line="41" selection-end-column="41" />
</first_editor>
<second_editor />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/package.json">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="38">
<caret line="2" column="21" lean-forward="true" selection-start-line="2" selection-start-column="21" selection-end-line="2" selection-end-column="21" />
</state>
</provider>
</entry>
</component>
<component name="masterDetails">
<states>
<state key="ProjectJDKs.UI">
<settings>
<last-edited>Python 3.7</last-edited>
<splitter-proportions>
<option name="proportions">
<list>
<option value="0.2" />
</list>
</option>
</splitter-proportions>
</settings>
</state>
</states>
</component>
</project>

9
GoogleScraperPup.iml Normal file
View File

@ -0,0 +1,9 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="WEB_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="true">
<exclude-output />
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

290
README.md Normal file
View File

@ -0,0 +1,290 @@
# Search Engine Scraper
This node module supports scraping several search engines.
Right now scraping for
* Google
* Google News
* Google News New (https://news.google.com)
* Google Image
* Bing
* Baidu
* Youtube
* Infospace
* Duckduckgo
* Webcrawler
is supported.
This module uses puppeteer. It was created by the Developer of https://github.com/NikolaiT/GoogleScraper, a module with 1800 Stars on Github.
### Technical Notes
Scraping is done with a headless chromium browser using the automation library puppeteer. Puppeteer is a Node library which provides a high-level API to control headless Chrome or Chromium over the DevTools Protocol.
No multithreading is supported for now. Only one scraping worker per `scrape()` call.
If you need to deploy scraping to the cloud (AWS or Azure), you can contact me on hire@incolumitas.com
### Installation and Usage
Install with
```bash
npm install se-scraper
```
Use se-scraper by calling it with a script such as the one below.
```javascript
const se_scraper = require('se-scraper');
let config = {
// the user agent to scrape with
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
// if random_user_agent is set to True, a random user agent is chosen
random_user_agent: false,
// get meta data of scraping in return object
write_meta_data: 'true',
// how long to sleep between requests. a random sleep interval within the range [a,b]
// is drawn before every request. empty string for no sleeping.
sleep_range: '[1,1]',
// which search engine to scrape
search_engine: 'google',
// whether debug information should be printed
debug: 'true',
// whether verbose program output should be printed
verbose: 'false',
// an array of keywords to scrape
keywords: ['incolumitas.com scraping', 'best scraping framework'],
};
se_scraper.scrape(config, (err, response) => {
if (err) { console.error(err) }
/* response object has the following properties:
response.results - json object with the scraping results
response.metadata - json object with metadata information
response.statusCode - status code of the scraping process
*/
console.dir(response.results, {depth: null, colors: true});
});
```
Supported options for the `search_engine` config key:
```javascript
'google'
'google_news_old'
'google_news'
'google_image'
'bing'
'bing_news'
'infospace'
'webcrawler'
'baidu'
'youtube'
'duckduckgo_news'
'google_dr'
```
Output for the above script on my laptop:
```text
Scraper took 4295ms to scrape 2 keywords.
On average ms/keyword: 2147.5ms/keyword
{ 'incolumitas.com scraping':
{ time: 'Mon, 24 Dec 2018 13:07:43 GMT',
num_results: 'Ungefähr 2020 Ergebnisse (0.18 Sekunden) ',
no_results: false,
effective_query: '',
results:
[ { link:
'https://incolumitas.com/2018/10/29/youtube-puppeteer-scraping/',
title:
'Coding, Learning and Business Ideas Tutorial: Youtube scraping ...',
snippet:
'29.10.2018 - In this blog post I am going to show you how to scrape YouTube video data using the handy puppeteer library. Puppeteer is a Node library ...',
visible_link:
'https://incolumitas.com/2018/10/29/youtube-puppeteer-scraping/',
date: '29.10.2018 - ',
rank: 1 },
{ link: 'https://incolumitas.com/2018/09/05/googlescraper-tutorial/',
title:
'GoogleScraper Tutorial - How to scrape 1000 keywords with Google',
snippet:
'05.09.2018 - Tutorial that teaches how to use GoogleScraper to scrape 1000 keywords with 10 selenium browsers.',
visible_link: 'https://incolumitas.com/2018/09/05/googlescraper-tutorial/',
date: '05.09.2018 - ',
rank: 2 },
{ link: 'https://incolumitas.com/tag/scraping.html',
title: 'Coding, Learning and Business Ideas Tag Scraping',
snippet:
'Scraping Amazon Reviews using Headless Chrome Browser and Python3. Posted on Mi ... GoogleScraper Tutorial - How to scrape 1000 keywords with Google.',
visible_link: 'https://incolumitas.com/tag/scraping.html',
date: '',
rank: 3 },
{ link: 'https://incolumitas.com/category/scraping.html',
title: 'Coding, Learning and Business Ideas Category Scraping',
snippet:
'Nikolai Tschacher\'s ideas and projects around IT security and computer science.',
visible_link: 'https://incolumitas.com/category/scraping.html',
date: '',
rank: 4 },
{ link:
'https://github.com/NikolaiT/incolumitas/blob/master/content/Meta/scraping-and-extracting-links-from-any-major-search-engine-like-google-yandex-baidu-bing-and-duckduckgo.md',
title:
'incolumitas/scraping-and-extracting-links-from-any-major-search ...',
snippet:
'Title: Scraping and Extracting Links from any major Search Engine like Google, Yandex, Baidu, Bing and Duckduckgo Date: 2014-11-12 00:47 Author: Nikolai ...',
visible_link:
'https://github.com/.../incolumitas/.../scraping-and-extracting-links...',
date: '',
rank: 5 },
{ link:
'https://stackoverflow.com/questions/16955325/scraping-google-results-with-python',
title: 'Scraping Google Results with Python - Stack Overflow',
snippet:
'I found this. incolumitas.com/2013/01/06/… But the author claims it is not ported to 2.7 yet. user2351394 Jun 6 \'13 at 6:59 ...',
visible_link:
'https://stackoverflow.com/.../scraping-google-results-with-python',
date: '',
rank: 6 },
{ link: 'https://pypi.org/project/GoogleScraper/0.1.18/',
title: 'GoogleScraper · PyPI',
snippet:
'[5]: http://incolumitas.com/2014/11/12/scraping-and-extracting-links-from-any-major-search-engine-like-google-yandex-baidu-bing-and-duckduckgo/ ...',
visible_link: 'https://pypi.org/project/GoogleScraper/0.1.18/',
date: '',
rank: 7 },
{ link:
'https://www.reddit.com/r/Python/comments/2m0vyu/scraping_links_on_google_yandex_bing_duckduckgo/',
title:
'Scraping links on Google, Yandex, Bing, Duckduckgo, Baidu and ...',
snippet:
'12.11.2014 - Scraping links on Google, Yandex, Bing, Duckduckgo, Baidu and other search engines with Python ... submitted 4 years ago by incolumitas.',
visible_link:
'https://www.reddit.com/.../scraping_links_on_google_yandex_bi...',
date: '12.11.2014 - ',
rank: 9 },
{ link: 'https://twitter.com/incolumitas_?lang=de',
title: 'Nikolai Tschacher (@incolumitas_) | Twitter',
snippet:
'Embed Tweet. How to use GoogleScraper to scrape images and download them ... Learn how to scrape millions of url from yandex and google or bing with: ...',
visible_link: 'https://twitter.com/incolumitas_?lang=de',
date: '',
rank: 10 } ] },
'best scraping framework':
{ time: 'Mon, 24 Dec 2018 13:07:44 GMT',
num_results: 'Ungefähr 2820000 Ergebnisse (0.36 Sekunden) ',
no_results: false,
effective_query: '',
results:
[ { link:
'http://www.aioptify.com/top-web-scraping-frameworks-and-librares.php',
title: 'Top Web Scraping Frameworks and Libraries - AI Optify',
snippet: '',
visible_link:
'www.aioptify.com/top-web-scraping-frameworks-and-librares.php',
date: '',
rank: 1 },
{ link:
'http://www.aioptify.com/top-web-scraping-frameworks-and-librares.php',
title: 'Top Web Scraping Frameworks and Libraries - AI Optify',
snippet: '',
visible_link:
'www.aioptify.com/top-web-scraping-frameworks-and-librares.php',
date: '',
rank: 2 },
{ link:
'https://www.scrapehero.com/open-source-web-scraping-frameworks-and-tools/',
title:
'Best Open Source Web Scraping Frameworks and Tools - ScrapeHero',
snippet:
'05.06.2018 - List of Open Source Web Scraping Frameworks. Scrapy. MechanicalSoup. PySpider. Portia. Apify SDK. Nodecrawler. Selenium WebDriver. Puppeteer.',
visible_link:
'https://www.scrapehero.com/open-source-web-scraping-framewo...',
date: '05.06.2018 - ',
rank: 3 },
{ link:
'https://medium.com/datadriveninvestor/best-data-scraping-tools-for-2018-top-10-reviews-558cc5a4992f',
title:
'Best Data Scraping Tools for 2018 (Top 10 Reviews) Data Driven ...',
snippet:
'05.03.2018 - Pros: Octoparse is the best free data scraping tool I\'ve met. ... your Scrapy (a open-source data extraction framework) web spider\'s activities.',
visible_link:
'https://medium.com/.../best-data-scraping-tools-for-2018-top-10-...',
date: '05.03.2018 - ',
rank: 4 },
{ link:
'https://www.quora.com/What-is-the-best-web-scraping-open-source-tool',
title: 'What is the best web scraping open source tool? - Quora',
snippet:
'15.06.2015 - My personal favourite is Python Scrapy and it is an excellent framework for building a web data scraper. Why Scrapy? 1) It is an open source framework and cost ...',
visible_link:
'https://www.quora.com/What-is-the-best-web-scraping-open-sour...',
date: '15.06.2015 - ',
rank: 5 },
{ link:
'http://www.aioptify.com/top-web-scraping-frameworks-and-librares.php',
title: 'Top Web Scraping Frameworks and Libraries - AI Optify',
snippet:
'21.05.2018 - Top Web Scraping Frameworks and Libraries. Requests. Scrapy. Beautiful Soup. Selenium with Python. lxml. Webscraping with Selenium - part 1. Extracting data from websites with Scrapy. Scrapinghub.',
visible_link:
'www.aioptify.com/top-web-scraping-frameworks-and-librares.php',
date: '21.05.2018 - ',
rank: 6 },
{ link: 'https://scrapy.org/',
title:
'Scrapy | A Fast and Powerful Scraping and Web Crawling Framework',
snippet:
'An open source and collaborative framework for extracting the data you need from ... Spider): name = \'blogspider\' start_urls = [\'https://blog.scrapinghub.com\'] def ...',
visible_link: 'https://scrapy.org/',
date: '',
rank: 7 },
{ link:
'https://www.scraperapi.com/blog/the-10-best-web-scraping-tools',
title: 'The 10 Best Web Scraping Tools of 2018 - Scraper API',
snippet:
'19.07.2018 - The 10 Best Web Scraping Tools of 2018. ParseHub. Scrapy. Diffbot. Cheerio. Website: https://cheerio.js.org. Beautiful Soup. Website: https://www.crummy.com/software/BeautifulSoup/ Puppeteer. Website: https://github.com/GoogleChrome/puppeteer. Content Grabber. Website: http://www.contentgrabber.com/ Mozenda. Website: ...',
visible_link:
'https://www.scraperapi.com/blog/the-10-best-web-scraping-tools',
date: '19.07.2018 - ',
rank: 8 },
{ link: 'https://elitedatascience.com/python-web-scraping-libraries',
title: '5 Tasty Python Web Scraping Libraries - EliteDataScience',
snippet:
'03.02.2017 - We\'ve decided to feature the 5 Python libraries for web scraping that ... The good news is that you can swap out its parser with a faster one if ... Scrapy is technically not even a library… it\'s a complete web scraping framework.',
visible_link: 'https://elitedatascience.com/python-web-scraping-libraries',
date: '03.02.2017 - ',
rank: 9 },
{ link:
'https://blog.michaelyin.info/web-scraping-framework-review-scrapy-vs-selenium/',
title:
'Web Scraping Framework Review: Scrapy VS Selenium | MichaelYin ...',
snippet:
'01.10.2018 - In this Scrapy tutorial, I will cover the features of Scrapy and Selenium, and help you decide which one is better for your projects.',
visible_link:
'https://blog.michaelyin.info/web-scraping-framework-review-scr...',
date: '01.10.2018 - ',
rank: 10 },
{ link: 'https://github.com/lorien/awesome-web-scraping',
title:
'GitHub - lorien/awesome-web-scraping: List of libraries, tools and APIs ...',
snippet:
'List of libraries, tools and APIs for web scraping and data processing. ... golang.md · add dataflow kit framework, 2 months ago ... Make this list better!',
visible_link: 'https://github.com/lorien/awesome-web-scraping',
date: '',
rank: 11 },
{ link: 'https://www.import.io/post/best-web-scraping-tools-2018/',
title: 'Best Web Scraping Software Tools 2018 | Import.io',
snippet:
'07.08.2018 - List of Best Web Scraping SoftwareThere are hundreds of Web ... it is a fast high-level screen scraping and web crawling framework, used to ...',
visible_link: 'https://www.import.io/post/best-web-scraping-tools-2018/',
date: '07.08.2018 - ',
rank: 12 } ] } }
```

8
TODO.txt Normal file
View File

@ -0,0 +1,8 @@
24.12.2018
- fix interface to scrape() [DONE]
- add to Github
TODO:
- add proxy support
- add captcha service solving support
- check if news instances run the same browser and if we can have one proxy per tab wokers

63
index.js Normal file
View File

@ -0,0 +1,63 @@
const handler = require('./src/node_scraper.js');
var fs = require('fs');
exports.scrape = function(config, callback) {
// options for scraping
event = {
// the user agent to scrape with
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
// if random_user_agent is set to True, a random user agent is chosen
random_user_agent: false,
// whether to select manual settings in visible mode
set_manual_settings: 'false',
// get meta data of scraping in return object
write_meta_data: 'true',
log_http_headers: 'false',
// how long to sleep between requests. a random sleep interval within the range [a,b]
// is drawn before every request. empty string for no sleeping.
sleep_range: '[1,1]',
// which search engine to scrape
search_engine: 'google',
compress: 'false', // compress
debug: 'false',
verbose: 'false',
keywords: [],
};
for (var key in config) {
event[key] = config[key];
}
if (fs.existsSync( event.keyword_file )) {
event.keywords = read_keywords_from_file(event.keyword_file);
}
if (!callback) {
// called when results are ready
callback = function (err, response) {
if (err) {
console.error(err)
}
console.dir(response.results, {depth: null, colors: true});
}
}
handler.handler(event, undefined, callback );
};
function read_keywords_from_file(fname) {
let kws = fs.readFileSync(fname).toString().split("\n");
// clean keywords
kws = kws.filter((kw) => {
return kw.trim().length > 0;
});
return kws;
}
function write_results(fname, data) {
fs.writeFile(fname || 'results.json', data, (err) => {
if (err) throw err;
console.log('Results written to file');
});
}

3
keywords.txt Normal file
View File

@ -0,0 +1,3 @@
google scraper nikolait
mount everest
incolumitas.com

499
package-lock.json generated Normal file
View File

@ -0,0 +1,499 @@
{
"name": "se-scraper",
"version": "1.0.0",
"lockfileVersion": 1,
"requires": true,
"dependencies": {
"@types/node": {
"version": "10.12.18",
"resolved": "https://registry.npmjs.org/@types/node/-/node-10.12.18.tgz",
"integrity": "sha512-fh+pAqt4xRzPfqA6eh3Z2y6fyZavRIumvjhaCL753+TVkGKGhpPeyrJG2JftD0T9q4GF00KjefsQ+PQNDdWQaQ=="
},
"agent-base": {
"version": "4.2.1",
"resolved": "https://registry.npmjs.org/agent-base/-/agent-base-4.2.1.tgz",
"integrity": "sha512-JVwXMr9nHYTUXsBFKUqhJwvlcYU/blreOEUkhNR2eXZIvwd+c+o5V4MgDPKWnMS/56awN3TRzIP+KoPn+roQtg==",
"requires": {
"es6-promisify": "^5.0.0"
}
},
"assertion-error": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/assertion-error/-/assertion-error-1.1.0.tgz",
"integrity": "sha512-jgsaNduz+ndvGyFt3uSuWqvy4lCnIJiovtouQN5JZHOKCS2QuhEdbcQHFhVksz2N2U9hXJo8odG7ETyWlEeuDw=="
},
"async-limiter": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/async-limiter/-/async-limiter-1.0.0.tgz",
"integrity": "sha512-jp/uFnooOiO+L211eZOoSyzpOITMXx1rBITauYykG3BRYPu8h0UcxsPNB04RR5vo4Tyz3+ay17tR6JVf9qzYWg=="
},
"balanced-match": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.0.tgz",
"integrity": "sha1-ibTRmasr7kneFk6gK4nORi1xt2c="
},
"boolbase": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz",
"integrity": "sha1-aN/1++YMUes3cl6p4+0xDcwed24="
},
"brace-expansion": {
"version": "1.1.11",
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz",
"integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==",
"requires": {
"balanced-match": "^1.0.0",
"concat-map": "0.0.1"
}
},
"buffer-from": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.1.tgz",
"integrity": "sha512-MQcXEUbCKtEo7bhqEs6560Hyd4XaovZlO/k9V3hjVUF/zwW7KBVdSK4gIt/bzwS9MbR5qob+F5jusZsb0YQK2A=="
},
"chai": {
"version": "4.2.0",
"resolved": "https://registry.npmjs.org/chai/-/chai-4.2.0.tgz",
"integrity": "sha512-XQU3bhBukrOsQCuwZndwGcCVQHyZi53fQ6Ys1Fym7E4olpIqqZZhhoFJoaKVvV17lWQoXYwgWN2nF5crA8J2jw==",
"requires": {
"assertion-error": "^1.1.0",
"check-error": "^1.0.2",
"deep-eql": "^3.0.1",
"get-func-name": "^2.0.0",
"pathval": "^1.1.0",
"type-detect": "^4.0.5"
}
},
"check-error": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/check-error/-/check-error-1.0.2.tgz",
"integrity": "sha1-V00xLt2Iu13YkS6Sht1sCu1KrII="
},
"cheerio": {
"version": "1.0.0-rc.2",
"resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.0.0-rc.2.tgz",
"integrity": "sha1-S59TqBsn5NXawxwP/Qz6A8xoMNs=",
"requires": {
"css-select": "~1.2.0",
"dom-serializer": "~0.1.0",
"entities": "~1.1.1",
"htmlparser2": "^3.9.1",
"lodash": "^4.15.0",
"parse5": "^3.0.1"
}
},
"concat-map": {
"version": "0.0.1",
"resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz",
"integrity": "sha1-2Klr13/Wjfd5OnMDajug1UBdR3s="
},
"concat-stream": {
"version": "1.6.2",
"resolved": "http://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz",
"integrity": "sha512-27HBghJxjiZtIk3Ycvn/4kbJk/1uZuJFfuPEns6LaEvpvG1f0hTea8lilrouyo9mVc2GWdcEZ8OLoGmSADlrCw==",
"requires": {
"buffer-from": "^1.0.0",
"inherits": "^2.0.3",
"readable-stream": "^2.2.2",
"typedarray": "^0.0.6"
},
"dependencies": {
"readable-stream": {
"version": "2.3.6",
"resolved": "http://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz",
"integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==",
"requires": {
"core-util-is": "~1.0.0",
"inherits": "~2.0.3",
"isarray": "~1.0.0",
"process-nextick-args": "~2.0.0",
"safe-buffer": "~5.1.1",
"string_decoder": "~1.1.1",
"util-deprecate": "~1.0.1"
}
},
"string_decoder": {
"version": "1.1.1",
"resolved": "http://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz",
"integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==",
"requires": {
"safe-buffer": "~5.1.0"
}
}
}
},
"core-util-is": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz",
"integrity": "sha1-tf1UIgqivFq1eqtxQMlAdUUDwac="
},
"css-select": {
"version": "1.2.0",
"resolved": "http://registry.npmjs.org/css-select/-/css-select-1.2.0.tgz",
"integrity": "sha1-KzoRBTnFNV8c2NMUYj6HCxIeyFg=",
"requires": {
"boolbase": "~1.0.0",
"css-what": "2.1",
"domutils": "1.5.1",
"nth-check": "~1.0.1"
}
},
"css-what": {
"version": "2.1.2",
"resolved": "https://registry.npmjs.org/css-what/-/css-what-2.1.2.tgz",
"integrity": "sha512-wan8dMWQ0GUeF7DGEPVjhHemVW/vy6xUYmFzRY8RYqgA0JtXC9rJmbScBjqSu6dg9q0lwPQy6ZAmJVr3PPTvqQ=="
},
"debug": {
"version": "4.1.1",
"resolved": "https://registry.npmjs.org/debug/-/debug-4.1.1.tgz",
"integrity": "sha512-pYAIzeRo8J6KPEaJ0VWOh5Pzkbw/RetuzehGM7QRRX5he4fPHx2rdKMB256ehJCkX+XRQm16eZLqLNS8RSZXZw==",
"requires": {
"ms": "^2.1.1"
}
},
"deep-eql": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/deep-eql/-/deep-eql-3.0.1.tgz",
"integrity": "sha512-+QeIQyN5ZuO+3Uk5DYh6/1eKO0m0YmJFGNmFHGACpf1ClL1nmlV/p4gNgbl2pJGxgXb4faqo6UE+M5ACEMyVcw==",
"requires": {
"type-detect": "^4.0.0"
}
},
"dom-serializer": {
"version": "0.1.0",
"resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-0.1.0.tgz",
"integrity": "sha1-BzxpdUbOB4DOI75KKOKT5AvDDII=",
"requires": {
"domelementtype": "~1.1.1",
"entities": "~1.1.1"
},
"dependencies": {
"domelementtype": {
"version": "1.1.3",
"resolved": "http://registry.npmjs.org/domelementtype/-/domelementtype-1.1.3.tgz",
"integrity": "sha1-vSh3PiZCiBrsUVRJJCmcXNgiGFs="
}
}
},
"domelementtype": {
"version": "1.3.1",
"resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-1.3.1.tgz",
"integrity": "sha512-BSKB+TSpMpFI/HOxCNr1O8aMOTZ8hT3pM3GQ0w/mWRmkhEDSFJkkyzz4XQsBV44BChwGkrDfMyjVD0eA2aFV3w=="
},
"domhandler": {
"version": "2.4.2",
"resolved": "https://registry.npmjs.org/domhandler/-/domhandler-2.4.2.tgz",
"integrity": "sha512-JiK04h0Ht5u/80fdLMCEmV4zkNh2BcoMFBmZ/91WtYZ8qVXSKjiw7fXMgFPnHcSZgOo3XdinHvmnDUeMf5R4wA==",
"requires": {
"domelementtype": "1"
}
},
"domutils": {
"version": "1.5.1",
"resolved": "https://registry.npmjs.org/domutils/-/domutils-1.5.1.tgz",
"integrity": "sha1-3NhIiib1Y9YQeeSMn3t+Mjc2gs8=",
"requires": {
"dom-serializer": "0",
"domelementtype": "1"
}
},
"entities": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/entities/-/entities-1.1.2.tgz",
"integrity": "sha512-f2LZMYl1Fzu7YSBKg+RoROelpOaNrcGmE9AZubeDfrCEia483oW4MI4VyFd5VNHIgQ/7qm1I0wUHK1eJnn2y2w=="
},
"es6-promise": {
"version": "4.2.5",
"resolved": "https://registry.npmjs.org/es6-promise/-/es6-promise-4.2.5.tgz",
"integrity": "sha512-n6wvpdE43VFtJq+lUDYDBFUwV8TZbuGXLV4D6wKafg13ldznKsyEvatubnmUe31zcvelSzOHF+XbaT+Bl9ObDg=="
},
"es6-promisify": {
"version": "5.0.0",
"resolved": "http://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz",
"integrity": "sha1-UQnWLz5W6pZ8S2NQWu8IKRyKUgM=",
"requires": {
"es6-promise": "^4.0.3"
}
},
"extract-zip": {
"version": "1.6.7",
"resolved": "https://registry.npmjs.org/extract-zip/-/extract-zip-1.6.7.tgz",
"integrity": "sha1-qEC0uK9kAyZMjbV/Txp0Mz74H+k=",
"requires": {
"concat-stream": "1.6.2",
"debug": "2.6.9",
"mkdirp": "0.5.1",
"yauzl": "2.4.1"
},
"dependencies": {
"debug": {
"version": "2.6.9",
"resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
"integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
"requires": {
"ms": "2.0.0"
}
},
"ms": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g="
}
}
},
"fd-slicer": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/fd-slicer/-/fd-slicer-1.0.1.tgz",
"integrity": "sha1-i1vL2ewyfFBBv5qwI/1nUPEXfmU=",
"requires": {
"pend": "~1.2.0"
}
},
"fs.realpath": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz",
"integrity": "sha1-FQStJSMVjKpA20onh8sBQRmU6k8="
},
"get-func-name": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/get-func-name/-/get-func-name-2.0.0.tgz",
"integrity": "sha1-6td0q+5y4gQJQzoGY2YCPdaIekE="
},
"glob": {
"version": "7.1.3",
"resolved": "https://registry.npmjs.org/glob/-/glob-7.1.3.tgz",
"integrity": "sha512-vcfuiIxogLV4DlGBHIUOwI0IbrJ8HWPc4MU7HzviGeNho/UJDfi6B5p3sHeWIQ0KGIU0Jpxi5ZHxemQfLkkAwQ==",
"requires": {
"fs.realpath": "^1.0.0",
"inflight": "^1.0.4",
"inherits": "2",
"minimatch": "^3.0.4",
"once": "^1.3.0",
"path-is-absolute": "^1.0.0"
}
},
"htmlparser2": {
"version": "3.10.0",
"resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-3.10.0.tgz",
"integrity": "sha512-J1nEUGv+MkXS0weHNWVKJJ+UrLfePxRWpN3C9bEi9fLxL2+ggW94DQvgYVXsaT30PGwYRIZKNZXuyMhp3Di4bQ==",
"requires": {
"domelementtype": "^1.3.0",
"domhandler": "^2.3.0",
"domutils": "^1.5.1",
"entities": "^1.1.1",
"inherits": "^2.0.1",
"readable-stream": "^3.0.6"
}
},
"https-proxy-agent": {
"version": "2.2.1",
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-2.2.1.tgz",
"integrity": "sha512-HPCTS1LW51bcyMYbxUIOO4HEOlQ1/1qRaFWcyxvwaqUS9TY88aoEuHUY33kuAh1YhVVaDQhLZsnPd+XNARWZlQ==",
"requires": {
"agent-base": "^4.1.0",
"debug": "^3.1.0"
},
"dependencies": {
"debug": {
"version": "3.2.6",
"resolved": "https://registry.npmjs.org/debug/-/debug-3.2.6.tgz",
"integrity": "sha512-mel+jf7nrtEl5Pn1Qx46zARXKDpBbvzezse7p7LqINmdoIk8PYP5SySaxEmYv6TZ0JyEKA1hsCId6DIhgITtWQ==",
"requires": {
"ms": "^2.1.1"
}
}
}
},
"inflight": {
"version": "1.0.6",
"resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz",
"integrity": "sha1-Sb1jMdfQLQwJvJEKEHW6gWW1bfk=",
"requires": {
"once": "^1.3.0",
"wrappy": "1"
}
},
"inherits": {
"version": "2.0.3",
"resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.3.tgz",
"integrity": "sha1-Yzwsg+PaQqUC9SRmAiSA9CCCYd4="
},
"isarray": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz",
"integrity": "sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE="
},
"lodash": {
"version": "4.17.11",
"resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.11.tgz",
"integrity": "sha512-cQKh8igo5QUhZ7lg38DYWAxMvjSAKG0A8wGSVimP07SIUEK2UO+arSRKbRZWtelMtN5V0Hkwh5ryOto/SshYIg=="
},
"mime": {
"version": "2.4.0",
"resolved": "https://registry.npmjs.org/mime/-/mime-2.4.0.tgz",
"integrity": "sha512-ikBcWwyqXQSHKtciCcctu9YfPbFYZ4+gbHEmE0Q8jzcTYQg5dHCr3g2wwAZjPoJfQVXZq6KXAjpXOTf5/cjT7w=="
},
"minimatch": {
"version": "3.0.4",
"resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.4.tgz",
"integrity": "sha512-yJHVQEhyqPLUTgt9B83PXu6W3rx4MvvHvSUvToogpwoGDOUQ+yDrR0HRot+yOCdCO7u4hX3pWft6kWBBcqh0UA==",
"requires": {
"brace-expansion": "^1.1.7"
}
},
"minimist": {
"version": "0.0.8",
"resolved": "http://registry.npmjs.org/minimist/-/minimist-0.0.8.tgz",
"integrity": "sha1-hX/Kv8M5fSYluCKCYuhqp6ARsF0="
},
"mkdirp": {
"version": "0.5.1",
"resolved": "http://registry.npmjs.org/mkdirp/-/mkdirp-0.5.1.tgz",
"integrity": "sha1-MAV0OOrGz3+MR2fzhkjWaX11yQM=",
"requires": {
"minimist": "0.0.8"
}
},
"ms": {
"version": "2.1.1",
"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.1.tgz",
"integrity": "sha512-tgp+dl5cGk28utYktBsrFqA7HKgrhgPsg6Z/EfhWI4gl1Hwq8B/GmY/0oXZ6nF8hDVesS/FpnYaD/kOWhYQvyg=="
},
"nth-check": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/nth-check/-/nth-check-1.0.2.tgz",
"integrity": "sha512-WeBOdju8SnzPN5vTUJYxYUxLeXpCaVP5i5e0LF8fg7WORF2Wd7wFX/pk0tYZk7s8T+J7VLy0Da6J1+wCT0AtHg==",
"requires": {
"boolbase": "~1.0.0"
}
},
"once": {
"version": "1.4.0",
"resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz",
"integrity": "sha1-WDsap3WWHUsROsF9nFC6753Xa9E=",
"requires": {
"wrappy": "1"
}
},
"parse5": {
"version": "3.0.3",
"resolved": "https://registry.npmjs.org/parse5/-/parse5-3.0.3.tgz",
"integrity": "sha512-rgO9Zg5LLLkfJF9E6CCmXlSE4UVceloys8JrFqCcHloC3usd/kJCyPDwH2SOlzix2j3xaP9sUX3e8+kvkuleAA==",
"requires": {
"@types/node": "*"
}
},
"path-is-absolute": {
"version": "1.0.1",
"resolved": "http://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz",
"integrity": "sha1-F0uSaHNVNP+8es5r9TpanhtcX18="
},
"pathval": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/pathval/-/pathval-1.1.0.tgz",
"integrity": "sha1-uULm1L3mUwBe9rcTYd74cn0GReA="
},
"pend": {
"version": "1.2.0",
"resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz",
"integrity": "sha1-elfrVQpng/kRUzH89GY9XI4AelA="
},
"process-nextick-args": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.0.tgz",
"integrity": "sha512-MtEC1TqN0EU5nephaJ4rAtThHtC86dNN9qCuEhtshvpVBkAW5ZO7BASN9REnF9eoXGcRub+pFuKEpOHE+HbEMw=="
},
"progress": {
"version": "2.0.3",
"resolved": "https://registry.npmjs.org/progress/-/progress-2.0.3.tgz",
"integrity": "sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA=="
},
"proxy-from-env": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.0.0.tgz",
"integrity": "sha1-M8UDmPcOp+uW0h97gXYwpVeRx+4="
},
"puppeteer": {
"version": "1.11.0",
"resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-1.11.0.tgz",
"integrity": "sha512-iG4iMOHixc2EpzqRV+pv7o3GgmU2dNYEMkvKwSaQO/vMZURakwSOn/EYJ6OIRFYOque1qorzIBvrytPIQB3YzQ==",
"requires": {
"debug": "^4.1.0",
"extract-zip": "^1.6.6",
"https-proxy-agent": "^2.2.1",
"mime": "^2.0.3",
"progress": "^2.0.1",
"proxy-from-env": "^1.0.0",
"rimraf": "^2.6.1",
"ws": "^6.1.0"
}
},
"readable-stream": {
"version": "3.1.1",
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.1.1.tgz",
"integrity": "sha512-DkN66hPyqDhnIQ6Jcsvx9bFjhw214O4poMBcIMgPVpQvNy9a0e0Uhg5SqySyDKAmUlwt8LonTBz1ezOnM8pUdA==",
"requires": {
"inherits": "^2.0.3",
"string_decoder": "^1.1.1",
"util-deprecate": "^1.0.1"
}
},
"rimraf": {
"version": "2.6.2",
"resolved": "https://registry.npmjs.org/rimraf/-/rimraf-2.6.2.tgz",
"integrity": "sha512-lreewLK/BlghmxtfH36YYVg1i8IAce4TI7oao75I1g245+6BctqTVQiBP3YUJ9C6DQOXJmkYR9X9fCLtCOJc5w==",
"requires": {
"glob": "^7.0.5"
}
},
"safe-buffer": {
"version": "5.1.2",
"resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz",
"integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g=="
},
"string_decoder": {
"version": "1.2.0",
"resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.2.0.tgz",
"integrity": "sha512-6YqyX6ZWEYguAxgZzHGL7SsCeGx3V2TtOTqZz1xSTSWnqsbWwbptafNyvf/ACquZUXV3DANr5BDIwNYe1mN42w==",
"requires": {
"safe-buffer": "~5.1.0"
}
},
"type-detect": {
"version": "4.0.8",
"resolved": "https://registry.npmjs.org/type-detect/-/type-detect-4.0.8.tgz",
"integrity": "sha512-0fr/mIH1dlO+x7TlcMy+bIDqKPsw/70tVyeHW787goQjhmqaZe10uwLujubK9q9Lg6Fiho1KUKDYz0Z7k7g5/g=="
},
"typedarray": {
"version": "0.0.6",
"resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz",
"integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c="
},
"util-deprecate": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz",
"integrity": "sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8="
},
"wrappy": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz",
"integrity": "sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8="
},
"ws": {
"version": "6.1.2",
"resolved": "https://registry.npmjs.org/ws/-/ws-6.1.2.tgz",
"integrity": "sha512-rfUqzvz0WxmSXtJpPMX2EeASXabOrSMk1ruMOV3JBTBjo4ac2lDjGGsbQSyxj8Odhw5fBib8ZKEjDNvgouNKYw==",
"requires": {
"async-limiter": "~1.0.0"
}
},
"yauzl": {
"version": "2.4.1",
"resolved": "https://registry.npmjs.org/yauzl/-/yauzl-2.4.1.tgz",
"integrity": "sha1-lSj0QtqxsihOWLQ3m7GU4i4MQAU=",
"requires": {
"fd-slicer": "~1.0.1"
}
}
}
}

26
package.json Normal file
View File

@ -0,0 +1,26 @@
{
"name": "se-scraper",
"version": "1.0.4",
"description": "A simple module which uses puppeteer to scrape several search engines.",
"main": "index.js",
"scripts": {
"test": "mocha"
},
"keywords": [
"scraping",
"search-engines",
"google",
"web-scraping"
],
"author": "Nikolai Tschacher <hire@incolumitas.com> (https://incolumitas.com/)",
"repository": {
"type": "git",
"url": "https://github.com/NikolaiT/se-scraper"
},
"license": "ISC",
"dependencies": {
"chai": "^4.2.0",
"cheerio": "^1.0.0-rc.2",
"puppeteer": "^1.9.0"
}
}

1
results.json Normal file

File diff suppressed because one or more lines are too long

34
run.js Normal file
View File

@ -0,0 +1,34 @@
const se_scraper = require('./index.js');
let config = {
// the user agent to scrape with
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
// if random_user_agent is set to True, a random user agent is chosen
random_user_agent: false,
// get meta data of scraping in return object
write_meta_data: 'true',
// how long to sleep between requests. a random sleep interval within the range [a,b]
// is drawn before every request. empty string for no sleeping.
sleep_range: '[1,1]',
// which search engine to scrape
search_engine: 'google',
// whether debug information should be printed
debug: 'true',
// whether verbose program output should be printed
verbose: 'false',
// an array of keywords to scrape
keywords: ['incolumitas.com scraping', 'best scraping framework'],
};
se_scraper.scrape(config, (err, response) => {
if (err) { console.error(err) }
/* response object has the following properties:
response.results - json object with the scraping results
response.metadata - json object with metadata information
response.statusCode - status code of the scraping process
*/
console.dir(response.results, {depth: null, colors: true});
});

101
src/modules/baidu.js Normal file
View File

@ -0,0 +1,101 @@
const cheerio = require('cheerio');
const sfunctions = require('./functions.js');
module.exports = {
scrape_baidu_pup: scrape_baidu_pup,
};
async function scrape_baidu_pup(browser, event, context) {
const page = await browser.newPage();
await page.goto('https://www.baidu.com/');
try {
await page.waitForSelector('input[name="wd"]', { timeout: 5000 });
} catch (e) {
return results;
}
let keywords = event.keywords;
var results = {};
for (var i = 0; i < keywords.length; i++) {
keyword = keywords[i];
try {
const input = await page.$('input[name="wd"]');
// overwrites last text in input
await input.click({ clickCount: 3 });
await input.type(keyword);
await input.focus();
await page.keyboard.press("Enter");
if (event.sleep_range) {
await sfunctions.random_sleep(event);
}
// in baidu we have a issue with waiting for a selector
// or waiting for navigation
// therefore, we just manually sleep
// issue in baidu: https://github.com/GoogleChrome/puppeteer/issues/609
// https://github.com/GoogleChrome/puppeteer/issues/2671
// await page.evaluate( () => {
// if ( ! window.Node ) {
// window.Node = {};
// }
// if ( ! Node.ELEMENT_NODE ) {
// Node.ELEMENT_NODE = 1;
// }
// } );
// await page.waitForSelector('.result', { timeout: 5000 });
// this should be reasonable for normal internet connections
await sfunctions.sleep(2000);
if (event.debug === true && event.is_local === true) {
await page.screenshot({path: `debug/${keyword}.png`});
}
let html = await page.content();
results[keyword] = parse(html);
} catch (e) {
console.error(`Problem with scraping ${keyword}: ${e}`);
}
}
return results;
}
function parse(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
// perform queries
const results = [];
$('#content_left .result').each((i, link) => {
results.push({
link: $(link).find('h3 a').attr('href'),
title: $(link).find('h3').text(),
snippet: $(link).find('.c-abstract').text(),
visible_link: $(link).find('.f13').text(),
})
});
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim()) {
res.rank = i+1;
cleaned.push(res);
}
}
return {
time: (new Date()).toUTCString(),
no_results: false,
num_results: $('.nums_text').text(),
results: cleaned,
}
}

178
src/modules/bing.js Normal file
View File

@ -0,0 +1,178 @@
const cheerio = require('cheerio');
const sfunctions = require('./functions.js');
module.exports = {
scrape_bing_pup: scrape_bing_pup,
scrape_bing_news_pup: scrape_bing_news_pup,
};
async function scrape_bing_pup(browser, event, context) {
const page = await browser.newPage();
await page.goto('https://www.bing.com/');
try {
await page.waitForSelector('input[name="q"]', { timeout: 5000 });
} catch (e) {
return results;
}
let keywords = event.keywords;
var results = {};
for (var i = 0; i < keywords.length; i++) {
keyword = keywords[i];
try {
const input = await page.$('input[name="q"]');
// overwrites last text in input
await input.click({ clickCount: 3 });
await input.type(keyword);
await input.focus();
await page.keyboard.press("Enter");
if (event.sleep_range) {
await sfunctions.random_sleep(event);
}
await page.waitForSelector('#b_content', { timeout: 5000 });
if (event.debug === true && event.is_local === true) {
await page.screenshot({path: `debug/${keyword}.png`});
}
let html = await page.content();
results[keyword] = parse(html);
} catch (e) {
console.error(`Problem with scraping ${keyword}: ${e}`);
}
}
return results;
}
function parse(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
// perform queries
const results = [];
$('#b_content #b_results .b_algo').each((i, link) => {
results.push({
link: $(link).find('h2 a').attr('href'),
title: $(link).find('h2').text(),
snippet: $(link).find('.b_caption p').text(),
visible_link: $(link).find('cite').text(),
})
});
let no_results = sfunctions.no_results(
['There are no results', 'Es gibt keine Ergebnisse', 'Including results for', 'Einschließlich Ergebnisse'],
$('#b_results').text()
);
let effective_query = $('#sp_requery a').first().text() || '';
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.title && res.title.trim()) {
res.rank = i+1;
cleaned.push(res);
}
}
return {
time: (new Date()).toUTCString(),
no_results: no_results,
effective_query: effective_query,
num_results: $('#b_content .sb_count').text(),
results: cleaned,
}
}
async function scrape_bing_news_pup(browser, event, context) {
const page = await browser.newPage();
await page.goto('https://www.bing.com/news/search?');
if (event.set_manual_settings === true) {
console.log('Sleeping 30 seconds. Set your settings now.');
await sfunctions.sleep(30000);
}
try {
await page.waitForSelector('input[name="q"]', { timeout: 5000 });
} catch (e) {
return results;
}
let keywords = event.keywords;
var results = {};
for (var i = 0; i < keywords.length; i++) {
if (sfunctions.should_turn_down(context)) {
break;
}
keyword = keywords[i];
try {
const input = await page.$('input[name="q"]');
// overwrites last text in input
await input.click({ clickCount: 3 });
await input.type(keyword);
await input.focus();
await page.keyboard.press("Enter");
if (event.sleep_range) {
await sfunctions.random_sleep(event);
}
await page.waitForSelector('#news', { timeout: 5000 });
await sfunctions.sleep(2000);
if (event.debug === true && event.is_local === true) {
await page.screenshot({path: `debug/${keyword}.png`});
}
let html = await page.content();
results[keyword] = parse_bing_news(html);
} catch (e) {
console.error(`Problem with scraping ${keyword}: ${e}`);
}
}
return results;
}
function parse_bing_news(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
// perform queries
const results = [];
$('#algocore .newsitem').each((i, link) => {
results.push({
link: $(link).attr('url'),
title: $(link).find('a.title').text(),
snippet: $(link).find('.snippet').text(),
date: $(link).find('.source span').last().text(),
})
});
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.title && res.title.trim()) {
res.rank = i+1;
cleaned.push(res);
}
}
return {
time: (new Date()).toUTCString(),
results: cleaned,
}
}

86
src/modules/duckduckgo.js Normal file
View File

@ -0,0 +1,86 @@
const cheerio = require('cheerio');
const sfunctions = require('./functions.js');
module.exports = {
scrape_duckduckgo_news_pup: scrape_duckduckgo_news_pup,
};
async function scrape_duckduckgo_news_pup(browser, event, context) {
const page = await browser.newPage();
await page.goto('https://duckduckgo.com/?q=42&t=h_&iar=news&ia=news');
try {
await page.waitForSelector('input[name="q"]', { timeout: 5000 });
} catch (e) {
return results;
}
let keywords = event.keywords;
var results = {};
for (var i = 0; i < keywords.length; i++) {
keyword = keywords[i];
try {
const input = await page.$('input[name="q"]');
// overwrites last text in input
await input.click({ clickCount: 3 });
await sfunctions.sleep(150);
await input.type(keyword);
await sfunctions.sleep(150);
await input.focus();
await page.keyboard.press("Enter");
if (event.sleep_range) {
await sfunctions.random_sleep(event);
}
// await page.waitForSelector('.result--news', { timeout: 5000 });
await page.waitForSelector('.serp__results', { timeout: 5000 });
await sfunctions.sleep(1500);
if (event.debug === true && event.is_local === true) {
await page.screenshot({path: `debug/${keyword}.png`});
}
let html = await page.content();
results[keyword] = parse_duckduckgo_news_results(html, event.max_results);
} catch (e) {
console.error(`Problem with scraping ${keyword}: ${e}`);
return results;
}
}
return results;
}
function parse_duckduckgo_news_results(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
// perform queries
const results = [];
$('.result--news').each((i, link) => {
results.push({
link: $(link).find('.result__title .result__a').attr('href'),
title: $(link).find('.result__title .result__a').text(),
date: $(link).find('.result__timestamp').text(),
snippet: $(link).find('.result__snippet').text(),
});
});
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.title && res.title.trim()) {
res.rank = i+1;
cleaned.push(res);
}
}
return {
time: (new Date()).toUTCString(),
results: cleaned
}
}

31
src/modules/functions.js Normal file
View File

@ -0,0 +1,31 @@
module.exports = {
no_results: no_results,
effective_query: effective_query,
sleep: sleep,
random_sleep: random_sleep,
};
function no_results(needles, html) {
return !needles.map((needle) => { return html.indexOf(needle)})
.every((res) => { return res == -1});
}
function effective_query(needles, html) {
return;
}
function sleep(ms) {
return new Promise(resolve => {
setTimeout(resolve, ms)
})
}
async function random_sleep(config) {
var min, max;
[min, max] = config.sleep_range;
var rand = Math.floor(Math.random() * (max - min + 1) + min); //Generate Random number
if (config.debug === true) {
console.log(`Sleeping for ${rand}s`);
}
await sleep(rand * 1000);
}

611
src/modules/google.js Normal file
View File

@ -0,0 +1,611 @@
const cheerio = require('cheerio');
const sfunctions = require('./functions.js');
/*
Scrape for dateranges:
https://www.google.com/search?lr=&hl=en&tbs=cdr:1,cd_min:1/1/2007,cd_max:1/1/2009&q=%22video+game%22+%22Catan%22&oq=%22video+game%22+%22Catan%22
*/
module.exports = {
scrape_google_news_old_pup: scrape_google_news_old_pup,
scrape_google_pup: scrape_google_pup,
scrape_google_image_pup: scrape_google_image_pup,
scrape_google_news_pup: scrape_google_news_pup,
scrape_google_pup_dr: scrape_google_pup_dr,
};
const STANDARD_TIMEOUT = 8000;
const SOLVE_CAPTCHA_TIME = 45000;
const setTextInputValue = async (page, selector, value) => {
await page.waitFor(selector);
await page.evaluate((value, selector) => {
return document.querySelector(selector).value = value;
}, value, selector);
};
async function scrape_google_pup(browser, event, context) {
const page = await browser.newPage();
await page.goto('https://www.google.com/');
try {
await page.waitForSelector('input[name="q"]', { timeout: STANDARD_TIMEOUT });
} catch (e) {
return results;
}
let keywords = event.keywords;
var results = {};
for (var i = 0; i < keywords.length; i++) {
keyword = keywords[i];
if (event.verbose === true) {
console.log(`${event.search_engine} is scraping keyword: ${keyword}`);
}
try {
const input = await page.$('input[name="q"]');
// await input.click({ clickCount: 3 });
// await sfunctions.sleep(50);
//await input.type(keyword);
await setTextInputValue(page, `input[name="q"]`, keyword);
await sfunctions.sleep(50);
await input.focus();
await page.keyboard.press("Enter");
if (event.sleep_range) {
await sfunctions.random_sleep(event);
}
await page.waitForSelector('#center_col', { timeout: STANDARD_TIMEOUT });
await sfunctions.sleep(100);
} catch (e) {
console.error(`Problem with scraping ${keyword}.`);
console.error(e);
if (await scraping_detected(page) === true) {
console.error('Google detected the scraping. Aborting.');
if (event.is_local === true) {
await sfunctions.sleep(SOLVE_CAPTCHA_TIME);
console.error('You have 45 seconds to enter the captcha.');
// expect that user filled out necessary captcha
} else {
return results;
}
} else {
// some other error, quit scraping process if stuff is broken
if (event.is_local === true) {
console.error('You have 30 seconds to fix this.');
await sfunctions.sleep(30000);
} else {
return results;
}
}
}
let html = await page.content();
results[keyword] = parse_google_results(html);
}
return results;
}
async function scrape_google_pup_dr(browser, event, context) {
const page = await browser.newPage();
let keywords = event.keywords;
first = keywords[0];
var year = first.slice(-5);
var remaining = first.slice(0,-5);
year = parseInt(year.trim());
let dr_from = `1/1/${year-1}`;
let dr_to = `1/1/${year+1}`;
var url = `https://www.google.com/search?lr=&hl=en&tbs=cdr:1,cd_min:${dr_from},cd_max:${dr_to}&q=${remaining}&oq=${remaining}`;
await page.goto(url);
try {
await page.waitForSelector('input[name="q"]', { timeout: STANDARD_TIMEOUT });
} catch (e) {
return results;
}
var results = {};
for (var i = 1; i < keywords.length; i++) {
// strip the year at the end plus whitespace
keyword = keywords[i].slice(0,-5);
if (event.verbose === true) {
console.log(`${event.search_engine} is scraping keyword: ${keyword}`);
}
try {
const input = await page.$('input[name="q"]');
// overwrites last text in input
// await input.click({ clickCount: 3 });
// await sfunctions.sleep(50);
// await input.type(keyword);
await setTextInputValue(page, `input[name="q"]`, keyword);
await sfunctions.sleep(50);
await input.focus();
await page.keyboard.press("Enter");
if (event.debug === true && event.is_local === true) {
console.log(`[${i}] Scraping ${keyword}`);
}
if (event.sleep_range) {
await sfunctions.random_sleep(event);
}
await page.waitForSelector('#center_col', { timeout: STANDARD_TIMEOUT });
await sfunctions.sleep(100);
} catch (e) {
console.error(`Problem with scraping ${keyword}: ${e}`);
if (await scraping_detected(page) === true) {
console.error('Google detected the scraping. Aborting.');
if (event.is_local === true) {
await sfunctions.sleep(SOLVE_CAPTCHA_TIME);
console.error('You have 45 seconds to enter the captcha.');
// expect that user filled out necessary captcha
} else {
return results;
}
} else {
// some other error, quit scraping process if stuff is broken
if (event.is_local === true) {
console.error('You have 30 seconds to fix this.');
await sfunctions.sleep(30000);
} else {
return results;
}
}
}
let html = await page.content();
results[keyword] = parse_google_results(html);
results[keyword].daterange = dr_from + '-' + dr_to;
results[keyword].year = year;
}
return results;
}
function parse_google_results(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
// perform queries
const results = [];
$('#center_col .g').each((i, link) => {
results.push({
link: $(link).find('.r a').attr('href'),
title: $(link).find('.r a h3').text(),
snippet: $(link).find('span.st').text(),
visible_link: $(link).find('.r cite').text(),
date: $(link).find('span.f').text() || '',
})
});
let no_results = sfunctions.no_results(
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
'No results found for', 'Ergebnisse für', 'Showing results for'],
$('#main').text()
);
let effective_query = $('#fprsl').text() || '';
if (!effective_query) {
effective_query = $('#fprs a').text()
}
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.title && res.title.trim()) {
res.rank = i+1;
cleaned.push(res);
}
}
return {
time: (new Date()).toUTCString(),
num_results: $('#resultStats').text(),
no_results: no_results,
effective_query: effective_query,
results: cleaned
}
}
async function scraping_detected(page) {
const title = await page.title();
let html = await page.content();
return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
}
async function scrape_google_news_old_pup(browser, event, context) {
const page = await browser.newPage();
let keywords = event.keywords;
var results = {};
for (var i = 0; i < keywords.length; i++) {
keyword = keywords[i];
if (event.verbose === true) {
console.log(`${event.search_engine} is scraping keyword: ${keyword}`);
}
try {
await page.goto(`https://www.google.com/search?q=${keyword}&hl=en&source=lnms&tbm=nws`, {
referer: 'https://www.google.com/'
});
await page.waitForSelector('input[name="q"]', { timeout: STANDARD_TIMEOUT });
const input = await page.$('input[name="q"]');
// overwrites last text in input
// await input.click({ clickCount: 3 });
// await input.type(keyword);
await setTextInputValue(page, `input[name="q"]`, keyword);
await sfunctions.sleep(50);
await input.focus();
await page.keyboard.press("Enter");
if (event.sleep_range) {
await sfunctions.random_sleep(event);
}
await page.waitForNavigation({ timeout: STANDARD_TIMEOUT });
await page.waitForSelector('#main', { timeout: STANDARD_TIMEOUT });
await sfunctions.sleep(200);
} catch(e) {
console.error(`Problem with scraping ${keyword}: ${e}`);
if (await scraping_detected(page) === true) {
console.error('Google detected the scraping. Aborting.');
if (event.is_local === true) {
await sfunctions.sleep(SOLVE_CAPTCHA_TIME);
console.error('You have 45 seconds to enter the captcha.');
// expect that user filled out necessary captcha
} else {
return results;
}
} else {
// some other error, quit scraping process if stuff is broken
if (event.is_local === true) {
console.error('You have 30 seconds to fix this.');
await sfunctions.sleep(30000);
} else {
return results;
}
}
}
let html = await page.content();
results[keyword] = parse_google_news_results_se_format(html);
}
return results;
}
function parse_google_news_results_se_format(html) {
const $ = cheerio.load(html);
// perform queries
const results = [];
$('.g').each((i, result) => {
results.push({
link: $(result).find('h3 a').attr('href'),
title: $(result).find('h3 a').text(),
snippet: $(result).find('.st').text(),
date: $(result).find('.nsa').text(),
})
});
let no_results = sfunctions.no_results(
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
'No results found for', 'Ergebnisse für', 'Showing results for', 'did not match any news results'],
$('#main').text()
);
let effective_query = $('#fprsl').text() || '';
if (!effective_query) {
effective_query = $('#fprs a').text()
}
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim()) {
res.rank = i+1;
cleaned.push(res);
}
}
return {
time: (new Date()).toUTCString(),
results: cleaned,
no_results: no_results,
effective_query: effective_query,
}
}
async function scrape_google_image_pup(browser, event, context) {
const page = await browser.newPage();
let keywords = event.keywords;
var results = {};
await page.goto(`https://www.google.com/imghp?tbm=isch`, {
referer: 'https://www.google.com/'
});
try {
await page.waitForSelector('input[name="q"]', { timeout: STANDARD_TIMEOUT });
} catch (e) {
return results;
}
for (var i = 0; i < keywords.length; i++) {
keyword = keywords[i];
if (event.verbose === true) {
console.log(`${event.search_engine} is scraping keyword: ${keyword}`);
}
try {
const input = await page.$('input[name="q"]');
// overwrites last text in input
// await input.click({ clickCount: 3 });
// await input.type(keyword);
await setTextInputValue(page, `input[name="q"]`, keyword);
await sfunctions.sleep(50);
await input.focus();
await page.keyboard.press("Enter");
if (event.sleep_range) {
await sfunctions.random_sleep(event);
}
await page.waitForNavigation({ timeout: STANDARD_TIMEOUT});
await page.waitForSelector('#main', { timeout: STANDARD_TIMEOUT });
let html = await page.content();
results[keyword] = parse_google_image_results(html);
} catch (e) {
console.error(`Problem with scraping ${keyword}: ${e}`);
if (await scraping_detected(page) === true) {
console.error('Google detected the scraping. Aborting.');
if (event.is_local === true) {
await sfunctions.sleep(SOLVE_CAPTCHA_TIME);
console.error('You have 45 seconds to enter the captcha.');
// expect that user filled out necessary captcha
} else {
return results;
}
} else {
// some other error, quit scraping process if stuff is broken
if (event.is_local === true) {
console.error('You have 30 seconds to fix this.');
await sfunctions.sleep(30000);
} else {
return results;
}
}
}
}
return results;
}
function parse_google_image_results(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
// perform queries
const results = [];
$('.rg_bx').each((i, link) => {
let link_element = $(link).find('a.rg_l').attr('href');
let clean_link = clean_image_url(link_element);
results.push({
link: link_element,
clean_link: clean_link,
snippet: $(link).find('.a-no-hover-decoration').text(),
})
});
let no_results = sfunctions.no_results(
['stimmt mit keinem Bildergebnis', 'Keine Ergebnisse für', 'not match any image results', 'No results found for',
'Showing results for', 'Ergebnisse für'],
$('#main').text()
);
let effective_query = $('#fprsl').text() || '';
if (!effective_query) {
effective_query = $('#fprs a').text();
}
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.link.trim().length > 10) {
res.link = res.link.trim();
res.rank = i+1;
cleaned.push(res);
}
}
return {
time: (new Date()).toUTCString(),
no_results: no_results,
results: cleaned,
effective_query: effective_query
}
}
function clean_image_url(url) {
// Example:
// https://www.google.com/imgres?imgurl=https%3A%2F%2Fupload.wikimedia.org%2Fwikipedia%2Fen%2Fthumb%2Ff%2Ffd%2F1928_Edward_Campbell.jpg%2F220px-1928_Edward_Campbell.jpg&imgrefurl=https%3A%2F%2Fwww.revolvy.com%2Fpage%2FSir-Edward-Campbell%252C-1st-Baronet&docid=BMkW_GerTIY4GM&tbnid=TmQapIxDCQbQhM%3A&vet=10ahUKEwje_LLE_YXeAhXisaQKHVAEBSAQMwiNAShEMEQ..i&w=220&h=290&bih=1696&biw=1280&q=John%20MacLeod%20Breadalbane%20Councillor%20Prince%20Edward%20Island&ved=0ahUKEwje_LLE_YXeAhXisaQKHVAEBSAQMwiNAShEMEQ&iact=mrc&uact=8
const regex = /imgurl=(.*?)&/gm;
let match = regex.exec(url);
if (match !== null) {
return decodeURIComponent(match[1]);
}
}
const all_results = new Set();
async function scrape_google_news_pup(browser, event, context) {
const page = await browser.newPage();
let keywords = event.keywords;
var results = {};
await page.goto(`https://news.google.com/?hl=en-US&gl=US&ceid=US:en`, {
referer: 'https://news.google.com'
});
await page.waitForSelector('div input:nth-child(2)', { timeout: STANDARD_TIMEOUT });
await sfunctions.sleep(1000);
// parse here front page results
let html = await page.content();
results['frontpage'] = parse_google_news_results(html);
for (var i = 0; i < keywords.length; i++) {
keyword = keywords[i];
if (event.verbose === true) {
console.log(`${event.search_engine} is scraping keyword: ${keyword}`);
}
try {
await page.waitForSelector('div input:nth-child(2)', { timeout: STANDARD_TIMEOUT });
const input = await page.$('div input:nth-child(2)');
// overwrites last text in input
await input.click({ clickCount: 3 });
await input.type(keyword);
// TODO: setting the input in https://news.google.com/
// TODO: doesn't work. Fall back to use clicking and typing
// await setTextInputValue(page, `input[aria-label="Search"]`, keyword);
await sfunctions.sleep(50);
await input.focus();
await page.keyboard.press("Enter");
if (event.sleep_range) {
await sfunctions.random_sleep(event);
}
//await page.waitForSelector('#main', { timeout: 5000 });
await sfunctions.sleep(2500);
html = await page.content();
results[keyword] = parse_google_news_results(html);
} catch(e) {
console.error(`Problem with scraping ${keyword}: ${e}`);
if (await scraping_detected(page) === true) {
console.error('Google detected the scraping. Aborting.');
if (event.is_local === true) {
await sfunctions.sleep(SOLVE_CAPTCHA_TIME);
console.error('You have 45 seconds to enter the captcha.');
// expect that user filled out necessary captcha
} else {
return results;
}
} else {
// some other error, quit scraping process if stuff is broken
if (event.is_local === true) {
console.error('You have 30 seconds to fix this.');
await sfunctions.sleep(30000);
} else {
return results;
}
}
}
}
return results;
}
function parse_google_news_results(html) {
const $ = cheerio.load(html);
// perform queries
const results = [];
$('article h3').each((i, headline) => {
title = $(headline).find('a span').text();
try {
snippet = $(headline).parent().find('p').text();
link = $(headline).find('a').attr('href');
date = $(headline).parent().parent().parent().find('time').text();
ts = $(headline).parent().parent().parent().find('time').attr('datetime');
} catch(e) {
}
if (!all_results.has(title)) {
results.push({
rank: i+1,
title: title,
snippet: snippet,
link: link,
date: date,
ts: ts,
})
}
all_results.add(title);
});
let no_results = sfunctions.no_results(
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
'No results found for', 'Ergebnisse für', 'Showing results for', 'did not match any news results'],
$('body').text()
);
let effective_query = $('#fprsl').text() || '';
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.title && res.title.trim()) {
res.rank = i+1;
cleaned.push(res);
}
}
return {
time: (new Date()).toUTCString(),
results: cleaned,
no_results: no_results,
effective_query: effective_query,
}
}

170
src/modules/infospace.js Normal file
View File

@ -0,0 +1,170 @@
const cheerio = require('cheerio');
const sfunctions = require('./functions.js');
module.exports = {
scrape_infospace_pup: scrape_infospace_pup,
scrape_webcrawler_news_pup: scrape_webcrawler_news_pup,
};
async function scrape_infospace_pup(browser, event, context) {
const page = await browser.newPage();
await page.goto('http://infospace.com/index.html');
try {
await page.waitForSelector('input[name="q"]', { timeout: 5000 });
} catch (e) {
return results;
}
let keywords = event.keywords;
var results = {};
for (var i = 0; i < keywords.length; i++) {
keyword = keywords[i];
try {
const input = await page.$('input[id="q"]');
// overwrites last text in input
await input.click({ clickCount: 3 });
await input.type(keyword);
await input.focus();
await page.keyboard.press("Enter");
if (event.sleep_range) {
await sfunctions.random_sleep(event);
}
await page.waitForSelector('.mainline-results', { timeout: 5000 }); // TODO: this is not the best selector.
await sfunctions.sleep(250);
if (event.debug === true && event.is_local === true) {
await page.screenshot({path: `debug/${keyword}.png`});
}
let html = await page.content();
results[keyword] = parse(html);
} catch (e) {
console.error(`Problem with scraping ${keyword}: ${e}`);
}
}
return results;
}
function parse(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
// perform queries
const results = [];
$('.result').each((i, link) => {
results.push({
link: $(link).find('a.title').attr('href'),
title: $(link).find('a.title').text(),
snippet: $(link).find('.description').text(),
visible_link: $(link).find('.url').text(),
})
});
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim()) {
res.rank = i+1;
cleaned.push(res);
}
}
let no_results = sfunctions.no_results(
['No search results were found for'],
$('.layout__mainline').text()
);
return {
time: (new Date()).toUTCString(),
no_results: no_results,
num_results: '',
results: cleaned,
}
}
async function scrape_webcrawler_news_pup(browser, event, context) {
const page = await browser.newPage();
await page.goto('https://www.webcrawler.com/?qc=news');
try {
await page.waitForSelector('input[name="q"]', { timeout: 5000 });
} catch (e) {
return results;
}
let keywords = event.keywords;
var results = {};
for (var i = 0; i < keywords.length; i++) {
keyword = keywords[i];
try {
const input = await page.$('input[name="q"]');
// overwrites last text in input
await input.click({ clickCount: 3 });
await sfunctions.sleep(150);
await input.type(keyword);
await sfunctions.sleep(150);
await input.focus();
await page.keyboard.press("Enter");
if (event.sleep_range) {
await sfunctions.random_sleep(event);
}
await page.waitForSelector('.mainline-results', { timeout: 5000 });
if (event.debug === true && event.is_local === true) {
await page.screenshot({path: `debug/${keyword}.png`});
}
let html = await page.content();
results[keyword] = parse_webcrawler_news_results(html, event.max_results);
} catch (e) {
console.error(`Problem with scraping ${keyword}: ${e}`);
return results;
}
}
return results;
}
function parse_webcrawler_news_results(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
// perform queries
const results = [];
$('.article').each((i, link) => {
let source = $(link).find('.source').text();
let date = source.split(',')[1] || '';
results.push({
link: $(link).find('a').attr('href'),
title: $(link).find('.title').text(),
publisher: $(link).find('.source').text(),
date: date,
snippet: $(link).find('.description').text(),
});
});
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.title && res.title.trim()) {
res.rank = i+1;
cleaned.push(res);
}
}
return {
time: (new Date()).toUTCString(),
results: cleaned
}
}

31
src/modules/metadata.js Normal file
View File

@ -0,0 +1,31 @@
const cheerio = require('cheerio');
module.exports = {
get_metadata: get_metadata,
get_http_headers: get_http_headers,
};
async function get_metadata(browser) {
let metadata = {};
const page = await browser.newPage();
await page.goto('https://ipinfo.io/json', {
waitLoad: true,
waitNetworkIdle: true // defaults to false
});
let json = await page.content();
const $ = cheerio.load(json);
metadata.ipinfo = $('pre').text();
return metadata;
}
async function get_http_headers(browser) {
let metadata = {};
const page = await browser.newPage();
await page.goto('https://httpbin.org/get', {
waitLoad: true,
waitNetworkIdle: true // defaults to false
});
let headers = await page.content();
return headers;
}

View File

@ -0,0 +1,85 @@
module.exports = {
random_user_agent: random_user_agent,
};
function random_user_agent() {
return user_agents[Math.floor(Math.random()*user_agents.length)];
}
const user_agents = [
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:62.0) Gecko/20100101 Firefox/62.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0',
'Mozilla/5.0 (X11; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:62.0) Gecko/20100101 Firefox/62.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/69.0.3497.81 Chrome/69.0.3497.81 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:62.0) Gecko/20100101 Firefox/62.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:62.0) Gecko/20100101 Firefox/62.0',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 OPR/55.0.2994.61',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:62.0) Gecko/20100101 Firefox/62.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0',
'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:62.0) Gecko/20100101 Firefox/62.0',
'Mozilla/5.0 (Windows NT 5.1; rv:52.0) Gecko/20100101 Firefox/52.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'Mozilla/5.0 (iPad; CPU OS 12_0_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/61.0',
'Mozilla/5.0 (Windows NT 6.1; rv:62.0) Gecko/20100101 Firefox/62.0',
'Mozilla/5.0 (iPad; CPU OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 OPR/56.0.3051.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 YaBrowser/18.9.0.3467 Yowser/2.5 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:60.0) Gecko/20100101 Firefox/60.0',
'Mozilla/5.0 (X11; CrOS x86_64 10895.56.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.95 Safari/537.36'
];

113
src/modules/youtube.js Normal file
View File

@ -0,0 +1,113 @@
const cheerio = require('cheerio');
const sfunctions = require('./functions.js');
module.exports = {
scrape_youtube_pup: scrape_youtube_pup,
};
const all_videos = new Set();
async function scrape_youtube_pup(browser, event, context) {
const page = await browser.newPage();
await page.goto('https://www.youtube.com');
try {
await page.waitForSelector('input[id="search"]', { timeout: 5000 });
} catch (e) {
return results;
}
let keywords = event.keywords;
var results = {};
// before we do anything, parse the results of the front page of youtube
await page.waitForSelector('ytd-video-renderer,ytd-grid-video-renderer', { timeout: 10000 });
await sfunctions.sleep(500);
let html = await page.content();
results['__frontpage__'] = parse(html);
for (var i = 0; i < keywords.length; i++) {
keyword = keywords[i];
try {
const input = await page.$('input[id="search"]');
// overwrites last text in input
await input.click({ clickCount: 3 });
await input.type(keyword);
await input.focus();
await page.keyboard.press("Enter");
if (event.sleep_range) {
await sfunctions.random_sleep(event);
}
await page.waitForFunction(`document.title.indexOf('${keyword}') !== -1`, { timeout: 5000 });
await page.waitForSelector('ytd-video-renderer,ytd-grid-video-renderer', { timeout: 5000 });
await sfunctions.sleep(500);
if (event.debug === true && event.is_local === true) {
await page.screenshot({path: `debug/${keyword}.png`});
}
let html = await page.content();
results[keyword] = parse(html);
} catch (e) {
console.error(`Problem with scraping ${keyword}: ${e}`);
}
}
return results;
}
function parse(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
// perform queries
const results = [];
$('#contents ytd-video-renderer,#contents ytd-grid-video-renderer').each((i, link) => {
results.push({
link: $(link).find('#video-title').attr('href'),
title: $(link).find('#video-title').text(),
snippet: $(link).find('#description-text').text(),
channel: $(link).find('#byline a').text(),
channel_link: $(link).find('#byline a').attr('href'),
num_views: $(link).find('#metadata-line span:nth-child(1)').text(),
release_date: $(link).find('#metadata-line span:nth-child(2)').text(),
})
});
let no_results = sfunctions.no_results(
['No results found', 'Keine Ergebnisse', 'Es werden Ergebnisse angezeigt', 'Showing results for' ],
$('yt-showing-results-for-renderer').text()
);
let effective_query = $('#corrected-link').text() || '';
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.title && res.title.trim()) {
res.title = res.title.trim();
res.snippet = res.snippet.trim();
res.rank = i+1;
// check if this result has been used before
if (all_videos.has(res.title) === false) {
cleaned.push(res);
}
all_videos.add(res.title);
}
}
return {
time: (new Date()).toUTCString(),
no_results: no_results,
effective_query: effective_query,
num_results: '',
results: cleaned,
}
}

200
src/node_scraper.js Normal file
View File

@ -0,0 +1,200 @@
const puppeteer = require('puppeteer');
const zlib = require('zlib');
// local module imports
const google = require('./modules/google.js');
const bing = require('./modules/bing.js');
const baidu = require('./modules/baidu.js');
const infospace = require('./modules/infospace.js');
const youtube = require('./modules/youtube.js');
const ua = require('./modules/user_agents.js');
const meta = require('./modules/metadata.js');
const duckduckgo = require('./modules/duckduckgo.js');
module.exports.handler = async function handler (event, context, callback) {
try {
const startTime = Date.now();
event = parseEventData(event);
if (event.debug === true) {
console.log(event);
}
const ADDITIONAL_CHROME_FLAGS = [
//'--proxy-server=' + proxy,
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--disable-gpu',
'--window-size=1920x1080',
'--hide-scrollbars',
];
let USER_AGENT = '';
if (event.random_user_agent) {
USER_AGENT = ua.random_user_agent();
}
if (event.user_agent) {
USER_AGENT = event.user_agent;
}
if (USER_AGENT) {
ADDITIONAL_CHROME_FLAGS.push(
`--user-agent="${USER_AGENT}"`
)
}
if (event.debug === true) {
console.log("Chrome Flags: ", ADDITIONAL_CHROME_FLAGS);
}
browser = await puppeteer.launch({
args: ADDITIONAL_CHROME_FLAGS,
headless: true,
});
if (event.log_http_headers === true) {
headers = await meta.get_http_headers(browser);
console.dir(headers);
}
// TODO: this is ugly but I don't want to use to much objects and classes right now.
if (event.search_engine == 'google') {
results = await google.scrape_google_pup(browser, event, context);
} else if (event.search_engine == 'google_news_old') {
results = await google.scrape_google_news_old_pup(browser, event, context);
} else if (event.search_engine == 'google_news') {
results = await google.scrape_google_news_pup(browser, event, context);
} else if (event.search_engine == 'google_image') {
results = await google.scrape_google_image_pup(browser, event, context);
} else if (event.search_engine == 'bing') {
results = await bing.scrape_bing_pup(browser, event, context);
} else if (event.search_engine == 'bing_news') {
results = await bing.scrape_bing_news_pup(browser, event, context);
} else if (event.search_engine == 'infospace') {
results = await infospace.scrape_infospace_pup(browser, event, context);
} else if (event.search_engine == 'webcrawler') {
results = await infospace.scrape_webcrawler_news_pup(browser, event, context);
} else if (event.search_engine == 'baidu') {
results = await baidu.scrape_baidu_pup(browser, event, context);
} else if (event.search_engine == 'youtube') {
results = await youtube.scrape_youtube_pup(browser, event, context);
} else if (event.search_engine == 'duckduckgo_news') {
results = await duckduckgo.scrape_duckduckgo_news_pup(browser, event, context);
} else if (event.search_engine == 'google_dr') {
results = await google.scrape_google_pup_dr(browser, event, context);
}
let metadata = {};
if (event.write_meta_data === true) {
metadata = await meta.get_metadata(browser);
}
await browser.close();
let num_keywords = event.keywords.length || 0;
let timeDelta = Date.now() - startTime;
let ms_per_keyword = timeDelta/num_keywords;
console.log(`Scraper took ${timeDelta}ms to scrape ${num_keywords} keywords.`);
console.log(`On average ms/keyword: ${ms_per_keyword}ms/keyword`);
if (event.verbose === true) {
console.dir(results, {depth: null, colors: true});
}
if (event.compress === true) {
results = JSON.stringify(results);
// https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Encoding
results = zlib.deflateSync(results).toString('base64');
}
if (event.write_meta_data === true) {
metadata.id = `${event.job_name} ${event.chunk_lines}`;
metadata.chunk_lines = event.chunk_lines;
metadata.elapsed_time = timeDelta.toString();
metadata.ms_per_keyword = ms_per_keyword.toString();
if (event.verbose === true) {
console.log(metadata);
}
}
let response = {
headers: {
'Content-Type': 'text/json',
},
results: results,
metadata: metadata || {},
statusCode: 200
};
callback(null, response);
} catch (e) {
callback(e, null);
}
};
function parseEventData(event) {
function _bool(e) {
e = String(e);
if (typeof e.trim === "function") {
return e.trim().toLowerCase() == 'true';
} else {
return e.toLowerCase() == 'true';
}
}
if (event.debug) {
event.debug = _bool(event.debug);
}
if (event.verbose) {
event.verbose = _bool(event.verbose);
}
if (event.upload_to_s3) {
event.upload_to_s3 = _bool(event.upload_to_s3);
}
if (event.write_meta_data) {
event.write_meta_data = _bool(event.write_meta_data);
}
if (event.log_http_headers) {
event.log_http_headers = _bool(event.log_http_headers);
}
if (event.compress) {
event.compress = _bool(event.compress);
}
if (event.is_local) {
event.is_local = _bool(event.is_local);
}
if (event.max_results) {
event.max_results = parseInt(event.max_results);
}
if (event.set_manual_settings) {
event.set_manual_settings = _bool(event.set_manual_settings);
}
if (event.sleep_range) {
// parse an array
event.sleep_range = eval(event.sleep_range);
if (event.sleep_range.length !== 2 && typeof i[0] !== 'number' && typeof i[1] !== 'number') {
throw "sleep_range is not a valid array of two integers.";
}
}
return event;
}

185
test/tests.js Normal file
View File

@ -0,0 +1,185 @@
const handler = require('./../src/node_scraper.js');
var assert = require('chai').assert;
/*
* Use chai and mocha for tests.
* https://mochajs.org/#installation
*/
function sleep(ms) {
return new Promise(resolve => {
setTimeout(resolve, ms)
})
}
const search_engines = ['google', 'google_image', 'google_news', 'youtube', 'bing', 'infospace', 'baidu'];
async function tests() {
const keywords = ['Google scraper NikolaiT', 'the idiot'];
event = {
search_engine: 'google',
compress: 'false',
debug: 'false',
verbose: 'false',
keywords: keywords,
};
for (var i = 0; i < search_engines.length; i++) {
se = search_engines[i];
console.log(`Testing ${se}...`);
event.search_engine = se;
await handler.handler(event, undefined, test_case);
await sleep(3000);
}
}
async function no_results_test() {
const keywords = ['fgskl340abJAksk43a44dsflkjaQQuBBdfk'];
event = {
write_meta_data: 'true',
compress: 'false',
debug: 'false',
verbose: 'false',
keywords: keywords,
};
for (var i = 0; i < search_engines.length; i++) {
se = search_engines[i];
console.log(`Testing ${se}...`);
event.search_engine = se;
await handler.handler(event, undefined, test_case_no_results);
await sleep(3000);
}
}
async function effective_query_test() {
const keywords = ['mount evverrest'];
event = {
write_meta_data: 'true',
job_name: 'test-job',
search_engine: '',
compress: 'false',
debug: 'false',
verbose: 'false',
keywords: keywords,
};
const effective_query_engines = ['google', 'google_image', 'google_news', 'youtube', 'bing'];
for (var i = 0; i < effective_query_engines.length; i++) {
se = effective_query_engines[i];
console.log(`Testing ${se}...`);
event.search_engine = se;
await handler.handler(event, undefined, test_case_effective_query);
await sleep(3000);
}
}
// we test with a callback function to our handler
function test_case(err, response) {
if (err) {
console.error(err);
} else {
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
assert.equal(response.statusCode, 200, 'status code must be 200');
results = response.results;
for (kw in results) {
// at least 6 results
assert.isAtLeast(results[kw].results.length, 6, 'results must have at least 6 links');
assert.equal(results[kw].no_results, false, 'no results should be false');
assert.typeOf(results[kw].num_results, 'string', 'num_results must be a string');
assert.isAtLeast(results[kw].num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(results[kw].time), 'number', 'time should be a valid date');
for (k = 0; k < results[kw].results.length; k++) {
res = results[kw].results[k];
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
}
}
}
}
// we test with a callback function to our handler
function test_case_no_results(err, response) {
if (err) {
console.error(err);
} else {
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
assert.equal(response.statusCode, 200, 'status code must be 200');
results = response.results;
for (kw in results) {
assert.isTrue(results[kw].no_results, 'no_result should be true');
assert.typeOf(results[kw].num_results, 'string', 'num_results must be a string');
assert.isEmpty(results[kw].num_results, 'no results should be a empty string');
assert.typeOf(Date.parse(results[kw].time), 'number', 'time should be a valid date');
}
}
}
// we test with a callback function to our handler
function test_case_effective_query(err, response) {
if (err) {
console.error(err);
} else {
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
assert.equal(response.statusCode, 200, 'status code must be 200');
results = response.results;
for (kw in results) {
assert.isTrue(results[kw].no_results, 'no_result should be true');
// effective query must be different to the original keyword
assert.isOk(results[kw].effective_query, 'effective query must be ok');
assert.isNotEmpty(results[kw].effective_query, 'effective query must be valid');
assert(results[kw].effective_query !== keyword, 'effective query must be different from keyword');
assert.typeOf(results[kw].num_results, 'string', 'num_results must be a string');
assert.isEmpty(results[kw].num_results, 'no results should be a empty string');
assert.typeOf(Date.parse(results[kw].time), 'number', 'time should be a valid date');
}
console.log('SUCCESS: all tests passed!');
}
}
//effective_query_test();
tests();
//no_results_test();