This commit is contained in:
Nikolai Tschacher 2019-01-26 20:15:19 +01:00
parent bab902e80a
commit b354e6918d
10 changed files with 83 additions and 28 deletions

View File

@ -47,22 +47,24 @@ let config = {
// if random_user_agent is set to True, a random user agent is chosen // if random_user_agent is set to True, a random user agent is chosen
random_user_agent: false, random_user_agent: false,
// get meta data of scraping in return object // get meta data of scraping in return object
write_meta_data: 'true', write_meta_data: true,
// how long to sleep between requests. a random sleep interval within the range [a,b] // how long to sleep between requests. a random sleep interval within the range [a,b]
// is drawn before every request. empty string for no sleeping. // is drawn before every request. empty string for no sleeping.
sleep_range: '[1,1]', sleep_range: '[1,1]',
// which search engine to scrape // which search engine to scrape
search_engine: 'yahoo_news', search_engine: 'yahoo_news',
// whether debug information should be printed // whether debug information should be printed
debug: 'true', debug: true,
// whether verbose program output should be printed // whether verbose program output should be printed
verbose: 'false', verbose: false,
// an array of keywords to scrape // an array of keywords to scrape
keywords: ['GOOGL', ], keywords: ['GOOGL', ],
// alternatively you can specify a keyword_file. this overwrites the keywords array // alternatively you can specify a keyword_file. this overwrites the keywords array
keyword_file: './keywords.txt', keyword_file: './keywords.txt',
// whether to start the browser in headless mode // whether to start the browser in headless mode
headless: false, headless: false,
// path to output file, data will be stored in JSON
output_file: 'results.json',
}; };
se_scraper.scrape(config, (err, response) => { se_scraper.scrape(config, (err, response) => {

1
data.json Normal file
View File

@ -0,0 +1 @@
{"scrapeulous.com":{"time":"Sat, 26 Jan 2019 19:05:15 GMT","num_results":"Ungefähr 171 Ergebnisse (0,25 Sekunden) ","no_results":false,"effective_query":"","results":[{"link":"https://scrapeulous.com/","title":"Scrapeulous","snippet":"Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. Whether you need to analyze your competitors market ...","visible_link":"https://scrapeulous.com/","date":"","rank":1},{"link":"https://scrapeulous.com/about/","title":"About - Scrapeulous","snippet":"Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. The business requirement to scrape information from ...","visible_link":"https://scrapeulous.com/about/","date":"","rank":2},{"link":"https://scrapeulous.com/contact/","title":"Contact - Scrapeulous","snippet":"Contact scrapeulous.com. Your email address. Valid email address where we are going to contact you. We will not send spam mail. Your inquiry.","visible_link":"https://scrapeulous.com/contact/","date":"","rank":3},{"link":"https://scrapeulous.com/howto/","title":"Howto - Scrapeulous","snippet":"We offer scraping large amounts of keywords for the Google Search Engine. Large means any number of keywords between 40 and 50000. Additionally, we ...","visible_link":"https://scrapeulous.com/howto/","date":"","rank":4},{"link":"https://www.scrapeulous.com/faq/","title":"Frequently Asked Questions - Scrapeulous","snippet":"31.10.2018 - Frequently Asked Questions. What is our hourly rate for customized scraping services? The minimal rate for creating custom scrapers is set to ...","visible_link":"https://www.scrapeulous.com/faq/","date":"31.10.2018 - ","rank":5},{"link":"https://www.scrapeulous.com/news/","title":"News Api for MSCI World ETF - Scrapeulous","snippet":"News Api for MSCI World ETF. Scrapeulous.com News Api allows you to query the most recent world news for an index composed of developed market equities.","visible_link":"https://www.scrapeulous.com/news/","date":"","rank":6},{"link":"https://scrapeulous.com/advanced/","title":"Advanced Scraping Services - Scrapeulous","snippet":"Advanced Scraping Services. If you have special requirements for your scraping/crawling projects, you can write us an email to this contact mail and we will ...","visible_link":"https://scrapeulous.com/advanced/","date":"","rank":7},{"link":"https://incolumitas.com/","title":"Coding, Learning and Business Ideas","snippet":"About · Contact · GoogleScraper · Lichess Autoplay-Bot · Projects · Scrapeulous.com · Site Notice · SVGCaptcha · Home Archives Categories Tags Atom ...","visible_link":"https://incolumitas.com/","date":"","rank":8},{"link":"https://incolumitas.com/pages/scrapeulous/","title":"Coding, Learning and Business Ideas Scrapeulous.com - Incolumitas","snippet":"In autumn 2018, I created a scraping service called scrapeulous.com. There you can purchase scrape jobs that allow you to upload a keyword file which in turn ...","visible_link":"https://incolumitas.com/pages/scrapeulous/","date":"","rank":9},{"link":"https://www.youtube.com/watch?v=a6xn6rc9GbI","title":"scrapeulous intro - YouTube","snippet":"Introduction for https://scrapeulous.com.","visible_link":"https://www.youtube.com/watch?v=a6xn6rc9GbI","date":"","rank":10}]}}

View File

@ -0,0 +1,35 @@
const se_scraper = require('../index.js');
async function multiple_search_engines() {
var searchEnginesList = ['google', 'bing'];
for (let index = 0; index < searchEnginesList.length; index++) {
const searchEngine = searchEnginesList[index];
let config = {
random_user_agent: true,
write_meta_data: true,
sleep_range: '[1,1]',
search_engine: searchEngine,
debug: false,
verbose: false,
// the list of keywords to scrape
keywords: ['scrapeulous.com',],
// whether to start the browser in headless mode
headless: true,
output_file: `${searchEngine}.json`
};
await se_scraper.scrape(config, (err, response) => {
if (err) {
console.error(err)
}
console.dir(response.results, {
depth: null,
colors: true
});
});
}
}
multiple_search_engines();

View File

@ -2,7 +2,7 @@ const handler = require('./src/node_scraper.js');
var fs = require('fs'); var fs = require('fs');
var os = require("os"); var os = require("os");
exports.scrape = function(config, callback) { exports.scrape = async function(config, callback) {
// options for scraping // options for scraping
event = { event = {
// the user agent to scrape with // the user agent to scrape with
@ -10,19 +10,23 @@ exports.scrape = function(config, callback) {
// if random_user_agent is set to True, a random user agent is chosen // if random_user_agent is set to True, a random user agent is chosen
random_user_agent: false, random_user_agent: false,
// whether to select manual settings in visible mode // whether to select manual settings in visible mode
set_manual_settings: 'false', set_manual_settings: false,
// get meta data of scraping in return object // get meta data of scraping in return object
write_meta_data: 'true', write_meta_data: true,
log_http_headers: 'false', log_http_headers: false,
// how long to sleep between requests. a random sleep interval within the range [a,b] // how long to sleep between requests. a random sleep interval within the range [a,b]
// is drawn before every request. empty string for no sleeping. // is drawn before every request. empty string for no sleeping.
sleep_range: '[1,1]', sleep_range: '[1,1]',
// which search engine to scrape // which search engine to scrape
search_engine: 'google', search_engine: 'google',
compress: 'false', // compress compress: false, // compress
debug: 'false', debug: false,
verbose: 'false', verbose: false,
keywords: ['test'], keywords: ['test'],
// whether to start the browser in headless mode
headless: true,
// path to output file, data will be stored in JSON
output_file: '',
}; };
for (var key in config) { for (var key in config) {
@ -44,7 +48,7 @@ exports.scrape = function(config, callback) {
} }
} }
handler.handler(event, undefined, callback ); await handler.handler(event, undefined, callback );
}; };
function read_keywords_from_file(fname) { function read_keywords_from_file(fname) {
@ -55,10 +59,3 @@ function read_keywords_from_file(fname) {
}); });
return kws; return kws;
} }
function write_results(fname, data) {
fs.writeFile(fname || 'results.json', data, (err) => {
if (err) throw err;
console.log('Results written to file');
});
}

6
jformat.py Executable file
View File

@ -0,0 +1,6 @@
import pprint
import sys
import json
if len(sys.argv) == 2:
print(pprint.pformat(json.load(open(sys.argv[1]))))

View File

@ -1,2 +1,2 @@
GOOGL test
AAPL water is blue

View File

@ -1,6 +1,6 @@
{ {
"name": "se-scraper", "name": "se-scraper",
"version": "1.0.5", "version": "1.0.6",
"description": "A simple module which uses puppeteer to scrape several search engines.", "description": "A simple module which uses puppeteer to scrape several search engines.",
"main": "index.js", "main": "index.js",
"scripts": { "scripts": {

File diff suppressed because one or more lines are too long

14
run.js
View File

@ -6,22 +6,24 @@ let config = {
// if random_user_agent is set to True, a random user agent is chosen // if random_user_agent is set to True, a random user agent is chosen
random_user_agent: false, random_user_agent: false,
// get meta data of scraping in return object // get meta data of scraping in return object
write_meta_data: 'true', write_meta_data: false,
// how long to sleep between requests. a random sleep interval within the range [a,b] // how long to sleep between requests. a random sleep interval within the range [a,b]
// is drawn before every request. empty string for no sleeping. // is drawn before every request. empty string for no sleeping.
sleep_range: '[1,1]', sleep_range: '[1,1]',
// which search engine to scrape // which search engine to scrape
search_engine: 'yahoo_news', search_engine: 'google',
// whether debug information should be printed // whether debug information should be printed
debug: 'true', debug: true,
// whether verbose program output should be printed // whether verbose program output should be printed
verbose: 'false', verbose: false,
// an array of keywords to scrape // an array of keywords to scrape
keywords: ['GOOGL', ], keywords: ['scrapeulous.com', ],
// alternatively you can specify a keyword_file. this overwrites the keywords array // alternatively you can specify a keyword_file. this overwrites the keywords array
keyword_file: './keywords.txt', keyword_file: '',
// whether to start the browser in headless mode // whether to start the browser in headless mode
headless: false, headless: false,
// path to output file, data will be stored in JSON
output_file: 'data.json',
}; };
se_scraper.scrape(config, (err, response) => { se_scraper.scrape(config, (err, response) => {

View File

@ -1,5 +1,6 @@
const puppeteer = require('puppeteer'); const puppeteer = require('puppeteer');
const zlib = require('zlib'); const zlib = require('zlib');
var fs = require('fs');
// local module imports // local module imports
const google = require('./modules/google.js'); const google = require('./modules/google.js');
@ -12,6 +13,14 @@ const meta = require('./modules/metadata.js');
const duckduckgo = require('./modules/duckduckgo.js'); const duckduckgo = require('./modules/duckduckgo.js');
const tickersearch = require('./modules/ticker_search.js'); const tickersearch = require('./modules/ticker_search.js');
function write_results(fname, data) {
fs.writeFileSync(fname, data, (err) => {
if (err) throw err;
console.log(`Results written to file ${fname}`);
});
}
module.exports.handler = async function handler (event, context, callback) { module.exports.handler = async function handler (event, context, callback) {
try { try {
@ -127,6 +136,10 @@ module.exports.handler = async function handler (event, context, callback) {
} }
} }
if (event.output_file) {
write_results(event.output_file, JSON.stringify(results));
}
let response = { let response = {
headers: { headers: {
'Content-Type': 'text/json', 'Content-Type': 'text/json',