mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2024-11-25 00:53:46 +01:00
.
This commit is contained in:
parent
bab902e80a
commit
b354e6918d
@ -47,22 +47,24 @@ let config = {
|
||||
// if random_user_agent is set to True, a random user agent is chosen
|
||||
random_user_agent: false,
|
||||
// get meta data of scraping in return object
|
||||
write_meta_data: 'true',
|
||||
write_meta_data: true,
|
||||
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
||||
// is drawn before every request. empty string for no sleeping.
|
||||
sleep_range: '[1,1]',
|
||||
// which search engine to scrape
|
||||
search_engine: 'yahoo_news',
|
||||
// whether debug information should be printed
|
||||
debug: 'true',
|
||||
debug: true,
|
||||
// whether verbose program output should be printed
|
||||
verbose: 'false',
|
||||
verbose: false,
|
||||
// an array of keywords to scrape
|
||||
keywords: ['GOOGL', ],
|
||||
// alternatively you can specify a keyword_file. this overwrites the keywords array
|
||||
keyword_file: './keywords.txt',
|
||||
// whether to start the browser in headless mode
|
||||
headless: false,
|
||||
// path to output file, data will be stored in JSON
|
||||
output_file: 'results.json',
|
||||
};
|
||||
|
||||
se_scraper.scrape(config, (err, response) => {
|
||||
|
1
data.json
Normal file
1
data.json
Normal file
@ -0,0 +1 @@
|
||||
{"scrapeulous.com":{"time":"Sat, 26 Jan 2019 19:05:15 GMT","num_results":"Ungefähr 171 Ergebnisse (0,25 Sekunden) ","no_results":false,"effective_query":"","results":[{"link":"https://scrapeulous.com/","title":"Scrapeulous","snippet":"Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. Whether you need to analyze your competitors market ...","visible_link":"https://scrapeulous.com/","date":"","rank":1},{"link":"https://scrapeulous.com/about/","title":"About - Scrapeulous","snippet":"Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. The business requirement to scrape information from ...","visible_link":"https://scrapeulous.com/about/","date":"","rank":2},{"link":"https://scrapeulous.com/contact/","title":"Contact - Scrapeulous","snippet":"Contact scrapeulous.com. Your email address. Valid email address where we are going to contact you. We will not send spam mail. Your inquiry.","visible_link":"https://scrapeulous.com/contact/","date":"","rank":3},{"link":"https://scrapeulous.com/howto/","title":"Howto - Scrapeulous","snippet":"We offer scraping large amounts of keywords for the Google Search Engine. Large means any number of keywords between 40 and 50000. Additionally, we ...","visible_link":"https://scrapeulous.com/howto/","date":"","rank":4},{"link":"https://www.scrapeulous.com/faq/","title":"Frequently Asked Questions - Scrapeulous","snippet":"31.10.2018 - Frequently Asked Questions. What is our hourly rate for customized scraping services? The minimal rate for creating custom scrapers is set to ...","visible_link":"https://www.scrapeulous.com/faq/","date":"31.10.2018 - ","rank":5},{"link":"https://www.scrapeulous.com/news/","title":"News Api for MSCI World ETF - Scrapeulous","snippet":"News Api for MSCI World ETF. Scrapeulous.com News Api allows you to query the most recent world news for an index composed of developed market equities.","visible_link":"https://www.scrapeulous.com/news/","date":"","rank":6},{"link":"https://scrapeulous.com/advanced/","title":"Advanced Scraping Services - Scrapeulous","snippet":"Advanced Scraping Services. If you have special requirements for your scraping/crawling projects, you can write us an email to this contact mail and we will ...","visible_link":"https://scrapeulous.com/advanced/","date":"","rank":7},{"link":"https://incolumitas.com/","title":"Coding, Learning and Business Ideas","snippet":"About · Contact · GoogleScraper · Lichess Autoplay-Bot · Projects · Scrapeulous.com · Site Notice · SVGCaptcha · Home Archives Categories Tags Atom ...","visible_link":"https://incolumitas.com/","date":"","rank":8},{"link":"https://incolumitas.com/pages/scrapeulous/","title":"Coding, Learning and Business Ideas – Scrapeulous.com - Incolumitas","snippet":"In autumn 2018, I created a scraping service called scrapeulous.com. There you can purchase scrape jobs that allow you to upload a keyword file which in turn ...","visible_link":"https://incolumitas.com/pages/scrapeulous/","date":"","rank":9},{"link":"https://www.youtube.com/watch?v=a6xn6rc9GbI","title":"scrapeulous intro - YouTube","snippet":"Introduction for https://scrapeulous.com.","visible_link":"https://www.youtube.com/watch?v=a6xn6rc9GbI","date":"","rank":10}]}}
|
35
examples/multiple_search_engines.js
Normal file
35
examples/multiple_search_engines.js
Normal file
@ -0,0 +1,35 @@
|
||||
const se_scraper = require('../index.js');
|
||||
|
||||
async function multiple_search_engines() {
|
||||
|
||||
var searchEnginesList = ['google', 'bing'];
|
||||
|
||||
for (let index = 0; index < searchEnginesList.length; index++) {
|
||||
const searchEngine = searchEnginesList[index];
|
||||
let config = {
|
||||
random_user_agent: true,
|
||||
write_meta_data: true,
|
||||
sleep_range: '[1,1]',
|
||||
search_engine: searchEngine,
|
||||
debug: false,
|
||||
verbose: false,
|
||||
// the list of keywords to scrape
|
||||
keywords: ['scrapeulous.com',],
|
||||
// whether to start the browser in headless mode
|
||||
headless: true,
|
||||
output_file: `${searchEngine}.json`
|
||||
};
|
||||
|
||||
await se_scraper.scrape(config, (err, response) => {
|
||||
if (err) {
|
||||
console.error(err)
|
||||
}
|
||||
console.dir(response.results, {
|
||||
depth: null,
|
||||
colors: true
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
multiple_search_engines();
|
27
index.js
27
index.js
@ -2,7 +2,7 @@ const handler = require('./src/node_scraper.js');
|
||||
var fs = require('fs');
|
||||
var os = require("os");
|
||||
|
||||
exports.scrape = function(config, callback) {
|
||||
exports.scrape = async function(config, callback) {
|
||||
// options for scraping
|
||||
event = {
|
||||
// the user agent to scrape with
|
||||
@ -10,19 +10,23 @@ exports.scrape = function(config, callback) {
|
||||
// if random_user_agent is set to True, a random user agent is chosen
|
||||
random_user_agent: false,
|
||||
// whether to select manual settings in visible mode
|
||||
set_manual_settings: 'false',
|
||||
set_manual_settings: false,
|
||||
// get meta data of scraping in return object
|
||||
write_meta_data: 'true',
|
||||
log_http_headers: 'false',
|
||||
write_meta_data: true,
|
||||
log_http_headers: false,
|
||||
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
||||
// is drawn before every request. empty string for no sleeping.
|
||||
sleep_range: '[1,1]',
|
||||
// which search engine to scrape
|
||||
search_engine: 'google',
|
||||
compress: 'false', // compress
|
||||
debug: 'false',
|
||||
verbose: 'false',
|
||||
compress: false, // compress
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: ['test'],
|
||||
// whether to start the browser in headless mode
|
||||
headless: true,
|
||||
// path to output file, data will be stored in JSON
|
||||
output_file: '',
|
||||
};
|
||||
|
||||
for (var key in config) {
|
||||
@ -44,7 +48,7 @@ exports.scrape = function(config, callback) {
|
||||
}
|
||||
}
|
||||
|
||||
handler.handler(event, undefined, callback );
|
||||
await handler.handler(event, undefined, callback );
|
||||
};
|
||||
|
||||
function read_keywords_from_file(fname) {
|
||||
@ -55,10 +59,3 @@ function read_keywords_from_file(fname) {
|
||||
});
|
||||
return kws;
|
||||
}
|
||||
|
||||
function write_results(fname, data) {
|
||||
fs.writeFile(fname || 'results.json', data, (err) => {
|
||||
if (err) throw err;
|
||||
console.log('Results written to file');
|
||||
});
|
||||
}
|
6
jformat.py
Executable file
6
jformat.py
Executable file
@ -0,0 +1,6 @@
|
||||
import pprint
|
||||
import sys
|
||||
import json
|
||||
|
||||
if len(sys.argv) == 2:
|
||||
print(pprint.pformat(json.load(open(sys.argv[1]))))
|
@ -1,2 +1,2 @@
|
||||
GOOGL
|
||||
AAPL
|
||||
test
|
||||
water is blue
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "se-scraper",
|
||||
"version": "1.0.5",
|
||||
"version": "1.0.6",
|
||||
"description": "A simple module which uses puppeteer to scrape several search engines.",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
|
File diff suppressed because one or more lines are too long
14
run.js
14
run.js
@ -6,22 +6,24 @@ let config = {
|
||||
// if random_user_agent is set to True, a random user agent is chosen
|
||||
random_user_agent: false,
|
||||
// get meta data of scraping in return object
|
||||
write_meta_data: 'true',
|
||||
write_meta_data: false,
|
||||
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
||||
// is drawn before every request. empty string for no sleeping.
|
||||
sleep_range: '[1,1]',
|
||||
// which search engine to scrape
|
||||
search_engine: 'yahoo_news',
|
||||
search_engine: 'google',
|
||||
// whether debug information should be printed
|
||||
debug: 'true',
|
||||
debug: true,
|
||||
// whether verbose program output should be printed
|
||||
verbose: 'false',
|
||||
verbose: false,
|
||||
// an array of keywords to scrape
|
||||
keywords: ['GOOGL', ],
|
||||
keywords: ['scrapeulous.com', ],
|
||||
// alternatively you can specify a keyword_file. this overwrites the keywords array
|
||||
keyword_file: './keywords.txt',
|
||||
keyword_file: '',
|
||||
// whether to start the browser in headless mode
|
||||
headless: false,
|
||||
// path to output file, data will be stored in JSON
|
||||
output_file: 'data.json',
|
||||
};
|
||||
|
||||
se_scraper.scrape(config, (err, response) => {
|
||||
|
@ -1,5 +1,6 @@
|
||||
const puppeteer = require('puppeteer');
|
||||
const zlib = require('zlib');
|
||||
var fs = require('fs');
|
||||
|
||||
// local module imports
|
||||
const google = require('./modules/google.js');
|
||||
@ -12,6 +13,14 @@ const meta = require('./modules/metadata.js');
|
||||
const duckduckgo = require('./modules/duckduckgo.js');
|
||||
const tickersearch = require('./modules/ticker_search.js');
|
||||
|
||||
|
||||
function write_results(fname, data) {
|
||||
fs.writeFileSync(fname, data, (err) => {
|
||||
if (err) throw err;
|
||||
console.log(`Results written to file ${fname}`);
|
||||
});
|
||||
}
|
||||
|
||||
module.exports.handler = async function handler (event, context, callback) {
|
||||
|
||||
try {
|
||||
@ -127,6 +136,10 @@ module.exports.handler = async function handler (event, context, callback) {
|
||||
}
|
||||
}
|
||||
|
||||
if (event.output_file) {
|
||||
write_results(event.output_file, JSON.stringify(results));
|
||||
}
|
||||
|
||||
let response = {
|
||||
headers: {
|
||||
'Content-Type': 'text/json',
|
||||
|
Loading…
Reference in New Issue
Block a user