mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2024-11-25 00:53:46 +01:00
.
This commit is contained in:
parent
bab902e80a
commit
b354e6918d
@ -47,22 +47,24 @@ let config = {
|
|||||||
// if random_user_agent is set to True, a random user agent is chosen
|
// if random_user_agent is set to True, a random user agent is chosen
|
||||||
random_user_agent: false,
|
random_user_agent: false,
|
||||||
// get meta data of scraping in return object
|
// get meta data of scraping in return object
|
||||||
write_meta_data: 'true',
|
write_meta_data: true,
|
||||||
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
||||||
// is drawn before every request. empty string for no sleeping.
|
// is drawn before every request. empty string for no sleeping.
|
||||||
sleep_range: '[1,1]',
|
sleep_range: '[1,1]',
|
||||||
// which search engine to scrape
|
// which search engine to scrape
|
||||||
search_engine: 'yahoo_news',
|
search_engine: 'yahoo_news',
|
||||||
// whether debug information should be printed
|
// whether debug information should be printed
|
||||||
debug: 'true',
|
debug: true,
|
||||||
// whether verbose program output should be printed
|
// whether verbose program output should be printed
|
||||||
verbose: 'false',
|
verbose: false,
|
||||||
// an array of keywords to scrape
|
// an array of keywords to scrape
|
||||||
keywords: ['GOOGL', ],
|
keywords: ['GOOGL', ],
|
||||||
// alternatively you can specify a keyword_file. this overwrites the keywords array
|
// alternatively you can specify a keyword_file. this overwrites the keywords array
|
||||||
keyword_file: './keywords.txt',
|
keyword_file: './keywords.txt',
|
||||||
// whether to start the browser in headless mode
|
// whether to start the browser in headless mode
|
||||||
headless: false,
|
headless: false,
|
||||||
|
// path to output file, data will be stored in JSON
|
||||||
|
output_file: 'results.json',
|
||||||
};
|
};
|
||||||
|
|
||||||
se_scraper.scrape(config, (err, response) => {
|
se_scraper.scrape(config, (err, response) => {
|
||||||
|
1
data.json
Normal file
1
data.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
{"scrapeulous.com":{"time":"Sat, 26 Jan 2019 19:05:15 GMT","num_results":"Ungefähr 171 Ergebnisse (0,25 Sekunden) ","no_results":false,"effective_query":"","results":[{"link":"https://scrapeulous.com/","title":"Scrapeulous","snippet":"Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. Whether you need to analyze your competitors market ...","visible_link":"https://scrapeulous.com/","date":"","rank":1},{"link":"https://scrapeulous.com/about/","title":"About - Scrapeulous","snippet":"Scrapeulous.com allows you to scrape various search engines automatically and in large quantities. The business requirement to scrape information from ...","visible_link":"https://scrapeulous.com/about/","date":"","rank":2},{"link":"https://scrapeulous.com/contact/","title":"Contact - Scrapeulous","snippet":"Contact scrapeulous.com. Your email address. Valid email address where we are going to contact you. We will not send spam mail. Your inquiry.","visible_link":"https://scrapeulous.com/contact/","date":"","rank":3},{"link":"https://scrapeulous.com/howto/","title":"Howto - Scrapeulous","snippet":"We offer scraping large amounts of keywords for the Google Search Engine. Large means any number of keywords between 40 and 50000. Additionally, we ...","visible_link":"https://scrapeulous.com/howto/","date":"","rank":4},{"link":"https://www.scrapeulous.com/faq/","title":"Frequently Asked Questions - Scrapeulous","snippet":"31.10.2018 - Frequently Asked Questions. What is our hourly rate for customized scraping services? The minimal rate for creating custom scrapers is set to ...","visible_link":"https://www.scrapeulous.com/faq/","date":"31.10.2018 - ","rank":5},{"link":"https://www.scrapeulous.com/news/","title":"News Api for MSCI World ETF - Scrapeulous","snippet":"News Api for MSCI World ETF. Scrapeulous.com News Api allows you to query the most recent world news for an index composed of developed market equities.","visible_link":"https://www.scrapeulous.com/news/","date":"","rank":6},{"link":"https://scrapeulous.com/advanced/","title":"Advanced Scraping Services - Scrapeulous","snippet":"Advanced Scraping Services. If you have special requirements for your scraping/crawling projects, you can write us an email to this contact mail and we will ...","visible_link":"https://scrapeulous.com/advanced/","date":"","rank":7},{"link":"https://incolumitas.com/","title":"Coding, Learning and Business Ideas","snippet":"About · Contact · GoogleScraper · Lichess Autoplay-Bot · Projects · Scrapeulous.com · Site Notice · SVGCaptcha · Home Archives Categories Tags Atom ...","visible_link":"https://incolumitas.com/","date":"","rank":8},{"link":"https://incolumitas.com/pages/scrapeulous/","title":"Coding, Learning and Business Ideas – Scrapeulous.com - Incolumitas","snippet":"In autumn 2018, I created a scraping service called scrapeulous.com. There you can purchase scrape jobs that allow you to upload a keyword file which in turn ...","visible_link":"https://incolumitas.com/pages/scrapeulous/","date":"","rank":9},{"link":"https://www.youtube.com/watch?v=a6xn6rc9GbI","title":"scrapeulous intro - YouTube","snippet":"Introduction for https://scrapeulous.com.","visible_link":"https://www.youtube.com/watch?v=a6xn6rc9GbI","date":"","rank":10}]}}
|
35
examples/multiple_search_engines.js
Normal file
35
examples/multiple_search_engines.js
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
const se_scraper = require('../index.js');
|
||||||
|
|
||||||
|
async function multiple_search_engines() {
|
||||||
|
|
||||||
|
var searchEnginesList = ['google', 'bing'];
|
||||||
|
|
||||||
|
for (let index = 0; index < searchEnginesList.length; index++) {
|
||||||
|
const searchEngine = searchEnginesList[index];
|
||||||
|
let config = {
|
||||||
|
random_user_agent: true,
|
||||||
|
write_meta_data: true,
|
||||||
|
sleep_range: '[1,1]',
|
||||||
|
search_engine: searchEngine,
|
||||||
|
debug: false,
|
||||||
|
verbose: false,
|
||||||
|
// the list of keywords to scrape
|
||||||
|
keywords: ['scrapeulous.com',],
|
||||||
|
// whether to start the browser in headless mode
|
||||||
|
headless: true,
|
||||||
|
output_file: `${searchEngine}.json`
|
||||||
|
};
|
||||||
|
|
||||||
|
await se_scraper.scrape(config, (err, response) => {
|
||||||
|
if (err) {
|
||||||
|
console.error(err)
|
||||||
|
}
|
||||||
|
console.dir(response.results, {
|
||||||
|
depth: null,
|
||||||
|
colors: true
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
multiple_search_engines();
|
27
index.js
27
index.js
@ -2,7 +2,7 @@ const handler = require('./src/node_scraper.js');
|
|||||||
var fs = require('fs');
|
var fs = require('fs');
|
||||||
var os = require("os");
|
var os = require("os");
|
||||||
|
|
||||||
exports.scrape = function(config, callback) {
|
exports.scrape = async function(config, callback) {
|
||||||
// options for scraping
|
// options for scraping
|
||||||
event = {
|
event = {
|
||||||
// the user agent to scrape with
|
// the user agent to scrape with
|
||||||
@ -10,19 +10,23 @@ exports.scrape = function(config, callback) {
|
|||||||
// if random_user_agent is set to True, a random user agent is chosen
|
// if random_user_agent is set to True, a random user agent is chosen
|
||||||
random_user_agent: false,
|
random_user_agent: false,
|
||||||
// whether to select manual settings in visible mode
|
// whether to select manual settings in visible mode
|
||||||
set_manual_settings: 'false',
|
set_manual_settings: false,
|
||||||
// get meta data of scraping in return object
|
// get meta data of scraping in return object
|
||||||
write_meta_data: 'true',
|
write_meta_data: true,
|
||||||
log_http_headers: 'false',
|
log_http_headers: false,
|
||||||
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
||||||
// is drawn before every request. empty string for no sleeping.
|
// is drawn before every request. empty string for no sleeping.
|
||||||
sleep_range: '[1,1]',
|
sleep_range: '[1,1]',
|
||||||
// which search engine to scrape
|
// which search engine to scrape
|
||||||
search_engine: 'google',
|
search_engine: 'google',
|
||||||
compress: 'false', // compress
|
compress: false, // compress
|
||||||
debug: 'false',
|
debug: false,
|
||||||
verbose: 'false',
|
verbose: false,
|
||||||
keywords: ['test'],
|
keywords: ['test'],
|
||||||
|
// whether to start the browser in headless mode
|
||||||
|
headless: true,
|
||||||
|
// path to output file, data will be stored in JSON
|
||||||
|
output_file: '',
|
||||||
};
|
};
|
||||||
|
|
||||||
for (var key in config) {
|
for (var key in config) {
|
||||||
@ -44,7 +48,7 @@ exports.scrape = function(config, callback) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
handler.handler(event, undefined, callback );
|
await handler.handler(event, undefined, callback );
|
||||||
};
|
};
|
||||||
|
|
||||||
function read_keywords_from_file(fname) {
|
function read_keywords_from_file(fname) {
|
||||||
@ -55,10 +59,3 @@ function read_keywords_from_file(fname) {
|
|||||||
});
|
});
|
||||||
return kws;
|
return kws;
|
||||||
}
|
}
|
||||||
|
|
||||||
function write_results(fname, data) {
|
|
||||||
fs.writeFile(fname || 'results.json', data, (err) => {
|
|
||||||
if (err) throw err;
|
|
||||||
console.log('Results written to file');
|
|
||||||
});
|
|
||||||
}
|
|
6
jformat.py
Executable file
6
jformat.py
Executable file
@ -0,0 +1,6 @@
|
|||||||
|
import pprint
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
|
||||||
|
if len(sys.argv) == 2:
|
||||||
|
print(pprint.pformat(json.load(open(sys.argv[1]))))
|
@ -1,2 +1,2 @@
|
|||||||
GOOGL
|
test
|
||||||
AAPL
|
water is blue
|
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "se-scraper",
|
"name": "se-scraper",
|
||||||
"version": "1.0.5",
|
"version": "1.0.6",
|
||||||
"description": "A simple module which uses puppeteer to scrape several search engines.",
|
"description": "A simple module which uses puppeteer to scrape several search engines.",
|
||||||
"main": "index.js",
|
"main": "index.js",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
|
File diff suppressed because one or more lines are too long
14
run.js
14
run.js
@ -6,22 +6,24 @@ let config = {
|
|||||||
// if random_user_agent is set to True, a random user agent is chosen
|
// if random_user_agent is set to True, a random user agent is chosen
|
||||||
random_user_agent: false,
|
random_user_agent: false,
|
||||||
// get meta data of scraping in return object
|
// get meta data of scraping in return object
|
||||||
write_meta_data: 'true',
|
write_meta_data: false,
|
||||||
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
||||||
// is drawn before every request. empty string for no sleeping.
|
// is drawn before every request. empty string for no sleeping.
|
||||||
sleep_range: '[1,1]',
|
sleep_range: '[1,1]',
|
||||||
// which search engine to scrape
|
// which search engine to scrape
|
||||||
search_engine: 'yahoo_news',
|
search_engine: 'google',
|
||||||
// whether debug information should be printed
|
// whether debug information should be printed
|
||||||
debug: 'true',
|
debug: true,
|
||||||
// whether verbose program output should be printed
|
// whether verbose program output should be printed
|
||||||
verbose: 'false',
|
verbose: false,
|
||||||
// an array of keywords to scrape
|
// an array of keywords to scrape
|
||||||
keywords: ['GOOGL', ],
|
keywords: ['scrapeulous.com', ],
|
||||||
// alternatively you can specify a keyword_file. this overwrites the keywords array
|
// alternatively you can specify a keyword_file. this overwrites the keywords array
|
||||||
keyword_file: './keywords.txt',
|
keyword_file: '',
|
||||||
// whether to start the browser in headless mode
|
// whether to start the browser in headless mode
|
||||||
headless: false,
|
headless: false,
|
||||||
|
// path to output file, data will be stored in JSON
|
||||||
|
output_file: 'data.json',
|
||||||
};
|
};
|
||||||
|
|
||||||
se_scraper.scrape(config, (err, response) => {
|
se_scraper.scrape(config, (err, response) => {
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
const puppeteer = require('puppeteer');
|
const puppeteer = require('puppeteer');
|
||||||
const zlib = require('zlib');
|
const zlib = require('zlib');
|
||||||
|
var fs = require('fs');
|
||||||
|
|
||||||
// local module imports
|
// local module imports
|
||||||
const google = require('./modules/google.js');
|
const google = require('./modules/google.js');
|
||||||
@ -12,6 +13,14 @@ const meta = require('./modules/metadata.js');
|
|||||||
const duckduckgo = require('./modules/duckduckgo.js');
|
const duckduckgo = require('./modules/duckduckgo.js');
|
||||||
const tickersearch = require('./modules/ticker_search.js');
|
const tickersearch = require('./modules/ticker_search.js');
|
||||||
|
|
||||||
|
|
||||||
|
function write_results(fname, data) {
|
||||||
|
fs.writeFileSync(fname, data, (err) => {
|
||||||
|
if (err) throw err;
|
||||||
|
console.log(`Results written to file ${fname}`);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
module.exports.handler = async function handler (event, context, callback) {
|
module.exports.handler = async function handler (event, context, callback) {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@ -127,6 +136,10 @@ module.exports.handler = async function handler (event, context, callback) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (event.output_file) {
|
||||||
|
write_results(event.output_file, JSON.stringify(results));
|
||||||
|
}
|
||||||
|
|
||||||
let response = {
|
let response = {
|
||||||
headers: {
|
headers: {
|
||||||
'Content-Type': 'text/json',
|
'Content-Type': 'text/json',
|
||||||
|
Loading…
Reference in New Issue
Block a user