mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2025-02-17 00:50:48 +01:00
added handle_results
This commit is contained in:
parent
86a66a09fd
commit
a20b6c79a9
@ -1 +1 @@
|
||||
{"scrapeulous.com":{"time":"Sun, 27 Jan 2019 18:07:33 GMT","num_results":"Ungefähr 101 Ergebnisse","no_results":false,"effective_query":"","results":[]}}
|
||||
{"incolumitas.com news":{"time":"Sun, 27 Jan 2019 19:07:41 GMT","num_results":"Ungefähr 691.000 Ergebnisse","no_results":false,"effective_query":"","results":[{"link":"/url?q=https://incolumitas.com/2018/10/29/youtube-puppeteer-scraping/&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFggUMAA&usg=AOvVaw1PmFDpPlIFYilxfQb1ym1W","title":"Coding, Learning and Business Ideas – Tutorial ... - Incolumitas","snippet":"29 Oct 2018 ... If you are a scientist, you might be interested in the spreading of fake news for \nexample. You want to monitor the sources of fake news.","visible_link":"","date":"","rank":1},{"link":"/url?q=https://incolumitas.com/2018/11/18/introduction-machine-learning-2019/&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFggaMAE&usg=AOvVaw37PW4eBuCGv2zkvXt0sZkw","title":"Coding, Learning and Business Ideas – Introduction to ... - Incolumitas","snippet":"18 Nov 2018 ... I want to recognize news articles soley based on one input: A link to the \ndocument. The algorithm should automatically recognize whether the ...","visible_link":"","date":"","rank":2},{"link":"/url?q=https://incolumitas.com/pages/about/&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFgggMAI&usg=AOvVaw2-XXZq6XPgh5orgEd_2rr6","title":"Coding, Learning and Business Ideas – About - Incolumitas","snippet":"As far as I know, this security vulnerability is still unfixed in late 2018. See my \noriginal blog post here. An news article from Heise and an article from ars \ntechnica.","visible_link":"","date":"","rank":3},{"link":"/url?q=https://incolumitas.com/uploads/2013/12/links.txt&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFgglMAM&usg=AOvVaw0cAkft0dN7WcT6DCDbDYZM","title":"http://www.urbandictionary.com/define.php?term=holy%20shit http ...","snippet":"... http://www.tumblr.com/tagged/holy-shit http://pitchfork.com/news/52370-watch-\nholy-shit-perform-at-ryan-mcginley-gallery-opening-in-san-francisco/ ...","visible_link":"","date":"","rank":4},{"link":"/url?q=https://incolumitas.com/2014/11/12/scraping-and-extracting-links-from-any-major-search-engine-like-google-yandex-baidu-bing-and-duckduckgo/&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFggqMAQ&usg=AOvVaw0I45HMQSEjMeuKozexkwcI","title":"Coding, Learning and Business Ideas – Scraping and ... - Incolumitas","snippet":"12 Nov 2014 ... The supported search types. For instance, Google supports Video Search, Image \nSearch, News search search_types = [] def __init__(self, html, ...","visible_link":"","date":"","rank":5},{"link":"/url?q=https://incolumitas.com/2013/01/06/googlesearch-a-rapid-python-class-to-get-search-results/&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFggwMAU&usg=AOvVaw39UsbJVwdj3lCe2hAGZlgq","title":"Coding, Learning and Business Ideas ... - Incolumitas","snippet":"18 Feb 2014 ... UPDATE on 18th February 2014: This python module has now its own github \nrepository! The plugin can extract All links Link titles The ...","visible_link":"","date":"","rank":6},{"link":"/url?q=https://www.npmjs.com/package/se-scraper&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFgg2MAY&usg=AOvVaw2qrTw_GOSWhggUYNgRdAPy","title":"se-scraper - npm","snippet":"3 days ago ... Google; Google News; Google News New (https://news.google.com) .... 'https://\nincolumitas.com/2018/10/29/youtube-puppeteer-scraping/',. title:.","visible_link":"","date":"","rank":7},{"link":"/url?q=https://pypi.org/project/CountryGoogleScraper/&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFgg8MAc&usg=AOvVaw039KcMaYiUHgES64a5tW9F","title":"CountryGoogleScraper · PyPI","snippet":"News GoogleScraper becomes **finally mature!** In the last months I didn't ..... [4\n]: http://incolumitas.com/2013/01/06/googlesearch-a-rapid-python-class-to-get- ...","visible_link":"","date":"","rank":8},{"link":"/url?q=https://pycoders.com/issues/69&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFghBMAg&usg=AOvVaw1sJXhzn4R2ihjximQOaA_W","title":"PyCoder's Weekly | Issue #69","snippet":"7 Jun 2013 ... To the keep up with all the breaking Python news follow @pycoders. Support us \non Gittip -- ... News and Developments. ... (incolumitas.com).","visible_link":"","date":"","rank":9},{"link":"/url?q=https://news.ycombinator.com/item%3Fid%3D11925325&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFghHMAk&usg=AOvVaw1F_732iyOahhNsWaJyvYM2","title":"Typosquatting in Programming Language Package ... - Hacker News","snippet":"Typosquatting in Programming Language Package Managers [pdf] (incolumitas.\ncom). 3 points by henry_flower on June 17, 2016 | hide | past | web | favorite ...","visible_link":"","date":"","rank":10}]}}
|
@ -27,7 +27,11 @@ module.exports = class Pluggable {
|
||||
}
|
||||
|
||||
async handle_metadata(args) {
|
||||
// silence
|
||||
// store scraping metadata somewhere
|
||||
}
|
||||
|
||||
async handle_results(args) {
|
||||
// store the results somewhere
|
||||
}
|
||||
|
||||
async start_browser(args={}) {
|
||||
@ -35,11 +39,16 @@ module.exports = class Pluggable {
|
||||
|
||||
let launch_args = {
|
||||
args: args.chromeFlags || this.chromeFlags,
|
||||
headless: args.headless || this.headless,
|
||||
headless: args.headless,
|
||||
};
|
||||
|
||||
if (launch_args.headless === undefined) {
|
||||
launch_args.headless = this.headless;
|
||||
}
|
||||
|
||||
this.browser = await puppeteer.launch(launch_args);
|
||||
console.log('Loaded custom function get_browser()');
|
||||
console.log(launch_args);
|
||||
|
||||
return this.browser;
|
||||
}
|
||||
|
2
index.js
2
index.js
@ -33,7 +33,7 @@ exports.scrape = async function(config, callback) {
|
||||
// path to js module that extends functionality
|
||||
// this module should export the functions:
|
||||
// get_browser, handle_metadata, close_browser
|
||||
custom_func: 'examples/pluggable.js',
|
||||
custom_func: '',
|
||||
};
|
||||
|
||||
for (var key in config) {
|
||||
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "se-scraper",
|
||||
"version": "1.1.1",
|
||||
"version": "1.1.3",
|
||||
"description": "A simple module which uses puppeteer to scrape several search engines.",
|
||||
"homepage": "https://scrapeulous.com/",
|
||||
"main": "index.js",
|
||||
|
10
run.js
10
run.js
@ -5,7 +5,7 @@ let config = {
|
||||
// the user agent to scrape with
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
// if random_user_agent is set to True, a random user agent is chosen
|
||||
random_user_agent: false,
|
||||
random_user_agent: true,
|
||||
// get meta data of scraping in return object
|
||||
write_meta_data: false,
|
||||
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
||||
@ -20,7 +20,7 @@ let config = {
|
||||
// this output is informational
|
||||
verbose: false,
|
||||
// an array of keywords to scrape
|
||||
keywords: ['scrapeulous.com', ],
|
||||
keywords: ['incolumitas.com news', ],
|
||||
// alternatively you can specify a keyword_file. this overwrites the keywords array
|
||||
keyword_file: '',
|
||||
// whether to start the browser in headless mode
|
||||
@ -38,7 +38,7 @@ let config = {
|
||||
custom_func: resolve('examples/pluggable.js'),
|
||||
};
|
||||
|
||||
se_scraper.scrape(config, (err, response) => {
|
||||
function callback(err, response) {
|
||||
if (err) { console.error(err) }
|
||||
|
||||
/* response object has the following properties:
|
||||
@ -49,4 +49,6 @@ se_scraper.scrape(config, (err, response) => {
|
||||
*/
|
||||
|
||||
console.dir(response.results, {depth: null, colors: true});
|
||||
});
|
||||
}
|
||||
|
||||
se_scraper.scrape(config, callback);
|
||||
|
@ -47,7 +47,7 @@ async function scrape_google_pup(page, event, context) {
|
||||
}
|
||||
|
||||
await page.waitForSelector('#center_col', { timeout: STANDARD_TIMEOUT });
|
||||
await sfunctions.sleep(100);
|
||||
await sfunctions.sleep(500);
|
||||
|
||||
} catch (e) {
|
||||
console.error(`Problem with scraping ${keyword}.`);
|
||||
@ -175,7 +175,7 @@ function parse_google_results(html) {
|
||||
$('#center_col .g').each((i, link) => {
|
||||
results.push({
|
||||
link: $(link).find('.r a').attr('href'),
|
||||
title: $(link).find('.r a h3').text(),
|
||||
title: $(link).find('.r a').text(),
|
||||
snippet: $(link).find('span.st').text(),
|
||||
visible_link: $(link).find('.r cite').text(),
|
||||
date: $(link).find('span.f').text() || '',
|
||||
|
@ -68,7 +68,7 @@ module.exports.handler = async function handler (config, context, callback) {
|
||||
|
||||
let launch_args = {
|
||||
args: ADDITIONAL_CHROME_FLAGS,
|
||||
headless: config.headless !== false,
|
||||
headless: config.headless,
|
||||
};
|
||||
|
||||
if (config.debug === true) {
|
||||
@ -149,6 +149,13 @@ module.exports.handler = async function handler (config, context, callback) {
|
||||
results = zlib.deflateSync(results).toString('base64');
|
||||
}
|
||||
|
||||
if (pluggable && pluggable.handle_results) {
|
||||
await pluggable.handle_results({
|
||||
config: config,
|
||||
results: results,
|
||||
});
|
||||
}
|
||||
|
||||
if (config.write_meta_data === true) {
|
||||
metadata.id = `${config.job_name} ${config.chunk_lines}`;
|
||||
metadata.chunk_lines = config.chunk_lines;
|
||||
|
Loading…
Reference in New Issue
Block a user