added handle_results

2025-08-16 08:37:54 +02:00 · 2019-01-27 20:08:09 +01:00
parent 86a66a09fd
commit a20b6c79a9
7 changed files with 30 additions and 12 deletions
--- a/data.json
+++ b/data.json
@ -1 +1 @@
-{"scrapeulous.com":{"time":"Sun, 27 Jan 2019 18:07:33 GMT","num_results":"Ungefähr 101 Ergebnisse","no_results":false,"effective_query":"","results":[]}}
+{"incolumitas.com news":{"time":"Sun, 27 Jan 2019 19:07:41 GMT","num_results":"Ungefähr 691.000 Ergebnisse","no_results":false,"effective_query":"","results":[{"link":"/url?q=https://incolumitas.com/2018/10/29/youtube-puppeteer-scraping/&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFggUMAA&usg=AOvVaw1PmFDpPlIFYilxfQb1ym1W","title":"Coding, Learning and Business Ideas – Tutorial ... - Incolumitas","snippet":"29 Oct 2018 ... If you are a scientist, you might be interested in the spreading of fake news for \nexample. You want to monitor the sources of fake news.","visible_link":"","date":"","rank":1},{"link":"/url?q=https://incolumitas.com/2018/11/18/introduction-machine-learning-2019/&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFggaMAE&usg=AOvVaw37PW4eBuCGv2zkvXt0sZkw","title":"Coding, Learning and Business Ideas – Introduction to ... - Incolumitas","snippet":"18 Nov 2018 ... I want to recognize news articles soley based on one input: A link to the \ndocument. The algorithm should automatically recognize whether the ...","visible_link":"","date":"","rank":2},{"link":"/url?q=https://incolumitas.com/pages/about/&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFgggMAI&usg=AOvVaw2-XXZq6XPgh5orgEd_2rr6","title":"Coding, Learning and Business Ideas – About - Incolumitas","snippet":"As far as I know, this security vulnerability is still unfixed in late 2018. See my \noriginal blog post here. An news article from Heise and an article from ars \ntechnica.","visible_link":"","date":"","rank":3},{"link":"/url?q=https://incolumitas.com/uploads/2013/12/links.txt&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFgglMAM&usg=AOvVaw0cAkft0dN7WcT6DCDbDYZM","title":"http://www.urbandictionary.com/define.php?term=holy%20shit http ...","snippet":"... http://www.tumblr.com/tagged/holy-shit http://pitchfork.com/news/52370-watch-\nholy-shit-perform-at-ryan-mcginley-gallery-opening-in-san-francisco/ ...","visible_link":"","date":"","rank":4},{"link":"/url?q=https://incolumitas.com/2014/11/12/scraping-and-extracting-links-from-any-major-search-engine-like-google-yandex-baidu-bing-and-duckduckgo/&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFggqMAQ&usg=AOvVaw0I45HMQSEjMeuKozexkwcI","title":"Coding, Learning and Business Ideas – Scraping and ... - Incolumitas","snippet":"12 Nov 2014 ... The supported search types. For instance, Google supports Video Search, Image \nSearch, News search search_types = [] def __init__(self, html, ...","visible_link":"","date":"","rank":5},{"link":"/url?q=https://incolumitas.com/2013/01/06/googlesearch-a-rapid-python-class-to-get-search-results/&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFggwMAU&usg=AOvVaw39UsbJVwdj3lCe2hAGZlgq","title":"Coding, Learning and Business Ideas ... - Incolumitas","snippet":"18 Feb 2014 ... UPDATE on 18th February 2014: This python module has now its own github \nrepository! The plugin can extract All links Link titles The ...","visible_link":"","date":"","rank":6},{"link":"/url?q=https://www.npmjs.com/package/se-scraper&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFgg2MAY&usg=AOvVaw2qrTw_GOSWhggUYNgRdAPy","title":"se-scraper - npm","snippet":"3 days ago ... Google; Google News; Google News New (https://news.google.com) .... 'https://\nincolumitas.com/2018/10/29/youtube-puppeteer-scraping/',. title:.","visible_link":"","date":"","rank":7},{"link":"/url?q=https://pypi.org/project/CountryGoogleScraper/&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFgg8MAc&usg=AOvVaw039KcMaYiUHgES64a5tW9F","title":"CountryGoogleScraper · PyPI","snippet":"News GoogleScraper becomes **finally mature!** In the last months I didn't ..... [4\n]: http://incolumitas.com/2013/01/06/googlesearch-a-rapid-python-class-to-get- ...","visible_link":"","date":"","rank":8},{"link":"/url?q=https://pycoders.com/issues/69&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFghBMAg&usg=AOvVaw1sJXhzn4R2ihjximQOaA_W","title":"PyCoder's Weekly | Issue #69","snippet":"7 Jun 2013 ... To the keep up with all the breaking Python news follow @pycoders. Support us \non Gittip -- ... News and Developments. ... (incolumitas.com).","visible_link":"","date":"","rank":9},{"link":"/url?q=https://news.ycombinator.com/item%3Fid%3D11925325&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFghHMAk&usg=AOvVaw1F_732iyOahhNsWaJyvYM2","title":"Typosquatting in Programming Language Package ... - Hacker News","snippet":"Typosquatting in Programming Language Package Managers [pdf] (incolumitas.\ncom). 3 points by henry_flower on June 17, 2016 | hide | past | web | favorite ...","visible_link":"","date":"","rank":10}]}}
--- a/examples/pluggable.js
+++ b/examples/pluggable.js
@ -27,7 +27,11 @@ module.exports = class Pluggable {
    }

    async handle_metadata(args) {
-        // silence
+        // store scraping metadata somewhere
+    }
+
+    async handle_results(args) {
+        // store the results somewhere
    }

    async start_browser(args={}) {
@ -35,11 +39,16 @@ module.exports = class Pluggable {

        let launch_args = {
            args: args.chromeFlags || this.chromeFlags,
-            headless: args.headless || this.headless,
+            headless: args.headless,
        };

+        if (launch_args.headless === undefined) {
+            launch_args.headless = this.headless;
+        }
+
        this.browser = await puppeteer.launch(launch_args);
        console.log('Loaded custom function get_browser()');
+        console.log(launch_args);

        return this.browser;
    }
--- a/index.js
+++ b/index.js
@ -33,7 +33,7 @@ exports.scrape = async function(config, callback) {
 		// path to js module that extends functionality
 		// this module should export the functions:
 		// get_browser, handle_metadata, close_browser
-		custom_func: 'examples/pluggable.js',
+		custom_func: '',
 	};

 	for (var key in config) {
--- a/package.json
+++ b/package.json
@ -1,6 +1,6 @@
 {
  "name": "se-scraper",
-  "version": "1.1.1",
+  "version": "1.1.3",
  "description": "A simple module which uses puppeteer to scrape several search engines.",
  "homepage": "https://scrapeulous.com/",
  "main": "index.js",
--- a/run.js
+++ b/run.js
@ -5,7 +5,7 @@ let config = {
    // the user agent to scrape with
    user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
    // if random_user_agent is set to True, a random user agent is chosen
-    random_user_agent: false,
+    random_user_agent: true,
    // get meta data of scraping in return object
    write_meta_data: false,
    // how long to sleep between requests. a random sleep interval within the range [a,b]
@ -20,7 +20,7 @@ let config = {
    // this output is informational
    verbose: false,
    // an array of keywords to scrape
-    keywords: ['scrapeulous.com', ],
+    keywords: ['incolumitas.com news', ],
    // alternatively you can specify a keyword_file. this overwrites the keywords array
    keyword_file: '',
    // whether to start the browser in headless mode
@ -38,7 +38,7 @@ let config = {
    custom_func: resolve('examples/pluggable.js'),
 };

-se_scraper.scrape(config, (err, response) => {
+function callback(err, response) {
    if (err) { console.error(err) }

    /* response object has the following properties:
@ -49,4 +49,6 @@ se_scraper.scrape(config, (err, response) => {
     */

    console.dir(response.results, {depth: null, colors: true});
-});
+}
+
+se_scraper.scrape(config, callback);
--- a/src/modules/google.js
+++ b/src/modules/google.js
@ -47,7 +47,7 @@ async function scrape_google_pup(page, event, context) {
            }

 			await page.waitForSelector('#center_col', { timeout: STANDARD_TIMEOUT });
-            await sfunctions.sleep(100);
+            await sfunctions.sleep(500);

 		} catch (e) {
 			console.error(`Problem with scraping ${keyword}.`);
@ -175,7 +175,7 @@ function parse_google_results(html) {
 	$('#center_col .g').each((i, link) => {
 		results.push({
 		  link: $(link).find('.r a').attr('href'),
-		  title: $(link).find('.r a h3').text(),
+		  title: $(link).find('.r a').text(),
 		  snippet: $(link).find('span.st').text(),
 		  visible_link: $(link).find('.r cite').text(),
 		  date: $(link).find('span.f').text() || '',
--- a/src/node_scraper.js
+++ b/src/node_scraper.js
@ -68,7 +68,7 @@ module.exports.handler = async function handler (config, context, callback) {

        let launch_args = {
 			args: ADDITIONAL_CHROME_FLAGS,
-			headless: config.headless !== false,
+			headless: config.headless,
 		};

 		if (config.debug === true) {
@ -149,6 +149,13 @@ module.exports.handler = async function handler (config, context, callback) {
 			results = zlib.deflateSync(results).toString('base64');
 		}

+		if (pluggable && pluggable.handle_results) {
+			await pluggable.handle_results({
+				config: config,
+				results: results,
+			});
+		}
+
 		if (config.write_meta_data === true) {
            metadata.id = `${config.job_name} ${config.chunk_lines}`;
 			metadata.chunk_lines = config.chunk_lines;