Merge branch 'problame/zfs-command-logging-and-status' into problame/holds-release-and-hold-leak-fix-v2

2024-11-22 08:23:50 +01:00 · 2020-03-29 19:04:21 +02:00 · 2020-03-29 19:04:21 +02:00 · bc291e622f
commit bc291e622f
parent 8bfaba1665 deeca76aa0
6 changed files with 410 additions and 2 deletions
--- a/zfs/zfscmd/zfscmd-logging-scraper/README.md
+++ b/zfs/zfscmd/zfscmd-logging-scraper/README.md
@ -0,0 +1,11 @@
+The tool in this package (`go run . -h`) scrapes log lines produces by the `github.com/zrepl/zrepl/zfs/zfscmd` package
+into a stream of JSON objects.
+
+The `analysis.ipynb` then runs some basic analysis on the collected log output.
+
+## Deps for the `scrape_graylog_csv.bash` script
+
+```
+pip install --upgrade git+https://github.com/lk-jeffpeck/csvfilter.git@ec433f14330fbbf5d41f56febfeedac22868a949
+```
+
--- a/zfs/zfscmd/zfscmd-logging-scraper/analysis.ipynb
+++ b/zfs/zfscmd/zfscmd-logging-scraper/analysis.ipynb
@ -0,0 +1,180 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt\n",
+    "import re\n",
+    "\n",
+    "%matplotlib notebook"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ./parsed.json is the stdout of the scraper tool in this directory\n",
+    "df = pd.read_json(\"./parsed.json\", lines=True)\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def parse_ds(entity):\n",
+    "    m = re.search(r\"(?P<dataset>[^@#]*)([@#].+)?\", entity)\n",
+    "    return m.group(\"dataset\")\n",
+    "    \n",
+    "def parse_cmd(row):\n",
+    "    cmd  = row.Cmd\n",
+    "    binary, verb, *tail = re.split(r\"\\s+\", cmd) # NOTE whitespace in dataset names => don't use comp\n",
+    "    \n",
+    "    dataset = None\n",
+    "    if binary == \"zfs\":\n",
+    "        if verb == \"send\":            \n",
+    "            dataset = parse_ds(tail[-1])\n",
+    "            if \"-n\" in tail:\n",
+    "                verb = \"send-dry\"\n",
+    "        elif verb == \"recv\" or verb == \"receive\":\n",
+    "            verb = \"receive\"\n",
+    "            if len(tail) > 0:\n",
+    "                dataset = parse_ds(tail[-1])\n",
+    "            else:\n",
+    "                verb = \"receive-CLI-test\"\n",
+    "        elif verb == \"get\":\n",
+    "            dataset = parse_ds(tail[-1])\n",
+    "        elif verb == \"list\":\n",
+    "            if \"-r\" in tail and \"-d\" in tail and \"1\" in tail:\n",
+    "                dataset = parse_ds(tail[-1])\n",
+    "                verb = \"list-single-dataset\"\n",
+    "            else:\n",
+    "                dataset = \"!ALL_POOLS!\"\n",
+    "                verb = \"list-all-filesystems\"\n",
+    "        elif verb == \"bookmark\":\n",
+    "            dataset = parse_ds(tail[-2])\n",
+    "        elif verb == \"hold\":\n",
+    "            dataset = parse_ds(tail[-1])\n",
+    "        elif verb == \"snapshot\":\n",
+    "            dataset = parse_ds(tail[-1])\n",
+    "        elif verb == \"release\":\n",
+    "            dss = tail[-1].split(\",\")\n",
+    "            if len(dss) > 1:\n",
+    "                raise Exception(\"cannot handle batch-release\")\n",
+    "            dataset = parse_ds(dss[0])\n",
+    "        elif verb == \"holds\" and \"-H\" in tail:\n",
+    "            dss = tail[-1].split(\",\")\n",
+    "            if len(dss) > 1:\n",
+    "                raise Exception(\"cannot handle batch-holds\")\n",
+    "            dataset = parse_ds(dss[0])\n",
+    "        elif verb == \"destroy\":\n",
+    "            dss = tail[-1].split(\",\")\n",
+    "            if len(dss) > 1:\n",
+    "                raise Exception(\"cannot handle batch-holds\")\n",
+    "            dataset = parse_ds(dss[0])\n",
+    "    \n",
+    "    return {'action':binary + \"-\" + verb, 'dataset': dataset }\n",
+    "    \n",
+    "    \n",
+    "res = df.apply(parse_cmd, axis='columns', result_type='expand')\n",
+    "res = pd.concat([df, res], axis='columns')\n",
+    "for cat in [\"action\", \"dataset\"]:\n",
+    "    res[cat] = res[cat].astype('category')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "res[\"OtherTime\"] = res.TotalTime - res.Usertime - res.Systime\n",
+    "x = res.melt(id_vars=[\"action\", \"dataset\"], value_vars=[\"TotalTime\", \"OtherTime\", \"Usertime\", \"Systime\"])\n",
+    "x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"commands with NaN values\")\n",
+    "set(x[x.isna().any(axis=1)].action.values)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# (~x.action.astype('str').isin([\"zfs-send\", \"zfs-recv\"]))\n",
+    "totaltimes = x[(x.variable == \"TotalTime\")].groupby([\"action\", \"dataset\"]).sum().reset_index()\n",
+    "display(totaltimes)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "totaltimes_by_action = totaltimes.groupby(\"action\").sum().sort_values(by=\"value\")\n",
+    "totaltimes_by_action.plot.barh()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "totaltimes.groupby(\"dataset\").sum().plot.barh(fontsize=5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "most_expensive_action = totaltimes_by_action.idxmax().value\n",
+    "display(most_expensive_action)\n",
+    "most_expensive_action_by_dataset = totaltimes[totaltimes.action == most_expensive_action].groupby(\"dataset\").sum().sort_values(by=\"value\")\n",
+    "most_expensive_action_by_dataset.plot.barh(rot=50, fontsize=5, figsize=(10, 20))\n",
+    "plt.savefig('most-expensive-command.pdf')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/zfs/zfscmd/zfscmd-logging-scraper/scrape_graylog_csv.bash
+++ b/zfs/zfscmd/zfscmd-logging-scraper/scrape_graylog_csv.bash
@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+
+# This script converts output that was produced by zrepl and captured by Graylog
+# back to something that the scraper in this package's main can understand
+# Intended for human syslog
+# logging:
+# - type: syslog
+#   level: debug
+#   format: human
+
+
+csvfilter --skip 1 -f 0,2 -q '"' --out-quotechar=' ' /dev/stdin | sed -E 's/^\s*([^,]*), /\1 [LEVEL]/' | \
+    go run . -v \
+    --dateRE '^([^\[]+) (\[.*)' \
+    --dateFormat '2006-01-02T15:04:05.999999999Z'
+
--- a/zfs/zfscmd/zfscmd-logging-scraper/zfscmd_logging_scraper.go
+++ b/zfs/zfscmd/zfscmd-logging-scraper/zfscmd_logging_scraper.go
@ -0,0 +1,122 @@
+package main
+
+import (
+	"bufio"
+	"encoding/json"
+	"fmt"
+	"os"
+	"regexp"
+	"strings"
+	"time"
+
+	"github.com/go-logfmt/logfmt"
+	"github.com/pkg/errors"
+	"github.com/spf13/pflag"
+	"github.com/zrepl/zrepl/daemon/logging"
+)
+
+type RuntimeLine struct {
+	LogTime                      time.Time
+	Cmd                          string
+	TotalTime, Usertime, Systime time.Duration
+	Error                        string
+}
+
+var humanFormatterLineRE = regexp.MustCompile(`^(\[[^\]]+\]){2}\[zfs.cmd\]:\s+command\s+exited\s+(with|without)\s+error\s+(.+)`)
+
+func parseSecs(s string) (time.Duration, error) {
+	d, err := time.ParseDuration(s + "s")
+	if err != nil {
+		return 0, errors.Wrapf(err, "parse duration %q", s)
+	}
+	return d, nil
+}
+
+func parseHumanFormatterNodate(line string) (l RuntimeLine, err error) {
+	m := humanFormatterLineRE.FindStringSubmatch(line)
+	if m == nil {
+		return l, errors.New("human formatter regex does not match")
+	}
+
+	d := logfmt.NewDecoder(strings.NewReader(m[3]))
+	for d.ScanRecord() {
+		for d.ScanKeyval() {
+			k := string(d.Key())
+			v := string(d.Value())
+			switch k {
+			case "cmd":
+				l.Cmd = v
+			case "total_time_s":
+				l.TotalTime, err = parseSecs(v)
+			case "usertime_s":
+				l.Usertime, err = parseSecs(v)
+			case "systemtime_s":
+				l.Systime, err = parseSecs(v)
+			case "err":
+				l.Error = v
+			case "invocation":
+				continue // pass
+			default:
+				return l, errors.Errorf("unknown key %q", k)
+			}
+		}
+	}
+	if d.Err() != nil {
+		return l, errors.Wrap(d.Err(), "decode key value pairs")
+	}
+	return l, nil
+}
+
+func parseLogLine(line string) (l RuntimeLine, err error) {
+	m := dateRegex.FindStringSubmatch(line)
+	if len(m) != 3 {
+		return l, errors.Errorf("invalid date regex match %v", m)
+	}
+	date, err := time.Parse(dateFormat, strings.TrimSpace(m[1]))
+	if err != nil {
+		panic(fmt.Sprintf("cannot parse date %q: %s", m[1], err))
+	}
+	logLine := m[2]
+
+	l, err = parseHumanFormatterNodate(strings.TrimSpace(logLine))
+	l.LogTime = date
+	return l, err
+}
+
+var verbose bool
+var dateRegexArg string
+var dateRegex *regexp.Regexp
+var dateFormat string
+
+func main() {
+
+	pflag.StringVarP(&dateRegexArg, "dateRE", "d", `^([^\[]+)(.*)`, "date regex")
+	pflag.StringVar(&dateFormat, "dateFormat", logging.HumanFormatterDateFormat, "go date format")
+	pflag.BoolVarP(&verbose, "verbose", "v", false, "verbose")
+	pflag.Parse()
+
+	dateRegex = regexp.MustCompile(dateRegexArg)
+
+	input := bufio.NewScanner(os.Stdin)
+	input.Split(bufio.ScanLines)
+
+	enc := json.NewEncoder(os.Stdout)
+	for input.Scan() {
+
+		l, err := parseLogLine(input.Text())
+		if err != nil && verbose {
+			fmt.Fprintf(os.Stderr, "ignoring line after error %v\n", err)
+			fmt.Fprintf(os.Stderr, "offending line was: %s\n", input.Text())
+		}
+		if err == nil {
+			if err := enc.Encode(l); err != nil {
+				panic(err)
+			}
+		}
+	}
+
+	if input.Err() != nil {
+		panic(input.Err())
+	}
+
+}
--- a/zfs/zfscmd/zfscmd-logging-scraper/zfscmd_logging_scrapter_test.go
+++ b/zfs/zfscmd/zfscmd-logging-scraper/zfscmd_logging_scrapter_test.go
@ -0,0 +1,78 @@
+package main
+
+import (
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestParseHumanFormatterNodate(t *testing.T) {
+
+	type testCase struct {
+		Name      string
+		Input     string
+		Expect    *RuntimeLine
+		ExpectErr string
+	}
+
+	secs := func(s string) time.Duration {
+		d, err := parseSecs(s)
+		require.NoError(t, err)
+		return d
+	}
+
+	tcs := []testCase{
+		{
+			Name:  "human-formatter-noerror",
+			Input: `[jobname][zfs.cmd]: command exited without error usertime_s="0.008445" cmd="zfs list -H -p -o name -r -t filesystem,volume" systemtime_s="0.033783" invocation="84" total_time_s="0.037828619"`,
+			Expect: &RuntimeLine{
+				Cmd:       "zfs list -H -p -o name -r -t filesystem,volume",
+				TotalTime: secs("0.037828619"),
+				Usertime:  secs("0.008445"),
+				Systime:   secs("0.033783"),
+				Error:     "",
+			},
+		},
+		{
+			Name:  "human-formatter-witherror",
+			Input: `[jobname][zfs.cmd]: command exited with error usertime_s="0.008445" cmd="zfs list -H -p -o name -r -t filesystem,volume" systemtime_s="0.033783" invocation="84" total_time_s="0.037828619" err="some error"`,
+			Expect: &RuntimeLine{
+				Cmd:       "zfs list -H -p -o name -r -t filesystem,volume",
+				TotalTime: secs("0.037828619"),
+				Usertime:  secs("0.008445"),
+				Systime:   secs("0.033783"),
+				Error:     "some error",
+			},
+		},
+		{
+			Name:  "from graylog",
+			Input: `[csnas][zfs.cmd]:  command  exited  without  error  usertime_s="0"  cmd="zfs  send  -i  zroot/ezjail/synapse-12@zrepl_20200329_095518_000  zroot/ezjail/synapse-12@zrepl_20200329_102454_000"  total_time_s="0.101598591"  invocation="85"  systemtime_s="0.041581"`,
+			Expect: &RuntimeLine{
+				Cmd:       "zfs  send  -i  zroot/ezjail/synapse-12@zrepl_20200329_095518_000  zroot/ezjail/synapse-12@zrepl_20200329_102454_000",
+				TotalTime: secs("0.101598591"),
+				Systime:   secs("0.041581"),
+				Usertime:  secs("0"),
+				Error:     "",
+			},
+		},
+	}
+
+	for _, c := range tcs {
+		t.Run(c.Name, func(t *testing.T) {
+			l, err := parseHumanFormatterNodate(c.Input)
+			t.Logf("l=%v", l)
+			t.Logf("err=%T %v", err, err)
+			if (c.Expect != nil && c.ExpectErr != "") || (c.Expect == nil && c.ExpectErr == "") {
+				t.Fatal("bad test case", c)
+			}
+			if c.Expect != nil {
+				require.Equal(t, *c.Expect, l)
+			}
+			if c.ExpectErr != "" {
+				require.EqualError(t, err, c.ExpectErr)
+			}
+		})
+	}
+
+}
--- a/zfs/zfscmd/zfscmd_logging.go
+++ b/zfs/zfscmd/zfscmd_logging.go
@ -8,7 +8,8 @@ import (
 //
 // Pre-events logged with debug
 // Post-event without error logged with info
-// Post-events with error logged at error level
+// Post-events with error _also_ logged with info
+// (Not all errors we observe at this layer) are actual errors in higher-level layers)

 func startPreLogging(c *Cmd, now time.Time) {
 	c.log().Debug("starting command")
@ -35,6 +36,6 @@ func waitPostLogging(c *Cmd, err error, now time.Time) {
 	if err == nil {
 		log.Info("command exited without error")
 	} else {
-		log.WithError(err).Error("command exited with error")
+		log.WithError(err).Info("command exited with error")
 	}
 }