In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re


In [None]:
# ./parsed.json is the stdout of the scraper tool in this directory
df = pd.read_json("./parsed.json", lines=True)
df

In [None]:
def parse_ds(entity):
    m = re.search(r"(?P<dataset>[^@#]*)([@#].+)?", entity)
    return m.group("dataset")
    
def parse_cmd(row):
    cmd  = row.Cmd
    binary, verb, *tail = re.split(r"\s+", cmd) # NOTE whitespace in dataset names => don't use comp
    
    dataset = None
    if binary == "zfs":
        if verb == "send":      
            if len(tail) == 0:
                verb = "send-feature-test"
            else:
                dataset = parse_ds(tail[-1])
                if "-n" in tail:
                    verb = "send-dry"
        elif verb == "recv" or verb == "receive":
            verb = "receive"
            if len(tail) > 0:
                dataset = parse_ds(tail[-1])
            else:
                verb = "receive-CLI-test"
        elif verb == "get":
            dataset = parse_ds(tail[-1])
        elif verb == "list":
            if "-r" in tail and "-d" in tail and "1" in tail:
                dataset = parse_ds(tail[-1])
                verb = "list-single-dataset"
            else:
                dataset = "!ALL_POOLS!"
                verb = "list-all-filesystems"
        elif verb == "bookmark":
            dataset = parse_ds(tail[-2])
        elif verb == "hold":
            dataset = parse_ds(tail[-1])
        elif verb == "snapshot":
            dataset = parse_ds(tail[-1])
        elif verb == "release":
            dss = tail[-1].split(",")
            if len(dss) > 1:
                raise Exception("cannot handle batch-release")
            dataset = parse_ds(dss[0])
        elif verb == "holds" and "-H" in tail:
            dss = tail[-1].split(",")
            if len(dss) > 1:
                raise Exception("cannot handle batch-holds")
            dataset = parse_ds(dss[0])
        elif verb == "destroy":
            dss = tail[-1].split(",")
            if len(dss) > 1:
                raise Exception("cannot handle batch-holds")
            dataset = parse_ds(dss[0])
    
    return {'action':binary + "-" + verb, 'dataset': dataset }
    
    
res = df.apply(parse_cmd, axis='columns', result_type='expand')
res = pd.concat([df, res], axis='columns')
for cat in ["action", "dataset"]:
    res[cat] = res[cat].astype('category')

res["LogTimeUnix"] = pd.to_datetime(res.LogTime)

In [None]:
res["OtherTime"] = res.TotalTime - res.Usertime - res.Systime
x = res.melt(id_vars=["action", "dataset"], value_vars=["TotalTime", "OtherTime", "Usertime", "Systime"])
x

In [None]:
print("commands with NaN values")
set(x[x.isna().any(axis=1)].action.values)

In [None]:
# (~x.action.astype('str').isin(["zfs-send", "zfs-recv"]))
totaltimes = x[(x.variable == "TotalTime")].groupby(["action", "dataset"]).sum().reset_index()
display(totaltimes)

In [None]:
totaltimes_by_action = totaltimes.groupby("action").sum().sort_values(by="value")
totaltimes_by_action.plot.barh()

In [None]:
totaltimes.groupby("dataset").sum().plot.barh(fontsize=5)

In [None]:
most_expensive_action = totaltimes_by_action.idxmax().value
display(most_expensive_action)
most_expensive_action_by_dataset = totaltimes[totaltimes.action == most_expensive_action].groupby("dataset").sum().sort_values(by="value")
most_expensive_action_by_dataset.plot.barh(rot=50, fontsize=5, figsize=(10, 20))
plt.savefig('most-expensive-command.pdf')

In [None]:
res

In [None]:
# %matplotlib notebook 

# res.index = res.LogTimeUnix

# resampled = res.pivot(columns='action', values='TotalTime').resample("1s").sum()
# resampled.cumsum().plot()
# res["BeginTime"] = res.LogTimeUnix.dt.total_seconds()
# holds = res[res.action == "zfs-holds"]
# sns.stripplot(x="LogTimeUnix", y="action", data=res)
# res["LogTimeUnix"].resample("20min").sum()
# res[res.action == "zfs-holds"].plot.scatter(x="LogTimeUnix", y="TotalTime")

#res[res.action == "zfs-holds"].pivot(columns='action', values=['TotalTime', 'Systime', "Usertime"]).resample("1s").sum().cumsum().plot()
pivoted = res.reset_index(drop=True).pivot_table(values=['TotalTime', 'Systime', "Usertime"], index="LogTimeUnix", columns="action")
pivoted

In [None]:
pivoted.cumsum()[[("TotalTime", "zfs-holds"),("Systime", "zfs-holds"),("Usertime", "zfs-holds")]].plot()

In [None]:
pivoted = res.reset_index(drop=True).pivot_table(values=['TotalTime'], index="LogTimeUnix", columns="action")
cum_invocation_counts_per_action = pivoted.isna().astype(int).cumsum()

In [None]:
cum_invocation_counts_per_action

In [None]:
# zfs-get as reference value
cum_invocation_counts_per_action[[("TotalTime","zfs-holds"),("TotalTime","zfs-get")]].plot()