mirror of
https://github.com/zrepl/zrepl.git
synced 2025-08-22 20:20:24 +02:00
[#307] add package trace, integrate it with logging, and adopt it throughout zrepl
package trace: - introduce the concept of tasks and spans, tracked as linked list within ctx - see package-level docs for an overview of the concepts - **main feature 1**: unique stack of task and span IDs - makes it easy to follow a series of log entries in concurrent code - **main feature 2**: ability to produce a chrome://tracing-compatible trace file - either via an env variable or a `zrepl pprof` subcommand - this is not a CPU profile, we already have go pprof for that - but it is very useful to visually inspect where the replication / snapshotter / pruner spends its time ( fixes #307 ) usage in package daemon/logging: - goal: every log entry should have a trace field with the ID stack from package trace - make `logging.GetLogger(ctx, Subsys)` the authoritative `logger.Logger` factory function - the context carries a linked list of injected fields which `logging.GetLogger` adds to the logger it returns - `logging.GetLogger` also uses package `trace` to get the task-and-span-stack and injects it into the returned logger's fields
This commit is contained in:
@@ -10,6 +10,7 @@ import (
|
||||
"sync"
|
||||
|
||||
"github.com/pkg/errors"
|
||||
"github.com/zrepl/zrepl/daemon/logging/trace"
|
||||
|
||||
"github.com/zrepl/zrepl/replication/logic/pdu"
|
||||
"github.com/zrepl/zrepl/util/chainedio"
|
||||
@@ -73,6 +74,8 @@ func (s *Sender) filterCheckFS(fs string) (*zfs.DatasetPath, error) {
|
||||
}
|
||||
|
||||
func (s *Sender) ListFilesystems(ctx context.Context, r *pdu.ListFilesystemReq) (*pdu.ListFilesystemRes, error) {
|
||||
defer trace.WithSpanFromStackUpdateCtx(&ctx)()
|
||||
|
||||
fss, err := zfs.ZFSListMapping(ctx, s.FSFilter)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@@ -95,6 +98,8 @@ func (s *Sender) ListFilesystems(ctx context.Context, r *pdu.ListFilesystemReq)
|
||||
}
|
||||
|
||||
func (s *Sender) ListFilesystemVersions(ctx context.Context, r *pdu.ListFilesystemVersionsReq) (*pdu.ListFilesystemVersionsRes, error) {
|
||||
defer trace.WithSpanFromStackUpdateCtx(&ctx)()
|
||||
|
||||
lp, err := s.filterCheckFS(r.GetFilesystem())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@@ -113,6 +118,7 @@ func (s *Sender) ListFilesystemVersions(ctx context.Context, r *pdu.ListFilesyst
|
||||
}
|
||||
|
||||
func (p *Sender) HintMostRecentCommonAncestor(ctx context.Context, r *pdu.HintMostRecentCommonAncestorReq) (*pdu.HintMostRecentCommonAncestorRes, error) {
|
||||
defer trace.WithSpanFromStackUpdateCtx(&ctx)()
|
||||
|
||||
fsp, err := p.filterCheckFS(r.GetFilesystem())
|
||||
if err != nil {
|
||||
@@ -245,6 +251,7 @@ func sendArgsFromPDUAndValidateExistsAndGetVersion(ctx context.Context, fs strin
|
||||
}
|
||||
|
||||
func (s *Sender) Send(ctx context.Context, r *pdu.SendReq) (*pdu.SendRes, io.ReadCloser, error) {
|
||||
defer trace.WithSpanFromStackUpdateCtx(&ctx)()
|
||||
|
||||
_, err := s.filterCheckFS(r.Filesystem)
|
||||
if err != nil {
|
||||
@@ -350,6 +357,7 @@ func (s *Sender) Send(ctx context.Context, r *pdu.SendReq) (*pdu.SendRes, io.Rea
|
||||
}
|
||||
|
||||
func (p *Sender) SendCompleted(ctx context.Context, r *pdu.SendCompletedReq) (*pdu.SendCompletedRes, error) {
|
||||
defer trace.WithSpanFromStackUpdateCtx(&ctx)()
|
||||
|
||||
orig := r.GetOriginalReq() // may be nil, always use proto getters
|
||||
fsp, err := p.filterCheckFS(orig.GetFilesystem())
|
||||
@@ -371,27 +379,30 @@ func (p *Sender) SendCompleted(ctx context.Context, r *pdu.SendCompletedReq) (*p
|
||||
return nil, errors.Wrap(err, "validate `to` exists")
|
||||
}
|
||||
|
||||
log := getLogger(ctx).WithField("to_guid", to.Guid).
|
||||
WithField("fs", fs).
|
||||
WithField("to", to.RelName)
|
||||
if from != nil {
|
||||
log = log.WithField("from", from.RelName).WithField("from_guid", from.Guid)
|
||||
log := func(ctx context.Context) Logger {
|
||||
log := getLogger(ctx).WithField("to_guid", to.Guid).
|
||||
WithField("fs", fs).
|
||||
WithField("to", to.RelName)
|
||||
if from != nil {
|
||||
log = log.WithField("from", from.RelName).WithField("from_guid", from.Guid)
|
||||
}
|
||||
return log
|
||||
}
|
||||
|
||||
log.Debug("move replication cursor to most recent common version")
|
||||
log(ctx).Debug("move replication cursor to most recent common version")
|
||||
destroyedCursors, err := MoveReplicationCursor(ctx, fs, to, p.jobId)
|
||||
if err != nil {
|
||||
if err == zfs.ErrBookmarkCloningNotSupported {
|
||||
log.Debug("not setting replication cursor, bookmark cloning not supported")
|
||||
log(ctx).Debug("not setting replication cursor, bookmark cloning not supported")
|
||||
} else {
|
||||
msg := "cannot move replication cursor, keeping hold on `to` until successful"
|
||||
log.WithError(err).Error(msg)
|
||||
log(ctx).WithError(err).Error(msg)
|
||||
err = errors.Wrap(err, msg)
|
||||
// it is correct to not release the hold if we can't move the cursor!
|
||||
return &pdu.SendCompletedRes{}, err
|
||||
}
|
||||
} else {
|
||||
log.Info("successfully moved replication cursor")
|
||||
log(ctx).Info("successfully moved replication cursor")
|
||||
}
|
||||
|
||||
// kick off releasing of step holds / bookmarks
|
||||
@@ -401,21 +412,27 @@ func (p *Sender) SendCompleted(ctx context.Context, r *pdu.SendCompletedReq) (*p
|
||||
wg.Add(2)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
log.Debug("release step-hold of or step-bookmark on `to`")
|
||||
ctx, endTask := trace.WithTask(ctx, "release-step-hold-to")
|
||||
defer endTask()
|
||||
|
||||
log(ctx).Debug("release step-hold of or step-bookmark on `to`")
|
||||
err = ReleaseStep(ctx, fs, to, p.jobId)
|
||||
if err != nil {
|
||||
log.WithError(err).Error("cannot release step-holds on or destroy step-bookmark of `to`")
|
||||
log(ctx).WithError(err).Error("cannot release step-holds on or destroy step-bookmark of `to`")
|
||||
} else {
|
||||
log.Info("successfully released step-holds on or destroyed step-bookmark of `to`")
|
||||
log(ctx).Info("successfully released step-holds on or destroyed step-bookmark of `to`")
|
||||
}
|
||||
|
||||
}()
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
ctx, endTask := trace.WithTask(ctx, "release-step-hold-from")
|
||||
defer endTask()
|
||||
|
||||
if from == nil {
|
||||
return
|
||||
}
|
||||
log.Debug("release step-hold of or step-bookmark on `from`")
|
||||
log(ctx).Debug("release step-hold of or step-bookmark on `from`")
|
||||
err := ReleaseStep(ctx, fs, *from, p.jobId)
|
||||
if err != nil {
|
||||
if dne, ok := err.(*zfs.DatasetDoesNotExist); ok {
|
||||
@@ -424,15 +441,15 @@ func (p *Sender) SendCompleted(ctx context.Context, r *pdu.SendCompletedReq) (*p
|
||||
// In that case, nonexistence of `from` is not an error, otherwise it is.
|
||||
for _, c := range destroyedCursors {
|
||||
if c.GetFullPath() == dne.Path {
|
||||
log.Info("`from` was a replication cursor and has already been destroyed")
|
||||
log(ctx).Info("`from` was a replication cursor and has already been destroyed")
|
||||
return
|
||||
}
|
||||
}
|
||||
// fallthrough
|
||||
}
|
||||
log.WithError(err).Error("cannot release step-holds on or destroy step-bookmark of `from`")
|
||||
log(ctx).WithError(err).Error("cannot release step-holds on or destroy step-bookmark of `from`")
|
||||
} else {
|
||||
log.Info("successfully released step-holds on or destroyed step-bookmark of `from`")
|
||||
log(ctx).Info("successfully released step-holds on or destroyed step-bookmark of `from`")
|
||||
}
|
||||
}()
|
||||
wg.Wait()
|
||||
@@ -441,6 +458,8 @@ func (p *Sender) SendCompleted(ctx context.Context, r *pdu.SendCompletedReq) (*p
|
||||
}
|
||||
|
||||
func (p *Sender) DestroySnapshots(ctx context.Context, req *pdu.DestroySnapshotsReq) (*pdu.DestroySnapshotsRes, error) {
|
||||
defer trace.WithSpanFromStackUpdateCtx(&ctx)()
|
||||
|
||||
dp, err := p.filterCheckFS(req.Filesystem)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@@ -449,6 +468,8 @@ func (p *Sender) DestroySnapshots(ctx context.Context, req *pdu.DestroySnapshots
|
||||
}
|
||||
|
||||
func (p *Sender) Ping(ctx context.Context, req *pdu.PingReq) (*pdu.PingRes, error) {
|
||||
defer trace.WithSpanFromStackUpdateCtx(&ctx)()
|
||||
|
||||
res := pdu.PingRes{
|
||||
Echo: req.GetMessage(),
|
||||
}
|
||||
@@ -456,14 +477,20 @@ func (p *Sender) Ping(ctx context.Context, req *pdu.PingReq) (*pdu.PingRes, erro
|
||||
}
|
||||
|
||||
func (p *Sender) PingDataconn(ctx context.Context, req *pdu.PingReq) (*pdu.PingRes, error) {
|
||||
defer trace.WithSpanFromStackUpdateCtx(&ctx)()
|
||||
|
||||
return p.Ping(ctx, req)
|
||||
}
|
||||
|
||||
func (p *Sender) WaitForConnectivity(ctx context.Context) error {
|
||||
defer trace.WithSpanFromStackUpdateCtx(&ctx)()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *Sender) ReplicationCursor(ctx context.Context, req *pdu.ReplicationCursorReq) (*pdu.ReplicationCursorRes, error) {
|
||||
defer trace.WithSpanFromStackUpdateCtx(&ctx)()
|
||||
|
||||
dp, err := p.filterCheckFS(req.Filesystem)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@@ -594,6 +621,8 @@ func (f subroot) MapToLocal(fs string) (*zfs.DatasetPath, error) {
|
||||
}
|
||||
|
||||
func (s *Receiver) ListFilesystems(ctx context.Context, req *pdu.ListFilesystemReq) (*pdu.ListFilesystemRes, error) {
|
||||
defer trace.WithSpanFromStackUpdateCtx(&ctx)()
|
||||
|
||||
root := s.clientRootFromCtx(ctx)
|
||||
filtered, err := zfs.ZFSListMapping(ctx, subroot{root})
|
||||
if err != nil {
|
||||
@@ -644,6 +673,8 @@ func (s *Receiver) ListFilesystems(ctx context.Context, req *pdu.ListFilesystemR
|
||||
}
|
||||
|
||||
func (s *Receiver) ListFilesystemVersions(ctx context.Context, req *pdu.ListFilesystemVersionsReq) (*pdu.ListFilesystemVersionsRes, error) {
|
||||
defer trace.WithSpanFromStackUpdateCtx(&ctx)()
|
||||
|
||||
root := s.clientRootFromCtx(ctx)
|
||||
lp, err := subroot{root}.MapToLocal(req.GetFilesystem())
|
||||
if err != nil {
|
||||
@@ -665,6 +696,8 @@ func (s *Receiver) ListFilesystemVersions(ctx context.Context, req *pdu.ListFile
|
||||
}
|
||||
|
||||
func (s *Receiver) Ping(ctx context.Context, req *pdu.PingReq) (*pdu.PingRes, error) {
|
||||
defer trace.WithSpanFromStackUpdateCtx(&ctx)()
|
||||
|
||||
res := pdu.PingRes{
|
||||
Echo: req.GetMessage(),
|
||||
}
|
||||
@@ -672,24 +705,30 @@ func (s *Receiver) Ping(ctx context.Context, req *pdu.PingReq) (*pdu.PingRes, er
|
||||
}
|
||||
|
||||
func (s *Receiver) PingDataconn(ctx context.Context, req *pdu.PingReq) (*pdu.PingRes, error) {
|
||||
defer trace.WithSpanFromStackUpdateCtx(&ctx)()
|
||||
return s.Ping(ctx, req)
|
||||
}
|
||||
|
||||
func (s *Receiver) WaitForConnectivity(ctx context.Context) error {
|
||||
defer trace.WithSpanFromStackUpdateCtx(&ctx)()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Receiver) ReplicationCursor(context.Context, *pdu.ReplicationCursorReq) (*pdu.ReplicationCursorRes, error) {
|
||||
func (s *Receiver) ReplicationCursor(ctx context.Context, _ *pdu.ReplicationCursorReq) (*pdu.ReplicationCursorRes, error) {
|
||||
defer trace.WithSpanFromStackUpdateCtx(&ctx)()
|
||||
return nil, fmt.Errorf("ReplicationCursor not implemented for Receiver")
|
||||
}
|
||||
|
||||
func (s *Receiver) Send(ctx context.Context, req *pdu.SendReq) (*pdu.SendRes, io.ReadCloser, error) {
|
||||
defer trace.WithSpanFromStackUpdateCtx(&ctx)()
|
||||
return nil, nil, fmt.Errorf("receiver does not implement Send()")
|
||||
}
|
||||
|
||||
var maxConcurrentZFSRecvSemaphore = semaphore.New(envconst.Int64("ZREPL_ENDPOINT_MAX_CONCURRENT_RECV", 10))
|
||||
|
||||
func (s *Receiver) Receive(ctx context.Context, req *pdu.ReceiveReq, receive io.ReadCloser) (*pdu.ReceiveRes, error) {
|
||||
defer trace.WithSpanFromStackUpdateCtx(&ctx)()
|
||||
|
||||
getLogger(ctx).Debug("incoming Receive")
|
||||
defer receive.Close()
|
||||
|
||||
@@ -918,6 +957,8 @@ func (s *Receiver) Receive(ctx context.Context, req *pdu.ReceiveReq, receive io.
|
||||
}
|
||||
|
||||
func (s *Receiver) DestroySnapshots(ctx context.Context, req *pdu.DestroySnapshotsReq) (*pdu.DestroySnapshotsRes, error) {
|
||||
defer trace.WithSpanFromStackUpdateCtx(&ctx)()
|
||||
|
||||
root := s.clientRootFromCtx(ctx)
|
||||
lp, err := subroot{root}.MapToLocal(req.Filesystem)
|
||||
if err != nil {
|
||||
@@ -927,6 +968,7 @@ func (s *Receiver) DestroySnapshots(ctx context.Context, req *pdu.DestroySnapsho
|
||||
}
|
||||
|
||||
func (p *Receiver) HintMostRecentCommonAncestor(ctx context.Context, r *pdu.HintMostRecentCommonAncestorReq) (*pdu.HintMostRecentCommonAncestorRes, error) {
|
||||
defer trace.WithSpanFromStackUpdateCtx(&ctx)()
|
||||
// we don't move last-received-hold as part of this hint
|
||||
// because that wouldn't give us any benefit wrt resumability.
|
||||
//
|
||||
@@ -935,7 +977,9 @@ func (p *Receiver) HintMostRecentCommonAncestor(ctx context.Context, r *pdu.Hint
|
||||
return &pdu.HintMostRecentCommonAncestorRes{}, nil
|
||||
}
|
||||
|
||||
func (p *Receiver) SendCompleted(context.Context, *pdu.SendCompletedReq) (*pdu.SendCompletedRes, error) {
|
||||
func (p *Receiver) SendCompleted(ctx context.Context, _ *pdu.SendCompletedReq) (*pdu.SendCompletedRes, error) {
|
||||
defer trace.WithSpanFromStackUpdateCtx(&ctx)()
|
||||
|
||||
return &pdu.SendCompletedRes{}, nil
|
||||
}
|
||||
|
||||
@@ -957,7 +1001,7 @@ func doDestroySnapshots(ctx context.Context, lp *zfs.DatasetPath, snaps []*pdu.F
|
||||
ErrOut: &errs[i],
|
||||
}
|
||||
}
|
||||
zfs.ZFSDestroyFilesystemVersions(reqs)
|
||||
zfs.ZFSDestroyFilesystemVersions(ctx, reqs)
|
||||
for i := range reqs {
|
||||
if errs[i] != nil {
|
||||
if de, ok := errs[i].(*zfs.DestroySnapshotsError); ok && len(de.Reason) == 1 {
|
||||
|
Reference in New Issue
Block a user