From 17add553d3e459c182cc74fd1bbb60488461cf0b Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Sat, 1 Feb 2020 15:28:52 +0100 Subject: [PATCH] WIP runtime-controllable concurrency for replication Changes done so far: - signal to route concurrency request up to the stepQueue - pretty hacky, no status reporting yet - stepQueue upsizing (confirmed that it works, no intermediary commit) Stuck at: stepQueue downsizing - idea was to have the stepQueue signal to the activated step that it should suspend - ideally, we'd just kill everything associated with the step and track it as restartable - the driver model doesn't allow for that though within an attempt - would need to start a separate run for the step - less perfect: tell the downsized steps to stop copying, but leave all zfs sends + rpc conns open - - doesn't give back the resoures that a step aquired before being selected as downsize victim (open zfs processe + TCP conn, some memory in the pipe) - - looks weird to user if the ps aux - + re-waking a step is easy: just tell it to proceed with copying - (the impl would likely pass a check function down into Step.do and have it check that functino periodically. the suspend should be acknowledged, and stepQueue should only remove the step from the active queue _after_ that step has acknowledged that is suspended) --- client/signal.go | 13 +- daemon/control.go | 12 ++ daemon/daemon.go | 12 ++ daemon/job/active.go | 36 ++++- daemon/job/job.go | 1 + daemon/job/passive.go | 2 + daemon/job/snapjob.go | 2 + daemon/prometheus.go | 3 + replication/driver/replication_driver.go | 67 ++++++-- replication/driver/replication_driver_test.go | 2 +- replication/driver/replication_stepqueue.go | 152 ++++++++++++------ .../driver/replication_stepqueue_test.go | 12 +- replication/replication.go | 4 +- replication/report/replication_report.go | 1 + 14 files changed, 244 insertions(+), 75 deletions(-) diff --git a/client/signal.go b/client/signal.go index 472c7ed..f51d793 100644 --- a/client/signal.go +++ b/client/signal.go @@ -9,7 +9,7 @@ import ( ) var SignalCmd = &cli.Subcommand{ - Use: "signal [wakeup|reset] JOB", + Use: "signal [wakeup|reset] JOB [DATA]", Short: "wake up a job from wait state or abort its current invocation", Run: func(subcommand *cli.Subcommand, args []string) error { return runSignalCmd(subcommand.Config(), args) @@ -17,8 +17,13 @@ var SignalCmd = &cli.Subcommand{ } func runSignalCmd(config *config.Config, args []string) error { - if len(args) != 2 { - return errors.Errorf("Expected 2 arguments: [wakeup|reset] JOB") + if len(args) < 2 || len(args) > 3 { + return errors.Errorf("Expected 2 arguments: [wakeup|reset|set-concurrency] JOB [DATA]") + } + + var data string + if len(args) == 3 { + data = args[2] } httpc, err := controlHttpClient(config.Global.Control.SockPath) @@ -30,9 +35,11 @@ func runSignalCmd(config *config.Config, args []string) error { struct { Name string Op string + Data string }{ Name: args[1], Op: args[0], + Data: data, }, struct{}{}, ) diff --git a/daemon/control.go b/daemon/control.go index 5cb498d..04ce7fd 100644 --- a/daemon/control.go +++ b/daemon/control.go @@ -8,6 +8,7 @@ import ( "io" "net" "net/http" + "strconv" "time" "github.com/pkg/errors" @@ -47,6 +48,8 @@ func (j *controlJob) OwnedDatasetSubtreeRoot() (p *zfs.DatasetPath, ok bool) { r func (j *controlJob) SenderConfig() *endpoint.SenderConfig { return nil } +func (j *controlJob) SetConcurrency(concurrency int) error { return errors.Errorf("not supported") } + var promControl struct { requestBegin *prometheus.CounterVec requestFinished *prometheus.HistogramVec @@ -126,6 +129,7 @@ func (j *controlJob) Run(ctx context.Context) { type reqT struct { Name string Op string + Data string } var req reqT if decoder(&req) != nil { @@ -138,6 +142,14 @@ func (j *controlJob) Run(ctx context.Context) { err = j.jobs.wakeup(req.Name) case "reset": err = j.jobs.reset(req.Name) + case "set-concurrency": + var concurrency int + concurrency, err = strconv.Atoi(req.Data) // shadow + if err != nil { + // fallthrough outer + } else { + err = j.jobs.setConcurrency(req.Name, concurrency) + } default: err = fmt.Errorf("operation %q is invalid", req.Op) } diff --git a/daemon/daemon.go b/daemon/daemon.go index 265ddc4..246eb99 100644 --- a/daemon/daemon.go +++ b/daemon/daemon.go @@ -176,6 +176,18 @@ func (s *jobs) reset(job string) error { return wu() } +func (s *jobs) setConcurrency(jobName string, concurrency int) error { + s.m.RLock() + defer s.m.RUnlock() + + job, ok := s.jobs[jobName] + if !ok { + return errors.Errorf("Job %q does not exist", job) + } + + return job.SetConcurrency(concurrency) +} + const ( jobNamePrometheus = "_prometheus" jobNameControl = "_control" diff --git a/daemon/job/active.go b/daemon/job/active.go index 4a784df..9994a0a 100644 --- a/daemon/job/active.go +++ b/daemon/job/active.go @@ -55,9 +55,12 @@ const ( type activeSideTasks struct { state ActiveSideState + concurrency int + // valid for state ActiveSideReplicating, ActiveSidePruneSender, ActiveSidePruneReceiver, ActiveSideDone - replicationReport driver.ReportFunc - replicationCancel context.CancelFunc + replicationReport driver.ReportFunc + replicationCancel context.CancelFunc + replicationSetConcurrency driver.SetConcurrencyFunc // valid for state ActiveSidePruneSender, ActiveSidePruneReceiver, ActiveSideDone prunerSender, prunerReceiver *pruner.Pruner @@ -278,6 +281,8 @@ func activeSide(g *config.Global, in *config.ActiveJob, configJob interface{}) ( return nil, errors.Wrap(err, "invalid job name") } + j.tasks.concurrency = 1 // FIXME + switch v := configJob.(type) { case *config.PushJob: j.mode, err = modePushFromConfig(g, v, j.name) // shadow @@ -375,6 +380,22 @@ func (j *ActiveSide) SenderConfig() *endpoint.SenderConfig { return push.senderConfig } +func (j *ActiveSide) SetConcurrency(concurrency int) (err error) { + j.updateTasks(func(tasks *activeSideTasks) { + if tasks.replicationSetConcurrency != nil { + err = tasks.replicationSetConcurrency(concurrency) // no shadow + if err == nil { + tasks.concurrency = concurrency + } + } else { + // FIXME this is not great, should always be able to set it + err = errors.Errorf("cannot set while not replicating") + } + + }) + return err +} + func (j *ActiveSide) Run(ctx context.Context) { log := GetLogger(ctx) ctx = logging.WithSubsystemLoggers(ctx, log) @@ -436,11 +457,14 @@ func (j *ActiveSide) do(ctx context.Context) { ctx, repCancel := context.WithCancel(ctx) var repWait driver.WaitFunc j.updateTasks(func(tasks *activeSideTasks) { - // reset it - *tasks = activeSideTasks{} + // reset it (almost) + old := *tasks + *tasks = activeSideTasks{ + concurrency: old.concurrency, + } tasks.replicationCancel = repCancel - tasks.replicationReport, repWait = replication.Do( - ctx, logic.NewPlanner(j.promRepStateSecs, j.promBytesReplicated, sender, receiver, j.mode.PlannerPolicy()), + tasks.replicationReport, repWait, tasks.replicationSetConcurrency = replication.Do( + ctx, tasks.concurrency, logic.NewPlanner(j.promRepStateSecs, j.promBytesReplicated, sender, receiver, j.mode.PlannerPolicy()), ) tasks.state = ActiveSideReplicating }) diff --git a/daemon/job/job.go b/daemon/job/job.go index cba89e1..c133090 100644 --- a/daemon/job/job.go +++ b/daemon/job/job.go @@ -40,6 +40,7 @@ type Job interface { // must return the root of that subtree as rfs and ok = true OwnedDatasetSubtreeRoot() (rfs *zfs.DatasetPath, ok bool) SenderConfig() *endpoint.SenderConfig + SetConcurrency(concurrency int) error } type Type string diff --git a/daemon/job/passive.go b/daemon/job/passive.go index ce3bafe..5189e0e 100644 --- a/daemon/job/passive.go +++ b/daemon/job/passive.go @@ -163,6 +163,8 @@ func (j *PassiveSide) SenderConfig() *endpoint.SenderConfig { func (*PassiveSide) RegisterMetrics(registerer prometheus.Registerer) {} +func (*PassiveSide) SetConcurrency(concurrency int) error { return errors.Errorf("not supported") } + func (j *PassiveSide) Run(ctx context.Context) { log := GetLogger(ctx) diff --git a/daemon/job/snapjob.go b/daemon/job/snapjob.go index ef554a4..222a5e3 100644 --- a/daemon/job/snapjob.go +++ b/daemon/job/snapjob.go @@ -117,6 +117,8 @@ outer: } } +func (*SnapJob) SetConcurrency(concurrency int) error { return errors.Errorf("not supported") } + // Adaptor that implements pruner.History around a pruner.Target. // The ReplicationCursor method is Get-op only and always returns // the filesystem's most recent version's GUID. diff --git a/daemon/prometheus.go b/daemon/prometheus.go index 7e43e8a..c159067 100644 --- a/daemon/prometheus.go +++ b/daemon/prometheus.go @@ -5,6 +5,7 @@ import ( "net" "net/http" + "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" @@ -51,6 +52,8 @@ func (j *prometheusJob) OwnedDatasetSubtreeRoot() (p *zfs.DatasetPath, ok bool) func (j *prometheusJob) SenderConfig() *endpoint.SenderConfig { return nil } +func (j *prometheusJob) SetConcurrency(concurrency int) error { return errors.Errorf("not supported") } + func (j *prometheusJob) RegisterMetrics(registerer prometheus.Registerer) {} func (j *prometheusJob) Run(ctx context.Context) { diff --git a/replication/driver/replication_driver.go b/replication/driver/replication_driver.go index a9c2ce7..8d2b314 100644 --- a/replication/driver/replication_driver.go +++ b/replication/driver/replication_driver.go @@ -89,6 +89,8 @@ type attempt struct { // if both are nil, it must be assumed that Planner.Plan is active planErr *timedError fss []*fs + + concurrency int } type timedError struct { @@ -170,11 +172,12 @@ type step struct { type ReportFunc func() *report.Report type WaitFunc func(block bool) (done bool) +type SetConcurrencyFunc func(concurrency int) error var maxAttempts = envconst.Int64("ZREPL_REPLICATION_MAX_ATTEMPTS", 3) var reconnectHardFailTimeout = envconst.Duration("ZREPL_REPLICATION_RECONNECT_HARD_FAIL_TIMEOUT", 10*time.Minute) -func Do(ctx context.Context, planner Planner) (ReportFunc, WaitFunc) { +func Do(ctx context.Context, initialConcurrency int, planner Planner) (ReportFunc, WaitFunc, SetConcurrencyFunc) { log := getLog(ctx) l := chainlock.New() run := &run{ @@ -182,6 +185,8 @@ func Do(ctx context.Context, planner Planner) (ReportFunc, WaitFunc) { startedAt: time.Now(), } + concurrencyChanges := make(chan concurrencyChange) + done := make(chan struct{}) go func() { defer close(done) @@ -198,15 +203,21 @@ func Do(ctx context.Context, planner Planner) (ReportFunc, WaitFunc) { run.waitReconnect.SetZero() run.waitReconnectError = nil + prevConcurrency := initialConcurrency // FIXME default concurrency + if prev != nil { + prevConcurrency = prev.concurrency + } + // do current attempt cur := &attempt{ - l: l, - startedAt: time.Now(), - planner: planner, + l: l, + startedAt: time.Now(), + planner: planner, + concurrency: prevConcurrency, } run.attempts = append(run.attempts, cur) run.l.DropWhile(func() { - cur.do(ctx, prev) + cur.do(ctx, prev, concurrencyChanges) }) prev = cur if ctx.Err() != nil { @@ -277,10 +288,25 @@ func Do(ctx context.Context, planner Planner) (ReportFunc, WaitFunc) { defer run.l.Lock().Unlock() return run.report() } - return report, wait + setConcurrency := func(concurrency int) (reterr error) { + var wg sync.WaitGroup + wg.Add(1) + concurrencyChanges <- concurrencyChange{ concurrency, func(err error) { + defer wg.Done() + reterr = err // shadow + }} + wg.Wait() + return reterr + } + return report, wait, setConcurrency } -func (a *attempt) do(ctx context.Context, prev *attempt) { +type concurrencyChange struct { + value int + resultCallback func(error) +} + +func (a *attempt) do(ctx context.Context, prev *attempt, setConcurrency <-chan concurrencyChange) { pfss, err := a.planner.Plan(ctx) errTime := time.Now() defer a.l.Lock().Unlock() @@ -354,8 +380,8 @@ func (a *attempt) do(ctx context.Context, prev *attempt) { } // invariant: prevs contains an entry for each unambigious correspondence - stepQueue := newStepQueue() - defer stepQueue.Start(1)() // TODO parallel replication + stepQueue := newStepQueue(a.concurrency) + defer stepQueue.Start()() var fssesDone sync.WaitGroup for _, f := range a.fss { fssesDone.Add(1) @@ -364,8 +390,27 @@ func (a *attempt) do(ctx context.Context, prev *attempt) { f.do(ctx, stepQueue, prevs[f]) }(f) } + changeConcurrencyDone := make(chan struct{}) + go func() { + for { + select { + case change := <-setConcurrency: + err := stepQueue.SetConcurrency(change.value) + go change.resultCallback(err) + if err == nil { + a.l.Lock() + a.concurrency = change.value + a.l.Unlock() + } + case <-changeConcurrencyDone: + return + // not waiting for ctx.Done here, the main job are the fsses + } + } + }() a.l.DropWhile(func() { fssesDone.Wait() + close(changeConcurrencyDone) }) a.finishedAt = time.Now() } @@ -461,7 +506,8 @@ func (fs *fs) do(ctx context.Context, pq *stepQueue, prev *fs) { // lock must not be held while executing step in order for reporting to work fs.l.DropWhile(func() { targetDate := s.step.TargetDate() - defer pq.WaitReady(fs, targetDate)() + ctx, done := pq.WaitReady(ctx, fs, targetDate)() + defer done() err = s.step.Step(ctx) // no shadow errTime = time.Now() // no shadow }) @@ -482,6 +528,7 @@ func (r *run) report() *report.Report { WaitReconnectSince: r.waitReconnect.begin, WaitReconnectUntil: r.waitReconnect.end, WaitReconnectError: r.waitReconnectError.IntoReportError(), + // Concurrency: r.concurrency, } for i := range report.Attempts { report.Attempts[i] = r.attempts[i].report() diff --git a/replication/driver/replication_driver_test.go b/replication/driver/replication_driver_test.go index abf39f1..3250a5c 100644 --- a/replication/driver/replication_driver_test.go +++ b/replication/driver/replication_driver_test.go @@ -151,7 +151,7 @@ func TestReplication(t *testing.T) { ctx := context.Background() mp := &mockPlanner{} - getReport, wait := Do(ctx, mp) + getReport, wait, _ := Do(ctx, 1, mp) begin := time.Now() fireAt := []time.Duration{ // the following values are relative to the start diff --git a/replication/driver/replication_stepqueue.go b/replication/driver/replication_stepqueue.go index a6486c0..31ab068 100644 --- a/replication/driver/replication_stepqueue.go +++ b/replication/driver/replication_stepqueue.go @@ -2,6 +2,8 @@ package driver import ( "container/heap" + "fmt" + "sync" "time" "github.com/zrepl/zrepl/util/chainlock" @@ -11,51 +13,88 @@ type stepQueueRec struct { ident interface{} targetDate time.Time wakeup chan StepCompletedFunc + cancelDueToConcurrencyDownsize interace{} } type stepQueue struct { stop chan struct{} reqs chan stepQueueRec + + // l protects all members except the channels above + + l *chainlock.L + pendingCond *sync.Cond + + // ident => queueItem + pending *stepQueueHeap + active *stepQueueHeap + queueItems map[interface{}]*stepQueueHeapItem // for tracking used idents in both pending and active + + // stopped is used for cancellation of "wake" goroutine + stopped bool + + concurrency int } type stepQueueHeapItem struct { idx int - req stepQueueRec + req *stepQueueRec +} +type stepQueueHeap struct { + items []*stepQueueHeapItem + reverse bool // never change after pushing first element } -type stepQueueHeap []*stepQueueHeapItem func (h stepQueueHeap) Less(i, j int) bool { - return h[i].req.targetDate.Before(h[j].req.targetDate) + res := h.items[i].req.targetDate.Before(h.items[j].req.targetDate) + if h.reverse { + return !res + } + return res } func (h stepQueueHeap) Swap(i, j int) { - h[i], h[j] = h[j], h[i] - h[i].idx = i - h[j].idx = j + h.items[i], h.items[j] = h.items[j], h.items[i] + h.items[i].idx = i + h.items[j].idx = j } func (h stepQueueHeap) Len() int { - return len(h) + return len(h.items) } func (h *stepQueueHeap) Push(elem interface{}) { hitem := elem.(*stepQueueHeapItem) hitem.idx = h.Len() - *h = append(*h, hitem) + h.items = append(h.items, hitem) } func (h *stepQueueHeap) Pop() interface{} { - elem := (*h)[h.Len()-1] + elem := h.items[h.Len()-1] elem.idx = -1 - *h = (*h)[:h.Len()-1] + h.items = h.items[:h.Len()-1] return elem } // returned stepQueue must be closed with method Close -func newStepQueue() *stepQueue { +func newStepQueue(concurrency int) *stepQueue { + l := chainlock.New() q := &stepQueue{ - stop: make(chan struct{}), - reqs: make(chan stepQueueRec), + stop: make(chan struct{}), + reqs: make(chan stepQueueRec), + l: l, + pendingCond: l.NewCond(), + // priority queue + pending: &stepQueueHeap{reverse: false}, + active: &stepQueueHeap{reverse: true}, + // ident => queueItem + queueItems: make(map[interface{}]*stepQueueHeapItem), + // stopped is used for cancellation of "wake" goroutine + stopped: false, + } + err := q.setConcurrencyLocked(concurrency) + if err != nil { + panic(err) } return q } @@ -65,25 +104,12 @@ func newStepQueue() *stepQueue { // // No WaitReady calls must be active at the time done is called // The behavior of calling WaitReady after done was called is undefined -func (q *stepQueue) Start(concurrency int) (done func()) { - if concurrency < 1 { - panic("concurrency must be >= 1") - } - // l protects pending and queueItems - l := chainlock.New() - pendingCond := l.NewCond() - // priority queue - pending := &stepQueueHeap{} - // ident => queueItem - queueItems := make(map[interface{}]*stepQueueHeapItem) - // stopped is used for cancellation of "wake" goroutine - stopped := false - active := 0 +func (q *stepQueue) Start() (done func()) { go func() { // "stopper" goroutine <-q.stop - defer l.Lock().Unlock() - stopped = true - pendingCond.Broadcast() + defer q.l.Lock().Unlock() + q.stopped = true + q.pendingCond.Broadcast() }() go func() { // "reqs" goroutine for { @@ -97,41 +123,52 @@ func (q *stepQueue) Start(concurrency int) (done func()) { } case req := <-q.reqs: func() { - defer l.Lock().Unlock() - if _, ok := queueItems[req.ident]; ok { + defer q.l.Lock().Unlock() + if _, ok := q.queueItems[req.ident]; ok { panic("WaitReady must not be called twice for the same ident") } qitem := &stepQueueHeapItem{ req: req, } - queueItems[req.ident] = qitem - heap.Push(pending, qitem) - pendingCond.Broadcast() + q.queueItems[req.ident] = qitem + heap.Push(q.pending, qitem) + q.pendingCond.Broadcast() }() } } }() go func() { // "wake" goroutine - defer l.Lock().Unlock() + defer q.l.Lock().Unlock() for { - for !stopped && (active >= concurrency || pending.Len() == 0) { - pendingCond.Wait() + for !q.stopped && (q.active.Len() >= q.concurrency || q.pending.Len() == 0) { + q.pendingCond.Wait() } - if stopped { + if q.stopped { return } - if pending.Len() <= 0 { + if q.pending.Len() <= 0 { return } - active++ - next := heap.Pop(pending).(*stepQueueHeapItem).req - delete(queueItems, next.ident) - next.wakeup <- func() { - defer l.Lock().Unlock() - active-- - pendingCond.Broadcast() + // pop from tracked items + next := heap.Pop(q.pending).(*stepQueueHeapItem) + + next.req.cancelDueToConcurrencyDownsize = + + heap.Push(q.active, next) + + next.req.wakeup <- func() { + defer q.l.Lock().Unlock() + + // + qitem := &stepQueueHeapItem{ + req: req, + } + + // delete(q.queueItems, next.req.ident) // def + + q.pendingCond.Broadcast() } } }() @@ -161,3 +198,24 @@ func (q *stepQueue) WaitReady(ident interface{}, targetDate time.Time) StepCompl } return q.sendAndWaitForWakeup(ident, targetDate) } + +// caller must hold lock +func (q *stepQueue) setConcurrencyLocked(newConcurrency int) error { + if !(newConcurrency >= 1) { + return fmt.Errorf("concurrency must be >= 1 but requested %v", newConcurrency) + } + q.concurrency = newConcurrency + q.pendingCond.Broadcast() // wake up waiters who could make progress + + for q.active.Len() > q.concurrency { + item := heap.Pop(q.active).(*stepQueueHeapItem) + item.req.cancelDueToConcurrencyDownsize() + heap.Push(q.pending, item) + } + return nil +} + +func (q *stepQueue) SetConcurrency(new int) error { + defer q.l.Lock().Unlock() + return q.setConcurrencyLocked(new) +} diff --git a/replication/driver/replication_stepqueue_test.go b/replication/driver/replication_stepqueue_test.go index 73f877a..c00bc59 100644 --- a/replication/driver/replication_stepqueue_test.go +++ b/replication/driver/replication_stepqueue_test.go @@ -17,7 +17,7 @@ import ( // (relies on scheduler responsivity of < 500ms) func TestPqNotconcurrent(t *testing.T) { var ctr uint32 - q := newStepQueue() + q := newStepQueue(1) var wg sync.WaitGroup wg.Add(4) go func() { @@ -29,7 +29,7 @@ func TestPqNotconcurrent(t *testing.T) { }() // give goroutine "1" 500ms to enter queue, get the active slot and enter time.Sleep - defer q.Start(1)() + defer q.Start()() time.Sleep(500 * time.Millisecond) // while "1" is still running, queue in "2", "3" and "4" @@ -77,8 +77,9 @@ func (r record) String() string { // Hence, perform some statistics on the wakeup times and assert that the mean wakeup // times for each step are close together. func TestPqConcurrent(t *testing.T) { - - q := newStepQueue() + + concurrency := 5 + q := newStepQueue(concurrency) var wg sync.WaitGroup filesystems := 100 stepsPerFS := 20 @@ -104,8 +105,7 @@ func TestPqConcurrent(t *testing.T) { records <- recs }(fs) } - concurrency := 5 - defer q.Start(concurrency)() + defer q.Start()() wg.Wait() close(records) t.Logf("loop done") diff --git a/replication/replication.go b/replication/replication.go index 7f9e35b..bd9a7f2 100644 --- a/replication/replication.go +++ b/replication/replication.go @@ -8,6 +8,6 @@ import ( "github.com/zrepl/zrepl/replication/driver" ) -func Do(ctx context.Context, planner driver.Planner) (driver.ReportFunc, driver.WaitFunc) { - return driver.Do(ctx, planner) +func Do(ctx context.Context, initialConcurrency int, planner driver.Planner) (driver.ReportFunc, driver.WaitFunc, driver.SetConcurrencyFunc) { + return driver.Do(ctx, initialConcurrency, planner) } diff --git a/replication/report/replication_report.go b/replication/report/replication_report.go index 5a64a3a..3643bc2 100644 --- a/replication/report/replication_report.go +++ b/replication/report/replication_report.go @@ -10,6 +10,7 @@ type Report struct { WaitReconnectSince, WaitReconnectUntil time.Time WaitReconnectError *TimedError Attempts []*AttemptReport + Concurrency int } var _, _ = json.Marshal(&Report{})