mirror of
https://github.com/zrepl/zrepl.git
synced 2025-08-13 16:57:04 +02:00
hotfix: limit concurrency of zfs send & recv commands
ATM, the replication logic sends all dry-run requests in parallel, which might overwhelm the ZFS pool on the sending side. Since we use rpc/dataconn for dry sends, this also opens one TCP connection per dry-run request. Use a sempahore to limit the degree of concurrency where we know it is a problem ATM. As indicated by the comments, the cleaner solution would involve some kind of 'resource exhaustion' error code. refs #161 refs #164
This commit is contained in:
@ -14,6 +14,8 @@ import (
|
||||
"github.com/zrepl/zrepl/replication/logic/pdu"
|
||||
"github.com/zrepl/zrepl/replication/report"
|
||||
"github.com/zrepl/zrepl/util/bytecounter"
|
||||
"github.com/zrepl/zrepl/util/envconst"
|
||||
"github.com/zrepl/zrepl/util/semaphore"
|
||||
"github.com/zrepl/zrepl/zfs"
|
||||
)
|
||||
|
||||
@ -110,6 +112,8 @@ type Filesystem struct {
|
||||
Path string // compat
|
||||
receiverFS *pdu.Filesystem
|
||||
promBytesReplicated prometheus.Counter // compat
|
||||
|
||||
sizeEstimateRequestSem *semaphore.S
|
||||
}
|
||||
|
||||
func (f *Filesystem) EqualToPreviousAttempt(other driver.FS) bool {
|
||||
@ -234,6 +238,8 @@ func (p *Planner) doPlanning(ctx context.Context) ([]*Filesystem, error) {
|
||||
}
|
||||
rfss := rlfssres.GetFilesystems()
|
||||
|
||||
sizeEstimateRequestSem := semaphore.New(envconst.Int64("ZREPL_REPLICATION_MAX_CONCURRENT_SIZE_ESTIMATE", 4))
|
||||
|
||||
q := make([]*Filesystem, 0, len(sfss))
|
||||
for _, fs := range sfss {
|
||||
|
||||
@ -247,11 +253,12 @@ func (p *Planner) doPlanning(ctx context.Context) ([]*Filesystem, error) {
|
||||
ctr := p.promBytesReplicated.WithLabelValues(fs.Path)
|
||||
|
||||
q = append(q, &Filesystem{
|
||||
sender: p.sender,
|
||||
receiver: p.receiver,
|
||||
Path: fs.Path,
|
||||
receiverFS: receiverFS,
|
||||
promBytesReplicated: ctr,
|
||||
sender: p.sender,
|
||||
receiver: p.receiver,
|
||||
Path: fs.Path,
|
||||
receiverFS: receiverFS,
|
||||
promBytesReplicated: ctr,
|
||||
sizeEstimateRequestSem: sizeEstimateRequestSem,
|
||||
})
|
||||
}
|
||||
|
||||
@ -336,7 +343,17 @@ func (fs *Filesystem) doPlanning(ctx context.Context) ([]*Step, error) {
|
||||
wg.Add(1)
|
||||
go func(step *Step) {
|
||||
defer wg.Done()
|
||||
err := step.updateSizeEstimate(fanOutCtx)
|
||||
|
||||
// TODO instead of the semaphore, rely on resource-exhaustion signaled by the remote endpoint to limit size-estimate requests
|
||||
// Send is handled over rpc/dataconn ATM, which doesn't support the resource exhaustion status codes that gRPC defines
|
||||
guard, err := fs.sizeEstimateRequestSem.Acquire(fanOutCtx)
|
||||
if err != nil {
|
||||
fanOutCancel()
|
||||
return
|
||||
}
|
||||
defer guard.Release()
|
||||
|
||||
err = step.updateSizeEstimate(fanOutCtx)
|
||||
if err != nil {
|
||||
log.WithError(err).WithField("step", step).Error("error computing size estimate")
|
||||
fanOutCancel()
|
||||
|
Reference in New Issue
Block a user