mirror of
https://github.com/zrepl/zrepl.git
synced 2025-01-13 09:49:32 +01:00
Merge branch 'problame/replication_refactor' into InsanePrawn-master
This commit is contained in:
commit
17818439a0
79
.circleci/config.yml
Normal file
79
.circleci/config.yml
Normal file
@ -0,0 +1,79 @@
|
||||
version: 2.0
|
||||
workflows:
|
||||
version: 2
|
||||
build:
|
||||
jobs:
|
||||
- build-1.11
|
||||
- build-1.12
|
||||
- build-latest
|
||||
jobs:
|
||||
# build-latest serves as the template
|
||||
# we use YAML anchors & aliases to exchange the docker image (and hence Go version used for the build)
|
||||
build-latest: &build-latest
|
||||
description: Builds zrepl
|
||||
parameters:
|
||||
image:
|
||||
description: "the docker image that the job should use"
|
||||
type: string
|
||||
docker:
|
||||
- image: circleci/golang:latest
|
||||
environment:
|
||||
# required by lazy.sh
|
||||
TERM: xterm
|
||||
working_directory: /go/src/github.com/zrepl/zrepl
|
||||
steps:
|
||||
- run:
|
||||
name: Setup environment variables
|
||||
command: |
|
||||
# used by pip (for docs)
|
||||
echo 'export PATH="$HOME/.local/bin:$PATH"' >> $BASH_ENV
|
||||
|
||||
- restore_cache:
|
||||
keys:
|
||||
- source
|
||||
- vendor
|
||||
- protobuf
|
||||
|
||||
- checkout
|
||||
|
||||
- save_cache:
|
||||
key: source
|
||||
paths:
|
||||
- ".git"
|
||||
|
||||
# install deps
|
||||
- run: wget https://github.com/protocolbuffers/protobuf/releases/download/v3.6.1/protoc-3.6.1-linux-x86_64.zip
|
||||
- run: echo "6003de742ea3fcf703cfec1cd4a3380fd143081a2eb0e559065563496af27807 protoc-3.6.1-linux-x86_64.zip" | sha256sum -c
|
||||
- run: sudo unzip -d /usr protoc-3.6.1-linux-x86_64.zip
|
||||
- save_cache:
|
||||
key: protobuf
|
||||
paths:
|
||||
- "/usr/include/google/protobuf"
|
||||
|
||||
- run: sudo apt install python3 python3-pip libgirepository1.0-dev
|
||||
- run: ./lazy.sh devsetup
|
||||
|
||||
- run: make vendordeps
|
||||
- save_cache:
|
||||
key: vendor
|
||||
paths:
|
||||
- "./vendor"
|
||||
|
||||
- run: make
|
||||
- run: make vet
|
||||
- run: make test
|
||||
- run: make release
|
||||
|
||||
- store_artifacts:
|
||||
path: ./artifacts/release
|
||||
when: always
|
||||
|
||||
|
||||
build-1.11:
|
||||
<<: *build-latest
|
||||
docker:
|
||||
- image: circleci/golang:1.11
|
||||
build-1.12:
|
||||
<<: *build-latest
|
||||
docker:
|
||||
- image: circleci/golang:1.12
|
84
.travis.yml
84
.travis.yml
@ -2,6 +2,7 @@ dist: xenial
|
||||
services:
|
||||
- docker
|
||||
|
||||
env: # for allow_failures: https://docs.travis-ci.com/user/customizing-the-build/
|
||||
matrix:
|
||||
include:
|
||||
|
||||
@ -15,45 +16,35 @@ matrix:
|
||||
--user "$(id -u):$(id -g)" \
|
||||
zrepl_build make vendordeps release
|
||||
|
||||
# all go entries vary only by go version
|
||||
- language: go
|
||||
- &zrepl_build_template
|
||||
language: go
|
||||
go_import_path: github.com/zrepl/zrepl
|
||||
before_install:
|
||||
- wget https://github.com/protocolbuffers/protobuf/releases/download/v3.6.1/protoc-3.6.1-linux-x86_64.zip
|
||||
- echo "6003de742ea3fcf703cfec1cd4a3380fd143081a2eb0e559065563496af27807 protoc-3.6.1-linux-x86_64.zip" | sha256sum -c
|
||||
- sudo unzip -d /usr protoc-3.6.1-linux-x86_64.zip
|
||||
- ./lazy.sh godep
|
||||
- make vendordeps
|
||||
script:
|
||||
- make
|
||||
- make vet
|
||||
- make test
|
||||
- make artifacts/zrepl-freebsd-amd64
|
||||
- make artifacts/zrepl-linux-amd64
|
||||
- make artifacts/zrepl-darwin-amd64
|
||||
go:
|
||||
- "1.11"
|
||||
go_import_path: github.com/zrepl/zrepl
|
||||
before_install:
|
||||
- wget https://github.com/protocolbuffers/protobuf/releases/download/v3.6.1/protoc-3.6.1-linux-x86_64.zip
|
||||
- echo "6003de742ea3fcf703cfec1cd4a3380fd143081a2eb0e559065563496af27807 protoc-3.6.1-linux-x86_64.zip" | sha256sum -c
|
||||
- sudo unzip -d /usr protoc-3.6.1-linux-x86_64.zip
|
||||
- ./lazy.sh godep
|
||||
- make vendordeps
|
||||
script:
|
||||
- make
|
||||
- make vet
|
||||
- make test
|
||||
- make artifacts/zrepl-freebsd-amd64
|
||||
- make artifacts/zrepl-linux-amd64
|
||||
- make artifacts/zrepl-darwin-amd64
|
||||
|
||||
- language: go
|
||||
- <<: *zrepl_build_template
|
||||
go:
|
||||
- "1.12"
|
||||
|
||||
- <<: *zrepl_build_template
|
||||
go:
|
||||
- "master"
|
||||
go_import_path: github.com/zrepl/zrepl
|
||||
before_install:
|
||||
- wget https://github.com/protocolbuffers/protobuf/releases/download/v3.6.1/protoc-3.6.1-linux-x86_64.zip
|
||||
- echo "6003de742ea3fcf703cfec1cd4a3380fd143081a2eb0e559065563496af27807 protoc-3.6.1-linux-x86_64.zip" | sha256sum -c
|
||||
- sudo unzip -d /usr protoc-3.6.1-linux-x86_64.zip
|
||||
- ./lazy.sh godep
|
||||
- make vendordeps
|
||||
script:
|
||||
- make
|
||||
- make vet
|
||||
- make test
|
||||
- make artifacts/zrepl-freebsd-amd64
|
||||
- make artifacts/zrepl-linux-amd64
|
||||
- make artifacts/zrepl-darwin-amd64
|
||||
|
||||
# all python entries vary only by python version
|
||||
- language: python
|
||||
- &zrepl_docs_template
|
||||
language: python
|
||||
python:
|
||||
- "3.4"
|
||||
install:
|
||||
@ -61,29 +52,18 @@ matrix:
|
||||
- pip install -r docs/requirements.txt
|
||||
script:
|
||||
- make docs
|
||||
- language: python
|
||||
- <<: *zrepl_docs_template
|
||||
python:
|
||||
- "3.5"
|
||||
install:
|
||||
- sudo apt-get install libgirepository1.0-dev
|
||||
- pip install -r docs/requirements.txt
|
||||
script:
|
||||
- make docs
|
||||
- language: python
|
||||
- <<: *zrepl_docs_template
|
||||
python:
|
||||
- "3.6"
|
||||
install:
|
||||
- sudo apt-get install libgirepository1.0-dev
|
||||
- pip install -r docs/requirements.txt
|
||||
script:
|
||||
- make docs
|
||||
- language: python
|
||||
- <<: *zrepl_docs_template
|
||||
python:
|
||||
- "3.7"
|
||||
install:
|
||||
- sudo apt-get install libgirepository1.0-dev
|
||||
- pip install -r docs/requirements.txt
|
||||
script:
|
||||
- make docs
|
||||
|
||||
|
||||
|
||||
allow_failures:
|
||||
- <<: *zrepl_build_template
|
||||
go:
|
||||
- "master"
|
||||
|
49
Gopkg.lock
generated
49
Gopkg.lock
generated
@ -89,6 +89,14 @@
|
||||
revision = "aa810b61a9c79d51363740d207bb46cf8e620ed5"
|
||||
version = "v1.2.0"
|
||||
|
||||
[[projects]]
|
||||
digest = "1:ad92aa49f34cbc3546063c7eb2cabb55ee2278b72842eda80e2a20a8a06a8d73"
|
||||
name = "github.com/google/uuid"
|
||||
packages = ["."]
|
||||
pruneopts = ""
|
||||
revision = "0cd6bf5da1e1c83f8b45653022c74f71af0538a4"
|
||||
version = "v1.1.1"
|
||||
|
||||
[[projects]]
|
||||
branch = "master"
|
||||
digest = "1:cb09475f771b9167fb9333629f5d6a7161572602ea040f1094602b0dc8709878"
|
||||
@ -161,6 +169,14 @@
|
||||
revision = "3247c84500bff8d9fb6d579d800f20b3e091582c"
|
||||
version = "v1.0.0"
|
||||
|
||||
[[projects]]
|
||||
digest = "1:4ff67dde814694496d7aa31be44b900f9717a10c8bc9136b13f49c8ef97f439a"
|
||||
name = "github.com/montanaflynn/stats"
|
||||
packages = ["."]
|
||||
pruneopts = ""
|
||||
revision = "63fbb2597b7a13043b453a4b819945badb8f8926"
|
||||
version = "v0.5.0"
|
||||
|
||||
[[projects]]
|
||||
branch = "master"
|
||||
digest = "1:f60ff065b58bd53e641112b38bbda9d2684deb828393c7ffb89c69a1ee301d17"
|
||||
@ -245,6 +261,14 @@
|
||||
pruneopts = ""
|
||||
revision = "8b1c2da0d56deffdbb9e48d4414b4e674bd8083e"
|
||||
|
||||
[[projects]]
|
||||
digest = "1:3962f553b77bf6c03fc07cd687a22dd3b00fe11aa14d31194f5505f5bb65cdc8"
|
||||
name = "github.com/sergi/go-diff"
|
||||
packages = ["diffmatchpatch"]
|
||||
pruneopts = ""
|
||||
revision = "1744e2970ca51c86172c8190fadad617561ed6e7"
|
||||
version = "v1.0.0"
|
||||
|
||||
[[projects]]
|
||||
branch = "master"
|
||||
digest = "1:146327ce93be37e68bd3ff8541090d96da8cb3adc9e35d57570e9170a29f6bf6"
|
||||
@ -280,6 +304,25 @@
|
||||
revision = "93babf24513d0e8277635da8169fcc5a46ae3f6a"
|
||||
version = "v1.11.0"
|
||||
|
||||
[[projects]]
|
||||
digest = "1:529ed3f98838f69e13761788d0cc71b44e130058fab13bae2ce09f7a176bced4"
|
||||
name = "github.com/yudai/gojsondiff"
|
||||
packages = [
|
||||
".",
|
||||
"formatter",
|
||||
]
|
||||
pruneopts = ""
|
||||
revision = "7b1b7adf999dab73a6eb02669c3d82dbb27a3dd6"
|
||||
version = "1.0.0"
|
||||
|
||||
[[projects]]
|
||||
branch = "master"
|
||||
digest = "1:9857bb2293f372b2181004d8b62179bbdb4ab0982ec6f762abe6cf2bfedaff85"
|
||||
name = "github.com/yudai/golcs"
|
||||
packages = ["."]
|
||||
pruneopts = ""
|
||||
revision = "ecda9a501e8220fae3b4b600c3db4b0ba22cfc68"
|
||||
|
||||
[[projects]]
|
||||
branch = "v2"
|
||||
digest = "1:6b8a6afafde7ed31cd0c577ba40d88ce39e8f1c5eb76d7836be7d5b74f1c534a"
|
||||
@ -403,9 +446,11 @@
|
||||
"github.com/go-logfmt/logfmt",
|
||||
"github.com/golang/protobuf/proto",
|
||||
"github.com/golang/protobuf/protoc-gen-go",
|
||||
"github.com/google/uuid",
|
||||
"github.com/jinzhu/copier",
|
||||
"github.com/kr/pretty",
|
||||
"github.com/mattn/go-isatty",
|
||||
"github.com/montanaflynn/stats",
|
||||
"github.com/pkg/errors",
|
||||
"github.com/pkg/profile",
|
||||
"github.com/problame/go-netssh",
|
||||
@ -415,14 +460,18 @@
|
||||
"github.com/spf13/pflag",
|
||||
"github.com/stretchr/testify/assert",
|
||||
"github.com/stretchr/testify/require",
|
||||
"github.com/yudai/gojsondiff",
|
||||
"github.com/yudai/gojsondiff/formatter",
|
||||
"github.com/zrepl/yaml-config",
|
||||
"golang.org/x/net/context",
|
||||
"golang.org/x/sys/unix",
|
||||
"golang.org/x/tools/cmd/stringer",
|
||||
"google.golang.org/grpc",
|
||||
"google.golang.org/grpc/codes",
|
||||
"google.golang.org/grpc/credentials",
|
||||
"google.golang.org/grpc/keepalive",
|
||||
"google.golang.org/grpc/peer",
|
||||
"google.golang.org/grpc/status",
|
||||
]
|
||||
solver-name = "gps-cdcl"
|
||||
solver-version = 1
|
||||
|
@ -59,3 +59,7 @@ required = [
|
||||
[[constraint]]
|
||||
name = "google.golang.org/grpc"
|
||||
version = "1"
|
||||
|
||||
[[constraint]]
|
||||
version = "1.1.0"
|
||||
name = "github.com/google/uuid"
|
||||
|
2
Makefile
2
Makefile
@ -27,7 +27,7 @@ vendordeps:
|
||||
dep ensure -v -vendor-only
|
||||
|
||||
generate: #not part of the build, must do that manually
|
||||
protoc -I=replication/pdu --go_out=plugins=grpc:replication/pdu replication/pdu/pdu.proto
|
||||
protoc -I=replication/logic/pdu --go_out=plugins=grpc:replication/logic/pdu replication/logic/pdu/pdu.proto
|
||||
go generate -x ./...
|
||||
|
||||
build:
|
||||
|
212
client/status.go
212
client/status.go
@ -10,8 +10,7 @@ import (
|
||||
"github.com/zrepl/zrepl/daemon"
|
||||
"github.com/zrepl/zrepl/daemon/job"
|
||||
"github.com/zrepl/zrepl/daemon/pruner"
|
||||
"github.com/zrepl/zrepl/replication"
|
||||
"github.com/zrepl/zrepl/replication/fsrep"
|
||||
"github.com/zrepl/zrepl/replication/report"
|
||||
"io"
|
||||
"math"
|
||||
"net/http"
|
||||
@ -122,7 +121,7 @@ func wrap(s string, width int) string {
|
||||
if idx := strings.IndexAny(s, "\n\r"); idx != -1 && idx < rem {
|
||||
rem = idx+1
|
||||
}
|
||||
untilNewline := strings.TrimSpace(s[:rem])
|
||||
untilNewline := strings.TrimRight(s[:rem], "\n\r")
|
||||
s = s[rem:]
|
||||
if len(untilNewline) == 0 {
|
||||
continue
|
||||
@ -130,7 +129,7 @@ func wrap(s string, width int) string {
|
||||
b.WriteString(untilNewline)
|
||||
b.WriteString("\n")
|
||||
}
|
||||
return strings.TrimSpace(b.String())
|
||||
return strings.TrimRight(b.String(), "\n\r")
|
||||
}
|
||||
|
||||
func (t *tui) printfDrawIndentedAndWrappedIfMultiline(format string, a ...interface{}) {
|
||||
@ -353,74 +352,91 @@ func (t *tui) draw() {
|
||||
termbox.Flush()
|
||||
}
|
||||
|
||||
func (t *tui) renderReplicationReport(rep *replication.Report, history *bytesProgressHistory) {
|
||||
func (t *tui) renderReplicationReport(rep *report.Report, history *bytesProgressHistory) {
|
||||
if rep == nil {
|
||||
t.printf("...\n")
|
||||
return
|
||||
}
|
||||
|
||||
all := make([]*fsrep.Report, 0, len(rep.Completed)+len(rep.Pending) + 1)
|
||||
all = append(all, rep.Completed...)
|
||||
all = append(all, rep.Pending...)
|
||||
if rep.Active != nil {
|
||||
all = append(all, rep.Active)
|
||||
if rep.WaitReconnectError != nil {
|
||||
t.printfDrawIndentedAndWrappedIfMultiline("Connectivity: %s", rep.WaitReconnectError)
|
||||
t.newline()
|
||||
}
|
||||
sort.Slice(all, func(i, j int) bool {
|
||||
return all[i].Filesystem < all[j].Filesystem
|
||||
if !rep.WaitReconnectSince.IsZero() {
|
||||
delta := rep.WaitReconnectUntil.Sub(time.Now()).Round(time.Second)
|
||||
if rep.WaitReconnectUntil.IsZero() || delta > 0 {
|
||||
var until string
|
||||
if rep.WaitReconnectUntil.IsZero() {
|
||||
until = "waiting indefinitely"
|
||||
} else {
|
||||
until = fmt.Sprintf("hard fail in %s @ %s", delta, rep.WaitReconnectUntil)
|
||||
}
|
||||
t.printfDrawIndentedAndWrappedIfMultiline("Connectivity: reconnecting with exponential backoff (since %s) (%s)",
|
||||
rep.WaitReconnectSince, until)
|
||||
} else {
|
||||
t.printfDrawIndentedAndWrappedIfMultiline("Connectivity: reconnects reached hard-fail timeout @ %s", rep.WaitReconnectUntil)
|
||||
}
|
||||
t.newline()
|
||||
}
|
||||
|
||||
// TODO visualize more than the latest attempt by folding all attempts into one
|
||||
if len(rep.Attempts) == 0 {
|
||||
t.printf("no attempts made yet")
|
||||
return
|
||||
} else {
|
||||
t.printf("Attempt #%d", len(rep.Attempts))
|
||||
if len(rep.Attempts) > 1 {
|
||||
t.printf(". Previous attempts failed with the follwing statuses:")
|
||||
t.newline()
|
||||
t.addIndent(1)
|
||||
for i, a := range rep.Attempts[:len(rep.Attempts)-1] {
|
||||
t.printfDrawIndentedAndWrappedIfMultiline("#%d: %s (failed at %s) (ran %s)", i + 1, a.State, a.FinishAt, a.FinishAt.Sub(a.StartAt))
|
||||
t.newline()
|
||||
}
|
||||
t.addIndent(-1)
|
||||
} else {
|
||||
t.newline()
|
||||
}
|
||||
}
|
||||
|
||||
latest := rep.Attempts[len(rep.Attempts)-1]
|
||||
sort.Slice(latest.Filesystems, func(i, j int) bool {
|
||||
return latest.Filesystems[i].Info.Name < latest.Filesystems[j].Info.Name
|
||||
})
|
||||
|
||||
state, err := replication.StateString(rep.Status)
|
||||
if err != nil {
|
||||
t.printf("Status: %q (parse error: %q)\n", rep.Status, err)
|
||||
return
|
||||
}
|
||||
|
||||
t.printf("Status: %s", state)
|
||||
t.printf("Status: %s", latest.State)
|
||||
t.newline()
|
||||
if rep.Problem != "" {
|
||||
if latest.State == report.AttemptPlanningError {
|
||||
t.printf("Problem: ")
|
||||
t.printfDrawIndentedAndWrappedIfMultiline("%s", rep.Problem)
|
||||
t.printfDrawIndentedAndWrappedIfMultiline("%s", latest.PlanError)
|
||||
t.newline()
|
||||
} else if latest.State == report.AttemptFanOutError {
|
||||
t.printf("Problem: one or more of the filesystems encountered errors")
|
||||
t.newline()
|
||||
}
|
||||
if rep.SleepUntil.After(time.Now()) && !state.IsTerminal() {
|
||||
t.printf("Sleeping until %s (%s left)\n", rep.SleepUntil, rep.SleepUntil.Sub(time.Now()))
|
||||
}
|
||||
|
||||
if state != replication.Planning && state != replication.PlanningError {
|
||||
if latest.State != report.AttemptPlanning && latest.State != report.AttemptPlanningError {
|
||||
// Draw global progress bar
|
||||
// Progress: [---------------]
|
||||
sumUpFSRep := func(rep *fsrep.Report) (transferred, total int64) {
|
||||
for _, s := range rep.Pending {
|
||||
transferred += s.Bytes
|
||||
total += s.ExpectedBytes
|
||||
}
|
||||
for _, s := range rep.Completed {
|
||||
transferred += s.Bytes
|
||||
total += s.ExpectedBytes
|
||||
}
|
||||
return
|
||||
}
|
||||
var transferred, total int64
|
||||
for _, fs := range all {
|
||||
fstx, fstotal := sumUpFSRep(fs)
|
||||
transferred += fstx
|
||||
total += fstotal
|
||||
}
|
||||
rate, changeCount := history.Update(transferred)
|
||||
expected, replicated := latest.BytesSum()
|
||||
rate, changeCount := history.Update(replicated)
|
||||
t.write("Progress: ")
|
||||
t.drawBar(50, transferred, total, changeCount)
|
||||
t.write(fmt.Sprintf(" %s / %s @ %s/s", ByteCountBinary(transferred), ByteCountBinary(total), ByteCountBinary(rate)))
|
||||
t.drawBar(50, replicated, expected, changeCount)
|
||||
t.write(fmt.Sprintf(" %s / %s @ %s/s", ByteCountBinary(replicated), ByteCountBinary(expected), ByteCountBinary(rate)))
|
||||
t.newline()
|
||||
|
||||
var maxFSLen int
|
||||
for _, fs := range latest.Filesystems {
|
||||
if len(fs.Info.Name) > maxFSLen {
|
||||
maxFSLen = len(fs.Info.Name)
|
||||
}
|
||||
}
|
||||
for _, fs := range latest.Filesystems {
|
||||
t.printFilesystemStatus(fs, false, maxFSLen) // FIXME bring 'active' flag back
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
var maxFSLen int
|
||||
for _, fs := range all {
|
||||
if len(fs.Filesystem) > maxFSLen {
|
||||
maxFSLen = len(fs.Filesystem)
|
||||
}
|
||||
}
|
||||
for _, fs := range all {
|
||||
t.printFilesystemStatus(fs, fs == rep.Active, maxFSLen)
|
||||
}
|
||||
}
|
||||
|
||||
func (t *tui) renderPrunerReport(r *pruner.Report) {
|
||||
@ -441,9 +457,6 @@ func (t *tui) renderPrunerReport(r *pruner.Report) {
|
||||
if r.Error != "" {
|
||||
t.printf("Error: %s\n", r.Error)
|
||||
}
|
||||
if r.SleepUntil.After(time.Now()) {
|
||||
t.printf("Sleeping until %s (%s left)\n", r.SleepUntil, r.SleepUntil.Sub(time.Now()))
|
||||
}
|
||||
|
||||
type commonFS struct {
|
||||
*pruner.FSReport
|
||||
@ -459,8 +472,7 @@ func (t *tui) renderPrunerReport(r *pruner.Report) {
|
||||
|
||||
switch state {
|
||||
case pruner.Plan: fallthrough
|
||||
case pruner.PlanWait: fallthrough
|
||||
case pruner.ErrPerm:
|
||||
case pruner.PlanErr:
|
||||
return
|
||||
}
|
||||
|
||||
@ -500,8 +512,18 @@ func (t *tui) renderPrunerReport(r *pruner.Report) {
|
||||
for _, fs := range all {
|
||||
t.write(rightPad(fs.Filesystem, maxFSname, " "))
|
||||
t.write(" ")
|
||||
if !fs.SkipReason.NotSkipped() {
|
||||
t.printf("skipped: %s\n", fs.SkipReason)
|
||||
continue
|
||||
}
|
||||
if fs.LastError != "" {
|
||||
t.printf("ERROR (%d): %s\n", fs.ErrorCount, fs.LastError) // whitespace is padding
|
||||
if strings.ContainsAny(fs.LastError, "\r\n") {
|
||||
t.printf("ERROR:")
|
||||
t.printfDrawIndentedAndWrappedIfMultiline("%s\n", fs.LastError)
|
||||
} else {
|
||||
t.printfDrawIndentedAndWrappedIfMultiline("ERROR: %s\n", fs.LastError)
|
||||
}
|
||||
t.newline()
|
||||
continue
|
||||
}
|
||||
|
||||
@ -524,25 +546,6 @@ func (t *tui) renderPrunerReport(r *pruner.Report) {
|
||||
|
||||
}
|
||||
|
||||
const snapshotIndent = 1
|
||||
func calculateMaxFSLength(all []*fsrep.Report) (maxFS, maxStatus int) {
|
||||
for _, e := range all {
|
||||
if len(e.Filesystem) > maxFS {
|
||||
maxFS = len(e.Filesystem)
|
||||
}
|
||||
all2 := make([]*fsrep.StepReport, 0, len(e.Pending) + len(e.Completed))
|
||||
all2 = append(all2, e.Pending...)
|
||||
all2 = append(all2, e.Completed...)
|
||||
for _, e2 := range all2 {
|
||||
elen := len(e2.Problem) + len(e2.From) + len(e2.To) + 60 // random spacing, units, labels, etc
|
||||
if elen > maxStatus {
|
||||
maxStatus = elen
|
||||
}
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func times(str string, n int) (out string) {
|
||||
for i := 0; i < n; i++ {
|
||||
out += str
|
||||
@ -586,35 +589,13 @@ func (t *tui) drawBar(length int, bytes, totalBytes int64, changeCount int) {
|
||||
t.write("]")
|
||||
}
|
||||
|
||||
func StringStepState(s fsrep.StepState) string {
|
||||
switch s {
|
||||
case fsrep.StepReplicationReady: return "Ready"
|
||||
case fsrep.StepMarkReplicatedReady: return "MarkReady"
|
||||
case fsrep.StepCompleted: return "Completed"
|
||||
default:
|
||||
return fmt.Sprintf("UNKNOWN %d", s)
|
||||
}
|
||||
}
|
||||
|
||||
func (t *tui) printFilesystemStatus(rep *fsrep.Report, active bool, maxFS int) {
|
||||
|
||||
bytes := int64(0)
|
||||
totalBytes := int64(0)
|
||||
for _, s := range rep.Pending {
|
||||
bytes += s.Bytes
|
||||
totalBytes += s.ExpectedBytes
|
||||
}
|
||||
for _, s := range rep.Completed {
|
||||
bytes += s.Bytes
|
||||
totalBytes += s.ExpectedBytes
|
||||
}
|
||||
|
||||
func (t *tui) printFilesystemStatus(rep *report.FilesystemReport, active bool, maxFS int) {
|
||||
|
||||
expected, replicated := rep.BytesSum()
|
||||
status := fmt.Sprintf("%s (step %d/%d, %s/%s)",
|
||||
rep.Status,
|
||||
len(rep.Completed), len(rep.Pending) + len(rep.Completed),
|
||||
ByteCountBinary(bytes), ByteCountBinary(totalBytes),
|
||||
|
||||
strings.ToUpper(string(rep.State)),
|
||||
rep.CurrentStep, len(rep.Steps),
|
||||
ByteCountBinary(replicated), ByteCountBinary(expected),
|
||||
)
|
||||
|
||||
activeIndicator := " "
|
||||
@ -623,18 +604,23 @@ func (t *tui) printFilesystemStatus(rep *fsrep.Report, active bool, maxFS int) {
|
||||
}
|
||||
t.printf("%s %s %s ",
|
||||
activeIndicator,
|
||||
rightPad(rep.Filesystem, maxFS, " "),
|
||||
rightPad(rep.Info.Name, maxFS, " "),
|
||||
status)
|
||||
|
||||
next := ""
|
||||
if rep.Problem != "" {
|
||||
next = rep.Problem
|
||||
} else if len(rep.Pending) > 0 {
|
||||
if rep.Pending[0].From != "" {
|
||||
next = fmt.Sprintf("next: %s => %s", rep.Pending[0].From, rep.Pending[0].To)
|
||||
if err := rep.Error(); err != nil {
|
||||
next = err.Err
|
||||
} else if rep.State != report.FilesystemDone {
|
||||
if nextStep := rep.NextStep(); nextStep != nil {
|
||||
if nextStep.IsIncremental() {
|
||||
next = fmt.Sprintf("next: %s => %s", nextStep.Info.From, nextStep.Info.To)
|
||||
} else {
|
||||
next = fmt.Sprintf("next: %s (full)", nextStep.Info.To)
|
||||
}
|
||||
} else {
|
||||
next = fmt.Sprintf("next: %s (full)", rep.Pending[0].To)
|
||||
next = "" // individual FSes may still be in planning state
|
||||
}
|
||||
|
||||
}
|
||||
t.printfDrawIndentedAndWrappedIfMultiline("%s", next)
|
||||
|
||||
|
@ -78,7 +78,38 @@ type PushJob struct {
|
||||
type PullJob struct {
|
||||
ActiveJob `yaml:",inline"`
|
||||
RootFS string `yaml:"root_fs"`
|
||||
Interval time.Duration `yaml:"interval,positive"`
|
||||
Interval PositiveDurationOrManual `yaml:"interval"`
|
||||
}
|
||||
|
||||
type PositiveDurationOrManual struct {
|
||||
Interval time.Duration
|
||||
Manual bool
|
||||
}
|
||||
|
||||
var _ yaml.Unmarshaler = (*PositiveDurationOrManual)(nil)
|
||||
|
||||
func (i *PositiveDurationOrManual) UnmarshalYAML(u func(interface{}, bool) error) (err error) {
|
||||
var s string
|
||||
if err := u(&s, true); err != nil {
|
||||
return err
|
||||
}
|
||||
switch s {
|
||||
case "manual":
|
||||
i.Manual = true
|
||||
i.Interval = 0
|
||||
case "":
|
||||
return fmt.Errorf("value must not be empty")
|
||||
default:
|
||||
i.Manual = false
|
||||
i.Interval, err = time.ParseDuration(s)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if i.Interval <= 0 {
|
||||
return fmt.Errorf("value must be a positive duration, got %q", s)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
type SinkJob struct {
|
||||
|
41
config/config_positiveintervalormanual_test.go
Normal file
41
config/config_positiveintervalormanual_test.go
Normal file
@ -0,0 +1,41 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/zrepl/yaml-config"
|
||||
)
|
||||
|
||||
func TestPositiveDurationOrManual(t *testing.T) {
|
||||
cases := []struct {
|
||||
Comment, Input string
|
||||
Result *PositiveDurationOrManual
|
||||
}{
|
||||
{"empty is error", "", nil},
|
||||
{"negative is error", "-1s", nil},
|
||||
{"zero seconds is error", "0s", nil},
|
||||
{"zero is error", "0", nil},
|
||||
{"non-manual is error", "something", nil},
|
||||
{"positive seconds works", "1s", &PositiveDurationOrManual{Manual: false, Interval: 1 * time.Second}},
|
||||
{"manual works", "manual", &PositiveDurationOrManual{Manual: true, Interval: 0}},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.Comment, func(t *testing.T) {
|
||||
var out struct {
|
||||
FieldName PositiveDurationOrManual `yaml:"fieldname"`
|
||||
}
|
||||
input := fmt.Sprintf("\nfieldname: %s\n", tc.Input)
|
||||
err := yaml.UnmarshalStrict([]byte(input), &out)
|
||||
if tc.Result == nil {
|
||||
assert.Error(t, err)
|
||||
t.Logf("%#v", out)
|
||||
} else {
|
||||
assert.Equal(t, *tc.Result, out.FieldName)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
}
|
@ -17,10 +17,12 @@ import (
|
||||
"github.com/zrepl/zrepl/daemon/snapper"
|
||||
"github.com/zrepl/zrepl/endpoint"
|
||||
"github.com/zrepl/zrepl/replication"
|
||||
"github.com/zrepl/zrepl/replication/driver"
|
||||
"github.com/zrepl/zrepl/replication/logic"
|
||||
"github.com/zrepl/zrepl/replication/report"
|
||||
"github.com/zrepl/zrepl/rpc"
|
||||
"github.com/zrepl/zrepl/transport"
|
||||
"github.com/zrepl/zrepl/transport/fromconfig"
|
||||
"github.com/zrepl/zrepl/util/envconst"
|
||||
"github.com/zrepl/zrepl/zfs"
|
||||
)
|
||||
|
||||
@ -53,7 +55,7 @@ type activeSideTasks struct {
|
||||
state ActiveSideState
|
||||
|
||||
// valid for state ActiveSideReplicating, ActiveSidePruneSender, ActiveSidePruneReceiver, ActiveSideDone
|
||||
replication *replication.Replication
|
||||
replicationReport driver.ReportFunc
|
||||
replicationCancel context.CancelFunc
|
||||
|
||||
// valid for state ActiveSidePruneSender, ActiveSidePruneReceiver, ActiveSideDone
|
||||
@ -79,7 +81,7 @@ func (a *ActiveSide) updateTasks(u func(*activeSideTasks)) activeSideTasks {
|
||||
type activeMode interface {
|
||||
ConnectEndpoints(rpcLoggers rpc.Loggers, connecter transport.Connecter)
|
||||
DisconnectEndpoints()
|
||||
SenderReceiver() (replication.Sender, replication.Receiver)
|
||||
SenderReceiver() (logic.Sender, logic.Receiver)
|
||||
Type() Type
|
||||
RunPeriodic(ctx context.Context, wakeUpCommon chan<- struct{})
|
||||
ResetConnectBackoff()
|
||||
@ -111,7 +113,7 @@ func (m *modePush) DisconnectEndpoints() {
|
||||
m.receiver = nil
|
||||
}
|
||||
|
||||
func (m *modePush) SenderReceiver() (replication.Sender, replication.Receiver) {
|
||||
func (m *modePush) SenderReceiver() (logic.Sender, logic.Receiver) {
|
||||
m.setupMtx.Lock()
|
||||
defer m.setupMtx.Unlock()
|
||||
return m.sender, m.receiver
|
||||
@ -151,7 +153,7 @@ type modePull struct {
|
||||
receiver *endpoint.Receiver
|
||||
sender *rpc.Client
|
||||
rootFS *zfs.DatasetPath
|
||||
interval time.Duration
|
||||
interval config.PositiveDurationOrManual
|
||||
}
|
||||
|
||||
func (m *modePull) ConnectEndpoints(loggers rpc.Loggers, connecter transport.Connecter) {
|
||||
@ -172,7 +174,7 @@ func (m *modePull) DisconnectEndpoints() {
|
||||
m.receiver = nil
|
||||
}
|
||||
|
||||
func (m *modePull) SenderReceiver() (replication.Sender, replication.Receiver) {
|
||||
func (m *modePull) SenderReceiver() (logic.Sender, logic.Receiver) {
|
||||
m.setupMtx.Lock()
|
||||
defer m.setupMtx.Unlock()
|
||||
return m.sender, m.receiver
|
||||
@ -181,7 +183,12 @@ func (m *modePull) SenderReceiver() (replication.Sender, replication.Receiver) {
|
||||
func (*modePull) Type() Type { return TypePull }
|
||||
|
||||
func (m *modePull) RunPeriodic(ctx context.Context, wakeUpCommon chan<- struct{}) {
|
||||
t := time.NewTicker(m.interval)
|
||||
if m.interval.Manual {
|
||||
GetLogger(ctx).Info("manual pull configured, periodic pull disabled")
|
||||
// "waiting for wakeups" is printed in common ActiveSide.do
|
||||
return
|
||||
}
|
||||
t := time.NewTicker(m.interval.Interval)
|
||||
defer t.Stop()
|
||||
for {
|
||||
select {
|
||||
@ -210,9 +217,6 @@ func (m *modePull) ResetConnectBackoff() {
|
||||
|
||||
func modePullFromConfig(g *config.Global, in *config.PullJob) (m *modePull, err error) {
|
||||
m = &modePull{}
|
||||
if in.Interval <= 0 {
|
||||
return nil, errors.New("interval must be positive")
|
||||
}
|
||||
m.interval = in.Interval
|
||||
|
||||
m.rootFS, err = zfs.NewDatasetPath(in.RootFS)
|
||||
@ -274,7 +278,7 @@ func (j *ActiveSide) RegisterMetrics(registerer prometheus.Registerer) {
|
||||
func (j *ActiveSide) Name() string { return j.name }
|
||||
|
||||
type ActiveSideStatus struct {
|
||||
Replication *replication.Report
|
||||
Replication *report.Report
|
||||
PruningSender, PruningReceiver *pruner.Report
|
||||
}
|
||||
|
||||
@ -283,8 +287,8 @@ func (j *ActiveSide) Status() *Status {
|
||||
|
||||
s := &ActiveSideStatus{}
|
||||
t := j.mode.Type()
|
||||
if tasks.replication != nil {
|
||||
s.Replication = tasks.replication.Report()
|
||||
if tasks.replicationReport != nil {
|
||||
s.Replication = tasks.replicationReport()
|
||||
}
|
||||
if tasks.prunerSender != nil {
|
||||
s.PruningSender = tasks.prunerSender.Report()
|
||||
@ -345,78 +349,6 @@ func (j *ActiveSide) do(ctx context.Context) {
|
||||
}
|
||||
}()
|
||||
|
||||
// The code after this watchdog goroutine is sequential and transitions the state from
|
||||
// ActiveSideReplicating -> ActiveSidePruneSender -> ActiveSidePruneReceiver -> ActiveSideDone
|
||||
// If any of those sequential tasks 'gets stuck' (livelock, no progress), the watchdog will eventually
|
||||
// cancel its context.
|
||||
// If the task is written to support context cancellation, it will return immediately (in permanent error state),
|
||||
// and the sequential code above transitions to the next state.
|
||||
go func() {
|
||||
|
||||
wdto := envconst.Duration("ZREPL_JOB_WATCHDOG_TIMEOUT", 10*time.Minute)
|
||||
jitter := envconst.Duration("ZREPL_JOB_WATCHDOG_JITTER", 1*time.Second)
|
||||
// shadowing!
|
||||
log := log.WithField("watchdog_timeout", wdto.String())
|
||||
|
||||
log.Debug("starting watchdog")
|
||||
defer log.Debug("watchdog stopped")
|
||||
|
||||
t := time.NewTicker(wdto)
|
||||
defer t.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-t.C: // fall
|
||||
}
|
||||
|
||||
j.updateTasks(func(tasks *activeSideTasks) {
|
||||
// Since cancelling a task will cause the sequential code to transition to the next state immediately,
|
||||
// we cannot check for its progress right then (no fallthrough).
|
||||
// Instead, we return (not continue because we are in a closure) and give the new state another
|
||||
// ZREPL_JOB_WATCHDOG_TIMEOUT interval to try make some progress.
|
||||
|
||||
log.WithField("state", tasks.state).Debug("watchdog firing")
|
||||
|
||||
const WATCHDOG_ENVCONST_NOTICE = " (adjust ZREPL_JOB_WATCHDOG_TIMEOUT env variable if inappropriate)"
|
||||
|
||||
switch tasks.state {
|
||||
case ActiveSideReplicating:
|
||||
log.WithField("replication_progress", tasks.replication.Progress.String()).
|
||||
Debug("check replication progress")
|
||||
if tasks.replication.Progress.CheckTimeout(wdto, jitter) {
|
||||
log.Error("replication did not make progress, cancelling" + WATCHDOG_ENVCONST_NOTICE)
|
||||
tasks.replicationCancel()
|
||||
return
|
||||
}
|
||||
case ActiveSidePruneSender:
|
||||
log.WithField("prune_sender_progress", tasks.replication.Progress.String()).
|
||||
Debug("check pruner_sender progress")
|
||||
if tasks.prunerSender.Progress.CheckTimeout(wdto, jitter) {
|
||||
log.Error("pruner_sender did not make progress, cancelling" + WATCHDOG_ENVCONST_NOTICE)
|
||||
tasks.prunerSenderCancel()
|
||||
return
|
||||
}
|
||||
case ActiveSidePruneReceiver:
|
||||
log.WithField("prune_receiver_progress", tasks.replication.Progress.String()).
|
||||
Debug("check pruner_receiver progress")
|
||||
if tasks.prunerReceiver.Progress.CheckTimeout(wdto, jitter) {
|
||||
log.Error("pruner_receiver did not make progress, cancelling" + WATCHDOG_ENVCONST_NOTICE)
|
||||
tasks.prunerReceiverCancel()
|
||||
return
|
||||
}
|
||||
case ActiveSideDone:
|
||||
// ignore, ctx will be Done() in a few milliseconds and the watchdog will exit
|
||||
default:
|
||||
log.WithField("state", tasks.state).
|
||||
Error("watchdog implementation error: unknown active side state")
|
||||
}
|
||||
})
|
||||
|
||||
}
|
||||
}()
|
||||
|
||||
sender, receiver := j.mode.SenderReceiver()
|
||||
|
||||
{
|
||||
@ -426,16 +358,19 @@ func (j *ActiveSide) do(ctx context.Context) {
|
||||
default:
|
||||
}
|
||||
ctx, repCancel := context.WithCancel(ctx)
|
||||
tasks := j.updateTasks(func(tasks *activeSideTasks) {
|
||||
var repWait driver.WaitFunc
|
||||
j.updateTasks(func(tasks *activeSideTasks) {
|
||||
// reset it
|
||||
*tasks = activeSideTasks{}
|
||||
tasks.replicationCancel = repCancel
|
||||
tasks.replication = replication.NewReplication(j.promRepStateSecs, j.promBytesReplicated)
|
||||
tasks.replicationReport, repWait = replication.Do(
|
||||
ctx, logic.NewPlanner(j.promRepStateSecs, j.promBytesReplicated, sender, receiver),
|
||||
)
|
||||
tasks.state = ActiveSideReplicating
|
||||
})
|
||||
log.Info("start replication")
|
||||
tasks.replication.Drive(ctx, sender, receiver)
|
||||
repCancel() // always cancel to free up context resources
|
||||
repWait(true) // wait blocking
|
||||
repCancel() // always cancel to free up context resources
|
||||
}
|
||||
|
||||
{
|
||||
|
@ -15,7 +15,7 @@ import (
|
||||
"github.com/zrepl/zrepl/daemon/snapper"
|
||||
"github.com/zrepl/zrepl/endpoint"
|
||||
"github.com/zrepl/zrepl/logger"
|
||||
"github.com/zrepl/zrepl/replication"
|
||||
"github.com/zrepl/zrepl/replication/driver"
|
||||
"github.com/zrepl/zrepl/rpc"
|
||||
"github.com/zrepl/zrepl/rpc/transportmux"
|
||||
"github.com/zrepl/zrepl/tlsconf"
|
||||
@ -79,7 +79,7 @@ const (
|
||||
)
|
||||
|
||||
func WithSubsystemLoggers(ctx context.Context, log logger.Logger) context.Context {
|
||||
ctx = replication.WithLogger(ctx, log.WithField(SubsysField, SubsysReplication))
|
||||
ctx = driver.WithLogger(ctx, log.WithField(SubsysField, SubsysReplication))
|
||||
ctx = endpoint.WithLogger(ctx, log.WithField(SubsysField, SubsyEndpoint))
|
||||
ctx = pruner.WithLogger(ctx, log.WithField(SubsysField, SubsysPruning))
|
||||
ctx = snapper.WithLogger(ctx, log.WithField(SubsysField, SubsysSnapshot))
|
||||
|
@ -8,10 +8,9 @@ import (
|
||||
"github.com/zrepl/zrepl/config"
|
||||
"github.com/zrepl/zrepl/logger"
|
||||
"github.com/zrepl/zrepl/pruning"
|
||||
"github.com/zrepl/zrepl/replication/pdu"
|
||||
"github.com/zrepl/zrepl/replication/logic/pdu"
|
||||
"github.com/zrepl/zrepl/util/envconst"
|
||||
"github.com/zrepl/zrepl/util/watchdog"
|
||||
"net"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
@ -21,6 +20,7 @@ import (
|
||||
// Try to keep it compatible with gitub.com/zrepl/zrepl/endpoint.Endpoint
|
||||
type History interface {
|
||||
ReplicationCursor(ctx context.Context, req *pdu.ReplicationCursorReq) (*pdu.ReplicationCursorRes, error)
|
||||
ListFilesystems(ctx context.Context, req *pdu.ListFilesystemReq) (*pdu.ListFilesystemRes, error)
|
||||
}
|
||||
|
||||
// Try to keep it compatible with gitub.com/zrepl/zrepl/endpoint.Endpoint
|
||||
@ -66,8 +66,7 @@ type Pruner struct {
|
||||
|
||||
state State
|
||||
|
||||
// State ErrWait|ErrPerm
|
||||
sleepUntil time.Time
|
||||
// State PlanErr
|
||||
err error
|
||||
|
||||
// State Exec
|
||||
@ -206,71 +205,43 @@ type State int
|
||||
|
||||
const (
|
||||
Plan State = 1 << iota
|
||||
PlanWait
|
||||
PlanErr
|
||||
Exec
|
||||
ExecWait
|
||||
ErrPerm
|
||||
ExecErr
|
||||
Done
|
||||
)
|
||||
|
||||
func (s State) statefunc() state {
|
||||
var statemap = map[State]state{
|
||||
Plan: statePlan,
|
||||
PlanWait: statePlanWait,
|
||||
Exec: stateExec,
|
||||
ExecWait: stateExecWait,
|
||||
ErrPerm: nil,
|
||||
Done: nil,
|
||||
}
|
||||
return statemap[s]
|
||||
}
|
||||
|
||||
func (s State) IsTerminal() bool {
|
||||
return s.statefunc() == nil
|
||||
}
|
||||
|
||||
type updater func(func(*Pruner)) State
|
||||
type state func(args *args, u updater) state
|
||||
type updater func(func(*Pruner))
|
||||
|
||||
func (p *Pruner) Prune() {
|
||||
p.prune(p.args)
|
||||
}
|
||||
|
||||
func (p *Pruner) prune(args args) {
|
||||
s := p.state.statefunc()
|
||||
for s != nil {
|
||||
pre := p.state
|
||||
s = s(&args, func(f func(*Pruner)) State {
|
||||
u := func(f func(*Pruner)) {
|
||||
p.mtx.Lock()
|
||||
defer p.mtx.Unlock()
|
||||
f(p)
|
||||
return p.state
|
||||
})
|
||||
post := p.state
|
||||
GetLogger(args.ctx).
|
||||
WithField("transition", fmt.Sprintf("%s=>%s", pre, post)).
|
||||
Debug("state transition")
|
||||
if err := p.Error(); err != nil {
|
||||
GetLogger(args.ctx).
|
||||
WithError(p.err).
|
||||
WithField("state", post.String()).
|
||||
Error("entering error state after error")
|
||||
}
|
||||
// TODO support automatic retries
|
||||
// It is advisable to merge this code with package replication/driver before
|
||||
// That will likely require re-modelling struct fs like replication/driver.attempt,
|
||||
// including figuring out how to resume a plan after being interrupted by network errors
|
||||
// The non-retrying code in this package should move straight to replication/logic.
|
||||
doOneAttempt(&args, u)
|
||||
}
|
||||
}
|
||||
|
||||
type Report struct {
|
||||
State string
|
||||
SleepUntil time.Time
|
||||
Error string
|
||||
State string
|
||||
Error string
|
||||
Pending, Completed []FSReport
|
||||
}
|
||||
|
||||
type FSReport struct {
|
||||
Filesystem string
|
||||
Filesystem string
|
||||
SnapshotList, DestroyList []SnapshotReport
|
||||
ErrorCount int
|
||||
LastError string
|
||||
SkipReason FSSkipReason
|
||||
LastError string
|
||||
}
|
||||
|
||||
type SnapshotReport struct {
|
||||
@ -285,14 +256,9 @@ func (p *Pruner) Report() *Report {
|
||||
|
||||
r := Report{State: p.state.String()}
|
||||
|
||||
if p.state & (PlanWait|ExecWait) != 0 {
|
||||
r.SleepUntil = p.sleepUntil
|
||||
}
|
||||
if p.state & (PlanWait|ExecWait|ErrPerm) != 0 {
|
||||
if p.err != nil {
|
||||
r.Error = p.err.Error()
|
||||
}
|
||||
}
|
||||
|
||||
if p.execQueue != nil {
|
||||
r.Pending, r.Completed = p.execQueue.Report()
|
||||
@ -307,20 +273,16 @@ func (p *Pruner) State() State {
|
||||
return p.state
|
||||
}
|
||||
|
||||
func (p *Pruner) Error() error {
|
||||
p.mtx.Lock()
|
||||
defer p.mtx.Unlock()
|
||||
if p.state & (PlanWait|ExecWait|ErrPerm) != 0 {
|
||||
return p.err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
type fs struct {
|
||||
path string
|
||||
|
||||
// permanent error during planning
|
||||
planErr error
|
||||
planErr error
|
||||
planErrContext string
|
||||
|
||||
// if != "", the fs was skipped for planning and the field
|
||||
// contains the reason
|
||||
skipReason FSSkipReason
|
||||
|
||||
// snapshots presented by target
|
||||
// (type snapshot)
|
||||
@ -333,8 +295,18 @@ type fs struct {
|
||||
|
||||
// only during Exec state, also used by execQueue
|
||||
execErrLast error
|
||||
execErrCount int
|
||||
}
|
||||
|
||||
type FSSkipReason string
|
||||
|
||||
const (
|
||||
NotSkipped = ""
|
||||
SkipPlaceholder = "filesystem is placeholder"
|
||||
SkipNoCorrespondenceOnSender = "filesystem has no correspondence on sender"
|
||||
)
|
||||
|
||||
func (r FSSkipReason) NotSkipped() bool {
|
||||
return r == NotSkipped
|
||||
}
|
||||
|
||||
func (f *fs) Report() FSReport {
|
||||
@ -343,7 +315,11 @@ func (f *fs) Report() FSReport {
|
||||
|
||||
r := FSReport{}
|
||||
r.Filesystem = f.path
|
||||
r.ErrorCount = f.execErrCount
|
||||
r.SkipReason = f.skipReason
|
||||
if !r.SkipReason.NotSkipped() {
|
||||
return r
|
||||
}
|
||||
|
||||
if f.planErr != nil {
|
||||
r.LastError = f.planErr.Error()
|
||||
} else if f.execErrLast != nil {
|
||||
@ -385,39 +361,7 @@ func (s snapshot) Replicated() bool { return s.replicated }
|
||||
|
||||
func (s snapshot) Date() time.Time { return s.date }
|
||||
|
||||
type Error interface {
|
||||
error
|
||||
Temporary() bool
|
||||
}
|
||||
|
||||
var _ Error = net.Error(nil)
|
||||
|
||||
func shouldRetry(e error) bool {
|
||||
if neterr, ok := e.(net.Error); ok {
|
||||
return neterr.Temporary()
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func onErr(u updater, e error) state {
|
||||
return u(func(p *Pruner) {
|
||||
p.err = e
|
||||
if !shouldRetry(e) {
|
||||
p.state = ErrPerm
|
||||
return
|
||||
}
|
||||
switch p.state {
|
||||
case Plan:
|
||||
p.state = PlanWait
|
||||
case Exec:
|
||||
p.state = ExecWait
|
||||
default:
|
||||
panic(p.state)
|
||||
}
|
||||
}).statefunc()
|
||||
}
|
||||
|
||||
func statePlan(a *args, u updater) state {
|
||||
func doOneAttempt(a *args, u updater) {
|
||||
|
||||
ctx, target, receiver := a.ctx, a.target, a.receiver
|
||||
var ka *watchdog.KeepAlive
|
||||
@ -425,28 +369,62 @@ func statePlan(a *args, u updater) state {
|
||||
ka = &pruner.Progress
|
||||
})
|
||||
|
||||
sfssres, err := receiver.ListFilesystems(ctx, &pdu.ListFilesystemReq{})
|
||||
if err != nil {
|
||||
u(func(p *Pruner) {
|
||||
p.state = PlanErr
|
||||
p.err = err
|
||||
})
|
||||
return
|
||||
}
|
||||
sfss := make(map[string]*pdu.Filesystem)
|
||||
for _, sfs := range sfssres.GetFilesystems() {
|
||||
sfss[sfs.GetPath()] = sfs
|
||||
}
|
||||
|
||||
tfssres, err := target.ListFilesystems(ctx, &pdu.ListFilesystemReq{})
|
||||
if err != nil {
|
||||
return onErr(u, err)
|
||||
u(func(p *Pruner) {
|
||||
p.state = PlanErr
|
||||
p.err = err
|
||||
})
|
||||
return
|
||||
}
|
||||
tfss := tfssres.GetFilesystems()
|
||||
|
||||
pfss := make([]*fs, len(tfss))
|
||||
tfss_loop:
|
||||
for i, tfs := range tfss {
|
||||
|
||||
l := GetLogger(ctx).WithField("fs", tfs.Path)
|
||||
l.Debug("plan filesystem")
|
||||
|
||||
|
||||
pfs := &fs{
|
||||
path: tfs.Path,
|
||||
path: tfs.Path,
|
||||
}
|
||||
pfss[i] = pfs
|
||||
|
||||
if tfs.GetIsPlaceholder() {
|
||||
pfs.skipReason = SkipPlaceholder
|
||||
l.WithField("skip_reason", pfs.skipReason).Debug("skipping filesystem")
|
||||
continue
|
||||
} else if sfs := sfss[tfs.GetPath()]; sfs == nil {
|
||||
pfs.skipReason = SkipNoCorrespondenceOnSender
|
||||
l.WithField("skip_reason", pfs.skipReason).WithField("sfs", sfs.GetPath()).Debug("skipping filesystem")
|
||||
continue
|
||||
}
|
||||
|
||||
pfsPlanErrAndLog := func(err error, message string) {
|
||||
t := fmt.Sprintf("%T", err)
|
||||
pfs.planErr = err
|
||||
pfs.planErrContext = message
|
||||
l.WithField("orig_err_type", t).WithError(err).Error(fmt.Sprintf("%s: plan error, skipping filesystem", message))
|
||||
}
|
||||
|
||||
tfsvsres, err := target.ListFilesystemVersions(ctx, &pdu.ListFilesystemVersionsReq{Filesystem: tfs.Path})
|
||||
if err != nil {
|
||||
l.WithError(err).Error("cannot list filesystem versions")
|
||||
return onErr(u, err)
|
||||
pfsPlanErrAndLog(err, "cannot list filesystem versions")
|
||||
continue tfss_loop
|
||||
}
|
||||
tfsvs := tfsvsres.GetVersions()
|
||||
// no progress here since we could run in a live-lock (must have used target AND receiver before progress)
|
||||
@ -455,24 +433,22 @@ func statePlan(a *args, u updater) state {
|
||||
|
||||
rcReq := &pdu.ReplicationCursorReq{
|
||||
Filesystem: tfs.Path,
|
||||
Op: &pdu.ReplicationCursorReq_Get{
|
||||
Op: &pdu.ReplicationCursorReq_Get{
|
||||
Get: &pdu.ReplicationCursorReq_GetOp{},
|
||||
},
|
||||
}
|
||||
rc, err := receiver.ReplicationCursor(ctx, rcReq)
|
||||
if err != nil {
|
||||
l.WithError(err).Error("cannot get replication cursor")
|
||||
return onErr(u, err)
|
||||
pfsPlanErrAndLog(err, "cannot get replication cursor bookmark")
|
||||
continue tfss_loop
|
||||
}
|
||||
ka.MadeProgress()
|
||||
if rc.GetNotexist() {
|
||||
l.Error("replication cursor does not exist, skipping")
|
||||
pfs.destroyList = []pruning.Snapshot{}
|
||||
pfs.planErr = fmt.Errorf("replication cursor bookmark does not exist (one successful replication is required before pruning works)")
|
||||
continue
|
||||
if rc.GetNotexist() {
|
||||
err := errors.New("replication cursor bookmark does not exist (one successful replication is required before pruning works)")
|
||||
pfsPlanErrAndLog(err, "")
|
||||
continue tfss_loop
|
||||
}
|
||||
|
||||
|
||||
// scan from older to newer, all snapshots older than cursor are interpreted as replicated
|
||||
sort.Slice(tfsvs, func(i, j int) bool {
|
||||
return tfsvs[i].CreateTXG < tfsvs[j].CreateTXG
|
||||
@ -494,11 +470,9 @@ func statePlan(a *args, u updater) state {
|
||||
}
|
||||
creation, err := tfsv.CreationAsTime()
|
||||
if err != nil {
|
||||
err := fmt.Errorf("%s%s has invalid creation date: %s", tfs, tfsv.RelName(), err)
|
||||
l.WithError(err).
|
||||
WithField("tfsv", tfsv.RelName()).
|
||||
Error("error with fileesystem version")
|
||||
return onErr(u, err)
|
||||
err := fmt.Errorf("%s: %s", tfsv.RelName(), err)
|
||||
pfsPlanErrAndLog(err, "fs version with invalid creation date")
|
||||
continue tfss_loop
|
||||
}
|
||||
// note that we cannot use CreateTXG because target and receiver could be on different pools
|
||||
atCursor := tfsv.Guid == rc.GetGuid()
|
||||
@ -510,9 +484,8 @@ func statePlan(a *args, u updater) state {
|
||||
})
|
||||
}
|
||||
if preCursor {
|
||||
err := fmt.Errorf("replication cursor not found in prune target filesystem versions")
|
||||
l.Error(err.Error())
|
||||
return onErr(u, err)
|
||||
pfsPlanErrAndLog(fmt.Errorf("replication cursor not found in prune target filesystem versions"), "")
|
||||
continue tfss_loop
|
||||
}
|
||||
|
||||
// Apply prune rules
|
||||
@ -520,34 +493,56 @@ func statePlan(a *args, u updater) state {
|
||||
ka.MadeProgress()
|
||||
}
|
||||
|
||||
return u(func(pruner *Pruner) {
|
||||
u(func(pruner *Pruner) {
|
||||
pruner.Progress.MadeProgress()
|
||||
pruner.execQueue = newExecQueue(len(pfss))
|
||||
for _, pfs := range pfss {
|
||||
pruner.execQueue.Put(pfs, nil, false)
|
||||
}
|
||||
pruner.state = Exec
|
||||
}).statefunc()
|
||||
}
|
||||
|
||||
func stateExec(a *args, u updater) state {
|
||||
})
|
||||
|
||||
for {
|
||||
var pfs *fs
|
||||
state := u(func(pruner *Pruner) {
|
||||
u(func(pruner *Pruner) {
|
||||
pfs = pruner.execQueue.Pop()
|
||||
})
|
||||
if pfs == nil {
|
||||
nextState := Done
|
||||
if pruner.execQueue.HasCompletedFSWithErrors() {
|
||||
nextState = ErrPerm
|
||||
break
|
||||
}
|
||||
doOneAttemptExec(a, u, pfs)
|
||||
}
|
||||
|
||||
var rep *Report
|
||||
{
|
||||
// must not hold lock for report
|
||||
var pruner *Pruner
|
||||
u(func(p *Pruner) {
|
||||
pruner = p
|
||||
})
|
||||
rep = pruner.Report()
|
||||
}
|
||||
u(func(p *Pruner) {
|
||||
if len(rep.Pending) > 0 {
|
||||
panic("queue should not have pending items at this point")
|
||||
}
|
||||
hadErr := false
|
||||
for _, fsr := range rep.Completed {
|
||||
hadErr = hadErr || fsr.SkipReason.NotSkipped() && fsr.LastError != ""
|
||||
}
|
||||
pruner.state = nextState
|
||||
return
|
||||
if hadErr {
|
||||
p.state = ExecErr
|
||||
} else {
|
||||
p.state = Done
|
||||
}
|
||||
})
|
||||
if state != Exec {
|
||||
return state.statefunc()
|
||||
|
||||
|
||||
}
|
||||
|
||||
// attempts to exec pfs, puts it back into the queue with the result
|
||||
func doOneAttemptExec(a *args, u updater, pfs *fs) {
|
||||
|
||||
destroyList := make([]*pdu.FilesystemVersion, len(pfs.destroyList))
|
||||
for i := range destroyList {
|
||||
destroyList[i] = pfs.destroyList[i].(snapshot).fsv
|
||||
@ -566,7 +561,7 @@ func stateExec(a *args, u updater) state {
|
||||
u(func(pruner *Pruner) {
|
||||
pruner.execQueue.Put(pfs, err, false)
|
||||
})
|
||||
return onErr(u, err)
|
||||
return
|
||||
}
|
||||
// check if all snapshots were destroyed
|
||||
destroyResults := make(map[string]*pdu.DestroySnapshotRes)
|
||||
@ -607,31 +602,6 @@ func stateExec(a *args, u updater) state {
|
||||
})
|
||||
if err != nil {
|
||||
GetLogger(a.ctx).WithError(err).Error("target could not destroy snapshots")
|
||||
return onErr(u, err)
|
||||
}
|
||||
|
||||
return u(func(pruner *Pruner) {
|
||||
pruner.Progress.MadeProgress()
|
||||
}).statefunc()
|
||||
}
|
||||
|
||||
func stateExecWait(a *args, u updater) state {
|
||||
return doWait(Exec, a, u)
|
||||
}
|
||||
|
||||
func statePlanWait(a *args, u updater) state {
|
||||
return doWait(Plan, a, u)
|
||||
}
|
||||
|
||||
func doWait(goback State, a *args, u updater) state {
|
||||
timer := time.NewTimer(a.retryWait)
|
||||
defer timer.Stop()
|
||||
select {
|
||||
case <-timer.C:
|
||||
return u(func(pruner *Pruner) {
|
||||
pruner.state = goback
|
||||
}).statefunc()
|
||||
case <-a.ctx.Done():
|
||||
return onErr(u, a.ctx.Err())
|
||||
return
|
||||
}
|
||||
}
|
||||
|
@ -58,10 +58,7 @@ func (q *execQueue) Pop() *fs {
|
||||
func(q *execQueue) Put(fs *fs, err error, done bool) {
|
||||
fs.mtx.Lock()
|
||||
fs.execErrLast = err
|
||||
if err != nil {
|
||||
fs.execErrCount++
|
||||
}
|
||||
if done || (err != nil && !shouldRetry(fs.execErrLast)) {
|
||||
if done || err != nil {
|
||||
fs.mtx.Unlock()
|
||||
q.mtx.Lock()
|
||||
q.completed = append(q.completed, fs)
|
||||
@ -78,9 +75,6 @@ func(q *execQueue) Put(fs *fs, err error, done bool) {
|
||||
defer q.pending[i].mtx.Unlock()
|
||||
q.pending[j].mtx.Lock()
|
||||
defer q.pending[j].mtx.Unlock()
|
||||
if q.pending[i].execErrCount != q.pending[j].execErrCount {
|
||||
return q.pending[i].execErrCount < q.pending[j].execErrCount
|
||||
}
|
||||
return strings.Compare(q.pending[i].path, q.pending[j].path) == -1
|
||||
})
|
||||
q.mtx.Unlock()
|
||||
|
@ -1,206 +0,0 @@
|
||||
package pruner
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/zrepl/zrepl/logger"
|
||||
"github.com/zrepl/zrepl/pruning"
|
||||
"github.com/zrepl/zrepl/replication/pdu"
|
||||
"net"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
type mockFS struct {
|
||||
path string
|
||||
snaps []string
|
||||
}
|
||||
|
||||
func (m *mockFS) Filesystem() *pdu.Filesystem {
|
||||
return &pdu.Filesystem{
|
||||
Path: m.path,
|
||||
}
|
||||
}
|
||||
|
||||
func (m *mockFS) FilesystemVersions() []*pdu.FilesystemVersion {
|
||||
versions := make([]*pdu.FilesystemVersion, len(m.snaps))
|
||||
for i, v := range m.snaps {
|
||||
versions[i] = &pdu.FilesystemVersion{
|
||||
Type: pdu.FilesystemVersion_Snapshot,
|
||||
Name: v,
|
||||
Creation: pdu.FilesystemVersionCreation(time.Unix(0, 0)),
|
||||
Guid: uint64(i),
|
||||
}
|
||||
}
|
||||
return versions
|
||||
}
|
||||
|
||||
type mockTarget struct {
|
||||
fss []mockFS
|
||||
destroyed map[string][]string
|
||||
listVersionsErrs map[string][]error
|
||||
listFilesystemsErr []error
|
||||
destroyErrs map[string][]error
|
||||
}
|
||||
|
||||
func (t *mockTarget) ListFilesystems(ctx context.Context, req *pdu.ListFilesystemReq) (*pdu.ListFilesystemRes, error) {
|
||||
if len(t.listFilesystemsErr) > 0 {
|
||||
e := t.listFilesystemsErr[0]
|
||||
t.listFilesystemsErr = t.listFilesystemsErr[1:]
|
||||
return nil, e
|
||||
}
|
||||
fss := make([]*pdu.Filesystem, len(t.fss))
|
||||
for i := range fss {
|
||||
fss[i] = t.fss[i].Filesystem()
|
||||
}
|
||||
return &pdu.ListFilesystemRes{Filesystems: fss}, nil
|
||||
}
|
||||
|
||||
func (t *mockTarget) ListFilesystemVersions(ctx context.Context, req *pdu.ListFilesystemVersionsReq) (*pdu.ListFilesystemVersionsRes, error) {
|
||||
fs := req.Filesystem
|
||||
if len(t.listVersionsErrs[fs]) != 0 {
|
||||
e := t.listVersionsErrs[fs][0]
|
||||
t.listVersionsErrs[fs] = t.listVersionsErrs[fs][1:]
|
||||
return nil, e
|
||||
}
|
||||
|
||||
for _, mfs := range t.fss {
|
||||
if mfs.path != fs {
|
||||
continue
|
||||
}
|
||||
return &pdu.ListFilesystemVersionsRes{Versions: mfs.FilesystemVersions()}, nil
|
||||
}
|
||||
return nil, fmt.Errorf("filesystem %s does not exist", fs)
|
||||
}
|
||||
|
||||
func (t *mockTarget) DestroySnapshots(ctx context.Context, req *pdu.DestroySnapshotsReq) (*pdu.DestroySnapshotsRes, error) {
|
||||
fs, snaps := req.Filesystem, req.Snapshots
|
||||
if len(t.destroyErrs[fs]) != 0 {
|
||||
e := t.destroyErrs[fs][0]
|
||||
t.destroyErrs[fs] = t.destroyErrs[fs][1:]
|
||||
return nil, e
|
||||
}
|
||||
destroyed := t.destroyed[fs]
|
||||
res := make([]*pdu.DestroySnapshotRes, len(snaps))
|
||||
for i, s := range snaps {
|
||||
destroyed = append(destroyed, s.Name)
|
||||
res[i] = &pdu.DestroySnapshotRes{Error: "", Snapshot: s}
|
||||
}
|
||||
t.destroyed[fs] = destroyed
|
||||
return &pdu.DestroySnapshotsRes{Results: res}, nil
|
||||
}
|
||||
|
||||
type mockCursor struct {
|
||||
snapname string
|
||||
guid uint64
|
||||
}
|
||||
type mockHistory struct {
|
||||
errs map[string][]error
|
||||
cursors map[string]*mockCursor
|
||||
}
|
||||
|
||||
func (r *mockHistory) ReplicationCursor(ctx context.Context, req *pdu.ReplicationCursorReq) (*pdu.ReplicationCursorRes, error) {
|
||||
fs := req.Filesystem
|
||||
if len(r.errs[fs]) > 0 {
|
||||
e := r.errs[fs][0]
|
||||
r.errs[fs] = r.errs[fs][1:]
|
||||
return nil, e
|
||||
}
|
||||
return &pdu.ReplicationCursorRes{Result: &pdu.ReplicationCursorRes_Guid{Guid: 0}}, nil
|
||||
}
|
||||
|
||||
type stubNetErr struct {
|
||||
msg string
|
||||
temporary, timeout bool
|
||||
}
|
||||
|
||||
var _ net.Error = stubNetErr{}
|
||||
|
||||
func (e stubNetErr) Error() string {
|
||||
return e.msg
|
||||
}
|
||||
|
||||
func (e stubNetErr) Temporary() bool { return e.temporary }
|
||||
|
||||
func (e stubNetErr) Timeout() bool { return e.timeout }
|
||||
|
||||
func TestPruner_Prune(t *testing.T) {
|
||||
|
||||
var _ net.Error = &net.OpError{} // we use it below
|
||||
target := &mockTarget{
|
||||
listFilesystemsErr: []error{
|
||||
stubNetErr{msg: "fakerror0", temporary: true},
|
||||
},
|
||||
listVersionsErrs: map[string][]error{
|
||||
"zroot/foo": {
|
||||
stubNetErr{msg: "fakeerror1", temporary: true},
|
||||
stubNetErr{msg: "fakeerror2", temporary: true,},
|
||||
},
|
||||
},
|
||||
destroyErrs: map[string][]error{
|
||||
"zroot/baz": {
|
||||
stubNetErr{msg: "fakeerror3", temporary: true}, // first error puts it back in the queue
|
||||
stubNetErr{msg:"permanent error"}, // so it will be last when pruner gives up due to permanent err
|
||||
},
|
||||
},
|
||||
destroyed: make(map[string][]string),
|
||||
fss: []mockFS{
|
||||
{
|
||||
path: "zroot/foo",
|
||||
snaps: []string{
|
||||
"keep_a",
|
||||
"keep_b",
|
||||
"drop_c",
|
||||
"keep_d",
|
||||
},
|
||||
},
|
||||
{
|
||||
path: "zroot/bar",
|
||||
snaps: []string{
|
||||
"keep_e",
|
||||
"keep_f",
|
||||
"drop_g",
|
||||
},
|
||||
},
|
||||
{
|
||||
path: "zroot/baz",
|
||||
snaps: []string{
|
||||
"keep_h",
|
||||
"drop_i",
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
history := &mockHistory{
|
||||
errs: map[string][]error{
|
||||
"zroot/foo": {
|
||||
stubNetErr{msg: "fakeerror4", temporary: true},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
keepRules := []pruning.KeepRule{pruning.MustKeepRegex("^keep", false)}
|
||||
|
||||
p := Pruner{
|
||||
args: args{
|
||||
ctx: WithLogger(context.Background(), logger.NewTestLogger(t)),
|
||||
target: target,
|
||||
receiver: history,
|
||||
rules: keepRules,
|
||||
retryWait: 10*time.Millisecond,
|
||||
},
|
||||
state: Plan,
|
||||
}
|
||||
p.Prune()
|
||||
|
||||
exp := map[string][]string{
|
||||
"zroot/foo": {"drop_c"},
|
||||
"zroot/bar": {"drop_g"},
|
||||
}
|
||||
|
||||
assert.Equal(t, exp, target.destroyed)
|
||||
|
||||
//assert.Equal(t, map[string][]error{}, target.listVersionsErrs, "retried")
|
||||
|
||||
}
|
@ -7,19 +7,17 @@ import (
|
||||
)
|
||||
|
||||
const (
|
||||
_StateName_0 = "PlanPlanWait"
|
||||
_StateName_0 = "PlanPlanErr"
|
||||
_StateName_1 = "Exec"
|
||||
_StateName_2 = "ExecWait"
|
||||
_StateName_3 = "ErrPerm"
|
||||
_StateName_4 = "Done"
|
||||
_StateName_2 = "ExecErr"
|
||||
_StateName_3 = "Done"
|
||||
)
|
||||
|
||||
var (
|
||||
_StateIndex_0 = [...]uint8{0, 4, 12}
|
||||
_StateIndex_0 = [...]uint8{0, 4, 11}
|
||||
_StateIndex_1 = [...]uint8{0, 4}
|
||||
_StateIndex_2 = [...]uint8{0, 8}
|
||||
_StateIndex_3 = [...]uint8{0, 7}
|
||||
_StateIndex_4 = [...]uint8{0, 4}
|
||||
_StateIndex_2 = [...]uint8{0, 7}
|
||||
_StateIndex_3 = [...]uint8{0, 4}
|
||||
)
|
||||
|
||||
func (i State) String() string {
|
||||
@ -33,22 +31,19 @@ func (i State) String() string {
|
||||
return _StateName_2
|
||||
case i == 16:
|
||||
return _StateName_3
|
||||
case i == 32:
|
||||
return _StateName_4
|
||||
default:
|
||||
return fmt.Sprintf("State(%d)", i)
|
||||
}
|
||||
}
|
||||
|
||||
var _StateValues = []State{1, 2, 4, 8, 16, 32}
|
||||
var _StateValues = []State{1, 2, 4, 8, 16}
|
||||
|
||||
var _StateNameToValueMap = map[string]State{
|
||||
_StateName_0[0:4]: 1,
|
||||
_StateName_0[4:12]: 2,
|
||||
_StateName_0[4:11]: 2,
|
||||
_StateName_1[0:4]: 4,
|
||||
_StateName_2[0:8]: 8,
|
||||
_StateName_3[0:7]: 16,
|
||||
_StateName_4[0:4]: 32,
|
||||
_StateName_2[0:7]: 8,
|
||||
_StateName_3[0:4]: 16,
|
||||
}
|
||||
|
||||
// StateString retrieves an enum value from the enum constants string name.
|
||||
|
@ -232,7 +232,8 @@ Job Type ``pull``
|
||||
- ZFS dataset path are received to
|
||||
``$root_fs/$client_identity``
|
||||
* - ``interval``
|
||||
- Interval at which to pull from the source job
|
||||
- | Interval at which to pull from the source job (e.g. ``10m``).
|
||||
| ``manual`` disables periodic pulling, replication then only happens on :ref:`wakeup <cli-signal-wakeup>`.
|
||||
* - ``pruning``
|
||||
- |pruning-spec|
|
||||
|
||||
|
@ -13,6 +13,8 @@ CLI Overview
|
||||
The zrepl binary is self-documenting:
|
||||
run ``zrepl help`` for an overview of the available subcommands or ``zrepl SUBCOMMAND --help`` for information on available flags, etc.
|
||||
|
||||
.. _cli-signal-wakeup:
|
||||
|
||||
.. list-table::
|
||||
:widths: 30 70
|
||||
:header-rows: 1
|
||||
|
@ -7,8 +7,7 @@ import (
|
||||
"path"
|
||||
|
||||
"github.com/pkg/errors"
|
||||
"github.com/zrepl/zrepl/replication"
|
||||
"github.com/zrepl/zrepl/replication/pdu"
|
||||
"github.com/zrepl/zrepl/replication/logic/pdu"
|
||||
"github.com/zrepl/zrepl/zfs"
|
||||
)
|
||||
|
||||
@ -34,7 +33,7 @@ func (s *Sender) filterCheckFS(fs string) (*zfs.DatasetPath, error) {
|
||||
return nil, err
|
||||
}
|
||||
if !pass {
|
||||
return nil, replication.NewFilteredError(fs)
|
||||
return nil, fmt.Errorf("endpoint does not allow access to filesystem %s", fs)
|
||||
}
|
||||
return dp, nil
|
||||
}
|
||||
@ -49,9 +48,10 @@ func (s *Sender) ListFilesystems(ctx context.Context, r *pdu.ListFilesystemReq)
|
||||
rfss[i] = &pdu.Filesystem{
|
||||
Path: fss[i].ToString(),
|
||||
// FIXME: not supporting ResumeToken yet
|
||||
IsPlaceholder: false, // sender FSs are never placeholders
|
||||
}
|
||||
}
|
||||
res := &pdu.ListFilesystemRes{Filesystems: rfss, Empty: len(rfss) == 0}
|
||||
res := &pdu.ListFilesystemRes{Filesystems: rfss}
|
||||
return res, nil
|
||||
}
|
||||
|
||||
@ -108,6 +108,21 @@ func (p *Sender) DestroySnapshots(ctx context.Context, req *pdu.DestroySnapshots
|
||||
return doDestroySnapshots(ctx, dp, req.Snapshots)
|
||||
}
|
||||
|
||||
func (p *Sender) Ping(ctx context.Context, req *pdu.PingReq) (*pdu.PingRes, error) {
|
||||
res := pdu.PingRes{
|
||||
Echo: req.GetMessage(),
|
||||
}
|
||||
return &res, nil
|
||||
}
|
||||
|
||||
func (p *Sender) PingDataconn(ctx context.Context, req *pdu.PingReq) (*pdu.PingRes, error) {
|
||||
return p.Ping(ctx, req)
|
||||
}
|
||||
|
||||
func (p *Sender) WaitForConnectivity(ctx context.Context) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *Sender) ReplicationCursor(ctx context.Context, req *pdu.ReplicationCursorReq) (*pdu.ReplicationCursorRes, error) {
|
||||
dp, err := p.filterCheckFS(req.Filesystem)
|
||||
if err != nil {
|
||||
@ -229,7 +244,7 @@ func (s *Receiver) ListFilesystems(ctx context.Context, req *pdu.ListFilesystemR
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// present without prefix, and only those that are not placeholders
|
||||
// present filesystem without the root_fs prefix
|
||||
fss := make([]*pdu.Filesystem, 0, len(filtered))
|
||||
for _, a := range filtered {
|
||||
ph, err := zfs.ZFSIsPlaceholderFilesystem(a)
|
||||
@ -240,21 +255,16 @@ func (s *Receiver) ListFilesystems(ctx context.Context, req *pdu.ListFilesystemR
|
||||
Error("inconsistent placeholder property")
|
||||
return nil, errors.New("server error: inconsistent placeholder property") // don't leak path
|
||||
}
|
||||
if ph {
|
||||
getLogger(ctx).
|
||||
WithField("fs", a.ToString()).
|
||||
Debug("ignoring placeholder filesystem")
|
||||
continue
|
||||
}
|
||||
getLogger(ctx).
|
||||
WithField("fs", a.ToString()).
|
||||
Debug("non-placeholder filesystem")
|
||||
WithField("is_placeholder", ph).
|
||||
Debug("filesystem")
|
||||
a.TrimPrefix(root)
|
||||
fss = append(fss, &pdu.Filesystem{Path: a.ToString()})
|
||||
fss = append(fss, &pdu.Filesystem{Path: a.ToString(), IsPlaceholder: ph})
|
||||
}
|
||||
if len(fss) == 0 {
|
||||
getLogger(ctx).Debug("no non-placeholder filesystems")
|
||||
return &pdu.ListFilesystemRes{Empty: true}, nil
|
||||
getLogger(ctx).Debug("no filesystems found")
|
||||
return &pdu.ListFilesystemRes{}, nil
|
||||
}
|
||||
return &pdu.ListFilesystemRes{Filesystems: fss}, nil
|
||||
}
|
||||
@ -279,6 +289,21 @@ func (s *Receiver) ListFilesystemVersions(ctx context.Context, req *pdu.ListFile
|
||||
return &pdu.ListFilesystemVersionsRes{Versions: rfsvs}, nil
|
||||
}
|
||||
|
||||
func (s *Receiver) Ping(ctx context.Context, req *pdu.PingReq) (*pdu.PingRes, error) {
|
||||
res := pdu.PingRes{
|
||||
Echo: req.GetMessage(),
|
||||
}
|
||||
return &res, nil
|
||||
}
|
||||
|
||||
func (s *Receiver) PingDataconn(ctx context.Context, req *pdu.PingReq) (*pdu.PingRes, error) {
|
||||
return s.Ping(ctx, req)
|
||||
}
|
||||
|
||||
func (s *Receiver) WaitForConnectivity(ctx context.Context) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Receiver) ReplicationCursor(context.Context, *pdu.ReplicationCursorReq) (*pdu.ReplicationCursorRes, error) {
|
||||
return nil, fmt.Errorf("ReplicationCursor not implemented for Receiver")
|
||||
}
|
||||
@ -324,28 +349,30 @@ func (s *Receiver) Receive(ctx context.Context, req *pdu.ReceiveReq, receive zfs
|
||||
getLogger(ctx).WithField("visitErr", visitErr).Debug("complete tree-walk")
|
||||
|
||||
if visitErr != nil {
|
||||
return nil, err
|
||||
return nil, visitErr
|
||||
}
|
||||
|
||||
needForceRecv := false
|
||||
var clearPlaceholderProperty bool
|
||||
var recvOpts zfs.RecvOptions
|
||||
props, err := zfs.ZFSGet(lp, []string{zfs.ZREPL_PLACEHOLDER_PROPERTY_NAME})
|
||||
if err == nil {
|
||||
if isPlaceholder, _ := zfs.IsPlaceholder(lp, props.Get(zfs.ZREPL_PLACEHOLDER_PROPERTY_NAME)); isPlaceholder {
|
||||
needForceRecv = true
|
||||
recvOpts.RollbackAndForceRecv = true
|
||||
clearPlaceholderProperty = true
|
||||
}
|
||||
}
|
||||
if clearPlaceholderProperty {
|
||||
if err := zfs.ZFSSetNoPlaceholder(lp); err != nil {
|
||||
return nil, fmt.Errorf("cannot clear placeholder property for forced receive: %s", err)
|
||||
}
|
||||
}
|
||||
|
||||
args := make([]string, 0, 1)
|
||||
if needForceRecv {
|
||||
args = append(args, "-F")
|
||||
}
|
||||
getLogger(ctx).WithField("opts", fmt.Sprintf("%#v", recvOpts)).Debug("start receive command")
|
||||
|
||||
getLogger(ctx).Debug("start receive command")
|
||||
|
||||
if err := zfs.ZFSRecv(ctx, lp.ToString(), receive, args...); err != nil {
|
||||
if err := zfs.ZFSRecv(ctx, lp.ToString(), receive, recvOpts); err != nil {
|
||||
getLogger(ctx).
|
||||
WithError(err).
|
||||
WithField("args", args).
|
||||
WithField("opts", recvOpts).
|
||||
Error("zfs receive failed")
|
||||
return nil, err
|
||||
}
|
||||
|
50
replication/driver/errorclass_enumer.go
Normal file
50
replication/driver/errorclass_enumer.go
Normal file
@ -0,0 +1,50 @@
|
||||
// Code generated by "enumer -type=errorClass"; DO NOT EDIT.
|
||||
|
||||
package driver
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
)
|
||||
|
||||
const _errorClassName = "errorClassUnknownerrorClassPermanenterrorClassTemporaryConnectivityRelated"
|
||||
|
||||
var _errorClassIndex = [...]uint8{0, 17, 36, 74}
|
||||
|
||||
func (i errorClass) String() string {
|
||||
if i < 0 || i >= errorClass(len(_errorClassIndex)-1) {
|
||||
return fmt.Sprintf("errorClass(%d)", i)
|
||||
}
|
||||
return _errorClassName[_errorClassIndex[i]:_errorClassIndex[i+1]]
|
||||
}
|
||||
|
||||
var _errorClassValues = []errorClass{0, 1, 2}
|
||||
|
||||
var _errorClassNameToValueMap = map[string]errorClass{
|
||||
_errorClassName[0:17]: 0,
|
||||
_errorClassName[17:36]: 1,
|
||||
_errorClassName[36:74]: 2,
|
||||
}
|
||||
|
||||
// errorClassString retrieves an enum value from the enum constants string name.
|
||||
// Throws an error if the param is not part of the enum.
|
||||
func errorClassString(s string) (errorClass, error) {
|
||||
if val, ok := _errorClassNameToValueMap[s]; ok {
|
||||
return val, nil
|
||||
}
|
||||
return 0, fmt.Errorf("%s does not belong to errorClass values", s)
|
||||
}
|
||||
|
||||
// errorClassValues returns all values of the enum
|
||||
func errorClassValues() []errorClass {
|
||||
return _errorClassValues
|
||||
}
|
||||
|
||||
// IsAerrorClass returns "true" if the value is listed in the enum definition. "false" otherwise
|
||||
func (i errorClass) IsAerrorClass() bool {
|
||||
for _, v := range _errorClassValues {
|
||||
if i == v {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
638
replication/driver/replication_driver.go
Normal file
638
replication/driver/replication_driver.go
Normal file
@ -0,0 +1,638 @@
|
||||
package driver
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/zrepl/zrepl/replication/report"
|
||||
"github.com/zrepl/zrepl/util/chainlock"
|
||||
"github.com/zrepl/zrepl/util/envconst"
|
||||
"google.golang.org/grpc/codes"
|
||||
"google.golang.org/grpc/status"
|
||||
)
|
||||
|
||||
type interval struct {
|
||||
begin time.Time
|
||||
end time.Time
|
||||
}
|
||||
|
||||
func (w *interval) SetZero() {
|
||||
w.begin = time.Time{}
|
||||
w.end = time.Time{}
|
||||
}
|
||||
|
||||
// Duration of 0 means indefinite length
|
||||
func (w *interval) Set(begin time.Time, duration time.Duration) {
|
||||
if begin.IsZero() {
|
||||
panic("zero begin time now allowed")
|
||||
}
|
||||
w.begin = begin
|
||||
w.end = begin.Add(duration)
|
||||
}
|
||||
|
||||
// Returns the End of the interval if it has a defined length.
|
||||
// For indefinite lengths, returns the zero value.
|
||||
func (w *interval) End() time.Time {
|
||||
return w.end
|
||||
}
|
||||
|
||||
// Return a context with a deadline at the interval's end.
|
||||
// If the interval has indefinite length (duration 0 on Set), return ctx as is.
|
||||
// The returned context.CancelFunc can be called either way.
|
||||
func (w *interval) ContextWithDeadlineAtEnd(ctx context.Context) (context.Context, context.CancelFunc) {
|
||||
if w.begin.IsZero() {
|
||||
panic("must call Set before ContextWIthDeadlineAtEnd")
|
||||
}
|
||||
if w.end.IsZero() {
|
||||
// indefinite length, just return context as is
|
||||
return ctx, func() {}
|
||||
} else {
|
||||
return context.WithDeadline(ctx, w.end)
|
||||
}
|
||||
}
|
||||
|
||||
type run struct {
|
||||
l *chainlock.L
|
||||
|
||||
startedAt, finishedAt time.Time
|
||||
|
||||
waitReconnect interval
|
||||
waitReconnectError *timedError
|
||||
|
||||
// the attempts attempted so far:
|
||||
// All but the last in this slice must have finished with some errors.
|
||||
// The last attempt may not be finished and may not have errors.
|
||||
attempts []*attempt
|
||||
}
|
||||
|
||||
type Planner interface {
|
||||
Plan(context.Context) ([]FS, error)
|
||||
WaitForConnectivity(context.Context) error
|
||||
}
|
||||
|
||||
// an attempt represents a single planning & execution of fs replications
|
||||
type attempt struct {
|
||||
planner Planner
|
||||
|
||||
l *chainlock.L
|
||||
|
||||
startedAt, finishedAt time.Time
|
||||
|
||||
// after Planner.Plan was called, planErr and fss are mutually exclusive with regards to nil-ness
|
||||
// if both are nil, it must be assumed that Planner.Plan is active
|
||||
planErr *timedError
|
||||
fss []*fs
|
||||
}
|
||||
|
||||
type timedError struct {
|
||||
Err error
|
||||
Time time.Time
|
||||
}
|
||||
|
||||
func newTimedError(err error, t time.Time) *timedError {
|
||||
if err == nil {
|
||||
panic("error must be non-nil")
|
||||
}
|
||||
if t.IsZero() {
|
||||
panic("t must be non-zero")
|
||||
}
|
||||
return &timedError{err, t}
|
||||
}
|
||||
|
||||
func (e *timedError) IntoReportError() *report.TimedError {
|
||||
if e == nil {
|
||||
return nil
|
||||
}
|
||||
return report.NewTimedError(e.Err.Error(), e.Time)
|
||||
}
|
||||
|
||||
type FS interface {
|
||||
// Returns true if this FS and fs refer to the same filesystem returned
|
||||
// by Planner.Plan in a previous attempt.
|
||||
EqualToPreviousAttempt(fs FS) bool
|
||||
// The returned steps are assumed to be dependent on exactly
|
||||
// their direct predecessors in the returned list.
|
||||
PlanFS(context.Context) ([]Step, error)
|
||||
ReportInfo() *report.FilesystemInfo
|
||||
}
|
||||
|
||||
type Step interface {
|
||||
// Returns true iff the target snapshot is the same for this Step and other.
|
||||
// We do not use TargetDate to avoid problems with wrong system time on
|
||||
// snapshot creation.
|
||||
//
|
||||
// Implementations can assume that `other` is a step of the same filesystem,
|
||||
// although maybe from a previous attempt.
|
||||
// (`same` as defined by FS.EqualToPreviousAttempt)
|
||||
//
|
||||
// Note that TargetEquals should return true in a situation with one
|
||||
// originally sent snapshot and a subsequent attempt's step that uses
|
||||
// resumable send & recv.
|
||||
TargetEquals(other Step) bool
|
||||
TargetDate() time.Time
|
||||
Step(context.Context) error
|
||||
ReportInfo() *report.StepInfo
|
||||
}
|
||||
|
||||
type fs struct {
|
||||
fs FS
|
||||
|
||||
l *chainlock.L
|
||||
|
||||
planning struct {
|
||||
done bool
|
||||
err *timedError
|
||||
}
|
||||
|
||||
// valid iff planning.done && planning.err == nil
|
||||
planned struct {
|
||||
// valid iff planning.done && planning.err == nil
|
||||
stepErr *timedError
|
||||
// all steps, in the order in which they must be completed
|
||||
steps []*step
|
||||
// index into steps, pointing at the step that is currently executing
|
||||
// if step >= len(steps), no more work needs to be done
|
||||
step int
|
||||
}
|
||||
}
|
||||
|
||||
type step struct {
|
||||
l *chainlock.L
|
||||
step Step
|
||||
}
|
||||
|
||||
type ReportFunc func() *report.Report
|
||||
type WaitFunc func(block bool) (done bool)
|
||||
|
||||
var maxAttempts = envconst.Int64("ZREPL_REPLICATION_MAX_ATTEMPTS", 3)
|
||||
var reconnectHardFailTimeout = envconst.Duration("ZREPL_REPLICATION_RECONNECT_HARD_FAIL_TIMEOUT", 10*time.Minute)
|
||||
|
||||
func Do(ctx context.Context, planner Planner) (ReportFunc, WaitFunc) {
|
||||
log := getLog(ctx)
|
||||
l := chainlock.New()
|
||||
run := &run{
|
||||
l: l,
|
||||
startedAt: time.Now(),
|
||||
}
|
||||
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
defer close(done)
|
||||
|
||||
defer run.l.Lock().Unlock()
|
||||
log.Debug("begin run")
|
||||
defer log.Debug("run ended")
|
||||
var prev *attempt
|
||||
mainLog := log
|
||||
for ano := 0; ano < int(maxAttempts) || maxAttempts == 0; ano++ {
|
||||
log := mainLog.WithField("attempt_number", ano)
|
||||
log.Debug("start attempt")
|
||||
|
||||
run.waitReconnect.SetZero()
|
||||
run.waitReconnectError = nil
|
||||
|
||||
// do current attempt
|
||||
cur := &attempt{
|
||||
l: l,
|
||||
startedAt: time.Now(),
|
||||
planner: planner,
|
||||
}
|
||||
run.attempts = append(run.attempts, cur)
|
||||
run.l.DropWhile(func() {
|
||||
cur.do(ctx, prev)
|
||||
})
|
||||
prev = cur
|
||||
if ctx.Err() != nil {
|
||||
log.WithError(ctx.Err()).Info("context error")
|
||||
return
|
||||
}
|
||||
|
||||
// error classification, bail out if done / permanent error
|
||||
rep := cur.report()
|
||||
log.WithField("attempt_state", rep.State).Debug("attempt state")
|
||||
errRep := cur.errorReport()
|
||||
|
||||
if rep.State == report.AttemptDone {
|
||||
log.Debug("attempt completed successfully")
|
||||
break
|
||||
}
|
||||
|
||||
mostRecentErr, mostRecentErrClass := errRep.MostRecent()
|
||||
log.WithField("most_recent_err", mostRecentErr).WithField("most_recent_err_class", mostRecentErrClass).Debug("most recent error used for re-connect decision")
|
||||
if mostRecentErr == nil {
|
||||
// inconsistent reporting, let's bail out
|
||||
log.Warn("attempt does not report done but error report does not report errors, aborting run")
|
||||
break
|
||||
}
|
||||
log.WithError(mostRecentErr.Err).Error("most recent error in this attempt")
|
||||
shouldReconnect := mostRecentErrClass == errorClassTemporaryConnectivityRelated
|
||||
log.WithField("reconnect_decision", shouldReconnect).Debug("reconnect decision made")
|
||||
if shouldReconnect {
|
||||
run.waitReconnect.Set(time.Now(), reconnectHardFailTimeout)
|
||||
log.WithField("deadline", run.waitReconnect.End()).Error("temporary connectivity-related error identified, start waiting for reconnect")
|
||||
var connectErr error
|
||||
var connectErrTime time.Time
|
||||
run.l.DropWhile(func() {
|
||||
ctx, cancel := run.waitReconnect.ContextWithDeadlineAtEnd(ctx)
|
||||
defer cancel()
|
||||
connectErr = planner.WaitForConnectivity(ctx)
|
||||
connectErrTime = time.Now()
|
||||
})
|
||||
if connectErr == nil {
|
||||
log.Error("reconnect successful") // same level as 'begin with reconnect' message above
|
||||
continue
|
||||
} else {
|
||||
run.waitReconnectError = newTimedError(connectErr, connectErrTime)
|
||||
log.WithError(connectErr).Error("reconnecting failed, aborting run")
|
||||
break
|
||||
}
|
||||
} else {
|
||||
log.Error("most recent error cannot be solved by reconnecting, aborting run")
|
||||
return
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}()
|
||||
|
||||
wait := func(block bool) bool {
|
||||
if block {
|
||||
<-done
|
||||
}
|
||||
select {
|
||||
case <-done:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
report := func() *report.Report {
|
||||
defer run.l.Lock().Unlock()
|
||||
return run.report()
|
||||
}
|
||||
return report, wait
|
||||
}
|
||||
|
||||
func (a *attempt) do(ctx context.Context, prev *attempt) {
|
||||
pfss, err := a.planner.Plan(ctx)
|
||||
errTime := time.Now()
|
||||
defer a.l.Lock().Unlock()
|
||||
if err != nil {
|
||||
a.planErr = newTimedError(err, errTime)
|
||||
a.fss = nil
|
||||
a.finishedAt = time.Now()
|
||||
return
|
||||
}
|
||||
|
||||
for _, pfs := range pfss {
|
||||
fs := &fs{
|
||||
fs: pfs,
|
||||
l: a.l,
|
||||
}
|
||||
a.fss = append(a.fss, fs)
|
||||
}
|
||||
|
||||
prevs := make(map[*fs]*fs)
|
||||
{
|
||||
prevFSs := make(map[*fs][]*fs, len(pfss))
|
||||
if prev != nil {
|
||||
debug("previous attempt has %d fss", len(a.fss))
|
||||
for _, fs := range a.fss {
|
||||
for _, prevFS := range prev.fss {
|
||||
if fs.fs.EqualToPreviousAttempt(prevFS.fs) {
|
||||
l := prevFSs[fs]
|
||||
l = append(l, prevFS)
|
||||
prevFSs[fs] = l
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
type inconsistency struct {
|
||||
cur *fs
|
||||
prevs []*fs
|
||||
}
|
||||
var inconsistencies []inconsistency
|
||||
for cur, fss := range prevFSs {
|
||||
if len(fss) > 1 {
|
||||
inconsistencies = append(inconsistencies, inconsistency{cur, fss})
|
||||
}
|
||||
}
|
||||
sort.SliceStable(inconsistencies, func(i, j int) bool {
|
||||
return inconsistencies[i].cur.fs.ReportInfo().Name < inconsistencies[j].cur.fs.ReportInfo().Name
|
||||
})
|
||||
if len(inconsistencies) > 0 {
|
||||
var msg strings.Builder
|
||||
msg.WriteString("cannot determine filesystem correspondences between different attempts:\n")
|
||||
var inconsistencyLines []string
|
||||
for _, i := range inconsistencies {
|
||||
var prevNames []string
|
||||
for _, prev := range i.prevs {
|
||||
prevNames = append(prevNames, prev.fs.ReportInfo().Name)
|
||||
}
|
||||
l := fmt.Sprintf(" %s => %v", i.cur.fs.ReportInfo().Name, prevNames)
|
||||
inconsistencyLines = append(inconsistencyLines, l)
|
||||
}
|
||||
fmt.Fprintf(&msg, strings.Join(inconsistencyLines, "\n"))
|
||||
now := time.Now()
|
||||
a.planErr = newTimedError(errors.New(msg.String()), now)
|
||||
a.fss = nil
|
||||
a.finishedAt = now
|
||||
return
|
||||
}
|
||||
for cur, fss := range prevFSs {
|
||||
if len(fss) > 0 {
|
||||
prevs[cur] = fss[0]
|
||||
}
|
||||
}
|
||||
}
|
||||
// invariant: prevs contains an entry for each unambigious correspondence
|
||||
|
||||
stepQueue := newStepQueue()
|
||||
defer stepQueue.Start(1)() // TODO parallel replication
|
||||
var fssesDone sync.WaitGroup
|
||||
for _, f := range a.fss {
|
||||
fssesDone.Add(1)
|
||||
go func(f *fs) {
|
||||
defer fssesDone.Done()
|
||||
f.do(ctx, stepQueue, prevs[f])
|
||||
}(f)
|
||||
}
|
||||
a.l.DropWhile(func() {
|
||||
fssesDone.Wait()
|
||||
})
|
||||
a.finishedAt = time.Now()
|
||||
}
|
||||
|
||||
func (fs *fs) do(ctx context.Context, pq *stepQueue, prev *fs) {
|
||||
psteps, err := fs.fs.PlanFS(ctx)
|
||||
errTime := time.Now()
|
||||
defer fs.l.Lock().Unlock()
|
||||
debug := debugPrefix("fs=%s", fs.fs.ReportInfo().Name)
|
||||
fs.planning.done = true
|
||||
if err != nil {
|
||||
fs.planning.err = newTimedError(err, errTime)
|
||||
return
|
||||
}
|
||||
for _, pstep := range psteps {
|
||||
step := &step{
|
||||
l: fs.l,
|
||||
step: pstep,
|
||||
}
|
||||
fs.planned.steps = append(fs.planned.steps, step)
|
||||
}
|
||||
debug("iniital len(fs.planned.steps) = %d", len(fs.planned.steps))
|
||||
|
||||
// for not-first attempts, only allow fs.planned.steps
|
||||
// up to including the originally planned target snapshot
|
||||
if prev != nil && prev.planning.done && prev.planning.err == nil {
|
||||
prevUncompleted := prev.planned.steps[prev.planned.step:]
|
||||
if len(prevUncompleted) == 0 {
|
||||
debug("prevUncompleted is empty")
|
||||
return
|
||||
}
|
||||
if len(fs.planned.steps) == 0 {
|
||||
debug("fs.planned.steps is empty")
|
||||
return
|
||||
}
|
||||
prevFailed := prevUncompleted[0]
|
||||
curFirst := fs.planned.steps[0]
|
||||
// we assume that PlanFS retries prevFailed (using curFirst)
|
||||
if !prevFailed.step.TargetEquals(curFirst.step) {
|
||||
debug("Targets don't match")
|
||||
// Two options:
|
||||
// A: planning algorithm is broken
|
||||
// B: manual user intervention inbetween
|
||||
// Neither way will we make progress, so let's error out
|
||||
stepFmt := func(step *step) string {
|
||||
r := step.report()
|
||||
s := r.Info
|
||||
if r.IsIncremental() {
|
||||
return fmt.Sprintf("%s=>%s", s.From, s.To)
|
||||
} else {
|
||||
return fmt.Sprintf("full=>%s", s.To)
|
||||
}
|
||||
}
|
||||
msg := fmt.Sprintf("last attempt's uncompleted step %s does not correspond to this attempt's first planned step %s",
|
||||
stepFmt(prevFailed), stepFmt(curFirst))
|
||||
fs.planned.stepErr = newTimedError(errors.New(msg), time.Now())
|
||||
return
|
||||
}
|
||||
// only allow until step targets diverge
|
||||
min := len(prevUncompleted)
|
||||
if min > len(fs.planned.steps) {
|
||||
min = len(fs.planned.steps)
|
||||
}
|
||||
diverge := 0
|
||||
for ; diverge < min; diverge++ {
|
||||
debug("diverge compare iteration %d", diverge)
|
||||
if !fs.planned.steps[diverge].step.TargetEquals(prevUncompleted[diverge].step) {
|
||||
break
|
||||
}
|
||||
}
|
||||
debug("diverge is %d", diverge)
|
||||
fs.planned.steps = fs.planned.steps[0:diverge]
|
||||
}
|
||||
debug("post-prev-merge len(fs.planned.steps) = %d", len(fs.planned.steps))
|
||||
|
||||
for i, s := range fs.planned.steps {
|
||||
var (
|
||||
err error
|
||||
errTime time.Time
|
||||
)
|
||||
// lock must not be held while executing step in order for reporting to work
|
||||
fs.l.DropWhile(func() {
|
||||
targetDate := s.step.TargetDate()
|
||||
defer pq.WaitReady(fs, targetDate)()
|
||||
err = s.step.Step(ctx) // no shadow
|
||||
errTime = time.Now() // no shadow
|
||||
})
|
||||
if err != nil {
|
||||
fs.planned.stepErr = newTimedError(err, errTime)
|
||||
break
|
||||
}
|
||||
fs.planned.step = i + 1 // fs.planned.step must be == len(fs.planned.steps) if all went OK
|
||||
}
|
||||
}
|
||||
|
||||
// caller must hold lock l
|
||||
func (r *run) report() *report.Report {
|
||||
report := &report.Report{
|
||||
Attempts: make([]*report.AttemptReport, len(r.attempts)),
|
||||
StartAt: r.startedAt,
|
||||
FinishAt: r.finishedAt,
|
||||
WaitReconnectSince: r.waitReconnect.begin,
|
||||
WaitReconnectUntil: r.waitReconnect.end,
|
||||
WaitReconnectError: r.waitReconnectError.IntoReportError(),
|
||||
}
|
||||
for i := range report.Attempts {
|
||||
report.Attempts[i] = r.attempts[i].report()
|
||||
}
|
||||
return report
|
||||
}
|
||||
|
||||
// caller must hold lock l
|
||||
func (a *attempt) report() *report.AttemptReport {
|
||||
|
||||
r := &report.AttemptReport{
|
||||
// State is set below
|
||||
Filesystems: make([]*report.FilesystemReport, len(a.fss)),
|
||||
StartAt: a.startedAt,
|
||||
FinishAt: a.finishedAt,
|
||||
PlanError: a.planErr.IntoReportError(),
|
||||
}
|
||||
|
||||
for i := range r.Filesystems {
|
||||
r.Filesystems[i] = a.fss[i].report()
|
||||
}
|
||||
|
||||
state := report.AttemptPlanning
|
||||
if a.planErr != nil {
|
||||
state = report.AttemptPlanningError
|
||||
} else if a.fss != nil {
|
||||
if a.finishedAt.IsZero() {
|
||||
state = report.AttemptFanOutFSs
|
||||
} else {
|
||||
fsWithError := false
|
||||
for _, s := range r.Filesystems {
|
||||
fsWithError = fsWithError || s.Error() != nil
|
||||
}
|
||||
state = report.AttemptDone
|
||||
if fsWithError {
|
||||
state = report.AttemptFanOutError
|
||||
}
|
||||
}
|
||||
}
|
||||
r.State = state
|
||||
|
||||
return r
|
||||
}
|
||||
|
||||
// caller must hold lock l
|
||||
func (f *fs) report() *report.FilesystemReport {
|
||||
state := report.FilesystemPlanningErrored
|
||||
if f.planning.err == nil {
|
||||
if f.planning.done {
|
||||
if f.planned.stepErr != nil {
|
||||
state = report.FilesystemSteppingErrored
|
||||
} else if f.planned.step < len(f.planned.steps) {
|
||||
state = report.FilesystemStepping
|
||||
} else {
|
||||
state = report.FilesystemDone
|
||||
}
|
||||
} else {
|
||||
state = report.FilesystemPlanning
|
||||
}
|
||||
}
|
||||
r := &report.FilesystemReport{
|
||||
Info: f.fs.ReportInfo(),
|
||||
State: state,
|
||||
PlanError: f.planning.err.IntoReportError(),
|
||||
StepError: f.planned.stepErr.IntoReportError(),
|
||||
Steps: make([]*report.StepReport, len(f.planned.steps)),
|
||||
CurrentStep: f.planned.step,
|
||||
}
|
||||
for i := range r.Steps {
|
||||
r.Steps[i] = f.planned.steps[i].report()
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
// caller must hold lock l
|
||||
func (s *step) report() *report.StepReport {
|
||||
r := &report.StepReport{
|
||||
Info: s.step.ReportInfo(),
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
type stepErrorReport struct {
|
||||
err *timedError
|
||||
step int
|
||||
}
|
||||
|
||||
//go:generate enumer -type=errorClass
|
||||
type errorClass int
|
||||
|
||||
const (
|
||||
errorClassUnknown errorClass = iota
|
||||
errorClassPermanent
|
||||
errorClassTemporaryConnectivityRelated
|
||||
)
|
||||
|
||||
type errorReport struct {
|
||||
flattened []*timedError
|
||||
// sorted DESCending by err time
|
||||
byClass map[errorClass][]*timedError
|
||||
}
|
||||
|
||||
// caller must hold lock l
|
||||
func (a *attempt) errorReport() *errorReport {
|
||||
r := &errorReport{}
|
||||
if a.planErr != nil {
|
||||
r.flattened = append(r.flattened, a.planErr)
|
||||
}
|
||||
for _, fs := range a.fss {
|
||||
if fs.planning.done && fs.planning.err != nil {
|
||||
r.flattened = append(r.flattened, fs.planning.err)
|
||||
} else if fs.planning.done && fs.planned.stepErr != nil {
|
||||
r.flattened = append(r.flattened, fs.planned.stepErr)
|
||||
}
|
||||
}
|
||||
|
||||
// build byClass
|
||||
{
|
||||
r.byClass = make(map[errorClass][]*timedError)
|
||||
putClass := func(err *timedError, class errorClass) {
|
||||
errs := r.byClass[class]
|
||||
errs = append(errs, err)
|
||||
r.byClass[class] = errs
|
||||
}
|
||||
for _, err := range r.flattened {
|
||||
if neterr, ok := err.Err.(net.Error); ok && neterr.Temporary() {
|
||||
putClass(err, errorClassTemporaryConnectivityRelated)
|
||||
continue
|
||||
}
|
||||
if st, ok := status.FromError(err.Err); ok && st.Code() == codes.Unavailable {
|
||||
// technically, codes.Unavailable could be returned by the gRPC endpoint, indicating overload, etc.
|
||||
// for now, let's assume it only happens for connectivity issues, as specified in
|
||||
// https://grpc.io/grpc/core/md_doc_statuscodes.html
|
||||
putClass(err, errorClassTemporaryConnectivityRelated)
|
||||
continue
|
||||
}
|
||||
putClass(err, errorClassPermanent)
|
||||
}
|
||||
for _, errs := range r.byClass {
|
||||
sort.Slice(errs, func(i, j int) bool {
|
||||
return errs[i].Time.After(errs[j].Time) // sort descendingly
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return r
|
||||
}
|
||||
|
||||
func (r *errorReport) AnyError() *timedError {
|
||||
for _, err := range r.flattened {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *errorReport) MostRecent() (err *timedError, errClass errorClass) {
|
||||
for class, errs := range r.byClass {
|
||||
// errs are sorted descendingly during construction
|
||||
if len(errs) > 0 && (err == nil || errs[0].Time.After(err.Time)) {
|
||||
err = errs[0]
|
||||
errClass = class
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
29
replication/driver/replication_driver_debug.go
Normal file
29
replication/driver/replication_driver_debug.go
Normal file
@ -0,0 +1,29 @@
|
||||
package driver
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
)
|
||||
|
||||
var debugEnabled bool = false
|
||||
|
||||
func init() {
|
||||
if os.Getenv("ZREPL_REPLICATION_DRIVER_DEBUG") != "" {
|
||||
debugEnabled = true
|
||||
}
|
||||
}
|
||||
|
||||
func debug(format string, args ...interface{}) {
|
||||
if debugEnabled {
|
||||
fmt.Fprintf(os.Stderr, "repl: driver: %s\n", fmt.Sprintf(format, args...))
|
||||
}
|
||||
}
|
||||
|
||||
type debugFunc func(format string, args ...interface{})
|
||||
|
||||
func debugPrefix(prefixFormat string, prefixFormatArgs ...interface{}) debugFunc {
|
||||
prefix := fmt.Sprintf(prefixFormat, prefixFormatArgs...)
|
||||
return func(format string, args ...interface{}) {
|
||||
debug("%s: %s", prefix, fmt.Sprintf(format, args))
|
||||
}
|
||||
}
|
25
replication/driver/replication_driver_logging.go
Normal file
25
replication/driver/replication_driver_logging.go
Normal file
@ -0,0 +1,25 @@
|
||||
package driver
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"github.com/zrepl/zrepl/logger"
|
||||
)
|
||||
|
||||
type Logger = logger.Logger
|
||||
|
||||
type contexKey int
|
||||
|
||||
const contexKeyLogger contexKey = iota + 1
|
||||
|
||||
func getLog(ctx context.Context) Logger {
|
||||
l, ok := ctx.Value(contexKeyLogger).(Logger)
|
||||
if !ok {
|
||||
l = logger.NewNullLogger()
|
||||
}
|
||||
return l
|
||||
}
|
||||
|
||||
func WithLogger(ctx context.Context, log Logger) context.Context {
|
||||
return context.WithValue(ctx, contexKeyLogger, log)
|
||||
}
|
215
replication/driver/replication_driver_test.go
Normal file
215
replication/driver/replication_driver_test.go
Normal file
@ -0,0 +1,215 @@
|
||||
package driver
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"sort"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
"github.com/zrepl/zrepl/replication/report"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
|
||||
jsondiff "github.com/yudai/gojsondiff"
|
||||
jsondiffformatter "github.com/yudai/gojsondiff/formatter"
|
||||
)
|
||||
|
||||
type mockPlanner struct {
|
||||
stepCounter uint32
|
||||
fss []FS // *mockFS
|
||||
}
|
||||
|
||||
func (p *mockPlanner) Plan(ctx context.Context) ([]FS, error) {
|
||||
time.Sleep(1 * time.Second)
|
||||
p.fss = []FS{
|
||||
&mockFS{
|
||||
&p.stepCounter,
|
||||
"zroot/one",
|
||||
nil,
|
||||
},
|
||||
&mockFS{
|
||||
&p.stepCounter,
|
||||
"zroot/two",
|
||||
nil,
|
||||
},
|
||||
}
|
||||
return p.fss, nil
|
||||
}
|
||||
|
||||
func (p *mockPlanner) WaitForConnectivity(context.Context) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
type mockFS struct {
|
||||
globalStepCounter *uint32
|
||||
name string
|
||||
steps []Step
|
||||
}
|
||||
|
||||
func (f *mockFS) EqualToPreviousAttempt(other FS) bool {
|
||||
return f.name == other.(*mockFS).name
|
||||
}
|
||||
|
||||
func (f *mockFS) PlanFS(ctx context.Context) ([]Step, error) {
|
||||
if f.steps != nil {
|
||||
panic("PlanFS used twice")
|
||||
}
|
||||
switch f.name {
|
||||
case "zroot/one":
|
||||
f.steps = []Step{
|
||||
&mockStep{
|
||||
fs: f,
|
||||
ident: "a",
|
||||
duration: 1 * time.Second,
|
||||
targetDate: time.Unix(2, 0),
|
||||
},
|
||||
&mockStep{
|
||||
fs: f,
|
||||
ident: "b",
|
||||
duration: 1 * time.Second,
|
||||
targetDate: time.Unix(10, 0),
|
||||
},
|
||||
&mockStep{
|
||||
fs: f,
|
||||
ident: "c",
|
||||
duration: 1 * time.Second,
|
||||
targetDate: time.Unix(20, 0),
|
||||
},
|
||||
}
|
||||
case "zroot/two":
|
||||
f.steps = []Step{
|
||||
&mockStep{
|
||||
fs: f,
|
||||
ident: "u",
|
||||
duration: 500 * time.Millisecond,
|
||||
targetDate: time.Unix(15, 0),
|
||||
},
|
||||
&mockStep{
|
||||
fs: f,
|
||||
duration: 500 * time.Millisecond,
|
||||
ident: "v",
|
||||
targetDate: time.Unix(30, 0),
|
||||
},
|
||||
}
|
||||
default:
|
||||
panic("unimplemented")
|
||||
}
|
||||
|
||||
return f.steps, nil
|
||||
}
|
||||
|
||||
func (f *mockFS) ReportInfo() *report.FilesystemInfo {
|
||||
return &report.FilesystemInfo{Name: f.name}
|
||||
}
|
||||
|
||||
type mockStep struct {
|
||||
fs *mockFS
|
||||
ident string
|
||||
duration time.Duration
|
||||
targetDate time.Time
|
||||
|
||||
// filled by method Step
|
||||
globalCtr uint32
|
||||
}
|
||||
|
||||
func (f *mockStep) String() string {
|
||||
return fmt.Sprintf("%s{%s} targetDate=%s globalCtr=%v", f.fs.name, f.ident, f.targetDate, f.globalCtr)
|
||||
}
|
||||
|
||||
func (f *mockStep) Step(ctx context.Context) error {
|
||||
f.globalCtr = atomic.AddUint32(f.fs.globalStepCounter, 1)
|
||||
time.Sleep(f.duration)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (f *mockStep) TargetEquals(s Step) bool {
|
||||
return f.ident == s.(*mockStep).ident
|
||||
}
|
||||
|
||||
func (f *mockStep) TargetDate() time.Time {
|
||||
return f.targetDate
|
||||
}
|
||||
|
||||
func (f *mockStep) ReportInfo() *report.StepInfo {
|
||||
return &report.StepInfo{From: f.ident, To: f.ident, BytesExpected: 100, BytesReplicated: 25}
|
||||
}
|
||||
|
||||
// TODO: add meaningful validation (i.e. actual checks)
|
||||
// Since the stepqueue is not deterministic due to scheduler jitter,
|
||||
// we cannot test for any definitive sequence of steps here.
|
||||
// Such checks would further only be sensible for a non-concurrent step-queue,
|
||||
// but we're going to have concurrent replication in the future.
|
||||
//
|
||||
// For the time being, let's just exercise the code a bit.
|
||||
func TestReplication(t *testing.T) {
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
mp := &mockPlanner{}
|
||||
getReport, wait := Do(ctx, mp)
|
||||
begin := time.Now()
|
||||
fireAt := []time.Duration{
|
||||
// the following values are relative to the start
|
||||
500 * time.Millisecond, // planning
|
||||
1500 * time.Millisecond, // nothing is done, a is running
|
||||
2500 * time.Millisecond, // a done, b running
|
||||
3250 * time.Millisecond, // a,b done, u running
|
||||
3750 * time.Millisecond, // a,b,u done, c running
|
||||
4750 * time.Millisecond, // a,b,u,c done, v running
|
||||
5250 * time.Millisecond, // a,b,u,c,v done
|
||||
}
|
||||
reports := make([]*report.Report, len(fireAt))
|
||||
for i := range fireAt {
|
||||
sleepUntil := begin.Add(fireAt[i])
|
||||
time.Sleep(sleepUntil.Sub(time.Now()))
|
||||
reports[i] = getReport()
|
||||
// uncomment for viewing non-diffed results
|
||||
// t.Logf("report @ %6.4f:\n%s", fireAt[i].Seconds(), pretty.Sprint(reports[i]))
|
||||
}
|
||||
waitBegin := time.Now()
|
||||
wait(true)
|
||||
waitDuration := time.Now().Sub(waitBegin)
|
||||
assert.True(t, waitDuration < 10*time.Millisecond, "%v", waitDuration) // and that's gratious
|
||||
|
||||
prev, err := json.Marshal(reports[0])
|
||||
require.NoError(t, err)
|
||||
for _, r := range reports[1:] {
|
||||
this, err := json.Marshal(r)
|
||||
require.NoError(t, err)
|
||||
differ := jsondiff.New()
|
||||
diff, err := differ.Compare(prev, this)
|
||||
require.NoError(t, err)
|
||||
df := jsondiffformatter.NewDeltaFormatter()
|
||||
_, err = df.Format(diff)
|
||||
require.NoError(t, err)
|
||||
// uncomment the following line to get json diffs between each captured step
|
||||
// t.Logf("%s", res)
|
||||
prev, err = json.Marshal(r)
|
||||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
steps := make([]*mockStep, 0)
|
||||
for _, fs := range mp.fss {
|
||||
for _, step := range fs.(*mockFS).steps {
|
||||
steps = append(steps, step.(*mockStep))
|
||||
}
|
||||
}
|
||||
|
||||
// sort steps in pq order (although, remember, pq is not deterministic)
|
||||
sort.Slice(steps, func(i, j int) bool {
|
||||
return steps[i].targetDate.Before(steps[j].targetDate)
|
||||
})
|
||||
|
||||
// manual inspection of the globalCtr value should show that, despite
|
||||
// scheduler-dependent behavior of pq, steps should generally be taken
|
||||
// from oldest to newest target date (globally, not per FS).
|
||||
t.Logf("steps sorted by target date:")
|
||||
for _, step := range steps {
|
||||
t.Logf("\t%s", step)
|
||||
}
|
||||
|
||||
}
|
163
replication/driver/replication_stepqueue.go
Normal file
163
replication/driver/replication_stepqueue.go
Normal file
@ -0,0 +1,163 @@
|
||||
package driver
|
||||
|
||||
import (
|
||||
"container/heap"
|
||||
"time"
|
||||
|
||||
"github.com/zrepl/zrepl/util/chainlock"
|
||||
)
|
||||
|
||||
type stepQueueRec struct {
|
||||
ident interface{}
|
||||
targetDate time.Time
|
||||
wakeup chan StepCompletedFunc
|
||||
}
|
||||
|
||||
type stepQueue struct {
|
||||
stop chan struct{}
|
||||
reqs chan stepQueueRec
|
||||
}
|
||||
|
||||
type stepQueueHeapItem struct {
|
||||
idx int
|
||||
req stepQueueRec
|
||||
}
|
||||
type stepQueueHeap []*stepQueueHeapItem
|
||||
|
||||
func (h stepQueueHeap) Less(i, j int) bool {
|
||||
return h[i].req.targetDate.Before(h[j].req.targetDate)
|
||||
}
|
||||
|
||||
func (h stepQueueHeap) Swap(i, j int) {
|
||||
h[i], h[j] = h[j], h[i]
|
||||
h[i].idx = i
|
||||
h[j].idx = j
|
||||
}
|
||||
|
||||
func (h stepQueueHeap) Len() int {
|
||||
return len(h)
|
||||
}
|
||||
|
||||
func (h *stepQueueHeap) Push(elem interface{}) {
|
||||
hitem := elem.(*stepQueueHeapItem)
|
||||
hitem.idx = h.Len()
|
||||
*h = append(*h, hitem)
|
||||
}
|
||||
|
||||
func (h *stepQueueHeap) Pop() interface{} {
|
||||
elem := (*h)[h.Len()-1]
|
||||
elem.idx = -1
|
||||
*h = (*h)[:h.Len()-1]
|
||||
return elem
|
||||
}
|
||||
|
||||
// returned stepQueue must be closed with method Close
|
||||
func newStepQueue() *stepQueue {
|
||||
q := &stepQueue{
|
||||
stop: make(chan struct{}),
|
||||
reqs: make(chan stepQueueRec),
|
||||
}
|
||||
return q
|
||||
}
|
||||
|
||||
// the returned done function must be called to free resources
|
||||
// allocated by the call to Start
|
||||
//
|
||||
// No WaitReady calls must be active at the time done is called
|
||||
// The behavior of calling WaitReady after done was called is undefined
|
||||
func (q *stepQueue) Start(concurrency int) (done func()) {
|
||||
if concurrency < 1 {
|
||||
panic("concurrency must be >= 1")
|
||||
}
|
||||
// l protects pending and queueItems
|
||||
l := chainlock.New()
|
||||
pendingCond := l.NewCond()
|
||||
// priority queue
|
||||
pending := &stepQueueHeap{}
|
||||
// ident => queueItem
|
||||
queueItems := make(map[interface{}]*stepQueueHeapItem)
|
||||
// stopped is used for cancellation of "wake" goroutine
|
||||
stopped := false
|
||||
active := 0
|
||||
go func() { // "stopper" goroutine
|
||||
<-q.stop
|
||||
defer l.Lock().Unlock()
|
||||
stopped = true
|
||||
pendingCond.Broadcast()
|
||||
}()
|
||||
go func() { // "reqs" goroutine
|
||||
for {
|
||||
select {
|
||||
case <-q.stop:
|
||||
select {
|
||||
case <-q.reqs:
|
||||
panic("WaitReady call active while calling Close")
|
||||
default:
|
||||
return
|
||||
}
|
||||
case req := <-q.reqs:
|
||||
func() {
|
||||
defer l.Lock().Unlock()
|
||||
if _, ok := queueItems[req.ident]; ok {
|
||||
panic("WaitReady must not be called twice for the same ident")
|
||||
}
|
||||
qitem := &stepQueueHeapItem{
|
||||
req: req,
|
||||
}
|
||||
queueItems[req.ident] = qitem
|
||||
heap.Push(pending, qitem)
|
||||
pendingCond.Broadcast()
|
||||
}()
|
||||
}
|
||||
}
|
||||
}()
|
||||
go func() { // "wake" goroutine
|
||||
defer l.Lock().Unlock()
|
||||
for {
|
||||
|
||||
for !stopped && (active >= concurrency || pending.Len() == 0) {
|
||||
pendingCond.Wait()
|
||||
}
|
||||
if stopped {
|
||||
return
|
||||
}
|
||||
if pending.Len() <= 0 {
|
||||
return
|
||||
}
|
||||
active++
|
||||
next := heap.Pop(pending).(*stepQueueHeapItem).req
|
||||
delete(queueItems, next.ident)
|
||||
|
||||
next.wakeup <- func() {
|
||||
defer l.Lock().Unlock()
|
||||
active--
|
||||
pendingCond.Broadcast()
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
done = func() {
|
||||
close(q.stop)
|
||||
}
|
||||
return done
|
||||
}
|
||||
|
||||
type StepCompletedFunc func()
|
||||
|
||||
func (q *stepQueue) sendAndWaitForWakeup(ident interface{}, targetDate time.Time) StepCompletedFunc {
|
||||
req := stepQueueRec{
|
||||
ident,
|
||||
targetDate,
|
||||
make(chan StepCompletedFunc),
|
||||
}
|
||||
q.reqs <- req
|
||||
return <-req.wakeup
|
||||
}
|
||||
|
||||
// Wait for the ident with targetDate to be selected to run.
|
||||
func (q *stepQueue) WaitReady(ident interface{}, targetDate time.Time) StepCompletedFunc {
|
||||
if targetDate.IsZero() {
|
||||
panic("targetDate of zero is reserved for marking Done")
|
||||
}
|
||||
return q.sendAndWaitForWakeup(ident, targetDate)
|
||||
}
|
166
replication/driver/replication_stepqueue_test.go
Normal file
166
replication/driver/replication_stepqueue_test.go
Normal file
@ -0,0 +1,166 @@
|
||||
package driver
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"sort"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/montanaflynn/stats"
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestPqNotconcurrent(t *testing.T) {
|
||||
|
||||
var ctr uint32
|
||||
q := newStepQueue()
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(3)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
defer q.WaitReady("1", time.Unix(1, 0))()
|
||||
ret := atomic.AddUint32(&ctr, 1)
|
||||
assert.Equal(t, uint32(1), ret)
|
||||
}()
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
defer q.WaitReady("2", time.Unix(2, 0))()
|
||||
ret := atomic.AddUint32(&ctr, 1)
|
||||
assert.Equal(t, uint32(2), ret)
|
||||
}()
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
defer q.WaitReady("3", time.Unix(3, 0))()
|
||||
ret := atomic.AddUint32(&ctr, 1)
|
||||
assert.Equal(t, uint32(3), ret)
|
||||
}()
|
||||
|
||||
time.Sleep(1 * time.Second)
|
||||
defer q.Start(1)()
|
||||
wg.Wait()
|
||||
|
||||
}
|
||||
|
||||
type record struct {
|
||||
fs int
|
||||
step int
|
||||
globalCtr uint32
|
||||
wakeAt time.Duration // relative to begin
|
||||
}
|
||||
|
||||
func (r record) String() string {
|
||||
return fmt.Sprintf("fs %08d step %08d globalCtr %08d wakeAt %2.8f", r.fs, r.step, r.globalCtr, r.wakeAt.Seconds())
|
||||
}
|
||||
|
||||
// This tests uses stepPq concurrently, simulating the following scenario:
|
||||
// Given a number of filesystems F, each filesystem has N steps to take.
|
||||
// The number of concurrent steps is limited to C.
|
||||
// The target date for each step is the step number N.
|
||||
// Hence, there are always F filesystems runnable (calling WaitReady)
|
||||
// The priority queue prioritizes steps with lower target data (= lower step number).
|
||||
// Hence, all steps with lower numbers should be woken up before steps with higher numbers.
|
||||
// However, scheduling is not 100% deterministic (runtime, OS scheduler, etc).
|
||||
// Hence, perform some statistics on the wakeup times and assert that the mean wakeup
|
||||
// times for each step are close together.
|
||||
func TestPqConcurrent(t *testing.T) {
|
||||
|
||||
q := newStepQueue()
|
||||
var wg sync.WaitGroup
|
||||
filesystems := 100
|
||||
stepsPerFS := 20
|
||||
sleepTimePerStep := 50 * time.Millisecond
|
||||
wg.Add(filesystems)
|
||||
var globalCtr uint32
|
||||
|
||||
begin := time.Now()
|
||||
records := make(chan []record, filesystems)
|
||||
for fs := 0; fs < filesystems; fs++ {
|
||||
go func(fs int) {
|
||||
defer wg.Done()
|
||||
recs := make([]record, 0)
|
||||
for step := 0; step < stepsPerFS; step++ {
|
||||
pos := atomic.AddUint32(&globalCtr, 1)
|
||||
t := time.Unix(int64(step), 0)
|
||||
done := q.WaitReady(fs, t)
|
||||
wakeAt := time.Now().Sub(begin)
|
||||
time.Sleep(sleepTimePerStep)
|
||||
done()
|
||||
recs = append(recs, record{fs, step, pos, wakeAt})
|
||||
}
|
||||
records <- recs
|
||||
}(fs)
|
||||
}
|
||||
concurrency := 5
|
||||
defer q.Start(concurrency)()
|
||||
wg.Wait()
|
||||
close(records)
|
||||
t.Logf("loop done")
|
||||
|
||||
flattenedRecs := make([]record, 0)
|
||||
for recs := range records {
|
||||
flattenedRecs = append(flattenedRecs, recs...)
|
||||
}
|
||||
|
||||
sort.Slice(flattenedRecs, func(i, j int) bool {
|
||||
return flattenedRecs[i].globalCtr < flattenedRecs[j].globalCtr
|
||||
})
|
||||
|
||||
wakeTimesByStep := map[int][]float64{}
|
||||
for _, rec := range flattenedRecs {
|
||||
wakeTimes, ok := wakeTimesByStep[rec.step]
|
||||
if !ok {
|
||||
wakeTimes = []float64{}
|
||||
}
|
||||
wakeTimes = append(wakeTimes, rec.wakeAt.Seconds())
|
||||
wakeTimesByStep[rec.step] = wakeTimes
|
||||
}
|
||||
|
||||
meansByStepId := make([]float64, stepsPerFS)
|
||||
interQuartileRangesByStepIdx := make([]float64, stepsPerFS)
|
||||
for step := 0; step < stepsPerFS; step++ {
|
||||
t.Logf("step %d", step)
|
||||
mean, _ := stats.Mean(wakeTimesByStep[step])
|
||||
meansByStepId[step] = mean
|
||||
t.Logf("\tmean: %v", mean)
|
||||
median, _ := stats.Median(wakeTimesByStep[step])
|
||||
t.Logf("\tmedian: %v", median)
|
||||
midhinge, _ := stats.Midhinge(wakeTimesByStep[step])
|
||||
t.Logf("\tmidhinge: %v", midhinge)
|
||||
min, _ := stats.Min(wakeTimesByStep[step])
|
||||
t.Logf("\tmin: %v", min)
|
||||
max, _ := stats.Max(wakeTimesByStep[step])
|
||||
t.Logf("\tmax: %v", max)
|
||||
quartiles, _ := stats.Quartile(wakeTimesByStep[step])
|
||||
t.Logf("\t%#v", quartiles)
|
||||
interQuartileRange, _ := stats.InterQuartileRange(wakeTimesByStep[step])
|
||||
t.Logf("\tinter-quartile range: %v", interQuartileRange)
|
||||
interQuartileRangesByStepIdx[step] = interQuartileRange
|
||||
}
|
||||
|
||||
iqrMean, _ := stats.Mean(interQuartileRangesByStepIdx)
|
||||
t.Logf("inter-quartile-range mean: %v", iqrMean)
|
||||
iqrDev, _ := stats.StandardDeviation(interQuartileRangesByStepIdx)
|
||||
t.Logf("inter-quartile-range deviation: %v", iqrDev)
|
||||
|
||||
// each step should have the same "distribution" (=~ "spread")
|
||||
assert.True(t, iqrDev < 0.01)
|
||||
|
||||
minTimeForAllStepsWithIdxI := sleepTimePerStep.Seconds() * float64(filesystems) / float64(concurrency)
|
||||
t.Logf("minTimeForAllStepsWithIdxI = %11.8f", minTimeForAllStepsWithIdxI)
|
||||
for i, mean := range meansByStepId {
|
||||
// we can't just do (i + 0.5) * minTimeforAllStepsWithIdxI
|
||||
// because this doesn't account for drift
|
||||
idealMean := 0.5 * minTimeForAllStepsWithIdxI
|
||||
if i > 0 {
|
||||
previousMean := meansByStepId[i-1]
|
||||
idealMean = previousMean + minTimeForAllStepsWithIdxI
|
||||
}
|
||||
deltaFromIdeal := idealMean - mean
|
||||
t.Logf("step %02d delta from ideal mean wake time: %11.8f - %11.8f = %11.8f", i, idealMean, mean, deltaFromIdeal)
|
||||
assert.True(t, math.Abs(deltaFromIdeal) < 0.05)
|
||||
}
|
||||
|
||||
}
|
@ -1,557 +0,0 @@
|
||||
// Package fsrep implements replication of a single file system with existing versions
|
||||
// from a sender to a receiver.
|
||||
package fsrep
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/zrepl/zrepl/logger"
|
||||
"github.com/zrepl/zrepl/replication/pdu"
|
||||
"github.com/zrepl/zrepl/util/bytecounter"
|
||||
"github.com/zrepl/zrepl/util/watchdog"
|
||||
"github.com/zrepl/zrepl/zfs"
|
||||
)
|
||||
|
||||
type contextKey int
|
||||
|
||||
const (
|
||||
contextKeyLogger contextKey = iota
|
||||
)
|
||||
|
||||
type Logger = logger.Logger
|
||||
|
||||
func WithLogger(ctx context.Context, log Logger) context.Context {
|
||||
return context.WithValue(ctx, contextKeyLogger, log)
|
||||
}
|
||||
|
||||
func getLogger(ctx context.Context) Logger {
|
||||
l, ok := ctx.Value(contextKeyLogger).(Logger)
|
||||
if !ok {
|
||||
l = logger.NewNullLogger()
|
||||
}
|
||||
return l
|
||||
}
|
||||
|
||||
// A Sender is usually part of a github.com/zrepl/zrepl/replication.Endpoint.
|
||||
type Sender interface {
|
||||
// If a non-nil io.ReadCloser is returned, it is guaranteed to be closed before
|
||||
// any next call to the parent github.com/zrepl/zrepl/replication.Endpoint.
|
||||
// If the send request is for dry run the io.ReadCloser will be nil
|
||||
Send(ctx context.Context, r *pdu.SendReq) (*pdu.SendRes, zfs.StreamCopier, error)
|
||||
ReplicationCursor(ctx context.Context, req *pdu.ReplicationCursorReq) (*pdu.ReplicationCursorRes, error)
|
||||
}
|
||||
|
||||
// A Sender is usually part of a github.com/zrepl/zrepl/replication.Endpoint.
|
||||
type Receiver interface {
|
||||
// Receive sends r and sendStream (the latter containing a ZFS send stream)
|
||||
// to the parent github.com/zrepl/zrepl/replication.Endpoint.
|
||||
Receive(ctx context.Context, req *pdu.ReceiveReq, receive zfs.StreamCopier) (*pdu.ReceiveRes, error)
|
||||
}
|
||||
|
||||
type StepReport struct {
|
||||
From, To string
|
||||
Status StepState
|
||||
Problem string
|
||||
Bytes int64
|
||||
ExpectedBytes int64 // 0 means no size estimate possible
|
||||
}
|
||||
|
||||
type Report struct {
|
||||
Filesystem string
|
||||
Status string
|
||||
Problem string
|
||||
Completed, Pending []*StepReport
|
||||
}
|
||||
|
||||
//go:generate enumer -type=State
|
||||
type State uint
|
||||
|
||||
const (
|
||||
Ready State = 1 << iota
|
||||
Completed
|
||||
)
|
||||
|
||||
type Error interface {
|
||||
error
|
||||
Temporary() bool
|
||||
ContextErr() bool
|
||||
LocalToFS() bool
|
||||
}
|
||||
|
||||
type Replication struct {
|
||||
promBytesReplicated prometheus.Counter
|
||||
|
||||
fs string
|
||||
|
||||
// lock protects all fields below it in this struct, but not the data behind pointers
|
||||
lock sync.Mutex
|
||||
state State
|
||||
err Error
|
||||
completed, pending []*ReplicationStep
|
||||
}
|
||||
|
||||
func (f *Replication) State() State {
|
||||
f.lock.Lock()
|
||||
defer f.lock.Unlock()
|
||||
return f.state
|
||||
}
|
||||
|
||||
func (f *Replication) FS() string { return f.fs }
|
||||
|
||||
// returns zero value time.Time{} if no more pending steps
|
||||
func (f *Replication) NextStepDate() time.Time {
|
||||
if len(f.pending) == 0 {
|
||||
return time.Time{}
|
||||
}
|
||||
return f.pending[0].to.SnapshotTime()
|
||||
}
|
||||
|
||||
func (f *Replication) Err() Error {
|
||||
f.lock.Lock()
|
||||
defer f.lock.Unlock()
|
||||
return f.err
|
||||
}
|
||||
|
||||
func (f *Replication) CanRetry() bool {
|
||||
f.lock.Lock()
|
||||
defer f.lock.Unlock()
|
||||
if f.state == Completed {
|
||||
return false
|
||||
}
|
||||
if f.state != Ready {
|
||||
panic(fmt.Sprintf("implementation error: %v", f.state))
|
||||
}
|
||||
if f.err == nil {
|
||||
return true
|
||||
}
|
||||
return f.err.Temporary()
|
||||
}
|
||||
|
||||
func (f *Replication) UpdateSizeEsitmate(ctx context.Context, sender Sender) error {
|
||||
f.lock.Lock()
|
||||
defer f.lock.Unlock()
|
||||
for _, e := range f.pending {
|
||||
if err := e.updateSizeEstimate(ctx, sender); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
type ReplicationBuilder struct {
|
||||
r *Replication
|
||||
}
|
||||
|
||||
func BuildReplication(fs string, promBytesReplicated prometheus.Counter) *ReplicationBuilder {
|
||||
return &ReplicationBuilder{&Replication{fs: fs, promBytesReplicated: promBytesReplicated}}
|
||||
}
|
||||
|
||||
func (b *ReplicationBuilder) AddStep(from, to FilesystemVersion) *ReplicationBuilder {
|
||||
step := &ReplicationStep{
|
||||
state: StepReplicationReady,
|
||||
parent: b.r,
|
||||
from: from,
|
||||
to: to,
|
||||
}
|
||||
b.r.pending = append(b.r.pending, step)
|
||||
return b
|
||||
}
|
||||
|
||||
func (b *ReplicationBuilder) Done() (r *Replication) {
|
||||
if len(b.r.pending) > 0 {
|
||||
b.r.state = Ready
|
||||
} else {
|
||||
b.r.state = Completed
|
||||
}
|
||||
r = b.r
|
||||
b.r = nil
|
||||
return r
|
||||
}
|
||||
|
||||
type ReplicationConflictError struct {
|
||||
Err error
|
||||
}
|
||||
|
||||
func (e *ReplicationConflictError) Timeout() bool { return false }
|
||||
|
||||
func (e *ReplicationConflictError) Temporary() bool { return false }
|
||||
|
||||
func (e *ReplicationConflictError) Error() string { return fmt.Sprintf("permanent error: %s", e.Err.Error()) }
|
||||
|
||||
func (e *ReplicationConflictError) LocalToFS() bool { return true }
|
||||
|
||||
func (e *ReplicationConflictError) ContextErr() bool { return false }
|
||||
|
||||
func NewReplicationConflictError(fs string, err error) *Replication {
|
||||
return &Replication{
|
||||
state: Completed,
|
||||
fs: fs,
|
||||
err: &ReplicationConflictError{Err: err},
|
||||
}
|
||||
}
|
||||
|
||||
//go:generate enumer -type=StepState
|
||||
type StepState uint
|
||||
|
||||
const (
|
||||
StepReplicationReady StepState = 1 << iota
|
||||
StepMarkReplicatedReady
|
||||
StepCompleted
|
||||
)
|
||||
|
||||
func (s StepState) IsTerminal() bool { return s == StepCompleted }
|
||||
|
||||
type FilesystemVersion interface {
|
||||
SnapshotTime() time.Time
|
||||
GetName() string // name without @ or #
|
||||
RelName() string // name with @ or #
|
||||
}
|
||||
|
||||
type ReplicationStep struct {
|
||||
// only protects state, err
|
||||
// from, to and parent are assumed to be immutable
|
||||
lock sync.Mutex
|
||||
|
||||
state StepState
|
||||
from, to FilesystemVersion
|
||||
parent *Replication
|
||||
|
||||
// both retry and permanent error
|
||||
err error
|
||||
|
||||
byteCounter bytecounter.StreamCopier
|
||||
expectedSize int64 // 0 means no size estimate present / possible
|
||||
}
|
||||
|
||||
func (f *Replication) Retry(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, receiver Receiver) Error {
|
||||
|
||||
var u updater = func(fu func(*Replication)) State {
|
||||
f.lock.Lock()
|
||||
defer f.lock.Unlock()
|
||||
if fu != nil {
|
||||
fu(f)
|
||||
}
|
||||
return f.state
|
||||
}
|
||||
|
||||
|
||||
var current *ReplicationStep
|
||||
pre := u(nil)
|
||||
getLogger(ctx).WithField("fsrep_state", pre).Debug("begin fsrep.Retry")
|
||||
defer func() {
|
||||
post := u(nil)
|
||||
getLogger(ctx).WithField("fsrep_transition", post).Debug("end fsrep.Retry")
|
||||
}()
|
||||
|
||||
st := u(func(f *Replication) {
|
||||
if len(f.pending) == 0 {
|
||||
f.state = Completed
|
||||
return
|
||||
}
|
||||
current = f.pending[0]
|
||||
})
|
||||
if st == Completed {
|
||||
return nil
|
||||
}
|
||||
if st != Ready {
|
||||
panic(fmt.Sprintf("implementation error: %v", st))
|
||||
}
|
||||
|
||||
stepCtx := WithLogger(ctx, getLogger(ctx).WithField("step", current))
|
||||
getLogger(stepCtx).Debug("take step")
|
||||
err := current.Retry(stepCtx, ka, sender, receiver)
|
||||
if err != nil {
|
||||
getLogger(stepCtx).WithError(err).Error("step could not be completed")
|
||||
}
|
||||
|
||||
u(func(fsr *Replication) {
|
||||
if err != nil {
|
||||
f.err = &StepError{stepStr: current.String(), err: err}
|
||||
return
|
||||
}
|
||||
if err == nil && current.state != StepCompleted {
|
||||
panic(fmt.Sprintf("implementation error: %v", current.state))
|
||||
}
|
||||
f.err = nil
|
||||
f.completed = append(f.completed, current)
|
||||
f.pending = f.pending[1:]
|
||||
if len(f.pending) > 0 {
|
||||
f.state = Ready
|
||||
} else {
|
||||
f.state = Completed
|
||||
}
|
||||
})
|
||||
var retErr Error = nil
|
||||
u(func(fsr *Replication) {
|
||||
retErr = fsr.err
|
||||
})
|
||||
return retErr
|
||||
}
|
||||
|
||||
type updater func(func(fsr *Replication)) State
|
||||
|
||||
type state func(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, receiver Receiver, u updater) state
|
||||
|
||||
type StepError struct {
|
||||
stepStr string
|
||||
err error
|
||||
}
|
||||
|
||||
var _ Error = &StepError{}
|
||||
|
||||
func (e StepError) Error() string {
|
||||
if e.LocalToFS() {
|
||||
return fmt.Sprintf("step %s failed: %s", e.stepStr, e.err)
|
||||
}
|
||||
return e.err.Error()
|
||||
}
|
||||
|
||||
func (e StepError) Timeout() bool {
|
||||
if neterr, ok := e.err.(net.Error); ok {
|
||||
return neterr.Timeout()
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (e StepError) Temporary() bool {
|
||||
if neterr, ok := e.err.(net.Error); ok {
|
||||
return neterr.Temporary()
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (e StepError) LocalToFS() bool {
|
||||
if _, ok := e.err.(net.Error); ok {
|
||||
return false
|
||||
}
|
||||
return true // conservative approximation: we'd like to check for specific errors returned over RPC here...
|
||||
}
|
||||
|
||||
func (e StepError) ContextErr() bool {
|
||||
switch e.err {
|
||||
case context.Canceled:
|
||||
return true
|
||||
case context.DeadlineExceeded:
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (fsr *Replication) Report() *Report {
|
||||
fsr.lock.Lock()
|
||||
defer fsr.lock.Unlock()
|
||||
|
||||
rep := Report{
|
||||
Filesystem: fsr.fs,
|
||||
Status: fsr.state.String(),
|
||||
}
|
||||
|
||||
if fsr.err != nil && fsr.err.LocalToFS() {
|
||||
rep.Problem = fsr.err.Error()
|
||||
}
|
||||
|
||||
rep.Completed = make([]*StepReport, len(fsr.completed))
|
||||
for i := range fsr.completed {
|
||||
rep.Completed[i] = fsr.completed[i].Report()
|
||||
}
|
||||
rep.Pending = make([]*StepReport, len(fsr.pending))
|
||||
for i := range fsr.pending {
|
||||
rep.Pending[i] = fsr.pending[i].Report()
|
||||
}
|
||||
|
||||
return &rep
|
||||
}
|
||||
|
||||
func (s *ReplicationStep) Retry(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, receiver Receiver) error {
|
||||
switch s.state {
|
||||
case StepReplicationReady:
|
||||
return s.doReplication(ctx, ka, sender, receiver)
|
||||
case StepMarkReplicatedReady:
|
||||
return s.doMarkReplicated(ctx, ka, sender)
|
||||
case StepCompleted:
|
||||
return nil
|
||||
}
|
||||
panic(fmt.Sprintf("implementation error: %v", s.state))
|
||||
}
|
||||
|
||||
func (s *ReplicationStep) Error() error {
|
||||
if s.state & (StepReplicationReady|StepMarkReplicatedReady) != 0 {
|
||||
return s.err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *ReplicationStep) doReplication(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, receiver Receiver) error {
|
||||
|
||||
if s.state != StepReplicationReady {
|
||||
panic(fmt.Sprintf("implementation error: %v", s.state))
|
||||
}
|
||||
|
||||
fs := s.parent.fs
|
||||
|
||||
log := getLogger(ctx)
|
||||
sr := s.buildSendRequest(false)
|
||||
|
||||
log.Debug("initiate send request")
|
||||
sres, sstreamCopier, err := sender.Send(ctx, sr)
|
||||
if err != nil {
|
||||
log.WithError(err).Error("send request failed")
|
||||
return err
|
||||
}
|
||||
if sstreamCopier == nil {
|
||||
err := errors.New("send request did not return a stream, broken endpoint implementation")
|
||||
return err
|
||||
}
|
||||
defer sstreamCopier.Close()
|
||||
|
||||
// Install a byte counter to track progress + for status report
|
||||
s.byteCounter = bytecounter.NewStreamCopier(sstreamCopier)
|
||||
byteCounterStopProgress := make(chan struct{})
|
||||
defer close(byteCounterStopProgress)
|
||||
go func() {
|
||||
var lastCount int64
|
||||
t := time.NewTicker(1 * time.Second)
|
||||
defer t.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-byteCounterStopProgress:
|
||||
return
|
||||
case <-t.C:
|
||||
newCount := s.byteCounter.Count()
|
||||
if lastCount != newCount {
|
||||
ka.MadeProgress()
|
||||
} else {
|
||||
lastCount = newCount
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
defer func() {
|
||||
s.parent.promBytesReplicated.Add(float64(s.byteCounter.Count()))
|
||||
}()
|
||||
|
||||
rr := &pdu.ReceiveReq{
|
||||
Filesystem: fs,
|
||||
ClearResumeToken: !sres.UsedResumeToken,
|
||||
}
|
||||
log.Debug("initiate receive request")
|
||||
_, err = receiver.Receive(ctx, rr, s.byteCounter)
|
||||
if err != nil {
|
||||
log.
|
||||
WithError(err).
|
||||
WithField("errType", fmt.Sprintf("%T", err)).
|
||||
Error("receive request failed (might also be error on sender)")
|
||||
// This failure could be due to
|
||||
// - an unexpected exit of ZFS on the sending side
|
||||
// - an unexpected exit of ZFS on the receiving side
|
||||
// - a connectivity issue
|
||||
return err
|
||||
}
|
||||
log.Debug("receive finished")
|
||||
ka.MadeProgress()
|
||||
|
||||
s.state = StepMarkReplicatedReady
|
||||
return s.doMarkReplicated(ctx, ka, sender)
|
||||
|
||||
}
|
||||
|
||||
func (s *ReplicationStep) doMarkReplicated(ctx context.Context, ka *watchdog.KeepAlive, sender Sender) error {
|
||||
|
||||
if s.state != StepMarkReplicatedReady {
|
||||
panic(fmt.Sprintf("implementation error: %v", s.state))
|
||||
}
|
||||
|
||||
log := getLogger(ctx)
|
||||
|
||||
log.Debug("advance replication cursor")
|
||||
req := &pdu.ReplicationCursorReq{
|
||||
Filesystem: s.parent.fs,
|
||||
Op: &pdu.ReplicationCursorReq_Set{
|
||||
Set: &pdu.ReplicationCursorReq_SetOp{
|
||||
Snapshot: s.to.GetName(),
|
||||
},
|
||||
},
|
||||
}
|
||||
_, err := sender.ReplicationCursor(ctx, req)
|
||||
if err != nil {
|
||||
log.WithError(err).Error("error advancing replication cursor")
|
||||
return err
|
||||
}
|
||||
ka.MadeProgress()
|
||||
|
||||
s.state = StepCompleted
|
||||
return err
|
||||
}
|
||||
|
||||
func (s *ReplicationStep) updateSizeEstimate(ctx context.Context, sender Sender) error {
|
||||
|
||||
log := getLogger(ctx)
|
||||
|
||||
sr := s.buildSendRequest(true)
|
||||
|
||||
log.Debug("initiate dry run send request")
|
||||
sres, _, err := sender.Send(ctx, sr)
|
||||
if err != nil {
|
||||
log.WithError(err).Error("dry run send request failed")
|
||||
return err
|
||||
}
|
||||
s.expectedSize = sres.ExpectedSize
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *ReplicationStep) buildSendRequest(dryRun bool) (sr *pdu.SendReq) {
|
||||
fs := s.parent.fs
|
||||
if s.from == nil {
|
||||
sr = &pdu.SendReq{
|
||||
Filesystem: fs,
|
||||
To: s.to.RelName(),
|
||||
DryRun: dryRun,
|
||||
}
|
||||
} else {
|
||||
sr = &pdu.SendReq{
|
||||
Filesystem: fs,
|
||||
From: s.from.RelName(),
|
||||
To: s.to.RelName(),
|
||||
DryRun: dryRun,
|
||||
}
|
||||
}
|
||||
return sr
|
||||
}
|
||||
|
||||
func (s *ReplicationStep) String() string {
|
||||
if s.from == nil { // FIXME: ZFS semantics are that to is nil on non-incremental send
|
||||
return fmt.Sprintf("%s%s (full)", s.parent.fs, s.to.RelName())
|
||||
} else {
|
||||
return fmt.Sprintf("%s(%s => %s)", s.parent.fs, s.from.RelName(), s.to.RelName())
|
||||
}
|
||||
}
|
||||
|
||||
func (s *ReplicationStep) Report() *StepReport {
|
||||
var from string // FIXME follow same convention as ZFS: to should be nil on full send
|
||||
if s.from != nil {
|
||||
from = s.from.RelName()
|
||||
}
|
||||
bytes := int64(0)
|
||||
if s.byteCounter != nil {
|
||||
bytes = s.byteCounter.Count()
|
||||
}
|
||||
problem := ""
|
||||
if s.err != nil {
|
||||
problem = s.err.Error()
|
||||
}
|
||||
rep := StepReport{
|
||||
From: from,
|
||||
To: s.to.RelName(),
|
||||
Status: s.state,
|
||||
Problem: problem,
|
||||
Bytes: bytes,
|
||||
ExpectedBytes: s.expectedSize,
|
||||
}
|
||||
return &rep
|
||||
}
|
@ -1,50 +0,0 @@
|
||||
// Code generated by "enumer -type=State"; DO NOT EDIT.
|
||||
|
||||
package fsrep
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
)
|
||||
|
||||
const _StateName = "ReadyCompleted"
|
||||
|
||||
var _StateIndex = [...]uint8{0, 5, 14}
|
||||
|
||||
func (i State) String() string {
|
||||
i -= 1
|
||||
if i >= State(len(_StateIndex)-1) {
|
||||
return fmt.Sprintf("State(%d)", i+1)
|
||||
}
|
||||
return _StateName[_StateIndex[i]:_StateIndex[i+1]]
|
||||
}
|
||||
|
||||
var _StateValues = []State{1, 2}
|
||||
|
||||
var _StateNameToValueMap = map[string]State{
|
||||
_StateName[0:5]: 1,
|
||||
_StateName[5:14]: 2,
|
||||
}
|
||||
|
||||
// StateString retrieves an enum value from the enum constants string name.
|
||||
// Throws an error if the param is not part of the enum.
|
||||
func StateString(s string) (State, error) {
|
||||
if val, ok := _StateNameToValueMap[s]; ok {
|
||||
return val, nil
|
||||
}
|
||||
return 0, fmt.Errorf("%s does not belong to State values", s)
|
||||
}
|
||||
|
||||
// StateValues returns all values of the enum
|
||||
func StateValues() []State {
|
||||
return _StateValues
|
||||
}
|
||||
|
||||
// IsAState returns "true" if the value is listed in the enum definition. "false" otherwise
|
||||
func (i State) IsAState() bool {
|
||||
for _, v := range _StateValues {
|
||||
if i == v {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
@ -1,61 +0,0 @@
|
||||
// Code generated by "enumer -type=StepState"; DO NOT EDIT.
|
||||
|
||||
package fsrep
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
)
|
||||
|
||||
const (
|
||||
_StepStateName_0 = "StepReplicationReadyStepMarkReplicatedReady"
|
||||
_StepStateName_1 = "StepCompleted"
|
||||
)
|
||||
|
||||
var (
|
||||
_StepStateIndex_0 = [...]uint8{0, 20, 43}
|
||||
_StepStateIndex_1 = [...]uint8{0, 13}
|
||||
)
|
||||
|
||||
func (i StepState) String() string {
|
||||
switch {
|
||||
case 1 <= i && i <= 2:
|
||||
i -= 1
|
||||
return _StepStateName_0[_StepStateIndex_0[i]:_StepStateIndex_0[i+1]]
|
||||
case i == 4:
|
||||
return _StepStateName_1
|
||||
default:
|
||||
return fmt.Sprintf("StepState(%d)", i)
|
||||
}
|
||||
}
|
||||
|
||||
var _StepStateValues = []StepState{1, 2, 4}
|
||||
|
||||
var _StepStateNameToValueMap = map[string]StepState{
|
||||
_StepStateName_0[0:20]: 1,
|
||||
_StepStateName_0[20:43]: 2,
|
||||
_StepStateName_1[0:13]: 4,
|
||||
}
|
||||
|
||||
// StepStateString retrieves an enum value from the enum constants string name.
|
||||
// Throws an error if the param is not part of the enum.
|
||||
func StepStateString(s string) (StepState, error) {
|
||||
if val, ok := _StepStateNameToValueMap[s]; ok {
|
||||
return val, nil
|
||||
}
|
||||
return 0, fmt.Errorf("%s does not belong to StepState values", s)
|
||||
}
|
||||
|
||||
// StepStateValues returns all values of the enum
|
||||
func StepStateValues() []StepState {
|
||||
return _StepStateValues
|
||||
}
|
||||
|
||||
// IsAStepState returns "true" if the value is listed in the enum definition. "false" otherwise
|
||||
func (i StepState) IsAStepState() bool {
|
||||
for _, v := range _StepStateValues {
|
||||
if i == v {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
@ -1,9 +1,11 @@
|
||||
package mainfsm
|
||||
package diff
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
. "github.com/zrepl/zrepl/replication/pdu"
|
||||
. "github.com/zrepl/zrepl/replication/logic/pdu"
|
||||
)
|
||||
|
||||
type ConflictNoCommonAncestor struct {
|
||||
@ -11,7 +13,19 @@ type ConflictNoCommonAncestor struct {
|
||||
}
|
||||
|
||||
func (c *ConflictNoCommonAncestor) Error() string {
|
||||
return "no common snapshot or suitable bookmark between sender and receiver"
|
||||
var buf strings.Builder
|
||||
buf.WriteString("no common snapshot or suitable bookmark between sender and receiver")
|
||||
if len(c.SortedReceiverVersions) > 0 || len(c.SortedSenderVersions) > 0 {
|
||||
buf.WriteString(":\n sorted sender versions:\n")
|
||||
for _, v := range c.SortedSenderVersions {
|
||||
fmt.Fprintf(&buf, " %s\n", v.RelName())
|
||||
}
|
||||
buf.WriteString(" sorted receiver versions:\n")
|
||||
for _, v := range c.SortedReceiverVersions {
|
||||
fmt.Fprintf(&buf, " %s\n", v.RelName())
|
||||
}
|
||||
}
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
type ConflictDiverged struct {
|
||||
@ -21,7 +35,18 @@ type ConflictDiverged struct {
|
||||
}
|
||||
|
||||
func (c *ConflictDiverged) Error() string {
|
||||
return "the receiver's latest snapshot is not present on sender"
|
||||
var buf strings.Builder
|
||||
buf.WriteString("the receiver's latest snapshot is not present on sender:\n")
|
||||
fmt.Fprintf(&buf, " last common: %s\n", c.CommonAncestor.RelName())
|
||||
fmt.Fprintf(&buf, " sender-only:\n")
|
||||
for _, v := range c.SenderOnly {
|
||||
fmt.Fprintf(&buf, " %s\n", v.RelName())
|
||||
}
|
||||
fmt.Fprintf(&buf, " receiver-only:\n")
|
||||
for _, v := range c.ReceiverOnly {
|
||||
fmt.Fprintf(&buf, " %s\n", v.RelName())
|
||||
}
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
func SortVersionListByCreateTXGThenBookmarkLTSnapshot(fsvslice []*FilesystemVersion) []*FilesystemVersion {
|
130
replication/logic/diff/diff_test.go
Normal file
130
replication/logic/diff/diff_test.go
Normal file
@ -0,0 +1,130 @@
|
||||
package diff
|
||||
|
||||
import (
|
||||
"strconv"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
. "github.com/zrepl/zrepl/replication/logic/pdu"
|
||||
)
|
||||
|
||||
func fsvlist(fsv ...string) (r []*FilesystemVersion) {
|
||||
|
||||
r = make([]*FilesystemVersion, len(fsv))
|
||||
for i, f := range fsv {
|
||||
|
||||
// parse the id from fsvlist. it is used to derivce Guid,CreateTXG and Creation attrs
|
||||
split := strings.Split(f, ",")
|
||||
if len(split) != 2 {
|
||||
panic("invalid fsv spec")
|
||||
}
|
||||
id, err := strconv.Atoi(split[1])
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
creation := func(id int) string {
|
||||
return FilesystemVersionCreation(time.Unix(0, 0).Add(time.Duration(id) * time.Second))
|
||||
}
|
||||
if strings.HasPrefix(f, "#") {
|
||||
r[i] = &FilesystemVersion{
|
||||
Name: strings.TrimPrefix(f, "#"),
|
||||
Type: FilesystemVersion_Bookmark,
|
||||
Guid: uint64(id),
|
||||
CreateTXG: uint64(id),
|
||||
Creation: creation(id),
|
||||
}
|
||||
} else if strings.HasPrefix(f, "@") {
|
||||
r[i] = &FilesystemVersion{
|
||||
Name: strings.TrimPrefix(f, "@"),
|
||||
Type: FilesystemVersion_Snapshot,
|
||||
Guid: uint64(id),
|
||||
CreateTXG: uint64(id),
|
||||
Creation: creation(id),
|
||||
}
|
||||
} else {
|
||||
panic("invalid character")
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func doTest(receiver, sender []*FilesystemVersion, validate func(incpath []*FilesystemVersion, conflict error)) {
|
||||
p, err := IncrementalPath(receiver, sender)
|
||||
validate(p, err)
|
||||
}
|
||||
|
||||
func TestIncrementalPath_SnapshotsOnly(t *testing.T) {
|
||||
|
||||
l := fsvlist
|
||||
|
||||
// basic functionality
|
||||
doTest(l("@a,1", "@b,2"), l("@a,1", "@b,2", "@c,3", "@d,4"), func(path []*FilesystemVersion, conflict error) {
|
||||
assert.Equal(t, l("@b,2", "@c,3", "@d,4"), path)
|
||||
})
|
||||
|
||||
// no common ancestor
|
||||
doTest(l(), l("@a,1"), func(path []*FilesystemVersion, conflict error) {
|
||||
assert.Nil(t, path)
|
||||
ca, ok := conflict.(*ConflictNoCommonAncestor)
|
||||
require.True(t, ok)
|
||||
assert.Equal(t, l("@a,1"), ca.SortedSenderVersions)
|
||||
})
|
||||
doTest(l("@a,1", "@b,2"), l("@c,3", "@d,4"), func(path []*FilesystemVersion, conflict error) {
|
||||
assert.Nil(t, path)
|
||||
ca, ok := conflict.(*ConflictNoCommonAncestor)
|
||||
require.True(t, ok)
|
||||
assert.Equal(t, l("@a,1", "@b,2"), ca.SortedReceiverVersions)
|
||||
assert.Equal(t, l("@c,3", "@d,4"), ca.SortedSenderVersions)
|
||||
})
|
||||
|
||||
// divergence is detected
|
||||
doTest(l("@a,1", "@b1,2"), l("@a,1", "@b2,3"), func(path []*FilesystemVersion, conflict error) {
|
||||
assert.Nil(t, path)
|
||||
cd, ok := conflict.(*ConflictDiverged)
|
||||
require.True(t, ok)
|
||||
assert.Equal(t, l("@a,1")[0], cd.CommonAncestor)
|
||||
assert.Equal(t, l("@b1,2"), cd.ReceiverOnly)
|
||||
assert.Equal(t, l("@b2,3"), cd.SenderOnly)
|
||||
})
|
||||
|
||||
// gaps before most recent common ancestor do not matter
|
||||
doTest(l("@a,1", "@b,2", "@c,3"), l("@a,1", "@c,3", "@d,4"), func(path []*FilesystemVersion, conflict error) {
|
||||
assert.Equal(t, l("@c,3", "@d,4"), path)
|
||||
})
|
||||
|
||||
// sender with earlier but also current version as sender is not a conflict
|
||||
doTest(l("@c,3"), l("@a,1", "@b,2", "@c,3") , func(path []*FilesystemVersion, conflict error) {
|
||||
t.Logf("path: %#v", path)
|
||||
t.Logf("conflict: %#v", conflict)
|
||||
assert.Empty(t, path)
|
||||
assert.Nil(t, conflict)
|
||||
})
|
||||
|
||||
}
|
||||
|
||||
func TestIncrementalPath_BookmarkSupport(t *testing.T) {
|
||||
l := fsvlist
|
||||
|
||||
// bookmarks are used
|
||||
doTest(l("@a,1"), l("#a,1", "@b,2"), func(path []*FilesystemVersion, conflict error) {
|
||||
assert.Equal(t, l("#a,1", "@b,2"), path)
|
||||
})
|
||||
|
||||
// boomarks are stripped from IncrementalPath (cannot send incrementally)
|
||||
doTest(l("@a,1"), l("#a,1", "#b,2", "@c,3"), func(path []*FilesystemVersion, conflict error) {
|
||||
assert.Equal(t, l("#a,1", "@c,3"), path)
|
||||
})
|
||||
|
||||
// test that snapshots are preferred over bookmarks in IncrementalPath
|
||||
doTest(l("@a,1"), l("#a,1", "@a,1", "@b,2"), func(path []*FilesystemVersion, conflict error) {
|
||||
assert.Equal(t, l("@a,1", "@b,2"), path)
|
||||
})
|
||||
doTest(l("@a,1"), l("@a,1", "#a,1", "@b,2"), func(path []*FilesystemVersion, conflict error) {
|
||||
assert.Equal(t, l("@a,1", "@b,2"), path)
|
||||
})
|
||||
|
||||
}
|
@ -43,7 +43,7 @@ func (x FilesystemVersion_VersionType) String() string {
|
||||
return proto.EnumName(FilesystemVersion_VersionType_name, int32(x))
|
||||
}
|
||||
func (FilesystemVersion_VersionType) EnumDescriptor() ([]byte, []int) {
|
||||
return fileDescriptor_pdu_89315d819a6e0938, []int{5, 0}
|
||||
return fileDescriptor_pdu_83b7e2a28d820622, []int{5, 0}
|
||||
}
|
||||
|
||||
type ListFilesystemReq struct {
|
||||
@ -56,7 +56,7 @@ func (m *ListFilesystemReq) Reset() { *m = ListFilesystemReq{} }
|
||||
func (m *ListFilesystemReq) String() string { return proto.CompactTextString(m) }
|
||||
func (*ListFilesystemReq) ProtoMessage() {}
|
||||
func (*ListFilesystemReq) Descriptor() ([]byte, []int) {
|
||||
return fileDescriptor_pdu_89315d819a6e0938, []int{0}
|
||||
return fileDescriptor_pdu_83b7e2a28d820622, []int{0}
|
||||
}
|
||||
func (m *ListFilesystemReq) XXX_Unmarshal(b []byte) error {
|
||||
return xxx_messageInfo_ListFilesystemReq.Unmarshal(m, b)
|
||||
@ -78,7 +78,6 @@ var xxx_messageInfo_ListFilesystemReq proto.InternalMessageInfo
|
||||
|
||||
type ListFilesystemRes struct {
|
||||
Filesystems []*Filesystem `protobuf:"bytes,1,rep,name=Filesystems,proto3" json:"Filesystems,omitempty"`
|
||||
Empty bool `protobuf:"varint,2,opt,name=Empty,proto3" json:"Empty,omitempty"`
|
||||
XXX_NoUnkeyedLiteral struct{} `json:"-"`
|
||||
XXX_unrecognized []byte `json:"-"`
|
||||
XXX_sizecache int32 `json:"-"`
|
||||
@ -88,7 +87,7 @@ func (m *ListFilesystemRes) Reset() { *m = ListFilesystemRes{} }
|
||||
func (m *ListFilesystemRes) String() string { return proto.CompactTextString(m) }
|
||||
func (*ListFilesystemRes) ProtoMessage() {}
|
||||
func (*ListFilesystemRes) Descriptor() ([]byte, []int) {
|
||||
return fileDescriptor_pdu_89315d819a6e0938, []int{1}
|
||||
return fileDescriptor_pdu_83b7e2a28d820622, []int{1}
|
||||
}
|
||||
func (m *ListFilesystemRes) XXX_Unmarshal(b []byte) error {
|
||||
return xxx_messageInfo_ListFilesystemRes.Unmarshal(m, b)
|
||||
@ -115,16 +114,10 @@ func (m *ListFilesystemRes) GetFilesystems() []*Filesystem {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *ListFilesystemRes) GetEmpty() bool {
|
||||
if m != nil {
|
||||
return m.Empty
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
type Filesystem struct {
|
||||
Path string `protobuf:"bytes,1,opt,name=Path,proto3" json:"Path,omitempty"`
|
||||
ResumeToken string `protobuf:"bytes,2,opt,name=ResumeToken,proto3" json:"ResumeToken,omitempty"`
|
||||
IsPlaceholder bool `protobuf:"varint,3,opt,name=IsPlaceholder,proto3" json:"IsPlaceholder,omitempty"`
|
||||
XXX_NoUnkeyedLiteral struct{} `json:"-"`
|
||||
XXX_unrecognized []byte `json:"-"`
|
||||
XXX_sizecache int32 `json:"-"`
|
||||
@ -134,7 +127,7 @@ func (m *Filesystem) Reset() { *m = Filesystem{} }
|
||||
func (m *Filesystem) String() string { return proto.CompactTextString(m) }
|
||||
func (*Filesystem) ProtoMessage() {}
|
||||
func (*Filesystem) Descriptor() ([]byte, []int) {
|
||||
return fileDescriptor_pdu_89315d819a6e0938, []int{2}
|
||||
return fileDescriptor_pdu_83b7e2a28d820622, []int{2}
|
||||
}
|
||||
func (m *Filesystem) XXX_Unmarshal(b []byte) error {
|
||||
return xxx_messageInfo_Filesystem.Unmarshal(m, b)
|
||||
@ -168,6 +161,13 @@ func (m *Filesystem) GetResumeToken() string {
|
||||
return ""
|
||||
}
|
||||
|
||||
func (m *Filesystem) GetIsPlaceholder() bool {
|
||||
if m != nil {
|
||||
return m.IsPlaceholder
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
type ListFilesystemVersionsReq struct {
|
||||
Filesystem string `protobuf:"bytes,1,opt,name=Filesystem,proto3" json:"Filesystem,omitempty"`
|
||||
XXX_NoUnkeyedLiteral struct{} `json:"-"`
|
||||
@ -179,7 +179,7 @@ func (m *ListFilesystemVersionsReq) Reset() { *m = ListFilesystemVersion
|
||||
func (m *ListFilesystemVersionsReq) String() string { return proto.CompactTextString(m) }
|
||||
func (*ListFilesystemVersionsReq) ProtoMessage() {}
|
||||
func (*ListFilesystemVersionsReq) Descriptor() ([]byte, []int) {
|
||||
return fileDescriptor_pdu_89315d819a6e0938, []int{3}
|
||||
return fileDescriptor_pdu_83b7e2a28d820622, []int{3}
|
||||
}
|
||||
func (m *ListFilesystemVersionsReq) XXX_Unmarshal(b []byte) error {
|
||||
return xxx_messageInfo_ListFilesystemVersionsReq.Unmarshal(m, b)
|
||||
@ -217,7 +217,7 @@ func (m *ListFilesystemVersionsRes) Reset() { *m = ListFilesystemVersion
|
||||
func (m *ListFilesystemVersionsRes) String() string { return proto.CompactTextString(m) }
|
||||
func (*ListFilesystemVersionsRes) ProtoMessage() {}
|
||||
func (*ListFilesystemVersionsRes) Descriptor() ([]byte, []int) {
|
||||
return fileDescriptor_pdu_89315d819a6e0938, []int{4}
|
||||
return fileDescriptor_pdu_83b7e2a28d820622, []int{4}
|
||||
}
|
||||
func (m *ListFilesystemVersionsRes) XXX_Unmarshal(b []byte) error {
|
||||
return xxx_messageInfo_ListFilesystemVersionsRes.Unmarshal(m, b)
|
||||
@ -259,7 +259,7 @@ func (m *FilesystemVersion) Reset() { *m = FilesystemVersion{} }
|
||||
func (m *FilesystemVersion) String() string { return proto.CompactTextString(m) }
|
||||
func (*FilesystemVersion) ProtoMessage() {}
|
||||
func (*FilesystemVersion) Descriptor() ([]byte, []int) {
|
||||
return fileDescriptor_pdu_89315d819a6e0938, []int{5}
|
||||
return fileDescriptor_pdu_83b7e2a28d820622, []int{5}
|
||||
}
|
||||
func (m *FilesystemVersion) XXX_Unmarshal(b []byte) error {
|
||||
return xxx_messageInfo_FilesystemVersion.Unmarshal(m, b)
|
||||
@ -339,7 +339,7 @@ func (m *SendReq) Reset() { *m = SendReq{} }
|
||||
func (m *SendReq) String() string { return proto.CompactTextString(m) }
|
||||
func (*SendReq) ProtoMessage() {}
|
||||
func (*SendReq) Descriptor() ([]byte, []int) {
|
||||
return fileDescriptor_pdu_89315d819a6e0938, []int{6}
|
||||
return fileDescriptor_pdu_83b7e2a28d820622, []int{6}
|
||||
}
|
||||
func (m *SendReq) XXX_Unmarshal(b []byte) error {
|
||||
return xxx_messageInfo_SendReq.Unmarshal(m, b)
|
||||
@ -420,7 +420,7 @@ func (m *Property) Reset() { *m = Property{} }
|
||||
func (m *Property) String() string { return proto.CompactTextString(m) }
|
||||
func (*Property) ProtoMessage() {}
|
||||
func (*Property) Descriptor() ([]byte, []int) {
|
||||
return fileDescriptor_pdu_89315d819a6e0938, []int{7}
|
||||
return fileDescriptor_pdu_83b7e2a28d820622, []int{7}
|
||||
}
|
||||
func (m *Property) XXX_Unmarshal(b []byte) error {
|
||||
return xxx_messageInfo_Property.Unmarshal(m, b)
|
||||
@ -470,7 +470,7 @@ func (m *SendRes) Reset() { *m = SendRes{} }
|
||||
func (m *SendRes) String() string { return proto.CompactTextString(m) }
|
||||
func (*SendRes) ProtoMessage() {}
|
||||
func (*SendRes) Descriptor() ([]byte, []int) {
|
||||
return fileDescriptor_pdu_89315d819a6e0938, []int{8}
|
||||
return fileDescriptor_pdu_83b7e2a28d820622, []int{8}
|
||||
}
|
||||
func (m *SendRes) XXX_Unmarshal(b []byte) error {
|
||||
return xxx_messageInfo_SendRes.Unmarshal(m, b)
|
||||
@ -524,7 +524,7 @@ func (m *ReceiveReq) Reset() { *m = ReceiveReq{} }
|
||||
func (m *ReceiveReq) String() string { return proto.CompactTextString(m) }
|
||||
func (*ReceiveReq) ProtoMessage() {}
|
||||
func (*ReceiveReq) Descriptor() ([]byte, []int) {
|
||||
return fileDescriptor_pdu_89315d819a6e0938, []int{9}
|
||||
return fileDescriptor_pdu_83b7e2a28d820622, []int{9}
|
||||
}
|
||||
func (m *ReceiveReq) XXX_Unmarshal(b []byte) error {
|
||||
return xxx_messageInfo_ReceiveReq.Unmarshal(m, b)
|
||||
@ -568,7 +568,7 @@ func (m *ReceiveRes) Reset() { *m = ReceiveRes{} }
|
||||
func (m *ReceiveRes) String() string { return proto.CompactTextString(m) }
|
||||
func (*ReceiveRes) ProtoMessage() {}
|
||||
func (*ReceiveRes) Descriptor() ([]byte, []int) {
|
||||
return fileDescriptor_pdu_89315d819a6e0938, []int{10}
|
||||
return fileDescriptor_pdu_83b7e2a28d820622, []int{10}
|
||||
}
|
||||
func (m *ReceiveRes) XXX_Unmarshal(b []byte) error {
|
||||
return xxx_messageInfo_ReceiveRes.Unmarshal(m, b)
|
||||
@ -601,7 +601,7 @@ func (m *DestroySnapshotsReq) Reset() { *m = DestroySnapshotsReq{} }
|
||||
func (m *DestroySnapshotsReq) String() string { return proto.CompactTextString(m) }
|
||||
func (*DestroySnapshotsReq) ProtoMessage() {}
|
||||
func (*DestroySnapshotsReq) Descriptor() ([]byte, []int) {
|
||||
return fileDescriptor_pdu_89315d819a6e0938, []int{11}
|
||||
return fileDescriptor_pdu_83b7e2a28d820622, []int{11}
|
||||
}
|
||||
func (m *DestroySnapshotsReq) XXX_Unmarshal(b []byte) error {
|
||||
return xxx_messageInfo_DestroySnapshotsReq.Unmarshal(m, b)
|
||||
@ -647,7 +647,7 @@ func (m *DestroySnapshotRes) Reset() { *m = DestroySnapshotRes{} }
|
||||
func (m *DestroySnapshotRes) String() string { return proto.CompactTextString(m) }
|
||||
func (*DestroySnapshotRes) ProtoMessage() {}
|
||||
func (*DestroySnapshotRes) Descriptor() ([]byte, []int) {
|
||||
return fileDescriptor_pdu_89315d819a6e0938, []int{12}
|
||||
return fileDescriptor_pdu_83b7e2a28d820622, []int{12}
|
||||
}
|
||||
func (m *DestroySnapshotRes) XXX_Unmarshal(b []byte) error {
|
||||
return xxx_messageInfo_DestroySnapshotRes.Unmarshal(m, b)
|
||||
@ -692,7 +692,7 @@ func (m *DestroySnapshotsRes) Reset() { *m = DestroySnapshotsRes{} }
|
||||
func (m *DestroySnapshotsRes) String() string { return proto.CompactTextString(m) }
|
||||
func (*DestroySnapshotsRes) ProtoMessage() {}
|
||||
func (*DestroySnapshotsRes) Descriptor() ([]byte, []int) {
|
||||
return fileDescriptor_pdu_89315d819a6e0938, []int{13}
|
||||
return fileDescriptor_pdu_83b7e2a28d820622, []int{13}
|
||||
}
|
||||
func (m *DestroySnapshotsRes) XXX_Unmarshal(b []byte) error {
|
||||
return xxx_messageInfo_DestroySnapshotsRes.Unmarshal(m, b)
|
||||
@ -734,7 +734,7 @@ func (m *ReplicationCursorReq) Reset() { *m = ReplicationCursorReq{} }
|
||||
func (m *ReplicationCursorReq) String() string { return proto.CompactTextString(m) }
|
||||
func (*ReplicationCursorReq) ProtoMessage() {}
|
||||
func (*ReplicationCursorReq) Descriptor() ([]byte, []int) {
|
||||
return fileDescriptor_pdu_89315d819a6e0938, []int{14}
|
||||
return fileDescriptor_pdu_83b7e2a28d820622, []int{14}
|
||||
}
|
||||
func (m *ReplicationCursorReq) XXX_Unmarshal(b []byte) error {
|
||||
return xxx_messageInfo_ReplicationCursorReq.Unmarshal(m, b)
|
||||
@ -882,7 +882,7 @@ func (m *ReplicationCursorReq_GetOp) Reset() { *m = ReplicationCursorReq
|
||||
func (m *ReplicationCursorReq_GetOp) String() string { return proto.CompactTextString(m) }
|
||||
func (*ReplicationCursorReq_GetOp) ProtoMessage() {}
|
||||
func (*ReplicationCursorReq_GetOp) Descriptor() ([]byte, []int) {
|
||||
return fileDescriptor_pdu_89315d819a6e0938, []int{14, 0}
|
||||
return fileDescriptor_pdu_83b7e2a28d820622, []int{14, 0}
|
||||
}
|
||||
func (m *ReplicationCursorReq_GetOp) XXX_Unmarshal(b []byte) error {
|
||||
return xxx_messageInfo_ReplicationCursorReq_GetOp.Unmarshal(m, b)
|
||||
@ -913,7 +913,7 @@ func (m *ReplicationCursorReq_SetOp) Reset() { *m = ReplicationCursorReq
|
||||
func (m *ReplicationCursorReq_SetOp) String() string { return proto.CompactTextString(m) }
|
||||
func (*ReplicationCursorReq_SetOp) ProtoMessage() {}
|
||||
func (*ReplicationCursorReq_SetOp) Descriptor() ([]byte, []int) {
|
||||
return fileDescriptor_pdu_89315d819a6e0938, []int{14, 1}
|
||||
return fileDescriptor_pdu_83b7e2a28d820622, []int{14, 1}
|
||||
}
|
||||
func (m *ReplicationCursorReq_SetOp) XXX_Unmarshal(b []byte) error {
|
||||
return xxx_messageInfo_ReplicationCursorReq_SetOp.Unmarshal(m, b)
|
||||
@ -954,7 +954,7 @@ func (m *ReplicationCursorRes) Reset() { *m = ReplicationCursorRes{} }
|
||||
func (m *ReplicationCursorRes) String() string { return proto.CompactTextString(m) }
|
||||
func (*ReplicationCursorRes) ProtoMessage() {}
|
||||
func (*ReplicationCursorRes) Descriptor() ([]byte, []int) {
|
||||
return fileDescriptor_pdu_89315d819a6e0938, []int{15}
|
||||
return fileDescriptor_pdu_83b7e2a28d820622, []int{15}
|
||||
}
|
||||
func (m *ReplicationCursorRes) XXX_Unmarshal(b []byte) error {
|
||||
return xxx_messageInfo_ReplicationCursorRes.Unmarshal(m, b)
|
||||
@ -1079,6 +1079,83 @@ func _ReplicationCursorRes_OneofSizer(msg proto.Message) (n int) {
|
||||
return n
|
||||
}
|
||||
|
||||
type PingReq struct {
|
||||
Message string `protobuf:"bytes,1,opt,name=Message,proto3" json:"Message,omitempty"`
|
||||
XXX_NoUnkeyedLiteral struct{} `json:"-"`
|
||||
XXX_unrecognized []byte `json:"-"`
|
||||
XXX_sizecache int32 `json:"-"`
|
||||
}
|
||||
|
||||
func (m *PingReq) Reset() { *m = PingReq{} }
|
||||
func (m *PingReq) String() string { return proto.CompactTextString(m) }
|
||||
func (*PingReq) ProtoMessage() {}
|
||||
func (*PingReq) Descriptor() ([]byte, []int) {
|
||||
return fileDescriptor_pdu_83b7e2a28d820622, []int{16}
|
||||
}
|
||||
func (m *PingReq) XXX_Unmarshal(b []byte) error {
|
||||
return xxx_messageInfo_PingReq.Unmarshal(m, b)
|
||||
}
|
||||
func (m *PingReq) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
|
||||
return xxx_messageInfo_PingReq.Marshal(b, m, deterministic)
|
||||
}
|
||||
func (dst *PingReq) XXX_Merge(src proto.Message) {
|
||||
xxx_messageInfo_PingReq.Merge(dst, src)
|
||||
}
|
||||
func (m *PingReq) XXX_Size() int {
|
||||
return xxx_messageInfo_PingReq.Size(m)
|
||||
}
|
||||
func (m *PingReq) XXX_DiscardUnknown() {
|
||||
xxx_messageInfo_PingReq.DiscardUnknown(m)
|
||||
}
|
||||
|
||||
var xxx_messageInfo_PingReq proto.InternalMessageInfo
|
||||
|
||||
func (m *PingReq) GetMessage() string {
|
||||
if m != nil {
|
||||
return m.Message
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
type PingRes struct {
|
||||
// Echo must be PingReq.Message
|
||||
Echo string `protobuf:"bytes,1,opt,name=Echo,proto3" json:"Echo,omitempty"`
|
||||
XXX_NoUnkeyedLiteral struct{} `json:"-"`
|
||||
XXX_unrecognized []byte `json:"-"`
|
||||
XXX_sizecache int32 `json:"-"`
|
||||
}
|
||||
|
||||
func (m *PingRes) Reset() { *m = PingRes{} }
|
||||
func (m *PingRes) String() string { return proto.CompactTextString(m) }
|
||||
func (*PingRes) ProtoMessage() {}
|
||||
func (*PingRes) Descriptor() ([]byte, []int) {
|
||||
return fileDescriptor_pdu_83b7e2a28d820622, []int{17}
|
||||
}
|
||||
func (m *PingRes) XXX_Unmarshal(b []byte) error {
|
||||
return xxx_messageInfo_PingRes.Unmarshal(m, b)
|
||||
}
|
||||
func (m *PingRes) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
|
||||
return xxx_messageInfo_PingRes.Marshal(b, m, deterministic)
|
||||
}
|
||||
func (dst *PingRes) XXX_Merge(src proto.Message) {
|
||||
xxx_messageInfo_PingRes.Merge(dst, src)
|
||||
}
|
||||
func (m *PingRes) XXX_Size() int {
|
||||
return xxx_messageInfo_PingRes.Size(m)
|
||||
}
|
||||
func (m *PingRes) XXX_DiscardUnknown() {
|
||||
xxx_messageInfo_PingRes.DiscardUnknown(m)
|
||||
}
|
||||
|
||||
var xxx_messageInfo_PingRes proto.InternalMessageInfo
|
||||
|
||||
func (m *PingRes) GetEcho() string {
|
||||
if m != nil {
|
||||
return m.Echo
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func init() {
|
||||
proto.RegisterType((*ListFilesystemReq)(nil), "ListFilesystemReq")
|
||||
proto.RegisterType((*ListFilesystemRes)(nil), "ListFilesystemRes")
|
||||
@ -1098,6 +1175,8 @@ func init() {
|
||||
proto.RegisterType((*ReplicationCursorReq_GetOp)(nil), "ReplicationCursorReq.GetOp")
|
||||
proto.RegisterType((*ReplicationCursorReq_SetOp)(nil), "ReplicationCursorReq.SetOp")
|
||||
proto.RegisterType((*ReplicationCursorRes)(nil), "ReplicationCursorRes")
|
||||
proto.RegisterType((*PingReq)(nil), "PingReq")
|
||||
proto.RegisterType((*PingRes)(nil), "PingRes")
|
||||
proto.RegisterEnum("FilesystemVersion_VersionType", FilesystemVersion_VersionType_name, FilesystemVersion_VersionType_value)
|
||||
}
|
||||
|
||||
@ -1113,6 +1192,7 @@ const _ = grpc.SupportPackageIsVersion4
|
||||
//
|
||||
// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://godoc.org/google.golang.org/grpc#ClientConn.NewStream.
|
||||
type ReplicationClient interface {
|
||||
Ping(ctx context.Context, in *PingReq, opts ...grpc.CallOption) (*PingRes, error)
|
||||
ListFilesystems(ctx context.Context, in *ListFilesystemReq, opts ...grpc.CallOption) (*ListFilesystemRes, error)
|
||||
ListFilesystemVersions(ctx context.Context, in *ListFilesystemVersionsReq, opts ...grpc.CallOption) (*ListFilesystemVersionsRes, error)
|
||||
DestroySnapshots(ctx context.Context, in *DestroySnapshotsReq, opts ...grpc.CallOption) (*DestroySnapshotsRes, error)
|
||||
@ -1127,6 +1207,15 @@ func NewReplicationClient(cc *grpc.ClientConn) ReplicationClient {
|
||||
return &replicationClient{cc}
|
||||
}
|
||||
|
||||
func (c *replicationClient) Ping(ctx context.Context, in *PingReq, opts ...grpc.CallOption) (*PingRes, error) {
|
||||
out := new(PingRes)
|
||||
err := c.cc.Invoke(ctx, "/Replication/Ping", in, out, opts...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func (c *replicationClient) ListFilesystems(ctx context.Context, in *ListFilesystemReq, opts ...grpc.CallOption) (*ListFilesystemRes, error) {
|
||||
out := new(ListFilesystemRes)
|
||||
err := c.cc.Invoke(ctx, "/Replication/ListFilesystems", in, out, opts...)
|
||||
@ -1165,6 +1254,7 @@ func (c *replicationClient) ReplicationCursor(ctx context.Context, in *Replicati
|
||||
|
||||
// ReplicationServer is the server API for Replication service.
|
||||
type ReplicationServer interface {
|
||||
Ping(context.Context, *PingReq) (*PingRes, error)
|
||||
ListFilesystems(context.Context, *ListFilesystemReq) (*ListFilesystemRes, error)
|
||||
ListFilesystemVersions(context.Context, *ListFilesystemVersionsReq) (*ListFilesystemVersionsRes, error)
|
||||
DestroySnapshots(context.Context, *DestroySnapshotsReq) (*DestroySnapshotsRes, error)
|
||||
@ -1175,6 +1265,24 @@ func RegisterReplicationServer(s *grpc.Server, srv ReplicationServer) {
|
||||
s.RegisterService(&_Replication_serviceDesc, srv)
|
||||
}
|
||||
|
||||
func _Replication_Ping_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
|
||||
in := new(PingReq)
|
||||
if err := dec(in); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if interceptor == nil {
|
||||
return srv.(ReplicationServer).Ping(ctx, in)
|
||||
}
|
||||
info := &grpc.UnaryServerInfo{
|
||||
Server: srv,
|
||||
FullMethod: "/Replication/Ping",
|
||||
}
|
||||
handler := func(ctx context.Context, req interface{}) (interface{}, error) {
|
||||
return srv.(ReplicationServer).Ping(ctx, req.(*PingReq))
|
||||
}
|
||||
return interceptor(ctx, in, info, handler)
|
||||
}
|
||||
|
||||
func _Replication_ListFilesystems_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
|
||||
in := new(ListFilesystemReq)
|
||||
if err := dec(in); err != nil {
|
||||
@ -1251,6 +1359,10 @@ var _Replication_serviceDesc = grpc.ServiceDesc{
|
||||
ServiceName: "Replication",
|
||||
HandlerType: (*ReplicationServer)(nil),
|
||||
Methods: []grpc.MethodDesc{
|
||||
{
|
||||
MethodName: "Ping",
|
||||
Handler: _Replication_Ping_Handler,
|
||||
},
|
||||
{
|
||||
MethodName: "ListFilesystems",
|
||||
Handler: _Replication_ListFilesystems_Handler,
|
||||
@ -1272,54 +1384,58 @@ var _Replication_serviceDesc = grpc.ServiceDesc{
|
||||
Metadata: "pdu.proto",
|
||||
}
|
||||
|
||||
func init() { proto.RegisterFile("pdu.proto", fileDescriptor_pdu_89315d819a6e0938) }
|
||||
func init() { proto.RegisterFile("pdu.proto", fileDescriptor_pdu_83b7e2a28d820622) }
|
||||
|
||||
var fileDescriptor_pdu_89315d819a6e0938 = []byte{
|
||||
// 735 bytes of a gzipped FileDescriptorProto
|
||||
0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x8c, 0x55, 0xdd, 0x6e, 0xda, 0x4a,
|
||||
0x10, 0xc6, 0x60, 0xc0, 0x0c, 0x51, 0x42, 0x36, 0x9c, 0xc8, 0xc7, 0xe7, 0x28, 0x42, 0xdb, 0x1b,
|
||||
0x52, 0xa9, 0x6e, 0x45, 0x7b, 0x53, 0x55, 0xaa, 0x54, 0x42, 0x7e, 0xa4, 0x56, 0x69, 0xb4, 0xd0,
|
||||
0x28, 0xca, 0x1d, 0x0d, 0xa3, 0xc4, 0x0a, 0xb0, 0xce, 0xee, 0xba, 0x0a, 0xbd, 0xec, 0x7b, 0xf4,
|
||||
0x41, 0xfa, 0x0e, 0xbd, 0xec, 0x03, 0x55, 0xbb, 0x60, 0xe3, 0x60, 0x23, 0x71, 0xe5, 0xfd, 0xbe,
|
||||
0x9d, 0x9d, 0x9d, 0xf9, 0x76, 0x66, 0x0c, 0xb5, 0x70, 0x14, 0xf9, 0xa1, 0xe0, 0x8a, 0xd3, 0x3d,
|
||||
0xd8, 0xfd, 0x14, 0x48, 0x75, 0x12, 0x8c, 0x51, 0xce, 0xa4, 0xc2, 0x09, 0xc3, 0x07, 0x7a, 0x95,
|
||||
0x25, 0x25, 0x79, 0x01, 0xf5, 0x25, 0x21, 0x5d, 0xab, 0x55, 0x6a, 0xd7, 0x3b, 0x75, 0x3f, 0x65,
|
||||
0x94, 0xde, 0x27, 0x4d, 0x28, 0x1f, 0x4f, 0x42, 0x35, 0x73, 0x8b, 0x2d, 0xab, 0xed, 0xb0, 0x39,
|
||||
0xa0, 0x5d, 0x80, 0xa5, 0x11, 0x21, 0x60, 0x5f, 0x0c, 0xd5, 0x9d, 0x6b, 0xb5, 0xac, 0x76, 0x8d,
|
||||
0x99, 0x35, 0x69, 0x41, 0x9d, 0xa1, 0x8c, 0x26, 0x38, 0xe0, 0xf7, 0x38, 0x35, 0xa7, 0x6b, 0x2c,
|
||||
0x4d, 0xd1, 0x77, 0xf0, 0xef, 0xd3, 0xe8, 0x2e, 0x51, 0xc8, 0x80, 0x4f, 0x25, 0xc3, 0x07, 0x72,
|
||||
0x90, 0xbe, 0x60, 0xe1, 0x38, 0xc5, 0xd0, 0x8f, 0xeb, 0x0f, 0x4b, 0xe2, 0x83, 0x13, 0xc3, 0x45,
|
||||
0x7e, 0xc4, 0xcf, 0x58, 0xb2, 0xc4, 0x86, 0xfe, 0xb1, 0x60, 0x37, 0xb3, 0x4f, 0x3a, 0x60, 0x0f,
|
||||
0x66, 0x21, 0x9a, 0xcb, 0xb7, 0x3b, 0x07, 0x59, 0x0f, 0xfe, 0xe2, 0xab, 0xad, 0x98, 0xb1, 0xd5,
|
||||
0x4a, 0x9c, 0x0f, 0x27, 0xb8, 0x48, 0xd7, 0xac, 0x35, 0x77, 0x1a, 0x05, 0x23, 0xb7, 0xd4, 0xb2,
|
||||
0xda, 0x36, 0x33, 0x6b, 0xf2, 0x3f, 0xd4, 0x8e, 0x04, 0x0e, 0x15, 0x0e, 0xae, 0x4e, 0x5d, 0xdb,
|
||||
0x6c, 0x2c, 0x09, 0xe2, 0x81, 0x63, 0x40, 0xc0, 0xa7, 0x6e, 0xd9, 0x78, 0x4a, 0x30, 0x3d, 0x84,
|
||||
0x7a, 0xea, 0x5a, 0xb2, 0x05, 0x4e, 0x7f, 0x3a, 0x0c, 0xe5, 0x1d, 0x57, 0x8d, 0x82, 0x46, 0x5d,
|
||||
0xce, 0xef, 0x27, 0x43, 0x71, 0xdf, 0xb0, 0xe8, 0x2f, 0x0b, 0xaa, 0x7d, 0x9c, 0x8e, 0x36, 0xd0,
|
||||
0x53, 0x07, 0x79, 0x22, 0xf8, 0x24, 0x0e, 0x5c, 0xaf, 0xc9, 0x36, 0x14, 0x07, 0xdc, 0x84, 0x5d,
|
||||
0x63, 0xc5, 0x01, 0x5f, 0x7d, 0x52, 0x3b, 0xf3, 0xa4, 0x26, 0x70, 0x3e, 0x09, 0x05, 0x4a, 0x69,
|
||||
0x02, 0x77, 0x58, 0x82, 0x75, 0x21, 0xf5, 0x70, 0x14, 0x85, 0x6e, 0x65, 0x5e, 0x48, 0x06, 0x90,
|
||||
0x7d, 0xa8, 0xf4, 0xc4, 0x8c, 0x45, 0x53, 0xb7, 0x6a, 0xe8, 0x05, 0xa2, 0x6f, 0xc0, 0xb9, 0x10,
|
||||
0x3c, 0x44, 0xa1, 0x66, 0x89, 0xa8, 0x56, 0x4a, 0xd4, 0x26, 0x94, 0x2f, 0x87, 0xe3, 0x28, 0x56,
|
||||
0x7a, 0x0e, 0xe8, 0x8f, 0x24, 0x63, 0x49, 0xda, 0xb0, 0xf3, 0x45, 0xe2, 0x68, 0xb5, 0x08, 0x1d,
|
||||
0xb6, 0x4a, 0x13, 0x0a, 0x5b, 0xc7, 0x8f, 0x21, 0xde, 0x28, 0x1c, 0xf5, 0x83, 0xef, 0x68, 0x32,
|
||||
0x2e, 0xb1, 0x27, 0x1c, 0x39, 0x04, 0x58, 0xc4, 0x13, 0xa0, 0x74, 0x6d, 0x53, 0x54, 0x35, 0x3f,
|
||||
0x0e, 0x91, 0xa5, 0x36, 0xe9, 0x15, 0x00, 0xc3, 0x1b, 0x0c, 0xbe, 0xe1, 0x26, 0xc2, 0x3f, 0x87,
|
||||
0xc6, 0xd1, 0x18, 0x87, 0x22, 0x1b, 0x67, 0x86, 0xa7, 0x5b, 0x29, 0xcf, 0x92, 0xde, 0xc2, 0x5e,
|
||||
0x0f, 0xa5, 0x12, 0x7c, 0x16, 0x57, 0xc0, 0x26, 0x9d, 0x43, 0x5e, 0x41, 0x2d, 0xb1, 0x77, 0x8b,
|
||||
0x6b, 0xbb, 0x63, 0x69, 0x44, 0xaf, 0x81, 0xac, 0x5c, 0xb4, 0x68, 0xb2, 0x18, 0x9a, 0x5b, 0xd6,
|
||||
0x34, 0x59, 0x6c, 0x63, 0x06, 0x89, 0x10, 0x5c, 0xc4, 0x2f, 0x66, 0x00, 0xed, 0xe5, 0x25, 0xa1,
|
||||
0x87, 0x54, 0x55, 0x27, 0x3e, 0x56, 0x71, 0x03, 0xef, 0xf9, 0xd9, 0x10, 0x58, 0x6c, 0x43, 0x7f,
|
||||
0x5b, 0xd0, 0x64, 0x18, 0x8e, 0x83, 0x1b, 0xd3, 0x24, 0x47, 0x91, 0x90, 0x5c, 0x6c, 0x22, 0xc6,
|
||||
0x4b, 0x28, 0xdd, 0xa2, 0x32, 0x21, 0xd5, 0x3b, 0xff, 0xf9, 0x79, 0x3e, 0xfc, 0x53, 0x54, 0x9f,
|
||||
0xc3, 0xb3, 0x02, 0xd3, 0x96, 0xfa, 0x80, 0x44, 0x65, 0x4a, 0x64, 0xed, 0x81, 0x7e, 0x7c, 0x40,
|
||||
0xa2, 0xf2, 0xaa, 0x50, 0x36, 0x0e, 0xbc, 0x67, 0x50, 0x36, 0x1b, 0xba, 0x49, 0x12, 0xe1, 0xe6,
|
||||
0x5a, 0x24, 0xb8, 0x6b, 0x43, 0x91, 0x87, 0x74, 0x90, 0x9b, 0x8d, 0x6e, 0xa1, 0xf9, 0x24, 0xd1,
|
||||
0x79, 0xd8, 0x67, 0x85, 0x64, 0x96, 0x38, 0xe7, 0x5c, 0xe1, 0x63, 0x20, 0xe7, 0xfe, 0x9c, 0xb3,
|
||||
0x02, 0x4b, 0x98, 0xae, 0x03, 0x95, 0xb9, 0x4a, 0x9d, 0x9f, 0x45, 0xdd, 0xbf, 0x89, 0x5b, 0xf2,
|
||||
0x16, 0x76, 0x9e, 0x8e, 0x50, 0x49, 0x88, 0x9f, 0xf9, 0x89, 0x78, 0x59, 0x4e, 0x92, 0x0b, 0xd8,
|
||||
0xcf, 0x9f, 0xbe, 0xc4, 0xf3, 0xd7, 0xce, 0x74, 0x6f, 0xfd, 0x9e, 0x24, 0xef, 0xa1, 0xb1, 0x5a,
|
||||
0x07, 0xa4, 0xe9, 0xe7, 0xd4, 0xb7, 0x97, 0xc7, 0x4a, 0xf2, 0x01, 0x76, 0x33, 0x92, 0x91, 0x7f,
|
||||
0x72, 0xdf, 0xc7, 0xcb, 0xa5, 0x65, 0xb7, 0x7c, 0x5d, 0x0a, 0x47, 0xd1, 0xd7, 0x8a, 0xf9, 0xa1,
|
||||
0xbe, 0xfe, 0x1b, 0x00, 0x00, 0xff, 0xff, 0xa3, 0xba, 0x8e, 0x63, 0x5d, 0x07, 0x00, 0x00,
|
||||
var fileDescriptor_pdu_83b7e2a28d820622 = []byte{
|
||||
// 785 bytes of a gzipped FileDescriptorProto
|
||||
0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x8c, 0x55, 0xd1, 0x8e, 0xe3, 0x34,
|
||||
0x14, 0x9d, 0xb4, 0x69, 0x9b, 0xde, 0x0e, 0xbb, 0x1d, 0x4f, 0x59, 0x85, 0x00, 0xab, 0xca, 0xcb,
|
||||
0x43, 0x17, 0x89, 0x80, 0x0a, 0x2f, 0x08, 0x09, 0x89, 0x4e, 0x67, 0x67, 0x10, 0xb0, 0x54, 0x6e,
|
||||
0x59, 0xad, 0xf6, 0x2d, 0x34, 0x57, 0x6d, 0x34, 0x6d, 0x9d, 0xb5, 0x13, 0xb4, 0xe5, 0x91, 0xbf,
|
||||
0x9a, 0x7f, 0xe0, 0x91, 0x0f, 0x42, 0x76, 0xe3, 0x34, 0x6d, 0x52, 0xa9, 0x4f, 0xf1, 0x39, 0xf7,
|
||||
0xda, 0x3e, 0xf7, 0xd8, 0xd7, 0x81, 0x76, 0x1c, 0xa6, 0x7e, 0x2c, 0x78, 0xc2, 0xe9, 0x35, 0x5c,
|
||||
0xfd, 0x1a, 0xc9, 0xe4, 0x55, 0xb4, 0x42, 0xb9, 0x95, 0x09, 0xae, 0x19, 0xbe, 0xa7, 0xa3, 0x32,
|
||||
0x29, 0xc9, 0x57, 0xd0, 0xd9, 0x13, 0xd2, 0xb5, 0xfa, 0xf5, 0x41, 0x67, 0xd8, 0xf1, 0x0b, 0x49,
|
||||
0xc5, 0x38, 0x5d, 0x02, 0xec, 0x21, 0x21, 0x60, 0x4f, 0x82, 0x64, 0xe9, 0x5a, 0x7d, 0x6b, 0xd0,
|
||||
0x66, 0x7a, 0x4c, 0xfa, 0xd0, 0x61, 0x28, 0xd3, 0x35, 0xce, 0xf8, 0x03, 0x6e, 0xdc, 0x9a, 0x0e,
|
||||
0x15, 0x29, 0xf2, 0x05, 0x7c, 0xf4, 0xb3, 0x9c, 0xac, 0x82, 0x39, 0x2e, 0xf9, 0x2a, 0x44, 0xe1,
|
||||
0xd6, 0xfb, 0xd6, 0xc0, 0x61, 0x87, 0x24, 0xfd, 0x01, 0x3e, 0x39, 0x54, 0xfb, 0x06, 0x85, 0x8c,
|
||||
0xf8, 0x46, 0x32, 0x7c, 0x4f, 0x9e, 0x17, 0x65, 0x64, 0xdb, 0x17, 0x18, 0xfa, 0xcb, 0xe9, 0xc9,
|
||||
0x92, 0xf8, 0xe0, 0x18, 0x98, 0xd5, 0x4b, 0xfc, 0x52, 0x26, 0xcb, 0x73, 0xe8, 0x7f, 0x16, 0x5c,
|
||||
0x95, 0xe2, 0x64, 0x08, 0xf6, 0x6c, 0x1b, 0xa3, 0xde, 0xfc, 0xc9, 0xf0, 0x79, 0x79, 0x05, 0x3f,
|
||||
0xfb, 0xaa, 0x2c, 0xa6, 0x73, 0x95, 0x5f, 0xaf, 0x83, 0x35, 0x66, 0xa6, 0xe8, 0xb1, 0xe2, 0xee,
|
||||
0xd2, 0x28, 0xd4, 0x26, 0xd8, 0x4c, 0x8f, 0xc9, 0x67, 0xd0, 0xbe, 0x11, 0x18, 0x24, 0x38, 0x7b,
|
||||
0x7b, 0xe7, 0xda, 0x3a, 0xb0, 0x27, 0x88, 0x07, 0x8e, 0x06, 0x11, 0xdf, 0xb8, 0x0d, 0xbd, 0x52,
|
||||
0x8e, 0xe9, 0x4b, 0xe8, 0x14, 0xb6, 0x25, 0x97, 0xe0, 0x4c, 0x37, 0x41, 0x2c, 0x97, 0x3c, 0xe9,
|
||||
0x5e, 0x28, 0x34, 0xe2, 0xfc, 0x61, 0x1d, 0x88, 0x87, 0xae, 0x45, 0x1f, 0x2d, 0x68, 0x4d, 0x71,
|
||||
0x13, 0x9e, 0xe1, 0xa7, 0x12, 0xf9, 0x4a, 0xf0, 0xb5, 0x11, 0xae, 0xc6, 0xe4, 0x09, 0xd4, 0x66,
|
||||
0x5c, 0xcb, 0x6e, 0xb3, 0xda, 0x8c, 0x1f, 0x1f, 0xbc, 0x5d, 0x3e, 0x78, 0x25, 0x9c, 0xaf, 0x63,
|
||||
0x81, 0x52, 0x6a, 0xe1, 0x0e, 0xcb, 0x31, 0xe9, 0x41, 0x63, 0x8c, 0x61, 0x1a, 0xbb, 0x4d, 0x1d,
|
||||
0xd8, 0x01, 0xf2, 0x0c, 0x9a, 0x63, 0xb1, 0x65, 0xe9, 0xc6, 0x6d, 0x69, 0x3a, 0x43, 0xf4, 0x3b,
|
||||
0x70, 0x26, 0x82, 0xc7, 0x28, 0x92, 0x6d, 0x6e, 0xaa, 0x55, 0x30, 0xb5, 0x07, 0x8d, 0x37, 0xc1,
|
||||
0x2a, 0x35, 0x4e, 0xef, 0x00, 0xfd, 0x27, 0xaf, 0x58, 0x92, 0x01, 0x3c, 0xfd, 0x43, 0x62, 0x78,
|
||||
0x7c, 0x55, 0x1d, 0x76, 0x4c, 0x13, 0x0a, 0x97, 0xb7, 0x1f, 0x62, 0x9c, 0x27, 0x18, 0x4e, 0xa3,
|
||||
0xbf, 0x51, 0x57, 0x5c, 0x67, 0x07, 0x1c, 0x79, 0x09, 0x90, 0xe9, 0x89, 0x50, 0xba, 0xb6, 0xbe,
|
||||
0x54, 0x6d, 0xdf, 0x48, 0x64, 0x85, 0x20, 0x7d, 0x0b, 0xc0, 0x70, 0x8e, 0xd1, 0x5f, 0x78, 0x8e,
|
||||
0xf1, 0x5f, 0x42, 0xf7, 0x66, 0x85, 0x81, 0x28, 0xeb, 0x2c, 0xf1, 0xf4, 0xb2, 0xb0, 0xb2, 0xa4,
|
||||
0x0b, 0xb8, 0x1e, 0xa3, 0x4c, 0x04, 0xdf, 0x9a, 0x1b, 0x70, 0x4e, 0xe7, 0x90, 0x6f, 0xa0, 0x9d,
|
||||
0xe7, 0xbb, 0xb5, 0x93, 0xdd, 0xb1, 0x4f, 0xa2, 0xef, 0x80, 0x1c, 0x6d, 0x94, 0x35, 0x99, 0x81,
|
||||
0x7a, 0x97, 0x13, 0x4d, 0x66, 0x72, 0xd4, 0x89, 0xdd, 0x0a, 0xc1, 0x85, 0x39, 0x31, 0x0d, 0xe8,
|
||||
0xb8, 0xaa, 0x08, 0xf5, 0x68, 0xb5, 0x54, 0xe1, 0xab, 0xc4, 0x34, 0xf0, 0xb5, 0x5f, 0x96, 0xc0,
|
||||
0x4c, 0x0e, 0xfd, 0xd7, 0x82, 0x1e, 0xc3, 0x78, 0x15, 0xcd, 0x75, 0x93, 0xdc, 0xa4, 0x42, 0x72,
|
||||
0x71, 0x8e, 0x19, 0x5f, 0x43, 0x7d, 0x81, 0x89, 0x96, 0xd4, 0x19, 0x7e, 0xea, 0x57, 0xad, 0xe1,
|
||||
0xdf, 0x61, 0xf2, 0x7b, 0x7c, 0x7f, 0xc1, 0x54, 0xa6, 0x9a, 0x20, 0x31, 0xd1, 0x57, 0xe4, 0xe4,
|
||||
0x84, 0xa9, 0x99, 0x20, 0x31, 0xf1, 0x5a, 0xd0, 0xd0, 0x0b, 0x78, 0x2f, 0xa0, 0xa1, 0x03, 0xaa,
|
||||
0x49, 0x72, 0xe3, 0x76, 0x5e, 0xe4, 0x78, 0x64, 0x43, 0x8d, 0xc7, 0x74, 0x56, 0x59, 0x8d, 0x6a,
|
||||
0xa1, 0xdd, 0x4b, 0xa2, 0xea, 0xb0, 0xef, 0x2f, 0xf2, 0xb7, 0xc4, 0x79, 0xcd, 0x13, 0xfc, 0x10,
|
||||
0xc9, 0xdd, 0x7a, 0xce, 0xfd, 0x05, 0xcb, 0x99, 0x91, 0x03, 0xcd, 0x9d, 0x4b, 0xf4, 0x05, 0xb4,
|
||||
0x26, 0xd1, 0x66, 0xa1, 0x6c, 0x71, 0xa1, 0xf5, 0x1b, 0x4a, 0x19, 0x2c, 0x4c, 0x53, 0x19, 0x48,
|
||||
0x3f, 0x37, 0x49, 0x52, 0xb5, 0xdd, 0xed, 0x7c, 0xc9, 0x4d, 0xdb, 0xa9, 0xf1, 0xf0, 0xb1, 0xa6,
|
||||
0xde, 0x80, 0x5c, 0x1a, 0xf1, 0xc0, 0x56, 0xe9, 0xc4, 0xf1, 0xb3, 0xa5, 0x3d, 0x33, 0x92, 0xe4,
|
||||
0x7b, 0x78, 0x7a, 0xf8, 0x44, 0x4b, 0x42, 0xfc, 0xd2, 0x4f, 0xcb, 0x2b, 0x73, 0x92, 0x4c, 0xe0,
|
||||
0x59, 0xf5, 0xeb, 0x4e, 0x3c, 0xff, 0xe4, 0x3f, 0xc3, 0x3b, 0x1d, 0x93, 0xe4, 0x47, 0xe8, 0x1e,
|
||||
0xdf, 0x33, 0xd2, 0xf3, 0x2b, 0xfa, 0xc7, 0xab, 0x62, 0x25, 0xf9, 0x09, 0xae, 0x4a, 0x47, 0x42,
|
||||
0x3e, 0xae, 0x3c, 0x7f, 0xaf, 0x92, 0x96, 0xa3, 0xc6, 0xbb, 0x7a, 0x1c, 0xa6, 0x7f, 0x36, 0xf5,
|
||||
0x0f, 0xfc, 0xdb, 0xff, 0x03, 0x00, 0x00, 0xff, 0xff, 0x37, 0x0e, 0xf2, 0xe4, 0xcd, 0x07, 0x00,
|
||||
0x00,
|
||||
}
|
@ -2,6 +2,7 @@ syntax = "proto3";
|
||||
option go_package = "pdu";
|
||||
|
||||
service Replication {
|
||||
rpc Ping (PingReq) returns (PingRes);
|
||||
rpc ListFilesystems (ListFilesystemReq) returns (ListFilesystemRes);
|
||||
rpc ListFilesystemVersions (ListFilesystemVersionsReq) returns (ListFilesystemVersionsRes);
|
||||
rpc DestroySnapshots (DestroySnapshotsReq) returns (DestroySnapshotsRes);
|
||||
@ -13,12 +14,12 @@ message ListFilesystemReq {}
|
||||
|
||||
message ListFilesystemRes {
|
||||
repeated Filesystem Filesystems = 1;
|
||||
bool Empty = 2;
|
||||
}
|
||||
|
||||
message Filesystem {
|
||||
string Path = 1;
|
||||
string ResumeToken = 2;
|
||||
bool IsPlaceholder = 3;
|
||||
}
|
||||
|
||||
message ListFilesystemVersionsReq {
|
||||
@ -120,3 +121,12 @@ message ReplicationCursorRes {
|
||||
bool Notexist = 2;
|
||||
}
|
||||
}
|
||||
|
||||
message PingReq {
|
||||
string Message = 1;
|
||||
}
|
||||
|
||||
message PingRes {
|
||||
// Echo must be PingReq.Message
|
||||
string Echo = 1;
|
||||
}
|
495
replication/logic/replication_logic.go
Normal file
495
replication/logic/replication_logic.go
Normal file
@ -0,0 +1,495 @@
|
||||
package logic
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
|
||||
"github.com/zrepl/zrepl/replication/driver"
|
||||
. "github.com/zrepl/zrepl/replication/logic/diff"
|
||||
"github.com/zrepl/zrepl/replication/logic/pdu"
|
||||
"github.com/zrepl/zrepl/replication/report"
|
||||
"github.com/zrepl/zrepl/util/bytecounter"
|
||||
"github.com/zrepl/zrepl/zfs"
|
||||
)
|
||||
|
||||
// Endpoint represents one side of the replication.
|
||||
//
|
||||
// An endpoint is either in Sender or Receiver mode, represented by the correspondingly
|
||||
// named interfaces defined in this package.
|
||||
type Endpoint interface {
|
||||
// Does not include placeholder filesystems
|
||||
ListFilesystems(ctx context.Context, req *pdu.ListFilesystemReq) (*pdu.ListFilesystemRes, error)
|
||||
ListFilesystemVersions(ctx context.Context, req *pdu.ListFilesystemVersionsReq) (*pdu.ListFilesystemVersionsRes, error)
|
||||
DestroySnapshots(ctx context.Context, req *pdu.DestroySnapshotsReq) (*pdu.DestroySnapshotsRes, error)
|
||||
WaitForConnectivity(ctx context.Context) (error)
|
||||
}
|
||||
|
||||
type Sender interface {
|
||||
Endpoint
|
||||
// If a non-nil io.ReadCloser is returned, it is guaranteed to be closed before
|
||||
// any next call to the parent github.com/zrepl/zrepl/replication.Endpoint.
|
||||
// If the send request is for dry run the io.ReadCloser will be nil
|
||||
Send(ctx context.Context, r *pdu.SendReq) (*pdu.SendRes, zfs.StreamCopier, error)
|
||||
ReplicationCursor(ctx context.Context, req *pdu.ReplicationCursorReq) (*pdu.ReplicationCursorRes, error)
|
||||
}
|
||||
|
||||
type Receiver interface {
|
||||
Endpoint
|
||||
// Receive sends r and sendStream (the latter containing a ZFS send stream)
|
||||
// to the parent github.com/zrepl/zrepl/replication.Endpoint.
|
||||
Receive(ctx context.Context, req *pdu.ReceiveReq, receive zfs.StreamCopier) (*pdu.ReceiveRes, error)
|
||||
}
|
||||
|
||||
type Planner struct {
|
||||
sender Sender
|
||||
receiver Receiver
|
||||
|
||||
promSecsPerState *prometheus.HistogramVec // labels: state
|
||||
promBytesReplicated *prometheus.CounterVec // labels: filesystem
|
||||
}
|
||||
|
||||
func (p *Planner) Plan(ctx context.Context) ([]driver.FS, error) {
|
||||
fss, err := p.doPlanning(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
dfss := make([]driver.FS, len(fss))
|
||||
for i := range dfss {
|
||||
dfss[i] = fss[i]
|
||||
}
|
||||
return dfss, nil
|
||||
}
|
||||
|
||||
func (p *Planner) WaitForConnectivity(ctx context.Context) error {
|
||||
var wg sync.WaitGroup
|
||||
doPing := func(endpoint Endpoint, errOut *error) {
|
||||
defer wg.Done()
|
||||
err := endpoint.WaitForConnectivity(ctx)
|
||||
if err != nil {
|
||||
*errOut = err
|
||||
} else {
|
||||
*errOut = nil
|
||||
}
|
||||
}
|
||||
wg.Add(2)
|
||||
var senderErr, receiverErr error
|
||||
go doPing(p.sender, &senderErr)
|
||||
go doPing(p.receiver, &receiverErr)
|
||||
wg.Wait()
|
||||
if senderErr == nil && receiverErr == nil {
|
||||
return nil
|
||||
} else if senderErr != nil && receiverErr != nil {
|
||||
if senderErr.Error() == receiverErr.Error() {
|
||||
return fmt.Errorf("sender and receiver are not reachable: %s", senderErr.Error())
|
||||
} else {
|
||||
return fmt.Errorf("sender and receiver are not reachable:\n sender: %s\n receiver: %s", senderErr, receiverErr)
|
||||
}
|
||||
} else {
|
||||
var side string
|
||||
var err *error
|
||||
if senderErr != nil {
|
||||
side = "sender"
|
||||
err = &senderErr
|
||||
} else {
|
||||
side = "receiver"
|
||||
err = &receiverErr
|
||||
}
|
||||
return fmt.Errorf("%s is not reachable: %s", side, *err)
|
||||
}
|
||||
}
|
||||
|
||||
type Filesystem struct {
|
||||
sender Sender
|
||||
receiver Receiver
|
||||
|
||||
Path string // compat
|
||||
receiverFS *pdu.Filesystem
|
||||
promBytesReplicated prometheus.Counter // compat
|
||||
}
|
||||
|
||||
func (f *Filesystem) EqualToPreviousAttempt(other driver.FS) bool {
|
||||
g, ok := other.(*Filesystem)
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
// TODO: use GUIDs (issued by zrepl, not those from ZFS)
|
||||
return f.Path == g.Path
|
||||
}
|
||||
|
||||
func (f *Filesystem) PlanFS(ctx context.Context) ([]driver.Step, error) {
|
||||
steps, err := f.doPlanning(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
dsteps := make([]driver.Step, len(steps))
|
||||
for i := range dsteps {
|
||||
dsteps[i] = steps[i]
|
||||
}
|
||||
return dsteps, nil
|
||||
}
|
||||
func (f *Filesystem) ReportInfo() *report.FilesystemInfo {
|
||||
return &report.FilesystemInfo{Name: f.Path} // FIXME compat name
|
||||
}
|
||||
|
||||
type Step struct {
|
||||
sender Sender
|
||||
receiver Receiver
|
||||
|
||||
parent *Filesystem
|
||||
from, to *pdu.FilesystemVersion // compat
|
||||
|
||||
byteCounter bytecounter.StreamCopier
|
||||
expectedSize int64 // 0 means no size estimate present / possible
|
||||
}
|
||||
|
||||
func (s *Step) TargetEquals(other driver.Step) bool {
|
||||
t, ok := other.(*Step)
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
if !s.parent.EqualToPreviousAttempt(t.parent) {
|
||||
panic("Step interface promise broken: parent filesystems must be same")
|
||||
}
|
||||
return s.from.GetGuid() == t.from.GetGuid() &&
|
||||
s.to.GetGuid() == t.to.GetGuid()
|
||||
}
|
||||
|
||||
func (s *Step) TargetDate() time.Time {
|
||||
return s.to.SnapshotTime() // FIXME compat name
|
||||
}
|
||||
|
||||
func (s *Step) Step(ctx context.Context) error {
|
||||
return s.doReplication(ctx)
|
||||
}
|
||||
|
||||
func (s *Step) ReportInfo() *report.StepInfo {
|
||||
var byteCounter int64
|
||||
if s.byteCounter != nil {
|
||||
byteCounter = s.byteCounter.Count()
|
||||
}
|
||||
// FIXME stick to zfs convention of from and to
|
||||
from := ""
|
||||
if s.from != nil {
|
||||
from = s.from.RelName()
|
||||
}
|
||||
return &report.StepInfo{
|
||||
From: from,
|
||||
To: s.to.RelName(),
|
||||
BytesExpected: s.expectedSize,
|
||||
BytesReplicated: byteCounter,
|
||||
}
|
||||
}
|
||||
|
||||
func NewPlanner(secsPerState *prometheus.HistogramVec, bytesReplicated *prometheus.CounterVec, sender Sender, receiver Receiver) *Planner {
|
||||
return &Planner{
|
||||
sender: sender,
|
||||
receiver: receiver,
|
||||
promSecsPerState: secsPerState,
|
||||
promBytesReplicated: bytesReplicated,
|
||||
}
|
||||
}
|
||||
func resolveConflict(conflict error) (path []*pdu.FilesystemVersion, msg string) {
|
||||
if noCommonAncestor, ok := conflict.(*ConflictNoCommonAncestor); ok {
|
||||
if len(noCommonAncestor.SortedReceiverVersions) == 0 {
|
||||
// TODO this is hard-coded replication policy: most recent snapshot as source
|
||||
var mostRecentSnap *pdu.FilesystemVersion
|
||||
for n := len(noCommonAncestor.SortedSenderVersions) - 1; n >= 0; n-- {
|
||||
if noCommonAncestor.SortedSenderVersions[n].Type == pdu.FilesystemVersion_Snapshot {
|
||||
mostRecentSnap = noCommonAncestor.SortedSenderVersions[n]
|
||||
break
|
||||
}
|
||||
}
|
||||
if mostRecentSnap == nil {
|
||||
return nil, "no snapshots available on sender side"
|
||||
}
|
||||
return []*pdu.FilesystemVersion{mostRecentSnap}, fmt.Sprintf("start replication at most recent snapshot %s", mostRecentSnap.RelName())
|
||||
}
|
||||
}
|
||||
return nil, "no automated way to handle conflict type"
|
||||
}
|
||||
|
||||
func (p *Planner) doPlanning(ctx context.Context) ([]*Filesystem, error) {
|
||||
|
||||
log := getLogger(ctx)
|
||||
|
||||
log.Info("start planning")
|
||||
|
||||
slfssres, err := p.sender.ListFilesystems(ctx, &pdu.ListFilesystemReq{})
|
||||
if err != nil {
|
||||
log.WithError(err).WithField("errType", fmt.Sprintf("%T", err)).Error("error listing sender filesystems")
|
||||
return nil, err
|
||||
}
|
||||
sfss := slfssres.GetFilesystems()
|
||||
// no progress here since we could run in a live-lock on connectivity issues
|
||||
|
||||
rlfssres, err := p.receiver.ListFilesystems(ctx, &pdu.ListFilesystemReq{})
|
||||
if err != nil {
|
||||
log.WithError(err).WithField("errType", fmt.Sprintf("%T", err)).Error("error listing receiver filesystems")
|
||||
return nil, err
|
||||
}
|
||||
rfss := rlfssres.GetFilesystems()
|
||||
|
||||
q := make([]*Filesystem, 0, len(sfss))
|
||||
for _, fs := range sfss {
|
||||
|
||||
var receiverFS *pdu.Filesystem
|
||||
for _, rfs := range rfss {
|
||||
if rfs.Path == fs.Path {
|
||||
receiverFS = rfs
|
||||
}
|
||||
}
|
||||
|
||||
ctr := p.promBytesReplicated.WithLabelValues(fs.Path)
|
||||
|
||||
q = append(q, &Filesystem{
|
||||
sender: p.sender,
|
||||
receiver: p.receiver,
|
||||
Path: fs.Path,
|
||||
receiverFS: receiverFS,
|
||||
promBytesReplicated: ctr,
|
||||
})
|
||||
}
|
||||
|
||||
return q, nil
|
||||
}
|
||||
|
||||
func (fs *Filesystem) doPlanning(ctx context.Context) ([]*Step, error) {
|
||||
|
||||
log := getLogger(ctx).WithField("filesystem", fs.Path)
|
||||
|
||||
log.Debug("assessing filesystem")
|
||||
|
||||
sfsvsres, err := fs.sender.ListFilesystemVersions(ctx, &pdu.ListFilesystemVersionsReq{Filesystem: fs.Path})
|
||||
if err != nil {
|
||||
log.WithError(err).Error("cannot get remote filesystem versions")
|
||||
return nil, err
|
||||
}
|
||||
sfsvs := sfsvsres.GetVersions()
|
||||
|
||||
if len(sfsvs) < 1 {
|
||||
err := errors.New("sender does not have any versions")
|
||||
log.Error(err.Error())
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var rfsvs []*pdu.FilesystemVersion
|
||||
if fs.receiverFS != nil && !fs.receiverFS.GetIsPlaceholder() {
|
||||
rfsvsres, err := fs.receiver.ListFilesystemVersions(ctx, &pdu.ListFilesystemVersionsReq{Filesystem: fs.Path})
|
||||
if err != nil {
|
||||
log.WithError(err).Error("receiver error")
|
||||
return nil, err
|
||||
}
|
||||
rfsvs = rfsvsres.GetVersions()
|
||||
} else {
|
||||
rfsvs = []*pdu.FilesystemVersion{}
|
||||
}
|
||||
|
||||
path, conflict := IncrementalPath(rfsvs, sfsvs)
|
||||
if conflict != nil {
|
||||
var msg string
|
||||
path, msg = resolveConflict(conflict) // no shadowing allowed!
|
||||
if path != nil {
|
||||
log.WithField("conflict", conflict).Info("conflict")
|
||||
log.WithField("resolution", msg).Info("automatically resolved")
|
||||
} else {
|
||||
log.WithField("conflict", conflict).Error("conflict")
|
||||
log.WithField("problem", msg).Error("cannot resolve conflict")
|
||||
}
|
||||
}
|
||||
if len(path) == 0 {
|
||||
return nil, conflict
|
||||
}
|
||||
|
||||
steps := make([]*Step, 0, len(path))
|
||||
// FIXME unify struct declarations => initializer?
|
||||
if len(path) == 1 {
|
||||
steps = append(steps, &Step{
|
||||
parent: fs,
|
||||
sender: fs.sender,
|
||||
receiver: fs.receiver,
|
||||
from: nil,
|
||||
to: path[0],
|
||||
})
|
||||
} else {
|
||||
for i := 0; i < len(path)-1; i++ {
|
||||
steps = append(steps, &Step{
|
||||
parent: fs,
|
||||
sender: fs.sender,
|
||||
receiver: fs.receiver,
|
||||
from: path[i],
|
||||
to: path[i+1],
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
log.Debug("compute send size estimate")
|
||||
errs := make(chan error, len(steps))
|
||||
var wg sync.WaitGroup
|
||||
fanOutCtx, fanOutCancel := context.WithCancel(ctx)
|
||||
defer fanOutCancel()
|
||||
for _, step := range steps {
|
||||
wg.Add(1)
|
||||
go func(step *Step) {
|
||||
defer wg.Done()
|
||||
err := step.updateSizeEstimate(fanOutCtx)
|
||||
if err != nil {
|
||||
log.WithError(err).WithField("step", step).Error("error computing size estimate")
|
||||
fanOutCancel()
|
||||
}
|
||||
errs <- err
|
||||
}(step)
|
||||
}
|
||||
wg.Wait()
|
||||
close(errs)
|
||||
var significantErr error = nil
|
||||
for err := range errs {
|
||||
if err != nil {
|
||||
if significantErr == nil || significantErr == context.Canceled {
|
||||
significantErr = err
|
||||
}
|
||||
}
|
||||
}
|
||||
if significantErr != nil {
|
||||
return nil, significantErr
|
||||
}
|
||||
|
||||
log.Debug("filesystem planning finished")
|
||||
return steps, nil
|
||||
}
|
||||
|
||||
// type FilesystemsReplicationFailedError struct {
|
||||
// FilesystemsWithError []*fsrep.Replication
|
||||
// }
|
||||
|
||||
// func (e FilesystemsReplicationFailedError) Error() string {
|
||||
// allSame := true
|
||||
// lastErr := e.FilesystemsWithError[0].Err().Error()
|
||||
// for _, fs := range e.FilesystemsWithError {
|
||||
// fsErr := fs.Err().Error()
|
||||
// allSame = allSame && lastErr == fsErr
|
||||
// }
|
||||
|
||||
// fsstr := "multiple filesystems"
|
||||
// if len(e.FilesystemsWithError) == 1 {
|
||||
// fsstr = fmt.Sprintf("filesystem %s", e.FilesystemsWithError[0].FS())
|
||||
// }
|
||||
// errorStr := lastErr
|
||||
// if !allSame {
|
||||
// errorStr = "multiple different errors"
|
||||
// }
|
||||
// return fmt.Sprintf("%s could not be replicated: %s", fsstr, errorStr)
|
||||
// }
|
||||
|
||||
func (s *Step) updateSizeEstimate(ctx context.Context) error {
|
||||
|
||||
log := getLogger(ctx)
|
||||
|
||||
sr := s.buildSendRequest(true)
|
||||
|
||||
log.Debug("initiate dry run send request")
|
||||
sres, _, err := s.sender.Send(ctx, sr)
|
||||
if err != nil {
|
||||
log.WithError(err).Error("dry run send request failed")
|
||||
return err
|
||||
}
|
||||
s.expectedSize = sres.ExpectedSize
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Step) buildSendRequest(dryRun bool) (sr *pdu.SendReq) {
|
||||
fs := s.parent.Path
|
||||
if s.from == nil {
|
||||
sr = &pdu.SendReq{
|
||||
Filesystem: fs,
|
||||
To: s.to.RelName(),
|
||||
DryRun: dryRun,
|
||||
}
|
||||
} else {
|
||||
sr = &pdu.SendReq{
|
||||
Filesystem: fs,
|
||||
From: s.from.RelName(),
|
||||
To: s.to.RelName(),
|
||||
DryRun: dryRun,
|
||||
}
|
||||
}
|
||||
return sr
|
||||
}
|
||||
|
||||
func (s *Step) doReplication(ctx context.Context) error {
|
||||
|
||||
fs := s.parent.Path
|
||||
|
||||
log := getLogger(ctx)
|
||||
sr := s.buildSendRequest(false)
|
||||
|
||||
log.Debug("initiate send request")
|
||||
sres, sstreamCopier, err := s.sender.Send(ctx, sr)
|
||||
if err != nil {
|
||||
log.WithError(err).Error("send request failed")
|
||||
return err
|
||||
}
|
||||
if sstreamCopier == nil {
|
||||
err := errors.New("send request did not return a stream, broken endpoint implementation")
|
||||
return err
|
||||
}
|
||||
defer sstreamCopier.Close()
|
||||
|
||||
// Install a byte counter to track progress + for status report
|
||||
s.byteCounter = bytecounter.NewStreamCopier(sstreamCopier)
|
||||
defer func() {
|
||||
s.parent.promBytesReplicated.Add(float64(s.byteCounter.Count()))
|
||||
}()
|
||||
|
||||
rr := &pdu.ReceiveReq{
|
||||
Filesystem: fs,
|
||||
ClearResumeToken: !sres.UsedResumeToken,
|
||||
}
|
||||
log.Debug("initiate receive request")
|
||||
_, err = s.receiver.Receive(ctx, rr, s.byteCounter)
|
||||
if err != nil {
|
||||
log.
|
||||
WithError(err).
|
||||
WithField("errType", fmt.Sprintf("%T", err)).
|
||||
Error("receive request failed (might also be error on sender)")
|
||||
// This failure could be due to
|
||||
// - an unexpected exit of ZFS on the sending side
|
||||
// - an unexpected exit of ZFS on the receiving side
|
||||
// - a connectivity issue
|
||||
return err
|
||||
}
|
||||
log.Debug("receive finished")
|
||||
|
||||
log.Debug("advance replication cursor")
|
||||
req := &pdu.ReplicationCursorReq{
|
||||
Filesystem: fs,
|
||||
Op: &pdu.ReplicationCursorReq_Set{
|
||||
Set: &pdu.ReplicationCursorReq_SetOp{
|
||||
Snapshot: s.to.GetName(),
|
||||
},
|
||||
},
|
||||
}
|
||||
_, err = s.sender.ReplicationCursor(ctx, req)
|
||||
if err != nil {
|
||||
log.WithError(err).Error("error advancing replication cursor")
|
||||
// If this fails and replication planning restarts, the diff algorithm will find
|
||||
// that cursor out of place. This is not a problem because then, it would just use another FS
|
||||
// However, we FIXME have no means to just update the cursor in a
|
||||
// second replication attempt right after this one where we don't have new snaps yet
|
||||
return err
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
func (s *Step) String() string {
|
||||
if s.from == nil { // FIXME: ZFS semantics are that to is nil on non-incremental send
|
||||
return fmt.Sprintf("%s%s (full)", s.parent.Path, s.to.RelName())
|
||||
} else {
|
||||
return fmt.Sprintf("%s(%s => %s)", s.parent.Path, s.from.RelName(), s.to.RelName())
|
||||
}
|
||||
}
|
@ -1,9 +1,9 @@
|
||||
package replication
|
||||
package logic
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"github.com/zrepl/zrepl/logger"
|
||||
"github.com/zrepl/zrepl/replication/fsrep"
|
||||
)
|
||||
|
||||
type contextKey int
|
||||
@ -16,7 +16,6 @@ type Logger = logger.Logger
|
||||
|
||||
func WithLogger(ctx context.Context, l Logger) context.Context {
|
||||
ctx = context.WithValue(ctx, contextKeyLog, l)
|
||||
ctx = fsrep.WithLogger(ctx, l)
|
||||
return ctx
|
||||
}
|
||||
|
@ -1,560 +0,0 @@
|
||||
// Package replication implements replication of filesystems with existing
|
||||
// versions (snapshots) from a sender to a receiver.
|
||||
package replication
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/zrepl/zrepl/daemon/job/wakeup"
|
||||
"github.com/zrepl/zrepl/util/envconst"
|
||||
"github.com/zrepl/zrepl/util/watchdog"
|
||||
"math/bits"
|
||||
"net"
|
||||
"sort"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/zrepl/zrepl/replication/fsrep"
|
||||
. "github.com/zrepl/zrepl/replication/internal/diff"
|
||||
"github.com/zrepl/zrepl/replication/pdu"
|
||||
)
|
||||
|
||||
//go:generate enumer -type=State
|
||||
type State uint
|
||||
|
||||
const (
|
||||
Planning State = 1 << iota
|
||||
PlanningError
|
||||
Working
|
||||
WorkingWait
|
||||
Completed
|
||||
PermanentError
|
||||
)
|
||||
|
||||
func (s State) rsf() state {
|
||||
idx := bits.TrailingZeros(uint(s))
|
||||
if idx == bits.UintSize {
|
||||
panic(s) // invalid value
|
||||
}
|
||||
m := []state{
|
||||
statePlanning,
|
||||
statePlanningError,
|
||||
stateWorking,
|
||||
stateWorkingWait,
|
||||
nil,
|
||||
nil,
|
||||
}
|
||||
return m[idx]
|
||||
}
|
||||
|
||||
func (s State) IsTerminal() bool {
|
||||
return s.rsf() == nil
|
||||
}
|
||||
|
||||
// Replication implements the replication of multiple file systems from a Sender to a Receiver.
|
||||
//
|
||||
// It is a state machine that is driven by the Drive method
|
||||
// and provides asynchronous reporting via the Report method (i.e. from another goroutine).
|
||||
type Replication struct {
|
||||
// not protected by lock
|
||||
promSecsPerState *prometheus.HistogramVec // labels: state
|
||||
promBytesReplicated *prometheus.CounterVec // labels: filesystem
|
||||
|
||||
Progress watchdog.KeepAlive
|
||||
|
||||
// lock protects all fields of this struct (but not the fields behind pointers!)
|
||||
lock sync.Mutex
|
||||
|
||||
state State
|
||||
|
||||
// Working, WorkingWait, Completed, ContextDone
|
||||
queue []*fsrep.Replication
|
||||
completed []*fsrep.Replication
|
||||
active *fsrep.Replication // == queue[0] or nil, unlike in Report
|
||||
|
||||
// for PlanningError, WorkingWait and ContextError and Completed
|
||||
err error
|
||||
|
||||
// PlanningError, WorkingWait
|
||||
sleepUntil time.Time
|
||||
}
|
||||
|
||||
type Report struct {
|
||||
Status string
|
||||
Problem string
|
||||
SleepUntil time.Time
|
||||
Completed []*fsrep.Report
|
||||
Pending []*fsrep.Report
|
||||
Active *fsrep.Report // not contained in Pending, unlike in struct Replication
|
||||
}
|
||||
|
||||
func NewReplication(secsPerState *prometheus.HistogramVec, bytesReplicated *prometheus.CounterVec) *Replication {
|
||||
r := Replication{
|
||||
promSecsPerState: secsPerState,
|
||||
promBytesReplicated: bytesReplicated,
|
||||
state: Planning,
|
||||
}
|
||||
return &r
|
||||
}
|
||||
|
||||
// Endpoint represents one side of the replication.
|
||||
//
|
||||
// An endpoint is either in Sender or Receiver mode, represented by the correspondingly
|
||||
// named interfaces defined in this package.
|
||||
type Endpoint interface {
|
||||
// Does not include placeholder filesystems
|
||||
ListFilesystems(ctx context.Context, req *pdu.ListFilesystemReq) (*pdu.ListFilesystemRes, error)
|
||||
ListFilesystemVersions(ctx context.Context, req *pdu.ListFilesystemVersionsReq) (*pdu.ListFilesystemVersionsRes, error)
|
||||
DestroySnapshots(ctx context.Context, req *pdu.DestroySnapshotsReq) (*pdu.DestroySnapshotsRes, error)
|
||||
}
|
||||
|
||||
type Sender interface {
|
||||
Endpoint
|
||||
fsrep.Sender
|
||||
}
|
||||
|
||||
type Receiver interface {
|
||||
Endpoint
|
||||
fsrep.Receiver
|
||||
}
|
||||
|
||||
type FilteredError struct{ fs string }
|
||||
|
||||
func NewFilteredError(fs string) *FilteredError {
|
||||
return &FilteredError{fs}
|
||||
}
|
||||
|
||||
func (f FilteredError) Error() string { return "endpoint does not allow access to filesystem " + f.fs }
|
||||
|
||||
type updater func(func(*Replication)) (newState State)
|
||||
type state func(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, receiver Receiver, u updater) state
|
||||
|
||||
// Drive starts the state machine and returns only after replication has finished (with or without errors).
|
||||
// The Logger in ctx is used for both debug and error logging, but is not guaranteed to be stable
|
||||
// or end-user friendly.
|
||||
// User-facing replication progress reports and can be obtained using the Report method,
|
||||
// whose output will not change after Drive returns.
|
||||
//
|
||||
// FIXME: Drive may be only called once per instance of Replication
|
||||
func (r *Replication) Drive(ctx context.Context, sender Sender, receiver Receiver) {
|
||||
|
||||
var u updater = func(f func(*Replication)) State {
|
||||
r.lock.Lock()
|
||||
defer r.lock.Unlock()
|
||||
if f != nil {
|
||||
f(r)
|
||||
}
|
||||
return r.state
|
||||
}
|
||||
|
||||
var s state = statePlanning
|
||||
var pre, post State
|
||||
for s != nil {
|
||||
preTime := time.Now()
|
||||
pre = u(nil)
|
||||
s = s(ctx, &r.Progress, sender, receiver, u)
|
||||
delta := time.Now().Sub(preTime)
|
||||
r.promSecsPerState.WithLabelValues(pre.String()).Observe(delta.Seconds())
|
||||
post = u(nil)
|
||||
getLogger(ctx).
|
||||
WithField("transition", fmt.Sprintf("%s => %s", pre, post)).
|
||||
WithField("duration", delta).
|
||||
Debug("main state transition")
|
||||
if post == Working && pre != post {
|
||||
getLogger(ctx).Info("start working")
|
||||
}
|
||||
}
|
||||
|
||||
getLogger(ctx).
|
||||
WithField("final_state", post).
|
||||
Debug("main final state")
|
||||
}
|
||||
|
||||
func resolveConflict(conflict error) (path []*pdu.FilesystemVersion, msg string) {
|
||||
if noCommonAncestor, ok := conflict.(*ConflictNoCommonAncestor); ok {
|
||||
if len(noCommonAncestor.SortedReceiverVersions) == 0 {
|
||||
// TODO this is hard-coded replication policy: most recent snapshot as source
|
||||
var mostRecentSnap *pdu.FilesystemVersion
|
||||
for n := len(noCommonAncestor.SortedSenderVersions) - 1; n >= 0; n-- {
|
||||
if noCommonAncestor.SortedSenderVersions[n].Type == pdu.FilesystemVersion_Snapshot {
|
||||
mostRecentSnap = noCommonAncestor.SortedSenderVersions[n]
|
||||
break
|
||||
}
|
||||
}
|
||||
if mostRecentSnap == nil {
|
||||
return nil, "no snapshots available on sender side"
|
||||
}
|
||||
return []*pdu.FilesystemVersion{mostRecentSnap}, fmt.Sprintf("start replication at most recent snapshot %s", mostRecentSnap.RelName())
|
||||
}
|
||||
}
|
||||
return nil, "no automated way to handle conflict type"
|
||||
}
|
||||
|
||||
var RetryInterval = envconst.Duration("ZREPL_REPLICATION_RETRY_INTERVAL", 10 * time.Second)
|
||||
|
||||
type Error interface {
|
||||
error
|
||||
Temporary() bool
|
||||
}
|
||||
|
||||
var _ Error = fsrep.Error(nil)
|
||||
var _ Error = net.Error(nil)
|
||||
|
||||
func isPermanent(err error) bool {
|
||||
if e, ok := err.(Error); ok {
|
||||
return !e.Temporary()
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func statePlanning(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, receiver Receiver, u updater) state {
|
||||
|
||||
log := getLogger(ctx)
|
||||
|
||||
log.Info("start planning")
|
||||
|
||||
handlePlanningError := func(err error) state {
|
||||
return u(func(r *Replication) {
|
||||
ge := GlobalError{Err: err, Temporary: !isPermanent(err)}
|
||||
log.WithError(ge).Error("encountered global error while planning replication")
|
||||
r.err = ge
|
||||
if !ge.Temporary {
|
||||
r.state = PermanentError
|
||||
} else {
|
||||
r.sleepUntil = time.Now().Add(RetryInterval)
|
||||
r.state = PlanningError
|
||||
}
|
||||
}).rsf()
|
||||
}
|
||||
|
||||
slfssres, err := sender.ListFilesystems(ctx, &pdu.ListFilesystemReq{})
|
||||
if err != nil {
|
||||
log.WithError(err).WithField("errType", fmt.Sprintf("%T", err)).Error("error listing sender filesystems")
|
||||
return handlePlanningError(err)
|
||||
}
|
||||
sfss := slfssres.GetFilesystems()
|
||||
// no progress here since we could run in a live-lock on connectivity issues
|
||||
|
||||
rlfssres, err := receiver.ListFilesystems(ctx, &pdu.ListFilesystemReq{})
|
||||
if err != nil {
|
||||
log.WithError(err).WithField("errType", fmt.Sprintf("%T", err)).Error("error listing receiver filesystems")
|
||||
return handlePlanningError(err)
|
||||
}
|
||||
rfss := rlfssres.GetFilesystems()
|
||||
ka.MadeProgress() // for both sender and receiver
|
||||
|
||||
q := make([]*fsrep.Replication, 0, len(sfss))
|
||||
mainlog := log
|
||||
for _, fs := range sfss {
|
||||
|
||||
log := mainlog.WithField("filesystem", fs.Path)
|
||||
|
||||
log.Debug("assessing filesystem")
|
||||
|
||||
sfsvsres, err := sender.ListFilesystemVersions(ctx, &pdu.ListFilesystemVersionsReq{Filesystem: fs.Path})
|
||||
if err != nil {
|
||||
log.WithError(err).Error("cannot get remote filesystem versions")
|
||||
return handlePlanningError(err)
|
||||
}
|
||||
sfsvs := sfsvsres.GetVersions()
|
||||
ka.MadeProgress()
|
||||
|
||||
if len(sfsvs) < 1 {
|
||||
err := errors.New("sender does not have any versions")
|
||||
log.Error(err.Error())
|
||||
q = append(q, fsrep.NewReplicationConflictError(fs.Path, err))
|
||||
continue
|
||||
}
|
||||
|
||||
receiverFSExists := false
|
||||
for _, rfs := range rfss {
|
||||
if rfs.Path == fs.Path {
|
||||
receiverFSExists = true
|
||||
}
|
||||
}
|
||||
|
||||
var rfsvs []*pdu.FilesystemVersion
|
||||
if receiverFSExists {
|
||||
rfsvsres, err := receiver.ListFilesystemVersions(ctx, &pdu.ListFilesystemVersionsReq{Filesystem: fs.Path})
|
||||
if err != nil {
|
||||
if _, ok := err.(*FilteredError); ok {
|
||||
log.Info("receiver ignores filesystem")
|
||||
continue
|
||||
}
|
||||
log.WithError(err).Error("receiver error")
|
||||
return handlePlanningError(err)
|
||||
}
|
||||
rfsvs = rfsvsres.GetVersions()
|
||||
} else {
|
||||
rfsvs = []*pdu.FilesystemVersion{}
|
||||
}
|
||||
ka.MadeProgress()
|
||||
|
||||
path, conflict := IncrementalPath(rfsvs, sfsvs)
|
||||
if conflict != nil {
|
||||
var msg string
|
||||
path, msg = resolveConflict(conflict) // no shadowing allowed!
|
||||
if path != nil {
|
||||
log.WithField("conflict", conflict).Info("conflict")
|
||||
log.WithField("resolution", msg).Info("automatically resolved")
|
||||
} else {
|
||||
log.WithField("conflict", conflict).Error("conflict")
|
||||
log.WithField("problem", msg).Error("cannot resolve conflict")
|
||||
}
|
||||
}
|
||||
ka.MadeProgress()
|
||||
if path == nil {
|
||||
q = append(q, fsrep.NewReplicationConflictError(fs.Path, conflict))
|
||||
continue
|
||||
}
|
||||
|
||||
var promBytesReplicated *prometheus.CounterVec
|
||||
u(func(replication *Replication) { // FIXME args struct like in pruner (also use for sender and receiver)
|
||||
promBytesReplicated = replication.promBytesReplicated
|
||||
})
|
||||
fsrfsm := fsrep.BuildReplication(fs.Path, promBytesReplicated.WithLabelValues(fs.Path))
|
||||
if len(path) == 1 {
|
||||
fsrfsm.AddStep(nil, path[0])
|
||||
} else {
|
||||
for i := 0; i < len(path)-1; i++ {
|
||||
fsrfsm.AddStep(path[i], path[i+1])
|
||||
}
|
||||
}
|
||||
qitem := fsrfsm.Done()
|
||||
ka.MadeProgress()
|
||||
|
||||
log.Debug("compute send size estimate")
|
||||
if err = qitem.UpdateSizeEsitmate(ctx, sender); err != nil {
|
||||
log.WithError(err).Error("error computing size estimate")
|
||||
return handlePlanningError(err)
|
||||
}
|
||||
ka.MadeProgress()
|
||||
|
||||
q = append(q, qitem)
|
||||
}
|
||||
|
||||
ka.MadeProgress()
|
||||
|
||||
return u(func(r *Replication) {
|
||||
r.completed = nil
|
||||
r.queue = q
|
||||
r.err = nil
|
||||
r.state = Working
|
||||
}).rsf()
|
||||
}
|
||||
|
||||
func statePlanningError(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, receiver Receiver, u updater) state {
|
||||
var sleepUntil time.Time
|
||||
u(func(r *Replication) {
|
||||
sleepUntil = r.sleepUntil
|
||||
})
|
||||
t := time.NewTimer(sleepUntil.Sub(time.Now()))
|
||||
getLogger(ctx).WithField("until", sleepUntil).Info("retry wait after planning error")
|
||||
defer t.Stop()
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return u(func(r *Replication) {
|
||||
r.state = PermanentError
|
||||
r.err = ctx.Err()
|
||||
}).rsf()
|
||||
case <-t.C:
|
||||
case <-wakeup.Wait(ctx):
|
||||
}
|
||||
return u(func(r *Replication) {
|
||||
r.state = Planning
|
||||
}).rsf()
|
||||
}
|
||||
|
||||
type GlobalError struct {
|
||||
Err error
|
||||
Temporary bool
|
||||
}
|
||||
|
||||
func (e GlobalError) Error() string {
|
||||
errClass := "temporary"
|
||||
if !e.Temporary {
|
||||
errClass = "permanent"
|
||||
}
|
||||
return fmt.Sprintf("%s global error: %s", errClass, e.Err)
|
||||
}
|
||||
|
||||
type FilesystemsReplicationFailedError struct {
|
||||
FilesystemsWithError []*fsrep.Replication
|
||||
}
|
||||
|
||||
func (e FilesystemsReplicationFailedError) Error() string {
|
||||
allSame := true
|
||||
lastErr := e.FilesystemsWithError[0].Err().Error()
|
||||
for _, fs := range e.FilesystemsWithError {
|
||||
fsErr := fs.Err().Error()
|
||||
allSame = allSame && lastErr == fsErr
|
||||
}
|
||||
|
||||
fsstr := "multiple filesystems"
|
||||
if len(e.FilesystemsWithError) == 1 {
|
||||
fsstr = fmt.Sprintf("filesystem %s", e.FilesystemsWithError[0].FS())
|
||||
}
|
||||
errorStr := lastErr
|
||||
if !allSame {
|
||||
errorStr = "multiple different errors"
|
||||
}
|
||||
return fmt.Sprintf("%s could not be replicated: %s", fsstr, errorStr)
|
||||
}
|
||||
|
||||
func stateWorking(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, receiver Receiver, u updater) state {
|
||||
|
||||
var active *fsrep.Replication
|
||||
rsfNext := u(func(r *Replication) {
|
||||
|
||||
r.err = nil
|
||||
|
||||
newq := make([]*fsrep.Replication, 0, len(r.queue))
|
||||
for i := range r.queue {
|
||||
if r.queue[i].CanRetry() {
|
||||
newq = append(newq, r.queue[i])
|
||||
} else {
|
||||
r.completed = append(r.completed, r.queue[i])
|
||||
}
|
||||
}
|
||||
sort.SliceStable(newq, func(i, j int) bool {
|
||||
return newq[i].NextStepDate().Before(newq[j].NextStepDate())
|
||||
})
|
||||
r.queue = newq
|
||||
|
||||
if len(r.queue) == 0 {
|
||||
r.state = Completed
|
||||
fsWithErr := FilesystemsReplicationFailedError{ // prepare it
|
||||
FilesystemsWithError: make([]*fsrep.Replication, 0, len(r.completed)),
|
||||
}
|
||||
for _, fs := range r.completed {
|
||||
if fs.CanRetry() {
|
||||
panic(fmt.Sprintf("implementation error: completed contains retryable FS %s %#v",
|
||||
fs.FS(), fs.Err()))
|
||||
}
|
||||
if fs.Err() != nil {
|
||||
fsWithErr.FilesystemsWithError = append(fsWithErr.FilesystemsWithError, fs)
|
||||
}
|
||||
}
|
||||
if len(fsWithErr.FilesystemsWithError) > 0 {
|
||||
r.err = fsWithErr
|
||||
r.state = PermanentError
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
active = r.queue[0] // do not dequeue: if it's done, it will be sorted the next time we check for more work
|
||||
r.active = active
|
||||
}).rsf()
|
||||
|
||||
if active == nil {
|
||||
return rsfNext
|
||||
}
|
||||
|
||||
activeCtx := fsrep.WithLogger(ctx, getLogger(ctx).WithField("fs", active.FS()))
|
||||
err := active.Retry(activeCtx, ka, sender, receiver)
|
||||
u(func(r *Replication) {
|
||||
r.active = nil
|
||||
}).rsf()
|
||||
|
||||
if err != nil {
|
||||
if err.ContextErr() && ctx.Err() != nil {
|
||||
getLogger(ctx).WithError(err).
|
||||
Info("filesystem replication was cancelled")
|
||||
u(func(r*Replication) {
|
||||
r.err = GlobalError{Err: err, Temporary: false}
|
||||
r.state = PermanentError
|
||||
})
|
||||
} else if err.LocalToFS() {
|
||||
getLogger(ctx).WithError(err).
|
||||
Error("filesystem replication encountered a filesystem-specific error")
|
||||
// we stay in this state and let the queuing logic above de-prioritize this failing FS
|
||||
} else if err.Temporary() {
|
||||
getLogger(ctx).WithError(err).
|
||||
Error("filesystem encountered a non-filesystem-specific temporary error, enter retry-wait")
|
||||
u(func(r *Replication) {
|
||||
r.err = GlobalError{Err: err, Temporary: true}
|
||||
r.sleepUntil = time.Now().Add(RetryInterval)
|
||||
r.state = WorkingWait
|
||||
}).rsf()
|
||||
} else {
|
||||
getLogger(ctx).WithError(err).
|
||||
Error("encountered a permanent non-filesystem-specific error")
|
||||
u(func(r *Replication) {
|
||||
r.err = GlobalError{Err: err, Temporary: false}
|
||||
r.state = PermanentError
|
||||
}).rsf()
|
||||
}
|
||||
}
|
||||
|
||||
return u(nil).rsf()
|
||||
}
|
||||
|
||||
func stateWorkingWait(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, receiver Receiver, u updater) state {
|
||||
var sleepUntil time.Time
|
||||
u(func(r *Replication) {
|
||||
sleepUntil = r.sleepUntil
|
||||
})
|
||||
t := time.NewTimer(RetryInterval)
|
||||
getLogger(ctx).WithField("until", sleepUntil).Info("retry wait after error")
|
||||
defer t.Stop()
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return u(func(r *Replication) {
|
||||
r.state = PermanentError
|
||||
r.err = ctx.Err()
|
||||
}).rsf()
|
||||
|
||||
case <-t.C:
|
||||
case <-wakeup.Wait(ctx):
|
||||
}
|
||||
return u(func(r *Replication) {
|
||||
r.state = Working
|
||||
}).rsf()
|
||||
}
|
||||
|
||||
// Report provides a summary of the progress of the Replication,
|
||||
// i.e., a condensed dump of the internal state machine.
|
||||
// Report is safe to be called asynchronously while Drive is running.
|
||||
func (r *Replication) Report() *Report {
|
||||
r.lock.Lock()
|
||||
defer r.lock.Unlock()
|
||||
|
||||
rep := Report{
|
||||
Status: r.state.String(),
|
||||
SleepUntil: r.sleepUntil,
|
||||
}
|
||||
|
||||
if r.err != nil {
|
||||
rep.Problem = r.err.Error()
|
||||
}
|
||||
|
||||
if r.state&(Planning|PlanningError) != 0 {
|
||||
return &rep
|
||||
}
|
||||
|
||||
rep.Pending = make([]*fsrep.Report, 0, len(r.queue))
|
||||
rep.Completed = make([]*fsrep.Report, 0, len(r.completed)) // room for active (potentially)
|
||||
|
||||
// since r.active == r.queue[0], do not contain it in pending output
|
||||
pending := r.queue
|
||||
if r.active != nil {
|
||||
rep.Active = r.active.Report()
|
||||
pending = r.queue[1:]
|
||||
}
|
||||
for _, fsr := range pending {
|
||||
rep.Pending= append(rep.Pending, fsr.Report())
|
||||
}
|
||||
for _, fsr := range r.completed {
|
||||
rep.Completed = append(rep.Completed, fsr.Report())
|
||||
}
|
||||
|
||||
return &rep
|
||||
}
|
||||
|
||||
func (r *Replication) State() State {
|
||||
r.lock.Lock()
|
||||
defer r.lock.Unlock()
|
||||
return r.state
|
||||
}
|
13
replication/replication.go
Normal file
13
replication/replication.go
Normal file
@ -0,0 +1,13 @@
|
||||
// Package replication implements replication of filesystems with existing
|
||||
// versions (snapshots) from a sender to a receiver.
|
||||
package replication
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"github.com/zrepl/zrepl/replication/driver"
|
||||
)
|
||||
|
||||
func Do(ctx context.Context, planner driver.Planner) (driver.ReportFunc, driver.WaitFunc) {
|
||||
return driver.Do(ctx, planner)
|
||||
}
|
152
replication/report/replication_report.go
Normal file
152
replication/report/replication_report.go
Normal file
@ -0,0 +1,152 @@
|
||||
package report
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Report struct {
|
||||
StartAt, FinishAt time.Time
|
||||
WaitReconnectSince, WaitReconnectUntil time.Time
|
||||
WaitReconnectError *TimedError
|
||||
Attempts []*AttemptReport
|
||||
}
|
||||
|
||||
var _, _ = json.Marshal(&Report{})
|
||||
|
||||
type TimedError struct {
|
||||
Err string
|
||||
Time time.Time
|
||||
}
|
||||
|
||||
func NewTimedError(err string, t time.Time) *TimedError {
|
||||
if err == "" {
|
||||
panic("error must be empty")
|
||||
}
|
||||
if t.IsZero() {
|
||||
panic("t must be non-zero")
|
||||
}
|
||||
return &TimedError{err, t}
|
||||
}
|
||||
|
||||
func (s *TimedError) Error() string {
|
||||
return s.Err
|
||||
}
|
||||
|
||||
var _, _ = json.Marshal(&TimedError{})
|
||||
|
||||
type AttemptReport struct {
|
||||
State AttemptState
|
||||
StartAt, FinishAt time.Time
|
||||
PlanError *TimedError
|
||||
Filesystems []*FilesystemReport
|
||||
}
|
||||
|
||||
type AttemptState string
|
||||
|
||||
const (
|
||||
AttemptPlanning AttemptState = "planning"
|
||||
AttemptPlanningError AttemptState = "planning-error"
|
||||
AttemptFanOutFSs AttemptState = "fan-out-filesystems"
|
||||
AttemptFanOutError AttemptState = "filesystem-error"
|
||||
AttemptDone AttemptState = "done"
|
||||
)
|
||||
|
||||
type FilesystemState string
|
||||
|
||||
const (
|
||||
FilesystemPlanning FilesystemState = "planning"
|
||||
FilesystemPlanningErrored FilesystemState = "planning-error"
|
||||
FilesystemStepping FilesystemState = "stepping"
|
||||
FilesystemSteppingErrored FilesystemState = "step-error"
|
||||
FilesystemDone FilesystemState = "done"
|
||||
)
|
||||
|
||||
type FilesystemReport struct {
|
||||
Info *FilesystemInfo
|
||||
|
||||
State FilesystemState
|
||||
|
||||
// Valid in State = FilesystemPlanningErrored
|
||||
PlanError *TimedError
|
||||
// Valid in State = FilesystemSteppingErrored
|
||||
StepError *TimedError
|
||||
|
||||
// Valid in State = FilesystemStepping
|
||||
CurrentStep int
|
||||
Steps []*StepReport
|
||||
}
|
||||
|
||||
type FilesystemInfo struct {
|
||||
Name string
|
||||
}
|
||||
|
||||
type StepReport struct {
|
||||
Info *StepInfo
|
||||
}
|
||||
|
||||
type StepInfo struct {
|
||||
From, To string
|
||||
BytesExpected int64
|
||||
BytesReplicated int64
|
||||
}
|
||||
|
||||
func (a *AttemptReport) BytesSum() (expected, replicated int64) {
|
||||
for _, fs := range a.Filesystems {
|
||||
e, r := fs.BytesSum()
|
||||
expected += e
|
||||
replicated += r
|
||||
}
|
||||
return expected, replicated
|
||||
}
|
||||
|
||||
func (f *FilesystemReport) BytesSum() (expected, replicated int64) {
|
||||
for _, step := range f.Steps {
|
||||
expected += step.Info.BytesExpected
|
||||
replicated += step.Info.BytesReplicated
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (f *AttemptReport) FilesystemsByState() map[FilesystemState][]*FilesystemReport {
|
||||
r := make(map[FilesystemState][]*FilesystemReport, 4)
|
||||
for _, fs := range f.Filesystems {
|
||||
l := r[fs.State]
|
||||
l = append(l, fs)
|
||||
r[fs.State] = l
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
func (f *FilesystemReport) Error() *TimedError {
|
||||
switch f.State {
|
||||
case FilesystemPlanningErrored:
|
||||
return f.PlanError
|
||||
case FilesystemSteppingErrored:
|
||||
return f.StepError
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// may return nil
|
||||
func (f *FilesystemReport) NextStep() *StepReport {
|
||||
switch f.State {
|
||||
case FilesystemDone:
|
||||
return nil
|
||||
case FilesystemPlanningErrored:
|
||||
return nil
|
||||
case FilesystemSteppingErrored:
|
||||
return nil
|
||||
case FilesystemPlanning:
|
||||
return nil
|
||||
case FilesystemStepping:
|
||||
// invariant is that this is always correct
|
||||
// TODO what about 0-length Steps but short intermediary state?
|
||||
return f.Steps[f.CurrentStep]
|
||||
}
|
||||
panic("unreachable")
|
||||
}
|
||||
|
||||
func (f *StepReport) IsIncremental() bool {
|
||||
return f.Info.From != "" // FIXME change to ZFS semantics (To != "")
|
||||
}
|
@ -1,76 +0,0 @@
|
||||
// Code generated by "enumer -type=State"; DO NOT EDIT.
|
||||
|
||||
package replication
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
)
|
||||
|
||||
const (
|
||||
_StateName_0 = "PlanningPlanningError"
|
||||
_StateName_1 = "Working"
|
||||
_StateName_2 = "WorkingWait"
|
||||
_StateName_3 = "Completed"
|
||||
_StateName_4 = "PermanentError"
|
||||
)
|
||||
|
||||
var (
|
||||
_StateIndex_0 = [...]uint8{0, 8, 21}
|
||||
_StateIndex_1 = [...]uint8{0, 7}
|
||||
_StateIndex_2 = [...]uint8{0, 11}
|
||||
_StateIndex_3 = [...]uint8{0, 9}
|
||||
_StateIndex_4 = [...]uint8{0, 14}
|
||||
)
|
||||
|
||||
func (i State) String() string {
|
||||
switch {
|
||||
case 1 <= i && i <= 2:
|
||||
i -= 1
|
||||
return _StateName_0[_StateIndex_0[i]:_StateIndex_0[i+1]]
|
||||
case i == 4:
|
||||
return _StateName_1
|
||||
case i == 8:
|
||||
return _StateName_2
|
||||
case i == 16:
|
||||
return _StateName_3
|
||||
case i == 32:
|
||||
return _StateName_4
|
||||
default:
|
||||
return fmt.Sprintf("State(%d)", i)
|
||||
}
|
||||
}
|
||||
|
||||
var _StateValues = []State{1, 2, 4, 8, 16, 32}
|
||||
|
||||
var _StateNameToValueMap = map[string]State{
|
||||
_StateName_0[0:8]: 1,
|
||||
_StateName_0[8:21]: 2,
|
||||
_StateName_1[0:7]: 4,
|
||||
_StateName_2[0:11]: 8,
|
||||
_StateName_3[0:9]: 16,
|
||||
_StateName_4[0:14]: 32,
|
||||
}
|
||||
|
||||
// StateString retrieves an enum value from the enum constants string name.
|
||||
// Throws an error if the param is not part of the enum.
|
||||
func StateString(s string) (State, error) {
|
||||
if val, ok := _StateNameToValueMap[s]; ok {
|
||||
return val, nil
|
||||
}
|
||||
return 0, fmt.Errorf("%s does not belong to State values", s)
|
||||
}
|
||||
|
||||
// StateValues returns all values of the enum
|
||||
func StateValues() []State {
|
||||
return _StateValues
|
||||
}
|
||||
|
||||
// IsAState returns "true" if the value is listed in the enum definition. "false" otherwise
|
||||
func (i State) IsAState() bool {
|
||||
for _, v := range _StateValues {
|
||||
if i == v {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
@ -7,7 +7,7 @@ import (
|
||||
"strings"
|
||||
|
||||
"github.com/golang/protobuf/proto"
|
||||
"github.com/zrepl/zrepl/replication/pdu"
|
||||
"github.com/zrepl/zrepl/replication/logic/pdu"
|
||||
"github.com/zrepl/zrepl/rpc/dataconn/stream"
|
||||
"github.com/zrepl/zrepl/transport"
|
||||
"github.com/zrepl/zrepl/zfs"
|
||||
@ -213,3 +213,23 @@ func (c *Client) ReqRecv(ctx context.Context, req *pdu.ReceiveReq, streamCopier
|
||||
|
||||
return res.res, cause
|
||||
}
|
||||
|
||||
|
||||
func (c *Client) ReqPing(ctx context.Context, req *pdu.PingReq) (*pdu.PingRes, error) {
|
||||
conn, err := c.getWire(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer c.putWire(conn)
|
||||
|
||||
if err := c.send(ctx, conn, EndpointPing, req, nil); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var res pdu.PingRes
|
||||
if err := c.recv(ctx, conn, &res); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &res, nil
|
||||
}
|
@ -7,7 +7,7 @@ import (
|
||||
|
||||
"github.com/golang/protobuf/proto"
|
||||
"github.com/zrepl/zrepl/logger"
|
||||
"github.com/zrepl/zrepl/replication/pdu"
|
||||
"github.com/zrepl/zrepl/replication/logic/pdu"
|
||||
"github.com/zrepl/zrepl/rpc/dataconn/stream"
|
||||
"github.com/zrepl/zrepl/transport"
|
||||
"github.com/zrepl/zrepl/zfs"
|
||||
@ -25,6 +25,8 @@ type Handler interface {
|
||||
// It is guaranteed that Server calls Receive with a stream that holds the IdleConnTimeout
|
||||
// configured in ServerConfig.Shared.IdleConnTimeout.
|
||||
Receive(ctx context.Context, r *pdu.ReceiveReq, receive zfs.StreamCopier) (*pdu.ReceiveRes, error)
|
||||
// PingDataconn handles a PingReq
|
||||
PingDataconn(ctx context.Context, r *pdu.PingReq) (*pdu.PingRes, error)
|
||||
}
|
||||
|
||||
type Logger = logger.Logger
|
||||
@ -125,6 +127,13 @@ func (s *Server) serveConn(nc *transport.AuthConn) {
|
||||
return
|
||||
}
|
||||
res, handlerErr = s.h.Receive(ctx, &req, &streamCopier{streamConn: c, closeStreamOnClose: false}) // SHADOWING
|
||||
case EndpointPing:
|
||||
var req pdu.PingReq
|
||||
if err := proto.Unmarshal(reqStructured, &req); err != nil {
|
||||
s.log.WithError(err).Error("cannot unmarshal ping request")
|
||||
return
|
||||
}
|
||||
res, handlerErr = s.h.PingDataconn(ctx, &req) // SHADOWING
|
||||
default:
|
||||
s.log.WithField("endpoint", endpoint).Error("unknown endpoint")
|
||||
handlerErr = fmt.Errorf("requested endpoint does not exist")
|
||||
@ -137,12 +146,17 @@ func (s *Server) serveConn(nc *transport.AuthConn) {
|
||||
// if marshaling fails. We consider failed marshaling a handler error
|
||||
var protobuf *bytes.Buffer
|
||||
if handlerErr == nil {
|
||||
protobufBytes, err := proto.Marshal(res)
|
||||
if err != nil {
|
||||
s.log.WithError(err).Error("cannot marshal handler protobuf")
|
||||
handlerErr = err
|
||||
if res == nil {
|
||||
handlerErr = fmt.Errorf("implementation error: handler for endpoint %q returns nil error and nil result", endpoint)
|
||||
s.log.WithError(err).Error("handle implementation error")
|
||||
} else {
|
||||
protobufBytes, err := proto.Marshal(res)
|
||||
if err != nil {
|
||||
s.log.WithError(err).Error("cannot marshal handler protobuf")
|
||||
handlerErr = err
|
||||
}
|
||||
protobuf = bytes.NewBuffer(protobufBytes) // SHADOWING
|
||||
}
|
||||
protobuf = bytes.NewBuffer(protobufBytes) // SHADOWING
|
||||
}
|
||||
|
||||
var resHeaderBuf bytes.Buffer
|
||||
|
@ -10,6 +10,7 @@ import (
|
||||
)
|
||||
|
||||
const (
|
||||
EndpointPing string = "/v1/ping"
|
||||
EndpointSend string = "/v1/send"
|
||||
EndpointRecv string = "/v1/recv"
|
||||
)
|
||||
|
@ -24,7 +24,7 @@ import (
|
||||
"github.com/pkg/profile"
|
||||
|
||||
"github.com/zrepl/zrepl/logger"
|
||||
"github.com/zrepl/zrepl/replication/pdu"
|
||||
"github.com/zrepl/zrepl/replication/logic/pdu"
|
||||
"github.com/zrepl/zrepl/rpc/dataconn"
|
||||
"github.com/zrepl/zrepl/rpc/dataconn/timeoutconn"
|
||||
"github.com/zrepl/zrepl/transport"
|
||||
@ -77,6 +77,12 @@ func (devNullHandler) Receive(ctx context.Context, r *pdu.ReceiveReq, stream zfs
|
||||
return &res, err
|
||||
}
|
||||
|
||||
func (devNullHandler) PingDataconn(ctx context.Context, r *pdu.PingReq) (*pdu.PingRes, error) {
|
||||
return &pdu.PingRes{
|
||||
Echo: r.GetMessage(),
|
||||
}, nil
|
||||
}
|
||||
|
||||
type tcpConnecter struct {
|
||||
addr string
|
||||
}
|
||||
|
@ -105,11 +105,12 @@ func NewInterceptors(logger Logger, clientIdentityKey interface{}) (unary grpc.U
|
||||
if !ok {
|
||||
panic("peer.FromContext expected to return a peer in grpc.UnaryServerInterceptor")
|
||||
}
|
||||
logger.WithField("peer", fmt.Sprintf("%v", p)).Debug("peer")
|
||||
logger.WithField("peer_addr", fmt.Sprintf("%s", p.Addr)).Debug("peer addr")
|
||||
a, ok := p.AuthInfo.(*authConnAuthType)
|
||||
if !ok {
|
||||
panic(fmt.Sprintf("NewInterceptors must be used in combination with grpc.NewTransportCredentials, but got auth type %T", p.AuthInfo))
|
||||
}
|
||||
logger.WithField("peer_client_identity", a.clientIdentity).Debug("peer client identity")
|
||||
ctx = context.WithValue(ctx, clientIdentityKey, a.clientIdentity)
|
||||
return handler(ctx, req)
|
||||
}
|
||||
|
@ -36,7 +36,7 @@ func ClientConn(cn transport.Connecter, log Logger) *grpc.ClientConn {
|
||||
})
|
||||
dialerOption := grpc.WithDialer(grpcclientidentity.NewDialer(log, cn))
|
||||
cred := grpc.WithTransportCredentials(grpcclientidentity.NewTransportCredentials(log))
|
||||
ctx, cancel := context.WithTimeout(context.TODO(), 5*time.Second)
|
||||
ctx, cancel := context.WithTimeout(context.TODO(), 5*time.Second) // FIXME constant
|
||||
defer cancel()
|
||||
cc, err := grpc.DialContext(ctx, "doesn't matter done by dialer", dialerOption, cred, ka)
|
||||
if err != nil {
|
||||
|
@ -2,13 +2,18 @@ package rpc
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"google.golang.org/grpc"
|
||||
|
||||
"github.com/zrepl/zrepl/replication"
|
||||
"github.com/zrepl/zrepl/replication/pdu"
|
||||
"github.com/google/uuid"
|
||||
"github.com/zrepl/zrepl/replication/logic"
|
||||
"github.com/zrepl/zrepl/replication/logic/pdu"
|
||||
"github.com/zrepl/zrepl/rpc/dataconn"
|
||||
"github.com/zrepl/zrepl/rpc/grpcclientidentity/grpchelper"
|
||||
"github.com/zrepl/zrepl/rpc/versionhandshake"
|
||||
@ -24,11 +29,12 @@ type Client struct {
|
||||
controlClient pdu.ReplicationClient // this the grpc client instance, see constructor
|
||||
controlConn *grpc.ClientConn
|
||||
loggers Loggers
|
||||
closed chan struct{}
|
||||
}
|
||||
|
||||
var _ replication.Endpoint = &Client{}
|
||||
var _ replication.Sender = &Client{}
|
||||
var _ replication.Receiver = &Client{}
|
||||
var _ logic.Endpoint = &Client{}
|
||||
var _ logic.Sender = &Client{}
|
||||
var _ logic.Receiver = &Client{}
|
||||
|
||||
type DialContextFunc = func(ctx context.Context, network string, addr string) (net.Conn, error)
|
||||
|
||||
@ -41,14 +47,21 @@ func NewClient(cn transport.Connecter, loggers Loggers) *Client {
|
||||
|
||||
c := &Client{
|
||||
loggers: loggers,
|
||||
closed: make(chan struct{}),
|
||||
}
|
||||
grpcConn := grpchelper.ClientConn(muxedConnecter.control, loggers.Control)
|
||||
|
||||
go func() {
|
||||
for {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
go func() {
|
||||
<-c.closed
|
||||
cancel()
|
||||
}()
|
||||
defer cancel()
|
||||
for ctx.Err() == nil {
|
||||
state := grpcConn.GetState()
|
||||
loggers.General.WithField("grpc_state", state.String()).Debug("grpc state change")
|
||||
grpcConn.WaitForStateChange(context.TODO(), state)
|
||||
grpcConn.WaitForStateChange(ctx, state)
|
||||
}
|
||||
}()
|
||||
c.controlClient = pdu.NewReplicationClient(grpcConn)
|
||||
@ -59,8 +72,9 @@ func NewClient(cn transport.Connecter, loggers Loggers) *Client {
|
||||
}
|
||||
|
||||
func (c *Client) Close() {
|
||||
close(c.closed)
|
||||
if err := c.controlConn.Close(); err != nil {
|
||||
c.loggers.General.WithError(err).Error("cannot cloe control connection")
|
||||
c.loggers.General.WithError(err).Error("cannot close control connection")
|
||||
}
|
||||
// TODO c.dataClient should have Close()
|
||||
}
|
||||
@ -101,6 +115,72 @@ func (c *Client) ReplicationCursor(ctx context.Context, in *pdu.ReplicationCurso
|
||||
return c.controlClient.ReplicationCursor(ctx, in)
|
||||
}
|
||||
|
||||
func (c *Client) WaitForConnectivity(ctx context.Context) error {
|
||||
ctx, cancel := context.WithCancel(ctx)
|
||||
defer cancel()
|
||||
msg := uuid.New().String()
|
||||
req := pdu.PingReq{Message: msg}
|
||||
var ctrlOk, dataOk int32
|
||||
loggers := GetLoggersOrPanic(ctx)
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(2)
|
||||
checkRes := func(res *pdu.PingRes, err error, logger Logger, okVar *int32) {
|
||||
if err == nil && res.GetEcho() != req.GetMessage() {
|
||||
err = errors.New("pilot message not echoed correctly")
|
||||
}
|
||||
if err == context.Canceled {
|
||||
err = nil
|
||||
}
|
||||
if err != nil {
|
||||
logger.WithError(err).Error("ping failed")
|
||||
atomic.StoreInt32(okVar, 0)
|
||||
cancel()
|
||||
} else {
|
||||
atomic.StoreInt32(okVar, 1)
|
||||
}
|
||||
}
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
ctrl, ctrlErr := c.controlClient.Ping(ctx, &req, grpc.FailFast(false))
|
||||
checkRes(ctrl, ctrlErr, loggers.Control, &ctrlOk)
|
||||
}()
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for ctx.Err() == nil {
|
||||
data, dataErr := c.dataClient.ReqPing(ctx, &req)
|
||||
// dataClient uses transport.Connecter, which doesn't expose FailFast(false)
|
||||
// => we need to mask dial timeouts
|
||||
if err, ok := dataErr.(interface{ Temporary() bool }); ok && err.Temporary() {
|
||||
// Rate-limit pings here in case Temporary() is a mis-classification
|
||||
// or returns immediately (this is a tight loop in that case)
|
||||
// TODO keep this in lockstep with controlClient
|
||||
// => don't use FailFast for control, but check that both control and data worked
|
||||
time.Sleep(envconst.Duration("ZREPL_RPC_DATACONN_PING_SLEEP", 1*time.Second))
|
||||
continue
|
||||
}
|
||||
// it's not a dial timeout,
|
||||
checkRes(data, dataErr, loggers.Data, &dataOk)
|
||||
return
|
||||
}
|
||||
}()
|
||||
wg.Wait()
|
||||
var what string
|
||||
if ctrlOk == 1 && dataOk == 1 {
|
||||
return nil
|
||||
}
|
||||
if ctrlOk == 0 {
|
||||
what += "control"
|
||||
}
|
||||
if dataOk == 0 {
|
||||
if len(what) > 0 {
|
||||
what += " and data"
|
||||
} else {
|
||||
what += "data"
|
||||
}
|
||||
}
|
||||
return fmt.Errorf("%s rpc failed to respond to ping rpcs", what)
|
||||
}
|
||||
|
||||
func (c *Client) ResetConnectBackoff() {
|
||||
c.controlConn.ResetConnectBackoff()
|
||||
}
|
||||
|
@ -7,7 +7,7 @@ import (
|
||||
"google.golang.org/grpc"
|
||||
|
||||
"github.com/zrepl/zrepl/endpoint"
|
||||
"github.com/zrepl/zrepl/replication/pdu"
|
||||
"github.com/zrepl/zrepl/replication/logic/pdu"
|
||||
"github.com/zrepl/zrepl/rpc/dataconn"
|
||||
"github.com/zrepl/zrepl/rpc/grpcclientidentity"
|
||||
"github.com/zrepl/zrepl/rpc/netadaptor"
|
||||
|
@ -7,10 +7,10 @@ package transportmux
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"time"
|
||||
"fmt"
|
||||
|
||||
"github.com/zrepl/zrepl/logger"
|
||||
"github.com/zrepl/zrepl/transport"
|
||||
@ -111,7 +111,7 @@ func Demux(ctx context.Context, rawListener transport.AuthenticatedListener, lab
|
||||
if ctx.Err() != nil {
|
||||
return
|
||||
}
|
||||
getLog(ctx).WithError(err).Error("accept error")
|
||||
getLog(ctx).WithError(err).WithField("errType", fmt.Sprintf("%T", err)).Error("accept error")
|
||||
continue
|
||||
}
|
||||
closeConn := func() {
|
||||
|
@ -26,14 +26,22 @@ type HandshakeError struct {
|
||||
msg string
|
||||
// If not nil, the underlying IO error that caused the handshake to fail.
|
||||
IOError error
|
||||
isAcceptError bool
|
||||
}
|
||||
|
||||
var _ net.Error = &HandshakeError{}
|
||||
|
||||
func (e HandshakeError) Error() string { return e.msg }
|
||||
|
||||
// Always true to enable usage in a net.Listener.
|
||||
func (e HandshakeError) Temporary() bool { return true }
|
||||
// Like with net.OpErr (Go issue 6163), a client failing to handshake
|
||||
// should be a temporary Accept error toward the Listener .
|
||||
func (e HandshakeError) Temporary() bool {
|
||||
if e.isAcceptError {
|
||||
return true
|
||||
}
|
||||
te, ok := e.IOError.(interface{ Temporary() bool });
|
||||
return ok && te.Temporary()
|
||||
}
|
||||
|
||||
// If the underlying IOError was net.Error.Timeout(), Timeout() returns that value.
|
||||
// Otherwise false.
|
||||
@ -142,14 +150,14 @@ func (m *HandshakeMessage) DecodeReader(r io.Reader, maxLen int) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func DoHandshakeCurrentVersion(conn net.Conn, deadline time.Time) error {
|
||||
func DoHandshakeCurrentVersion(conn net.Conn, deadline time.Time) *HandshakeError {
|
||||
// current protocol version is hardcoded here
|
||||
return DoHandshakeVersion(conn, deadline, 1)
|
||||
}
|
||||
|
||||
const HandshakeMessageMaxLen = 16 * 4096
|
||||
|
||||
func DoHandshakeVersion(conn net.Conn, deadline time.Time, version int) error {
|
||||
func DoHandshakeVersion(conn net.Conn, deadline time.Time, version int) *HandshakeError {
|
||||
ours := HandshakeMessage{
|
||||
ProtocolVersion: version,
|
||||
Extensions: nil,
|
||||
|
@ -55,6 +55,7 @@ func (l HandshakeListener) Accept(ctx context.Context) (*transport.AuthConn, err
|
||||
dl = time.Now().Add(l.timeout) // shadowing
|
||||
}
|
||||
if err := DoHandshakeCurrentVersion(conn, dl); err != nil {
|
||||
err.isAcceptError = true
|
||||
conn.Close()
|
||||
return nil, err
|
||||
}
|
||||
|
42
util/chainlock/chainlock.go
Normal file
42
util/chainlock/chainlock.go
Normal file
@ -0,0 +1,42 @@
|
||||
// package chainlock implements a mutex whose Lock and Unlock
|
||||
// methods return the lock itself, to enable chaining.
|
||||
//
|
||||
// Intended Usage
|
||||
//
|
||||
// defer s.lock().unlock()
|
||||
// // drop lock while waiting for wait group
|
||||
// func() {
|
||||
// defer a.l.Unlock().Lock()
|
||||
// fssesDone.Wait()
|
||||
// }()
|
||||
//
|
||||
package chainlock
|
||||
|
||||
import "sync"
|
||||
|
||||
type L struct {
|
||||
mtx sync.Mutex
|
||||
}
|
||||
|
||||
func New() *L {
|
||||
return &L{}
|
||||
}
|
||||
|
||||
func (l *L) Lock() *L {
|
||||
l.mtx.Lock()
|
||||
return l
|
||||
}
|
||||
|
||||
func (l *L) Unlock() *L {
|
||||
l.mtx.Unlock()
|
||||
return l
|
||||
}
|
||||
|
||||
func (l *L) NewCond() *sync.Cond {
|
||||
return sync.NewCond(&l.mtx)
|
||||
}
|
||||
|
||||
func (l *L) DropWhile(f func()) {
|
||||
defer l.Unlock().Lock()
|
||||
f()
|
||||
}
|
@ -40,3 +40,19 @@ func Int64(varname string, def int64) int64 {
|
||||
cache.Store(varname, d)
|
||||
return d
|
||||
}
|
||||
|
||||
func Bool(varname string, def bool) bool {
|
||||
if v, ok := cache.Load(varname); ok {
|
||||
return v.(bool)
|
||||
}
|
||||
e := os.Getenv(varname)
|
||||
if e == "" {
|
||||
return def
|
||||
}
|
||||
d, err := strconv.ParseBool(e)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
cache.Store(varname, d)
|
||||
return d
|
||||
}
|
||||
|
@ -1,16 +0,0 @@
|
||||
// Code generated by "stringer -type=Conflict"; DO NOT EDIT.
|
||||
|
||||
package zfs
|
||||
|
||||
import "strconv"
|
||||
|
||||
const _Conflict_name = "ConflictIncrementalConflictAllRightConflictNoCommonAncestorConflictDiverged"
|
||||
|
||||
var _Conflict_index = [...]uint8{0, 19, 35, 59, 75}
|
||||
|
||||
func (i Conflict) String() string {
|
||||
if i < 0 || i >= Conflict(len(_Conflict_index)-1) {
|
||||
return "Conflict(" + strconv.FormatInt(int64(i), 10) + ")"
|
||||
}
|
||||
return _Conflict_name[_Conflict_index[i]:_Conflict_index[i+1]]
|
||||
}
|
284
zfs/diff.go
284
zfs/diff.go
@ -1,284 +0,0 @@
|
||||
package zfs
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"crypto/sha512"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"os/exec"
|
||||
"sort"
|
||||
)
|
||||
|
||||
type fsbyCreateTXG []FilesystemVersion
|
||||
|
||||
func (l fsbyCreateTXG) Len() int { return len(l) }
|
||||
func (l fsbyCreateTXG) Swap(i, j int) { l[i], l[j] = l[j], l[i] }
|
||||
func (l fsbyCreateTXG) Less(i, j int) bool {
|
||||
return l[i].CreateTXG < l[j].CreateTXG
|
||||
}
|
||||
|
||||
//go:generate stringer -type=Conflict
|
||||
type Conflict int
|
||||
|
||||
const (
|
||||
ConflictIncremental Conflict = iota // no conflict, incremental repl possible
|
||||
ConflictAllRight // no conflict, initial repl possible
|
||||
ConflictNoCommonAncestor
|
||||
ConflictDiverged
|
||||
)
|
||||
|
||||
/* The receiver (left) wants to know if the sender (right) has more recent versions
|
||||
|
||||
Left : | C |
|
||||
Right: | A | B | C | D | E |
|
||||
=> : | C | D | E |
|
||||
|
||||
Left: | C |
|
||||
Right: | D | E |
|
||||
=> : <empty list>, no common ancestor
|
||||
|
||||
Left : | C | D | E |
|
||||
Right: | A | B | C |
|
||||
=> : <empty list>, the left has newer versions
|
||||
|
||||
Left : | A | B | C | | F |
|
||||
Right: | C | D | E |
|
||||
=> : | C | | F | => diverged => <empty list>
|
||||
|
||||
IMPORTANT: since ZFS currently does not export dataset UUIDs, the best heuristic to
|
||||
identify a filesystem version is the tuple (name,creation)
|
||||
*/
|
||||
type FilesystemDiff struct {
|
||||
|
||||
// Which kind of conflict / "way forward" is possible.
|
||||
// Check this first to determine the semantics of this struct's remaining members
|
||||
Conflict Conflict
|
||||
|
||||
// Conflict = Incremental | AllRight
|
||||
// The incremental steps required to get left up to right's most recent version
|
||||
// 0th element is the common ancestor, ordered by birthtime, oldest first
|
||||
// If len() < 2, left and right are at same most recent version
|
||||
// Conflict = otherwise
|
||||
// nil; there is no incremental path for left to get to right's most recent version
|
||||
IncrementalPath []FilesystemVersion
|
||||
|
||||
// Conflict = Incremental | AllRight: nil
|
||||
// Conflict = NoCommonAncestor: left as passed as input
|
||||
// Conflict = Diverged: contains path from left most recent common ancestor (mrca) to most
|
||||
// recent version on left
|
||||
MRCAPathLeft []FilesystemVersion
|
||||
// Conflict = Incremental | AllRight: nil
|
||||
// Conflict = NoCommonAncestor: right as passed as input
|
||||
// Conflict = Diverged: contains path from right most recent common ancestor (mrca)
|
||||
// to most recent version on right
|
||||
MRCAPathRight []FilesystemVersion
|
||||
}
|
||||
|
||||
func (f FilesystemDiff) String() (str string) {
|
||||
var b bytes.Buffer
|
||||
|
||||
fmt.Fprintf(&b, "%s, ", f.Conflict)
|
||||
|
||||
switch f.Conflict {
|
||||
case ConflictIncremental:
|
||||
fmt.Fprintf(&b, "incremental path length %v, common ancestor at %s", len(f.IncrementalPath)-1, f.IncrementalPath[0])
|
||||
case ConflictAllRight:
|
||||
fmt.Fprintf(&b, "%v versions, most recent is %s", len(f.MRCAPathRight)-1, f.MRCAPathRight[len(f.MRCAPathRight)-1])
|
||||
case ConflictDiverged:
|
||||
fmt.Fprintf(&b, "diverged at %s", f.MRCAPathRight[0]) // right always has at least one snap...?
|
||||
case ConflictNoCommonAncestor:
|
||||
fmt.Fprintf(&b, "no diff to show")
|
||||
default:
|
||||
fmt.Fprintf(&b, "unknown conflict type, likely a bug")
|
||||
}
|
||||
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// we must assume left and right are ordered ascendingly by ZFS_PROP_CREATETXG and that
|
||||
// names are unique (bas ZFS_PROP_GUID replacement)
|
||||
func MakeFilesystemDiff(left, right []FilesystemVersion) (diff FilesystemDiff) {
|
||||
|
||||
if right == nil {
|
||||
panic("right must not be nil")
|
||||
}
|
||||
if left == nil {
|
||||
diff = FilesystemDiff{
|
||||
IncrementalPath: nil,
|
||||
Conflict: ConflictAllRight,
|
||||
MRCAPathLeft: left,
|
||||
MRCAPathRight: right,
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Assert both left and right are sorted by createtxg
|
||||
{
|
||||
var leftSorted, rightSorted fsbyCreateTXG
|
||||
leftSorted = left
|
||||
rightSorted = right
|
||||
if !sort.IsSorted(leftSorted) {
|
||||
panic("cannot make filesystem diff: unsorted left")
|
||||
}
|
||||
if !sort.IsSorted(rightSorted) {
|
||||
panic("cannot make filesystem diff: unsorted right")
|
||||
}
|
||||
}
|
||||
|
||||
// Find most recent common ancestor by name, preferring snapshots over bookmarks
|
||||
mrcaLeft := len(left) - 1
|
||||
var mrcaRight int
|
||||
outer:
|
||||
for ; mrcaLeft >= 0; mrcaLeft-- {
|
||||
for i := len(right) - 1; i >= 0; i-- {
|
||||
if left[mrcaLeft].Guid == right[i].Guid {
|
||||
mrcaRight = i
|
||||
if i-1 >= 0 && right[i-1].Guid == right[i].Guid && right[i-1].Type == Snapshot {
|
||||
// prefer snapshots over bookmarks
|
||||
mrcaRight = i - 1
|
||||
}
|
||||
break outer
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// no common ancestor?
|
||||
if mrcaLeft == -1 {
|
||||
diff = FilesystemDiff{
|
||||
IncrementalPath: nil,
|
||||
Conflict: ConflictNoCommonAncestor,
|
||||
MRCAPathLeft: left,
|
||||
MRCAPathRight: right,
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// diverged?
|
||||
if mrcaLeft != len(left)-1 {
|
||||
diff = FilesystemDiff{
|
||||
IncrementalPath: nil,
|
||||
Conflict: ConflictDiverged,
|
||||
MRCAPathLeft: left[mrcaLeft:],
|
||||
MRCAPathRight: right[mrcaRight:],
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
if mrcaLeft != len(left)-1 {
|
||||
panic("invariant violated: mrca on left must be the last item in the left list")
|
||||
}
|
||||
|
||||
// incPath must not contain bookmarks except initial one,
|
||||
// and only if that initial bookmark's snapshot is gone
|
||||
incPath := make([]FilesystemVersion, 0, len(right))
|
||||
incPath = append(incPath, right[mrcaRight])
|
||||
// right[mrcaRight] may be a bookmark if there's no equally named snapshot
|
||||
for i := mrcaRight + 1; i < len(right); i++ {
|
||||
if right[i].Type != Bookmark {
|
||||
incPath = append(incPath, right[i])
|
||||
}
|
||||
}
|
||||
|
||||
diff = FilesystemDiff{
|
||||
IncrementalPath: incPath,
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
const ZREPL_PLACEHOLDER_PROPERTY_NAME string = "zrepl:placeholder"
|
||||
|
||||
type FilesystemState struct {
|
||||
Placeholder bool
|
||||
// TODO extend with resume token when that feature is finally added
|
||||
}
|
||||
|
||||
// A somewhat efficient way to determine if a filesystem exists on this host.
|
||||
// Particularly useful if exists is called more than once (will only fork exec once and cache the result)
|
||||
func ZFSListFilesystemState() (localState map[string]FilesystemState, err error) {
|
||||
|
||||
var actual [][]string
|
||||
if actual, err = ZFSList([]string{"name", ZREPL_PLACEHOLDER_PROPERTY_NAME}, "-t", "filesystem,volume"); err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
localState = make(map[string]FilesystemState, len(actual))
|
||||
for _, e := range actual {
|
||||
dp, err := NewDatasetPath(e[0])
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("ZFS does not return parseable dataset path: %s", e[0])
|
||||
}
|
||||
placeholder, _ := IsPlaceholder(dp, e[1])
|
||||
localState[e[0]] = FilesystemState{
|
||||
placeholder,
|
||||
}
|
||||
}
|
||||
return
|
||||
|
||||
}
|
||||
|
||||
// Computes the value for the ZREPL_PLACEHOLDER_PROPERTY_NAME ZFS user property
|
||||
// to mark the given DatasetPath p as a placeholder
|
||||
//
|
||||
// We cannot simply use booleans here since user properties are always
|
||||
// inherited.
|
||||
//
|
||||
// We hash the DatasetPath and use it to check for a given path if it is the
|
||||
// one originally marked as placeholder.
|
||||
//
|
||||
// However, this prohibits moving datasets around via `zfs rename`. The
|
||||
// placeholder attribute must be re-computed for the dataset path after the
|
||||
// move.
|
||||
//
|
||||
// TODO better solution available?
|
||||
func PlaceholderPropertyValue(p *DatasetPath) string {
|
||||
ps := []byte(p.ToString())
|
||||
sum := sha512.Sum512_256(ps)
|
||||
return hex.EncodeToString(sum[:])
|
||||
}
|
||||
|
||||
func IsPlaceholder(p *DatasetPath, placeholderPropertyValue string) (isPlaceholder bool, err error) {
|
||||
expected := PlaceholderPropertyValue(p)
|
||||
isPlaceholder = expected == placeholderPropertyValue
|
||||
if !isPlaceholder {
|
||||
err = fmt.Errorf("expected %s, has %s", expected, placeholderPropertyValue)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// for nonexistent FS, isPlaceholder == false && err == nil
|
||||
func ZFSIsPlaceholderFilesystem(p *DatasetPath) (isPlaceholder bool, err error) {
|
||||
props, err := zfsGet(p.ToString(), []string{ZREPL_PLACEHOLDER_PROPERTY_NAME}, sourceAny)
|
||||
if err == io.ErrUnexpectedEOF {
|
||||
// interpret this as an early exit of the zfs binary due to the fs not existing
|
||||
return false, nil
|
||||
} else if err != nil {
|
||||
return false, err
|
||||
}
|
||||
isPlaceholder, _ = IsPlaceholder(p, props.Get(ZREPL_PLACEHOLDER_PROPERTY_NAME))
|
||||
return
|
||||
}
|
||||
|
||||
func ZFSCreatePlaceholderFilesystem(p *DatasetPath) (err error) {
|
||||
v := PlaceholderPropertyValue(p)
|
||||
cmd := exec.Command(ZFS_BINARY, "create",
|
||||
"-o", fmt.Sprintf("%s=%s", ZREPL_PLACEHOLDER_PROPERTY_NAME, v),
|
||||
"-o", "mountpoint=none",
|
||||
p.ToString())
|
||||
|
||||
stderr := bytes.NewBuffer(make([]byte, 0, 1024))
|
||||
cmd.Stderr = stderr
|
||||
|
||||
if err = cmd.Start(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err = cmd.Wait(); err != nil {
|
||||
err = &ZFSError{
|
||||
Stderr: stderr.Bytes(),
|
||||
WaitErr: err,
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
112
zfs/diff_test.go
112
zfs/diff_test.go
@ -1,112 +0,0 @@
|
||||
package zfs
|
||||
|
||||
import (
|
||||
"github.com/stretchr/testify/assert"
|
||||
"strconv"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func fsvlist(fsv ...string) (r []FilesystemVersion) {
|
||||
|
||||
r = make([]FilesystemVersion, len(fsv))
|
||||
for i, f := range fsv {
|
||||
|
||||
// parse the id from fsvlist. it is used to derivce Guid,CreateTXG and Creation attrs
|
||||
split := strings.Split(f, ",")
|
||||
if len(split) != 2 {
|
||||
panic("invalid fsv spec")
|
||||
}
|
||||
id, err := strconv.Atoi(split[1])
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
if strings.HasPrefix(f, "#") {
|
||||
r[i] = FilesystemVersion{
|
||||
Name: strings.TrimPrefix(f, "#"),
|
||||
Type: Bookmark,
|
||||
Guid: uint64(id),
|
||||
CreateTXG: uint64(id),
|
||||
Creation: time.Unix(0, 0).Add(time.Duration(id) * time.Second),
|
||||
}
|
||||
} else if strings.HasPrefix(f, "@") {
|
||||
r[i] = FilesystemVersion{
|
||||
Name: strings.TrimPrefix(f, "@"),
|
||||
Type: Snapshot,
|
||||
Guid: uint64(id),
|
||||
CreateTXG: uint64(id),
|
||||
Creation: time.Unix(0, 0).Add(time.Duration(id) * time.Second),
|
||||
}
|
||||
} else {
|
||||
panic("invalid character")
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func doTest(left, right []FilesystemVersion, validate func(d FilesystemDiff)) {
|
||||
var d FilesystemDiff
|
||||
d = MakeFilesystemDiff(left, right)
|
||||
validate(d)
|
||||
}
|
||||
|
||||
func TestMakeFilesystemDiff_IncrementalSnapshots(t *testing.T) {
|
||||
|
||||
l := fsvlist
|
||||
|
||||
// basic functionality
|
||||
doTest(l("@a,1", "@b,2"), l("@a,1", "@b,2", "@c,3", "@d,4"), func(d FilesystemDiff) {
|
||||
assert.Equal(t, l("@b,2", "@c,3", "@d,4"), d.IncrementalPath)
|
||||
})
|
||||
|
||||
// no common ancestor
|
||||
doTest(l(), l("@a,1"), func(d FilesystemDiff) {
|
||||
assert.Nil(t, d.IncrementalPath)
|
||||
assert.EqualValues(t, d.Conflict, ConflictNoCommonAncestor)
|
||||
assert.Equal(t, l("@a,1"), d.MRCAPathRight)
|
||||
})
|
||||
doTest(l("@a,1", "@b,2"), l("@c,3", "@d,4"), func(d FilesystemDiff) {
|
||||
assert.Nil(t, d.IncrementalPath)
|
||||
assert.EqualValues(t, d.Conflict, ConflictNoCommonAncestor)
|
||||
assert.Equal(t, l("@c,3", "@d,4"), d.MRCAPathRight)
|
||||
})
|
||||
|
||||
// divergence is detected
|
||||
doTest(l("@a,1", "@b1,2"), l("@a,1", "@b2,3"), func(d FilesystemDiff) {
|
||||
assert.Nil(t, d.IncrementalPath)
|
||||
assert.EqualValues(t, d.Conflict, ConflictDiverged)
|
||||
assert.Equal(t, l("@a,1", "@b1,2"), d.MRCAPathLeft)
|
||||
assert.Equal(t, l("@a,1", "@b2,3"), d.MRCAPathRight)
|
||||
})
|
||||
|
||||
// gaps before most recent common ancestor do not matter
|
||||
doTest(l("@a,1", "@b,2", "@c,3"), l("@a,1", "@c,3", "@d,4"), func(d FilesystemDiff) {
|
||||
assert.Equal(t, l("@c,3", "@d,4"), d.IncrementalPath)
|
||||
})
|
||||
|
||||
}
|
||||
|
||||
func TestMakeFilesystemDiff_BookmarksSupport(t *testing.T) {
|
||||
l := fsvlist
|
||||
|
||||
// bookmarks are used
|
||||
doTest(l("@a,1"), l("#a,1", "@b,2"), func(d FilesystemDiff) {
|
||||
assert.Equal(t, l("#a,1", "@b,2"), d.IncrementalPath)
|
||||
})
|
||||
|
||||
// boomarks are stripped from IncrementalPath (cannot send incrementally)
|
||||
doTest(l("@a,1"), l("#a,1", "#b,2", "@c,3"), func(d FilesystemDiff) {
|
||||
assert.Equal(t, l("#a,1", "@c,3"), d.IncrementalPath)
|
||||
})
|
||||
|
||||
// test that snapshots are preferred over bookmarks in IncrementalPath
|
||||
doTest(l("@a,1"), l("#a,1", "@a,1", "@b,2"), func(d FilesystemDiff) {
|
||||
assert.Equal(t, l("@a,1", "@b,2"), d.IncrementalPath)
|
||||
})
|
||||
doTest(l("@a,1"), l("@a,1", "#a,1", "@b,2"), func(d FilesystemDiff) {
|
||||
assert.Equal(t, l("@a,1", "@b,2"), d.IncrementalPath)
|
||||
})
|
||||
|
||||
}
|
113
zfs/placeholder.go
Normal file
113
zfs/placeholder.go
Normal file
@ -0,0 +1,113 @@
|
||||
package zfs
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"crypto/sha512"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"os/exec"
|
||||
)
|
||||
|
||||
const ZREPL_PLACEHOLDER_PROPERTY_NAME string = "zrepl:placeholder"
|
||||
|
||||
type FilesystemState struct {
|
||||
Placeholder bool
|
||||
// TODO extend with resume token when that feature is finally added
|
||||
}
|
||||
|
||||
// A somewhat efficient way to determine if a filesystem exists on this host.
|
||||
// Particularly useful if exists is called more than once (will only fork exec once and cache the result)
|
||||
func ZFSListFilesystemState() (localState map[string]FilesystemState, err error) {
|
||||
|
||||
var actual [][]string
|
||||
if actual, err = ZFSList([]string{"name", ZREPL_PLACEHOLDER_PROPERTY_NAME}, "-t", "filesystem,volume"); err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
localState = make(map[string]FilesystemState, len(actual))
|
||||
for _, e := range actual {
|
||||
dp, err := NewDatasetPath(e[0])
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("ZFS does not return parseable dataset path: %s", e[0])
|
||||
}
|
||||
placeholder, _ := IsPlaceholder(dp, e[1])
|
||||
localState[e[0]] = FilesystemState{
|
||||
placeholder,
|
||||
}
|
||||
}
|
||||
return
|
||||
|
||||
}
|
||||
|
||||
// Computes the value for the ZREPL_PLACEHOLDER_PROPERTY_NAME ZFS user property
|
||||
// to mark the given DatasetPath p as a placeholder
|
||||
//
|
||||
// We cannot simply use booleans here since user properties are always
|
||||
// inherited.
|
||||
//
|
||||
// We hash the DatasetPath and use it to check for a given path if it is the
|
||||
// one originally marked as placeholder.
|
||||
//
|
||||
// However, this prohibits moving datasets around via `zfs rename`. The
|
||||
// placeholder attribute must be re-computed for the dataset path after the
|
||||
// move.
|
||||
//
|
||||
// TODO better solution available?
|
||||
func PlaceholderPropertyValue(p *DatasetPath) string {
|
||||
ps := []byte(p.ToString())
|
||||
sum := sha512.Sum512_256(ps)
|
||||
return hex.EncodeToString(sum[:])
|
||||
}
|
||||
|
||||
func IsPlaceholder(p *DatasetPath, placeholderPropertyValue string) (isPlaceholder bool, err error) {
|
||||
expected := PlaceholderPropertyValue(p)
|
||||
isPlaceholder = expected == placeholderPropertyValue
|
||||
if !isPlaceholder {
|
||||
err = fmt.Errorf("expected %s, has %s", expected, placeholderPropertyValue)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// for nonexistent FS, isPlaceholder == false && err == nil
|
||||
func ZFSIsPlaceholderFilesystem(p *DatasetPath) (isPlaceholder bool, err error) {
|
||||
props, err := zfsGet(p.ToString(), []string{ZREPL_PLACEHOLDER_PROPERTY_NAME}, sourceAny)
|
||||
if err == io.ErrUnexpectedEOF {
|
||||
// interpret this as an early exit of the zfs binary due to the fs not existing
|
||||
return false, nil
|
||||
} else if err != nil {
|
||||
return false, err
|
||||
}
|
||||
isPlaceholder, _ = IsPlaceholder(p, props.Get(ZREPL_PLACEHOLDER_PROPERTY_NAME))
|
||||
return
|
||||
}
|
||||
|
||||
func ZFSCreatePlaceholderFilesystem(p *DatasetPath) (err error) {
|
||||
v := PlaceholderPropertyValue(p)
|
||||
cmd := exec.Command(ZFS_BINARY, "create",
|
||||
"-o", fmt.Sprintf("%s=%s", ZREPL_PLACEHOLDER_PROPERTY_NAME, v),
|
||||
"-o", "mountpoint=none",
|
||||
p.ToString())
|
||||
|
||||
stderr := bytes.NewBuffer(make([]byte, 0, 1024))
|
||||
cmd.Stderr = stderr
|
||||
|
||||
if err = cmd.Start(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err = cmd.Wait(); err != nil {
|
||||
err = &ZFSError{
|
||||
Stderr: stderr.Bytes(),
|
||||
WaitErr: err,
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func ZFSSetNoPlaceholder(p *DatasetPath) error {
|
||||
props := NewZFSProperties()
|
||||
props.Set(ZREPL_PLACEHOLDER_PROPERTY_NAME, "off")
|
||||
return zfsSet(p.ToString(), props)
|
||||
}
|
82
zfs/zfs.go
82
zfs/zfs.go
@ -9,6 +9,7 @@ import (
|
||||
"io"
|
||||
"os"
|
||||
"os/exec"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
@ -691,17 +692,62 @@ type StreamCopier interface {
|
||||
Close() error
|
||||
}
|
||||
|
||||
type RecvOptions struct {
|
||||
// Rollback to the oldest snapshot, destroy it, then perform `recv -F`.
|
||||
// Note that this doesn't change property values, i.e. an existing local property value will be kept.
|
||||
RollbackAndForceRecv bool
|
||||
}
|
||||
|
||||
func ZFSRecv(ctx context.Context, fs string, streamCopier StreamCopier, additionalArgs ...string) (err error) {
|
||||
func ZFSRecv(ctx context.Context, fs string, streamCopier StreamCopier, opts RecvOptions) (err error) {
|
||||
|
||||
if err := validateZFSFilesystem(fs); err != nil {
|
||||
return err
|
||||
}
|
||||
fsdp, err := NewDatasetPath(fs)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if opts.RollbackAndForceRecv {
|
||||
// destroy all snapshots before `recv -F` because `recv -F`
|
||||
// does not perform a rollback unless `send -R` was used (which we assume hasn't been the case)
|
||||
var snaps []FilesystemVersion
|
||||
{
|
||||
vs, err := ZFSListFilesystemVersions(fsdp, nil)
|
||||
if err != nil {
|
||||
err = fmt.Errorf("cannot list versions to rollback is required: %s", err)
|
||||
}
|
||||
for _, v := range vs {
|
||||
if v.Type == Snapshot {
|
||||
snaps = append(snaps, v)
|
||||
}
|
||||
}
|
||||
sort.Slice(snaps, func(i, j int) bool {
|
||||
return snaps[i].CreateTXG < snaps[j].CreateTXG
|
||||
})
|
||||
}
|
||||
// bookmarks are rolled back automatically
|
||||
if len(snaps) > 0 {
|
||||
// use rollback to efficiently destroy all but the earliest snapshot
|
||||
// then destroy that earliest snapshot
|
||||
// afterwards, `recv -F` will work
|
||||
rollbackTarget := snaps[0]
|
||||
rollbackTargetAbs := rollbackTarget.ToAbsPath(fsdp)
|
||||
debug("recv: rollback to %q", rollbackTargetAbs)
|
||||
if err := ZFSRollback(fsdp, rollbackTarget, "-r"); err != nil {
|
||||
return fmt.Errorf("cannot rollback %s to %s for forced receive: %s", fsdp.ToString(), rollbackTarget, err)
|
||||
}
|
||||
debug("recv: destroy %q", rollbackTargetAbs)
|
||||
if err := ZFSDestroy(rollbackTargetAbs); err != nil {
|
||||
return fmt.Errorf("cannot destroy %s for forced receive: %s", rollbackTargetAbs, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
args := make([]string, 0)
|
||||
args = append(args, "recv")
|
||||
if len(args) > 0 {
|
||||
args = append(args, additionalArgs...)
|
||||
if opts.RollbackAndForceRecv {
|
||||
args = append(args, "-F")
|
||||
}
|
||||
args = append(args, fs)
|
||||
|
||||
@ -1038,3 +1084,33 @@ func ZFSBookmark(fs *DatasetPath, snapshot, bookmark string) (err error) {
|
||||
return
|
||||
|
||||
}
|
||||
|
||||
func ZFSRollback(fs *DatasetPath, snapshot FilesystemVersion, rollbackArgs ...string) (err error) {
|
||||
|
||||
snapabs := snapshot.ToAbsPath(fs)
|
||||
if snapshot.Type != Snapshot {
|
||||
return fmt.Errorf("can only rollback to snapshots, got %s", snapabs)
|
||||
}
|
||||
|
||||
args := []string{"rollback"}
|
||||
args = append(args, rollbackArgs...)
|
||||
args = append(args, snapabs)
|
||||
|
||||
cmd := exec.Command(ZFS_BINARY, args...)
|
||||
|
||||
stderr := bytes.NewBuffer(make([]byte, 0, 1024))
|
||||
cmd.Stderr = stderr
|
||||
|
||||
if err = cmd.Start(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err = cmd.Wait(); err != nil {
|
||||
err = &ZFSError{
|
||||
Stderr: stderr.Bytes(),
|
||||
WaitErr: err,
|
||||
}
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user