mirror of
https://github.com/zrepl/zrepl.git
synced 2025-01-18 12:18:19 +01:00
parent
082335df5d
commit
d6304f4f1d
@ -222,6 +222,10 @@ func (c *Conn) writeFrame(payload []byte, frameType uint32) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Conn) ResetWriteTimeout() error {
|
||||
return c.nc.RenewWriteDeadline()
|
||||
}
|
||||
|
||||
func (c *Conn) Shutdown(deadline time.Time) error {
|
||||
// TCP connection teardown is a bit wonky if we are in a situation
|
||||
// where there is still data in flight (DIF) to our side:
|
||||
|
@ -123,7 +123,17 @@ func (c *Conn) readFrameFiltered() (frameconn.Frame, error) {
|
||||
return frameconn.Frame{}, fmt.Errorf("unknown frame type %x", f.Header.Type)
|
||||
}
|
||||
// drop heartbeat frame
|
||||
debug("received heartbeat")
|
||||
debug("received heartbeat, resetting write timeout")
|
||||
// the peer's heartbeat proves to us that the peer is still live
|
||||
// => trust the peer at this point (DoS risks are ignored ATM)
|
||||
// => we assume that the connection is symmetric duplex, i.e., if receiving works for us,
|
||||
// sending works for us, too.
|
||||
// So, let's grant the peer another write timeout.
|
||||
err = c.fc.ResetWriteTimeout()
|
||||
debug("renew frameconn write timeout returned errT=%T err=%s", err, err)
|
||||
if err != nil {
|
||||
return frameconn.Frame{}, err
|
||||
}
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,144 @@
|
||||
// This integration test exercises the behavior of heartbeatconn
|
||||
// where the server is slow at handling the data received over the connection.
|
||||
// Note that the server is still sending heartbeats to the client, it's just the
|
||||
// data handling (usually I/O in case of zrepl endpoint.Receiver) that is slow.
|
||||
//
|
||||
// In commit 082335df5d85e1b0b9faa35ff182c71886142d3e and earlier, heartbeatconn would fail
|
||||
// this benchmark with a writev I/O timeout (here the ss(8) output at the time of failure)
|
||||
//
|
||||
// ESTAB 33369 0 127.0.0.1:12345 127.0.0.1:57282 users:(("heartbeatconn_i",pid=25953,fd=5))
|
||||
// cubic wscale:7,7 rto:203 rtt:2.992/5.849 ato:162 mss:32768 pmtu:65535 rcvmss:32741 advmss:65483 cwnd:10 bytes_sent:48 bytes_acked:48 bytes_received:195401 segs_out:44 segs_in:57 data_segs_out:6 data_segs_in:34 send 876.1Mbps lastsnd:125 lastrcv:9390 lastack:125 pacing_rate 1752.0Mbps delivery_rate 6393.8Mbps delivered:7 app_limited busy:42ms rcv_rtt:1 rcv_space:65483 rcv_ssthresh:65483 minrtt:0.029
|
||||
// --
|
||||
// ESTAB 0 3956805 127.0.0.1:57282 127.0.0.1:12345 users:(("heartbeatconn_i",pid=26100,fd=3))
|
||||
// cubic wscale:7,7 rto:211 backoff:5 rtt:10.38/16.937 ato:40 mss:32768 pmtu:65535 rcvmss:536 advmss:65483 cwnd:10 bytes_sent:195401 bytes_acked:195402 bytes_received:48 segs_out:57 segs_in:45 data_segs_out:34 data_segs_in:6 send 252.5Mbps lastsnd:9390 lastrcv:125 lastack:125 pacing_rate 505.1Mbps delivery_rate 1971.0Mbps delivered:35 busy:30127ms rwnd_limited:30086ms(99.9%) rcv_space:65495 rcv_ssthresh:65495 notsent:3956805 minrtt:0.007
|
||||
// panic: writev tcp 127.0.0.1:57282->127.0.0.1:12345: i/o timeout
|
||||
//
|
||||
// The assumed reason for those writev timeouts is the following:
|
||||
// - Sporadic server stalls (sever data handling, usually I/O) cause TCP exponential backoff on the client for client->server
|
||||
// - Go runtime unblocks after the deadline expires, resultin gin writev I/O timeout
|
||||
// - That is, even though the client observed heartbeats from the server
|
||||
// -> TCP doesn't assume symmetric connection behavior, but our implementation does.
|
||||
//
|
||||
// The fix contained in the commit this message was committed with resets the deadline whenever
|
||||
// a heartbeat is received from the server.
|
||||
//
|
||||
//
|
||||
// How to run this integration test:
|
||||
//
|
||||
//
|
||||
// Terminal 1:
|
||||
// $ ZREPL_RPC_DATACONN_HEARTBEATCONN_DEBUG=1 go run heartbeatconn_integration_variablereceiverate.go -mode server -addr 127.0.0.1:12345
|
||||
// rpc/dataconn/heartbeatconn: send heartbeat
|
||||
// rpc/dataconn/heartbeatconn: send heartbeat
|
||||
// ...
|
||||
//
|
||||
// Terminal 2:
|
||||
// $ ZREPL_RPC_DATACONN_HEARTBEATCONN_DEBUG=1 go run heartbeatconn_integration_variablereceiverate.go -mode client -addr 127.0.0.1:12345
|
||||
// rpc/dataconn/heartbeatconn: received heartbeat, resetting write timeout
|
||||
// rpc/dataconn/heartbeatconn: renew frameconn write timeout returned errT=<nil> err=%!s(<nil>)
|
||||
// rpc/dataconn/heartbeatconn: send heartbeat
|
||||
// rpc/dataconn/heartbeatconn: received heartbeat, resetting write timeout
|
||||
// rpc/dataconn/heartbeatconn: renew frameconn write timeout returned errT=<nil> err=%!s(<nil>)
|
||||
// rpc/dataconn/heartbeatconn: received heartbeat, resetting write timeout
|
||||
// ...
|
||||
//
|
||||
// You should observe
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"math/rand"
|
||||
"net"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path"
|
||||
"time"
|
||||
|
||||
"github.com/zrepl/zrepl/util/devnoop"
|
||||
|
||||
"github.com/zrepl/zrepl/rpc/dataconn/heartbeatconn"
|
||||
)
|
||||
|
||||
func orDie(err error) {
|
||||
if err != nil {
|
||||
grepfield := path.Base(os.Args[0])[:10]
|
||||
fmt.Fprintf(os.Stderr, "grepping for %s\n", grepfield)
|
||||
sh := fmt.Sprintf("ss -ntpi | grep -A1 %s", grepfield)
|
||||
cmd := exec.Command("bash", "-c", sh)
|
||||
o, _ := cmd.CombinedOutput()
|
||||
buf := bytes.NewBuffer(o)
|
||||
_, _ = io.Copy(os.Stderr, buf)
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
var mode string
|
||||
var addr string
|
||||
|
||||
func main() {
|
||||
|
||||
flag.StringVar(&mode, "mode", "", "server|client")
|
||||
flag.StringVar(&addr, "addr", "INVALID", "")
|
||||
flag.Parse()
|
||||
|
||||
modemap := map[string]func(){
|
||||
"server": server,
|
||||
"client": client,
|
||||
}
|
||||
modemap[mode]()
|
||||
|
||||
}
|
||||
|
||||
func server() {
|
||||
ln, err := net.Listen("tcp", addr)
|
||||
orDie(err)
|
||||
l := ln.(*net.TCPListener)
|
||||
for {
|
||||
c, err := l.AcceptTCP()
|
||||
if err != nil {
|
||||
log.Printf("accept err: %s", err)
|
||||
continue
|
||||
}
|
||||
hc := heartbeatconn.Wrap(c, 5*time.Second, 10*time.Second)
|
||||
|
||||
for {
|
||||
f, err := hc.ReadFrame()
|
||||
orDie(err)
|
||||
// _, err = buf.Write(f.Buffer.Bytes())
|
||||
// orDie(err)
|
||||
sleep := time.Duration(rand.NormFloat64()*500) * time.Millisecond
|
||||
time.Sleep(sleep)
|
||||
f.Buffer.Free()
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func client() {
|
||||
c, err := net.Dial("tcp", addr)
|
||||
orDie(err)
|
||||
hc := heartbeatconn.Wrap(c.(*net.TCPConn), 5*time.Second, 10*time.Second)
|
||||
|
||||
// follow API requirements to always ReadFrame
|
||||
go func() {
|
||||
for {
|
||||
f, err := hc.ReadFrame()
|
||||
orDie(err)
|
||||
f.Buffer.Free()
|
||||
}
|
||||
}()
|
||||
|
||||
dn := devnoop.Get()
|
||||
var buf [1 << 10]byte
|
||||
for {
|
||||
n, err := dn.Read(buf[:])
|
||||
orDie(err)
|
||||
err = hc.WriteFrame(buf[:n], 23)
|
||||
orDie(err)
|
||||
}
|
||||
}
|
@ -80,7 +80,7 @@ func (c *Conn) renewReadDeadline() error {
|
||||
return c.SetReadDeadline(time.Now().Add(c.idleTimeout))
|
||||
}
|
||||
|
||||
func (c *Conn) renewWriteDeadline() error {
|
||||
func (c *Conn) RenewWriteDeadline() error {
|
||||
if atomic.LoadInt32(&c.renewDeadlinesDisabled) != 0 {
|
||||
return nil
|
||||
}
|
||||
@ -107,7 +107,7 @@ restart:
|
||||
func (c Conn) Write(p []byte) (n int, err error) {
|
||||
n = 0
|
||||
restart:
|
||||
if err := c.renewWriteDeadline(); err != nil {
|
||||
if err := c.RenewWriteDeadline(); err != nil {
|
||||
return n, err
|
||||
}
|
||||
var nCurWrite int
|
||||
@ -127,7 +127,7 @@ restart:
|
||||
func (c Conn) WritevFull(bufs net.Buffers) (n int64, err error) {
|
||||
n = 0
|
||||
restart:
|
||||
if err := c.renewWriteDeadline(); err != nil {
|
||||
if err := c.RenewWriteDeadline(); err != nil {
|
||||
return n, err
|
||||
}
|
||||
var nCurWrite int64
|
||||
|
Loading…
Reference in New Issue
Block a user