zrepl/rpc/rpc_server.go
Christian Schwarz 2927d0ca15 rpc: use grpchelper package, add grpc.KeepaliveEnforcementPolicy, fix 'transport is closing' error
Symptom: zrepl log message:

    rpc error: code = Unavailable desc = transport is closing

Underlying Problem:

* rpc.NewServer was not using grpchelper.NewServer and not setting Server KeepaliveParams by itself
* and even grpchelper.NewServer didn't set a KeepaliveEnforcementPolicy
* However, KeepaliveEnforcementPolicy is necessary if the client keepalive is configured with non-default values
* .. which grpchelper.ClientConn does, and that method is used by rpc.NewClient

* rpc.Client was sending pings
* lacking server-side KeepaliveEnforcementPolicy caused grpc-hard-coded `pingStrikes` counter to go past limit of 2:
  021bd5734e/internal/transport/http2_server.go (L726)

How was this debugged:
* GRPC_GO_LOG_VERBOSITY_LEVEL=99 GRPC_GO_LOG_SEVERITY_LEVEL=info PATH=/root/mockpath:$PATH zrepl daemon
* with a patch on grpc package to get more log messages on pingStrikes increases:

    diff --git a/internal/transport/http2_server.go b/internal/transport/http2_server.go
    index 8b04b039..f68f55ea 100644
    --- a/internal/transport/http2_server.go
    +++ b/internal/transport/http2_server.go
    @@ -214,6 +214,7 @@ func newHTTP2Server(conn net.Conn, config *ServerConfig) (_ ServerTransport, err
            if kep.MinTime == 0 {
                    kep.MinTime = defaultKeepalivePolicyMinTime
            }
    +       errorf("effective keepalive enforcement policy: %#v", kep)
            done := make(chan struct{})
            t := &http2Server{
                    ctx:               context.Background(),
    @@ -696,6 +697,7 @@ func (t *http2Server) handlePing(f *http2.PingFrame) {
            t.controlBuf.put(pingAck)

            now := time.Now()
    +       errorf("transport:ping handlePing, last ping %s ago", now.Sub(t.lastPingAt))
            defer func() {
                    t.lastPingAt = now
            }()
    @@ -713,11 +715,13 @@ func (t *http2Server) handlePing(f *http2.PingFrame) {
                    // Keepalive shouldn't be active thus, this new ping should
                    // have come after at least defaultPingTimeout.
                    if t.lastPingAt.Add(defaultPingTimeout).After(now) {
    +                       errorf("transport:ping strike ns < 1 && !t.kep.PermitWithoutStream")
                            t.pingStrikes++
                    }
            } else {
                    // Check if keepalive policy is respected.
                    if t.lastPingAt.Add(t.kep.MinTime).After(now) {
    +                       errorf("transport:ping strike !(ns < 1 && !t.kep.PermitWithoutStream) kep.MinTime=%s ns=%d", t.kep.MinTime, ns)
                            t.pingStrikes++
                    }
            }

fixes #181
2020-01-04 21:10:41 +01:00

112 lines
3.6 KiB
Go

package rpc
import (
"context"
"time"
"github.com/zrepl/zrepl/endpoint"
"github.com/zrepl/zrepl/replication/logic/pdu"
"github.com/zrepl/zrepl/rpc/dataconn"
"github.com/zrepl/zrepl/rpc/grpcclientidentity/grpchelper"
"github.com/zrepl/zrepl/rpc/versionhandshake"
"github.com/zrepl/zrepl/transport"
"github.com/zrepl/zrepl/util/envconst"
)
type Handler interface {
pdu.ReplicationServer
dataconn.Handler
}
type serveFunc func(ctx context.Context, demuxedListener transport.AuthenticatedListener, errOut chan<- error)
// Server abstracts the accept and request routing infrastructure for the
// passive side of a replication setup.
type Server struct {
logger Logger
handler Handler
controlServerServe serveFunc
dataServer *dataconn.Server
dataServerServe serveFunc
}
type HandlerContextInterceptor func(ctx context.Context) context.Context
// config must be valid (use its Validate function).
func NewServer(handler Handler, loggers Loggers, ctxInterceptor HandlerContextInterceptor) *Server {
controlServerServe := func(ctx context.Context, controlListener transport.AuthenticatedListener, errOut chan<- error) {
controlServer, serve := grpchelper.NewServer(controlListener, endpoint.ClientIdentityKey, loggers.Control)
pdu.RegisterReplicationServer(controlServer, handler)
// give time for graceful stop until deadline expires, then hard stop
go func() {
<-ctx.Done()
if dl, ok := ctx.Deadline(); ok {
go time.AfterFunc(dl.Sub(dl), controlServer.Stop)
}
loggers.Control.Debug("shutting down control server")
controlServer.GracefulStop()
}()
errOut <- serve()
}
// setup data server
dataServerClientIdentitySetter := func(ctx context.Context, wire *transport.AuthConn) (context.Context, *transport.AuthConn) {
ci := wire.ClientIdentity()
ctx = context.WithValue(ctx, endpoint.ClientIdentityKey, ci)
if ctxInterceptor != nil {
ctx = ctxInterceptor(ctx) // SHADOWING
}
return ctx, wire
}
dataServer := dataconn.NewServer(dataServerClientIdentitySetter, loggers.Data, handler)
dataServerServe := func(ctx context.Context, dataListener transport.AuthenticatedListener, errOut chan<- error) {
dataServer.Serve(ctx, dataListener)
errOut <- nil // TODO bad design of dataServer?
}
server := &Server{
logger: loggers.General,
handler: handler,
controlServerServe: controlServerServe,
dataServer: dataServer,
dataServerServe: dataServerServe,
}
return server
}
// The context is used for cancellation only.
// Serve never returns an error, it logs them to the Server's logger.
func (s *Server) Serve(ctx context.Context, l transport.AuthenticatedListener) {
ctx, cancel := context.WithCancel(ctx)
l = versionhandshake.Listener(l, envconst.Duration("ZREPL_RPC_SERVER_VERSIONHANDSHAKE_TIMEOUT", 10*time.Second))
// it is important that demux's context is cancelled,
// it has background goroutines attached
demuxListener := demux(ctx, l)
serveErrors := make(chan error, 2)
go s.controlServerServe(ctx, demuxListener.control, serveErrors)
go s.dataServerServe(ctx, demuxListener.data, serveErrors)
select {
case serveErr := <-serveErrors:
s.logger.WithError(serveErr).Error("serve error")
s.logger.Debug("wait for other server to shut down")
cancel()
secondServeErr := <-serveErrors
s.logger.WithError(secondServeErr).Error("serve error")
case <-ctx.Done():
s.logger.Debug("context cancelled, wait for control and data servers")
cancel()
for i := 0; i < 2; i++ {
<-serveErrors
}
s.logger.Debug("control and data server shut down, returning from Serve")
}
}