rpc: use grpchelper package, add grpc.KeepaliveEnforcementPolicy, fix 'transport is closing' error

Symptom: zrepl log message:

    rpc error: code = Unavailable desc = transport is closing

Underlying Problem:

* rpc.NewServer was not using grpchelper.NewServer and not setting Server KeepaliveParams by itself
* and even grpchelper.NewServer didn't set a KeepaliveEnforcementPolicy
* However, KeepaliveEnforcementPolicy is necessary if the client keepalive is configured with non-default values
* .. which grpchelper.ClientConn does, and that method is used by rpc.NewClient

* rpc.Client was sending pings
* lacking server-side KeepaliveEnforcementPolicy caused grpc-hard-coded `pingStrikes` counter to go past limit of 2:
  021bd5734e/internal/transport/http2_server.go (L726)

How was this debugged:
* GRPC_GO_LOG_VERBOSITY_LEVEL=99 GRPC_GO_LOG_SEVERITY_LEVEL=info PATH=/root/mockpath:$PATH zrepl daemon
* with a patch on grpc package to get more log messages on pingStrikes increases:

    diff --git a/internal/transport/http2_server.go b/internal/transport/http2_server.go
    index 8b04b039..f68f55ea 100644
    --- a/internal/transport/http2_server.go
    +++ b/internal/transport/http2_server.go
    @@ -214,6 +214,7 @@ func newHTTP2Server(conn net.Conn, config *ServerConfig) (_ ServerTransport, err
            if kep.MinTime == 0 {
                    kep.MinTime = defaultKeepalivePolicyMinTime
            }
    +       errorf("effective keepalive enforcement policy: %#v", kep)
            done := make(chan struct{})
            t := &http2Server{
                    ctx:               context.Background(),
    @@ -696,6 +697,7 @@ func (t *http2Server) handlePing(f *http2.PingFrame) {
            t.controlBuf.put(pingAck)

            now := time.Now()
    +       errorf("transport:ping handlePing, last ping %s ago", now.Sub(t.lastPingAt))
            defer func() {
                    t.lastPingAt = now
            }()
    @@ -713,11 +715,13 @@ func (t *http2Server) handlePing(f *http2.PingFrame) {
                    // Keepalive shouldn't be active thus, this new ping should
                    // have come after at least defaultPingTimeout.
                    if t.lastPingAt.Add(defaultPingTimeout).After(now) {
    +                       errorf("transport:ping strike ns < 1 && !t.kep.PermitWithoutStream")
                            t.pingStrikes++
                    }
            } else {
                    // Check if keepalive policy is respected.
                    if t.lastPingAt.Add(t.kep.MinTime).After(now) {
    +                       errorf("transport:ping strike !(ns < 1 && !t.kep.PermitWithoutStream) kep.MinTime=%s ns=%d", t.kep.MinTime, ns)
                            t.pingStrikes++
                    }
            }

fixes #181
This commit is contained in:
Christian Schwarz 2020-01-04 21:00:00 +01:00
parent d35e2400b2
commit 2927d0ca15
3 changed files with 19 additions and 22 deletions

View File

@ -87,10 +87,13 @@ func server() {
log := logger.NewStderrDebugLogger()
srv, serve, err := grpchelper.NewServer(authListenerFactory, clientIdentityKey, log)
authListener, err := authListenerFactory()
if err != nil {
onErr(err, "new server")
onErr(err, "cannot listen")
}
srv, serve := grpchelper.NewServer(authListener, clientIdentityKey, log)
svc := &greeter{"hello "}
pdu.RegisterGreeterServer(srv, svc)

View File

@ -50,25 +50,25 @@ func ClientConn(cn transport.Connecter, log Logger) *grpc.ClientConn {
}
// NewServer is a convenience interface around the TransportCredentials and Interceptors interface.
func NewServer(authListenerFactory transport.AuthenticatedListenerFactory, clientIdentityKey interface{}, logger grpcclientidentity.Logger) (srv *grpc.Server, serve func() error, err error) {
func NewServer(authListener transport.AuthenticatedListener, clientIdentityKey interface{}, logger grpcclientidentity.Logger) (srv *grpc.Server, serve func() error) {
ka := grpc.KeepaliveParams(keepalive.ServerParameters{
Time: StartKeepalivesAfterInactivityDuration,
Timeout: KeepalivePeerTimeout,
})
ep := grpc.KeepaliveEnforcementPolicy(keepalive.EnforcementPolicy{
MinTime: StartKeepalivesAfterInactivityDuration / 2, // avoid skew
PermitWithoutStream: true,
})
tcs := grpcclientidentity.NewTransportCredentials(logger)
unary, stream := grpcclientidentity.NewInterceptors(logger, clientIdentityKey)
srv = grpc.NewServer(grpc.Creds(tcs), grpc.UnaryInterceptor(unary), grpc.StreamInterceptor(stream), ka)
srv = grpc.NewServer(grpc.Creds(tcs), grpc.UnaryInterceptor(unary), grpc.StreamInterceptor(stream), ka, ep)
serve = func() error {
l, err := authListenerFactory()
if err != nil {
return err
}
if err := srv.Serve(netadaptor.New(l, logger)); err != nil {
if err := srv.Serve(netadaptor.New(authListener, logger)); err != nil {
return err
}
return nil
}
return srv, serve, nil
return srv, serve
}

View File

@ -4,13 +4,10 @@ import (
"context"
"time"
"google.golang.org/grpc"
"github.com/zrepl/zrepl/endpoint"
"github.com/zrepl/zrepl/replication/logic/pdu"
"github.com/zrepl/zrepl/rpc/dataconn"
"github.com/zrepl/zrepl/rpc/grpcclientidentity"
"github.com/zrepl/zrepl/rpc/netadaptor"
"github.com/zrepl/zrepl/rpc/grpcclientidentity/grpchelper"
"github.com/zrepl/zrepl/rpc/versionhandshake"
"github.com/zrepl/zrepl/transport"
"github.com/zrepl/zrepl/util/envconst"
@ -28,7 +25,6 @@ type serveFunc func(ctx context.Context, demuxedListener transport.Authenticated
type Server struct {
logger Logger
handler Handler
controlServer *grpc.Server
controlServerServe serveFunc
dataServer *dataconn.Server
dataServerServe serveFunc
@ -39,12 +35,11 @@ type HandlerContextInterceptor func(ctx context.Context) context.Context
// config must be valid (use its Validate function).
func NewServer(handler Handler, loggers Loggers, ctxInterceptor HandlerContextInterceptor) *Server {
// setup control server
tcs := grpcclientidentity.NewTransportCredentials(loggers.Control) // TODO different subsystem for log
unary, stream := grpcclientidentity.NewInterceptors(loggers.Control, endpoint.ClientIdentityKey)
controlServer := grpc.NewServer(grpc.Creds(tcs), grpc.UnaryInterceptor(unary), grpc.StreamInterceptor(stream))
pdu.RegisterReplicationServer(controlServer, handler)
controlServerServe := func(ctx context.Context, controlListener transport.AuthenticatedListener, errOut chan<- error) {
controlServer, serve := grpchelper.NewServer(controlListener, endpoint.ClientIdentityKey, loggers.Control)
pdu.RegisterReplicationServer(controlServer, handler)
// give time for graceful stop until deadline expires, then hard stop
go func() {
<-ctx.Done()
@ -55,7 +50,7 @@ func NewServer(handler Handler, loggers Loggers, ctxInterceptor HandlerContextIn
controlServer.GracefulStop()
}()
errOut <- controlServer.Serve(netadaptor.New(controlListener, loggers.Control))
errOut <- serve()
}
// setup data server
@ -76,7 +71,6 @@ func NewServer(handler Handler, loggers Loggers, ctxInterceptor HandlerContextIn
server := &Server{
logger: loggers.General,
handler: handler,
controlServer: controlServer,
controlServerServe: controlServerServe,
dataServer: dataServer,
dataServerServe: dataServerServe,