zrepl/rpc/rpc_server.go

113 lines
3.6 KiB
Go
Raw Normal View History

package rpc
import (
"context"
"time"
"github.com/zrepl/zrepl/endpoint"
"github.com/zrepl/zrepl/replication/logic/pdu"
"github.com/zrepl/zrepl/rpc/dataconn"
rpc: use grpchelper package, add grpc.KeepaliveEnforcementPolicy, fix 'transport is closing' error Symptom: zrepl log message: rpc error: code = Unavailable desc = transport is closing Underlying Problem: * rpc.NewServer was not using grpchelper.NewServer and not setting Server KeepaliveParams by itself * and even grpchelper.NewServer didn't set a KeepaliveEnforcementPolicy * However, KeepaliveEnforcementPolicy is necessary if the client keepalive is configured with non-default values * .. which grpchelper.ClientConn does, and that method is used by rpc.NewClient * rpc.Client was sending pings * lacking server-side KeepaliveEnforcementPolicy caused grpc-hard-coded `pingStrikes` counter to go past limit of 2: https://github.com/grpc/grpc-go/blob/021bd5734e43b1b11073ea326de29af4de3dfa9b/internal/transport/http2_server.go#L726 How was this debugged: * GRPC_GO_LOG_VERBOSITY_LEVEL=99 GRPC_GO_LOG_SEVERITY_LEVEL=info PATH=/root/mockpath:$PATH zrepl daemon * with a patch on grpc package to get more log messages on pingStrikes increases: diff --git a/internal/transport/http2_server.go b/internal/transport/http2_server.go index 8b04b039..f68f55ea 100644 --- a/internal/transport/http2_server.go +++ b/internal/transport/http2_server.go @@ -214,6 +214,7 @@ func newHTTP2Server(conn net.Conn, config *ServerConfig) (_ ServerTransport, err if kep.MinTime == 0 { kep.MinTime = defaultKeepalivePolicyMinTime } + errorf("effective keepalive enforcement policy: %#v", kep) done := make(chan struct{}) t := &http2Server{ ctx: context.Background(), @@ -696,6 +697,7 @@ func (t *http2Server) handlePing(f *http2.PingFrame) { t.controlBuf.put(pingAck) now := time.Now() + errorf("transport:ping handlePing, last ping %s ago", now.Sub(t.lastPingAt)) defer func() { t.lastPingAt = now }() @@ -713,11 +715,13 @@ func (t *http2Server) handlePing(f *http2.PingFrame) { // Keepalive shouldn't be active thus, this new ping should // have come after at least defaultPingTimeout. if t.lastPingAt.Add(defaultPingTimeout).After(now) { + errorf("transport:ping strike ns < 1 && !t.kep.PermitWithoutStream") t.pingStrikes++ } } else { // Check if keepalive policy is respected. if t.lastPingAt.Add(t.kep.MinTime).After(now) { + errorf("transport:ping strike !(ns < 1 && !t.kep.PermitWithoutStream) kep.MinTime=%s ns=%d", t.kep.MinTime, ns) t.pingStrikes++ } } fixes #181
2020-01-04 21:00:00 +01:00
"github.com/zrepl/zrepl/rpc/grpcclientidentity/grpchelper"
"github.com/zrepl/zrepl/rpc/versionhandshake"
"github.com/zrepl/zrepl/transport"
"github.com/zrepl/zrepl/util/envconst"
)
type Handler interface {
pdu.ReplicationServer
dataconn.Handler
}
type serveFunc func(ctx context.Context, demuxedListener transport.AuthenticatedListener, errOut chan<- error)
// Server abstracts the accept and request routing infrastructure for the
// passive side of a replication setup.
type Server struct {
logger Logger
handler Handler
controlServerServe serveFunc
dataServer *dataconn.Server
dataServerServe serveFunc
}
type HandlerContextInterceptor func(ctx context.Context) context.Context
// config must be valid (use its Validate function).
func NewServer(handler Handler, loggers Loggers, ctxInterceptor HandlerContextInterceptor) *Server {
// setup control server
controlServerServe := func(ctx context.Context, controlListener transport.AuthenticatedListener, errOut chan<- error) {
rpc: use grpchelper package, add grpc.KeepaliveEnforcementPolicy, fix 'transport is closing' error Symptom: zrepl log message: rpc error: code = Unavailable desc = transport is closing Underlying Problem: * rpc.NewServer was not using grpchelper.NewServer and not setting Server KeepaliveParams by itself * and even grpchelper.NewServer didn't set a KeepaliveEnforcementPolicy * However, KeepaliveEnforcementPolicy is necessary if the client keepalive is configured with non-default values * .. which grpchelper.ClientConn does, and that method is used by rpc.NewClient * rpc.Client was sending pings * lacking server-side KeepaliveEnforcementPolicy caused grpc-hard-coded `pingStrikes` counter to go past limit of 2: https://github.com/grpc/grpc-go/blob/021bd5734e43b1b11073ea326de29af4de3dfa9b/internal/transport/http2_server.go#L726 How was this debugged: * GRPC_GO_LOG_VERBOSITY_LEVEL=99 GRPC_GO_LOG_SEVERITY_LEVEL=info PATH=/root/mockpath:$PATH zrepl daemon * with a patch on grpc package to get more log messages on pingStrikes increases: diff --git a/internal/transport/http2_server.go b/internal/transport/http2_server.go index 8b04b039..f68f55ea 100644 --- a/internal/transport/http2_server.go +++ b/internal/transport/http2_server.go @@ -214,6 +214,7 @@ func newHTTP2Server(conn net.Conn, config *ServerConfig) (_ ServerTransport, err if kep.MinTime == 0 { kep.MinTime = defaultKeepalivePolicyMinTime } + errorf("effective keepalive enforcement policy: %#v", kep) done := make(chan struct{}) t := &http2Server{ ctx: context.Background(), @@ -696,6 +697,7 @@ func (t *http2Server) handlePing(f *http2.PingFrame) { t.controlBuf.put(pingAck) now := time.Now() + errorf("transport:ping handlePing, last ping %s ago", now.Sub(t.lastPingAt)) defer func() { t.lastPingAt = now }() @@ -713,11 +715,13 @@ func (t *http2Server) handlePing(f *http2.PingFrame) { // Keepalive shouldn't be active thus, this new ping should // have come after at least defaultPingTimeout. if t.lastPingAt.Add(defaultPingTimeout).After(now) { + errorf("transport:ping strike ns < 1 && !t.kep.PermitWithoutStream") t.pingStrikes++ } } else { // Check if keepalive policy is respected. if t.lastPingAt.Add(t.kep.MinTime).After(now) { + errorf("transport:ping strike !(ns < 1 && !t.kep.PermitWithoutStream) kep.MinTime=%s ns=%d", t.kep.MinTime, ns) t.pingStrikes++ } } fixes #181
2020-01-04 21:00:00 +01:00
controlServer, serve := grpchelper.NewServer(controlListener, endpoint.ClientIdentityKey, loggers.Control, ctxInterceptor)
rpc: use grpchelper package, add grpc.KeepaliveEnforcementPolicy, fix 'transport is closing' error Symptom: zrepl log message: rpc error: code = Unavailable desc = transport is closing Underlying Problem: * rpc.NewServer was not using grpchelper.NewServer and not setting Server KeepaliveParams by itself * and even grpchelper.NewServer didn't set a KeepaliveEnforcementPolicy * However, KeepaliveEnforcementPolicy is necessary if the client keepalive is configured with non-default values * .. which grpchelper.ClientConn does, and that method is used by rpc.NewClient * rpc.Client was sending pings * lacking server-side KeepaliveEnforcementPolicy caused grpc-hard-coded `pingStrikes` counter to go past limit of 2: https://github.com/grpc/grpc-go/blob/021bd5734e43b1b11073ea326de29af4de3dfa9b/internal/transport/http2_server.go#L726 How was this debugged: * GRPC_GO_LOG_VERBOSITY_LEVEL=99 GRPC_GO_LOG_SEVERITY_LEVEL=info PATH=/root/mockpath:$PATH zrepl daemon * with a patch on grpc package to get more log messages on pingStrikes increases: diff --git a/internal/transport/http2_server.go b/internal/transport/http2_server.go index 8b04b039..f68f55ea 100644 --- a/internal/transport/http2_server.go +++ b/internal/transport/http2_server.go @@ -214,6 +214,7 @@ func newHTTP2Server(conn net.Conn, config *ServerConfig) (_ ServerTransport, err if kep.MinTime == 0 { kep.MinTime = defaultKeepalivePolicyMinTime } + errorf("effective keepalive enforcement policy: %#v", kep) done := make(chan struct{}) t := &http2Server{ ctx: context.Background(), @@ -696,6 +697,7 @@ func (t *http2Server) handlePing(f *http2.PingFrame) { t.controlBuf.put(pingAck) now := time.Now() + errorf("transport:ping handlePing, last ping %s ago", now.Sub(t.lastPingAt)) defer func() { t.lastPingAt = now }() @@ -713,11 +715,13 @@ func (t *http2Server) handlePing(f *http2.PingFrame) { // Keepalive shouldn't be active thus, this new ping should // have come after at least defaultPingTimeout. if t.lastPingAt.Add(defaultPingTimeout).After(now) { + errorf("transport:ping strike ns < 1 && !t.kep.PermitWithoutStream") t.pingStrikes++ } } else { // Check if keepalive policy is respected. if t.lastPingAt.Add(t.kep.MinTime).After(now) { + errorf("transport:ping strike !(ns < 1 && !t.kep.PermitWithoutStream) kep.MinTime=%s ns=%d", t.kep.MinTime, ns) t.pingStrikes++ } } fixes #181
2020-01-04 21:00:00 +01:00
pdu.RegisterReplicationServer(controlServer, handler)
// give time for graceful stop until deadline expires, then hard stop
go func() {
<-ctx.Done()
if dl, ok := ctx.Deadline(); ok {
go time.AfterFunc(dl.Sub(dl), controlServer.Stop)
}
loggers.Control.Debug("shutting down control server")
controlServer.GracefulStop()
}()
rpc: use grpchelper package, add grpc.KeepaliveEnforcementPolicy, fix 'transport is closing' error Symptom: zrepl log message: rpc error: code = Unavailable desc = transport is closing Underlying Problem: * rpc.NewServer was not using grpchelper.NewServer and not setting Server KeepaliveParams by itself * and even grpchelper.NewServer didn't set a KeepaliveEnforcementPolicy * However, KeepaliveEnforcementPolicy is necessary if the client keepalive is configured with non-default values * .. which grpchelper.ClientConn does, and that method is used by rpc.NewClient * rpc.Client was sending pings * lacking server-side KeepaliveEnforcementPolicy caused grpc-hard-coded `pingStrikes` counter to go past limit of 2: https://github.com/grpc/grpc-go/blob/021bd5734e43b1b11073ea326de29af4de3dfa9b/internal/transport/http2_server.go#L726 How was this debugged: * GRPC_GO_LOG_VERBOSITY_LEVEL=99 GRPC_GO_LOG_SEVERITY_LEVEL=info PATH=/root/mockpath:$PATH zrepl daemon * with a patch on grpc package to get more log messages on pingStrikes increases: diff --git a/internal/transport/http2_server.go b/internal/transport/http2_server.go index 8b04b039..f68f55ea 100644 --- a/internal/transport/http2_server.go +++ b/internal/transport/http2_server.go @@ -214,6 +214,7 @@ func newHTTP2Server(conn net.Conn, config *ServerConfig) (_ ServerTransport, err if kep.MinTime == 0 { kep.MinTime = defaultKeepalivePolicyMinTime } + errorf("effective keepalive enforcement policy: %#v", kep) done := make(chan struct{}) t := &http2Server{ ctx: context.Background(), @@ -696,6 +697,7 @@ func (t *http2Server) handlePing(f *http2.PingFrame) { t.controlBuf.put(pingAck) now := time.Now() + errorf("transport:ping handlePing, last ping %s ago", now.Sub(t.lastPingAt)) defer func() { t.lastPingAt = now }() @@ -713,11 +715,13 @@ func (t *http2Server) handlePing(f *http2.PingFrame) { // Keepalive shouldn't be active thus, this new ping should // have come after at least defaultPingTimeout. if t.lastPingAt.Add(defaultPingTimeout).After(now) { + errorf("transport:ping strike ns < 1 && !t.kep.PermitWithoutStream") t.pingStrikes++ } } else { // Check if keepalive policy is respected. if t.lastPingAt.Add(t.kep.MinTime).After(now) { + errorf("transport:ping strike !(ns < 1 && !t.kep.PermitWithoutStream) kep.MinTime=%s ns=%d", t.kep.MinTime, ns) t.pingStrikes++ } } fixes #181
2020-01-04 21:00:00 +01:00
errOut <- serve()
}
// setup data server
dataServerClientIdentitySetter := func(ctx context.Context, wire *transport.AuthConn) (context.Context, *transport.AuthConn) {
ci := wire.ClientIdentity()
ctx = context.WithValue(ctx, endpoint.ClientIdentityKey, ci)
if ctxInterceptor != nil {
ctx = ctxInterceptor(ctx) // SHADOWING
}
return ctx, wire
}
dataServer := dataconn.NewServer(dataServerClientIdentitySetter, loggers.Data, handler)
dataServerServe := func(ctx context.Context, dataListener transport.AuthenticatedListener, errOut chan<- error) {
dataServer.Serve(ctx, dataListener)
errOut <- nil // TODO bad design of dataServer?
}
server := &Server{
logger: loggers.General,
handler: handler,
controlServerServe: controlServerServe,
dataServer: dataServer,
dataServerServe: dataServerServe,
}
return server
}
// The context is used for cancellation only.
// Serve never returns an error, it logs them to the Server's logger.
func (s *Server) Serve(ctx context.Context, l transport.AuthenticatedListener) {
ctx, cancel := context.WithCancel(ctx)
l = versionhandshake.Listener(l, envconst.Duration("ZREPL_RPC_SERVER_VERSIONHANDSHAKE_TIMEOUT", 10*time.Second))
// it is important that demux's context is cancelled,
// it has background goroutines attached
demuxListener := demux(ctx, l)
serveErrors := make(chan error, 2)
go s.controlServerServe(ctx, demuxListener.control, serveErrors)
go s.dataServerServe(ctx, demuxListener.data, serveErrors)
select {
case serveErr := <-serveErrors:
s.logger.WithError(serveErr).Error("serve error")
s.logger.Debug("wait for other server to shut down")
cancel()
secondServeErr := <-serveErrors
s.logger.WithError(secondServeErr).Error("serve error")
case <-ctx.Done():
s.logger.Debug("context cancelled, wait for control and data servers")
cancel()
for i := 0; i < 2; i++ {
<-serveErrors
}
s.logger.Debug("control and data server shut down, returning from Serve")
}
}