zrepl/internal/rpc/dataconn/frameconn/frameconn.go
2024-10-18 19:21:17 +02:00

361 lines
12 KiB
Go

package frameconn
import (
"encoding/binary"
"errors"
"fmt"
"io"
"net"
"sync"
"syscall"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/zrepl/zrepl/internal/rpc/dataconn/base2bufpool"
"github.com/zrepl/zrepl/internal/rpc/dataconn/timeoutconn"
)
type FrameHeader struct {
Type uint32
PayloadLen uint32
}
// The 4 MSBs of ft are reserved for frameconn.
func IsPublicFrameType(ft uint32) bool {
return (0xf<<28)&ft == 0
}
const (
rstFrameType uint32 = 1<<28 + iota
)
func assertPublicFrameType(frameType uint32) {
if !IsPublicFrameType(frameType) {
panic(fmt.Sprintf("frameconn: frame type %v cannot be used by consumers of this package", frameType))
}
}
func (f *FrameHeader) Unmarshal(buf []byte) {
if len(buf) != 8 {
panic("frame header is 8 bytes long")
}
f.Type = binary.BigEndian.Uint32(buf[0:4])
f.PayloadLen = binary.BigEndian.Uint32(buf[4:8])
}
type Conn struct {
readMtx, writeMtx sync.Mutex
nc *timeoutconn.Conn
readNextValid bool
readNext FrameHeader
nextReadErr error
bufPool *base2bufpool.Pool // no need for sync around it
shutdown shutdownFSM
}
func Wrap(nc *timeoutconn.Conn) *Conn {
return &Conn{
nc: nc,
// ncBuf: bufio.NewReadWriter(bufio.NewReaderSize(nc, 1<<23), bufio.NewWriterSize(nc, 1<<23)),
bufPool: base2bufpool.New(15, 22, base2bufpool.Allocate), // FIXME switch to Panic, but need to enforce the limits in recv for that. => need frameconn config
readNext: FrameHeader{},
readNextValid: false,
}
}
var ErrReadFrameLengthShort = errors.New("read frame length too short")
var ErrFixedFrameLengthMismatch = errors.New("read frame length mismatch")
type Buffer struct {
bufpoolBuffer base2bufpool.Buffer
payloadLen uint32
}
func (b *Buffer) Free() {
b.bufpoolBuffer.Free()
}
func (b *Buffer) Bytes() []byte {
return b.bufpoolBuffer.Bytes()[0:b.payloadLen]
}
type Frame struct {
Header FrameHeader
Buffer Buffer
}
var ErrShutdown = fmt.Errorf("frameconn: shutting down")
// ReadFrame reads a frame from the connection.
//
// Due to an internal optimization (Readv, specifically), it is not guaranteed that a single call to
// WriteFrame unblocks a pending ReadFrame on an otherwise idle (empty) connection.
// The only way to guarantee that all previously written frames can reach the peer's layers on top
// of frameconn is to send an empty frame (no payload) and to ignore empty frames on the receiving side.
func (c *Conn) ReadFrame() (Frame, error) {
if c.shutdown.IsShuttingDown() {
return Frame{}, ErrShutdown
}
// only acquire readMtx now to prioritize the draining in Shutdown()
// over external callers (= drain public callers)
c.readMtx.Lock()
defer c.readMtx.Unlock()
f, err := c.readFrame()
if f.Header.Type == rstFrameType {
c.shutdown.Begin()
return Frame{}, ErrShutdown
}
return f, err
}
// callers must have readMtx locked
func (c *Conn) readFrame() (Frame, error) {
if c.nextReadErr != nil {
ret := c.nextReadErr
c.nextReadErr = nil
return Frame{}, ret
}
if !c.readNextValid {
var buf [8]byte
if _, err := io.ReadFull(c.nc, buf[:]); err != nil {
return Frame{}, err
}
c.readNext.Unmarshal(buf[:])
c.readNextValid = true
}
// read payload + next header
var nextHdrBuf [8]byte
buffer := c.bufPool.Get(uint(c.readNext.PayloadLen))
bufferBytes := buffer.Bytes()
if c.readNext.PayloadLen == 0 {
// This if statement implements the unlock-by-sending-empty-frame behavior
// documented in ReadFrame's public docs.
//
// It is crucial that we return this empty frame now:
// Consider the following plot with x-axis being time,
// P being a frame with payload, E one without, X either of P or E
//
// P P P P P P P E.....................X
// | | | |
// | | | F3
// | | |
// | F2 |significant time between frames because
// F1 the peer has nothing to say to us
//
// Assume we're at the point were F2's header is in c.readNext.
// That means F2 has not yet been returned.
// But because it is empty (no payload), we're already done reading it.
// If we omitted this if statement, the following would happen:
// Readv below would read [][]byte{[len(0)], [len(8)]).
c.readNextValid = false
frame := Frame{
Header: c.readNext,
Buffer: Buffer{
bufpoolBuffer: buffer,
payloadLen: c.readNext.PayloadLen, // 0
},
}
return frame, nil
}
noNextHeader := false
if n, err := c.nc.ReadvFull([][]byte{bufferBytes, nextHdrBuf[:]}); err != nil {
noNextHeader = true
zeroPayloadAndPeerClosed := n == 0 && c.readNext.PayloadLen == 0 && err == io.EOF
zeroPayloadAndNextFrameHeaderThenPeerClosed := err == io.EOF && c.readNext.PayloadLen == 0 && n == int64(len(nextHdrBuf))
nonzeroPayloadRecvdButNextHeaderMissing := n > 0 && uint32(n) == c.readNext.PayloadLen
if zeroPayloadAndPeerClosed || zeroPayloadAndNextFrameHeaderThenPeerClosed || nonzeroPayloadRecvdButNextHeaderMissing {
// This is the last frame on the conn.
// Store the error to be returned on the next invocation of ReadFrame.
c.nextReadErr = err
// NORETURN, this frame is still valid
} else {
return Frame{}, err
}
}
frame := Frame{
Header: c.readNext,
Buffer: Buffer{
bufpoolBuffer: buffer,
payloadLen: c.readNext.PayloadLen,
},
}
if !noNextHeader {
c.readNext.Unmarshal(nextHdrBuf[:])
c.readNextValid = true
} else {
c.readNextValid = false
}
return frame, nil
}
func (c *Conn) WriteFrame(payload []byte, frameType uint32) error {
assertPublicFrameType(frameType)
if c.shutdown.IsShuttingDown() {
return ErrShutdown
}
c.writeMtx.Lock()
defer c.writeMtx.Unlock()
return c.writeFrame(payload, frameType)
}
func (c *Conn) writeFrame(payload []byte, frameType uint32) error {
var hdrBuf [8]byte
binary.BigEndian.PutUint32(hdrBuf[0:4], frameType)
binary.BigEndian.PutUint32(hdrBuf[4:8], uint32(len(payload)))
bufs := net.Buffers([][]byte{hdrBuf[:], payload})
if _, err := c.nc.WritevFull(bufs); err != nil {
return err
}
return nil
}
func (c *Conn) ResetWriteTimeout() error {
return c.nc.RenewWriteDeadline()
}
func (c *Conn) Shutdown(deadline time.Time) error {
// TCP connection teardown is a bit wonky if we are in a situation
// where there is still data in flight (DIF) to our side:
// If we just close the connection, our kernel will send RSTs
// in response to the DIF, and those RSTs may reach the client's
// kernel faster than the client app is able to pull the
// last bytes from its kernel TCP receive buffer.
//
// Therefore, we send a frame with type rstFrameType to indicate
// that the connection is to be closed immediately, and further
// use CloseWrite instead of Close.
// As per definition of the wire interface, CloseWrite guarantees
// delivery of the data in our kernel TCP send buffer.
// Therefore, the client always receives the RST frame.
//
// Now what are we going to do after that?
//
// 1. Naive Option: We just call Close() right after CloseWrite.
// This yields the same race condition as explained above (DIF, first
// paragraph): The situation just became a little more unlikely because
// our rstFrameType + CloseWrite dance gave the client a full RTT worth of
// time to read the data from its TCP recv buffer.
//
// 2. Correct Option: Drain the read side until io.EOF
// We can read from the unclosed read-side of the connection until we get
// the io.EOF caused by the (well behaved) client closing the connection
// in response to it reading the rstFrameType frame we sent.
// However, this wastes resources on our side (we don't care about the
// pending data anymore), and has potential for (D)DoS through CPU-time
// exhaustion if the client just keeps sending data.
// Then again, this option has the advantage with well-behaved clients
// that we do not waste precious kernel-memory on the stale receive buffer
// on our side (which is still full of data that we do not intend to read).
//
// 2.1 DoS Mitigation: Bound the number of bytes to drain, then close
// At the time of writing, this technique is practiced by the Go http server
// implementation, and actually SHOULDed in the HTTP 1.1 RFC. It is
// important to disable the idle timeout of the underlying timeoutconn in
// that case and set an absolute deadline by which the socket must have
// been fully drained. Not too hard, though ;)
//
// 2.2: Client sends RST, not FIN when it receives an rstFrameTyp frame.
// We can use wire.(*net.TCPConn).SetLinger(0) to force an RST to be sent
// on a subsequent close (instead of a FIN + wait for FIN+ACK).
// TODO put this into Wire interface as an abstract method.
//
// 2.3 Only start draining after N*RTT
// We have an RTT approximation from Wire.CloseWrite, which by definition
// must not return before all to-be-sent-data has been acknowledged by the
// client. Give the client a fair chance to react, and only start draining
// after a multiple of the RTT has elapsed.
// We waste the recv buffer memory a little longer than necessary, iff the
// client reacts faster than expected. But we don't wast CPU time.
// If we apply 2.2, we'll also have the benefit that our kernel will have
// dropped the recv buffer memory as soon as it receives the client's RST.
//
// 3. TCP-only: OOB-messaging
// We can use TCP's 'urgent' flag in the client to acknowledge the receipt
// of the rstFrameType to us.
// We can thus wait for that signal while leaving the kernel buffer as is.
// TODO: For now, we just drain the connection (Option 2),
// but we enforce deadlines so the _time_ we drain the connection
// is bounded, although we do _that_ at full speed
defer prometheus.NewTimer(prom.ShutdownSeconds).ObserveDuration()
closeWire := func(step string) error {
// TODO SetLinger(0) or similar (we want RST frames here, not FINS)
closeErr := c.nc.Close()
if closeErr == nil {
return nil
}
// TODO go1.13: https://github.com/zrepl/zrepl/issues/190
// https://github.com/golang/go/issues/8319
// (use errors.Is(closeErr, syscall.ECONNRESET))
if pe, ok := closeErr.(*net.OpError); ok && pe.Err == syscall.ECONNRESET {
// connection reset by peer on FreeBSD, see https://github.com/zrepl/zrepl/issues/190
// We know from kernel code reading that the FD behind c.nc is closed, so let's not consider this an error
return nil
}
prom.ShutdownCloseErrors.WithLabelValues("close").Inc()
return closeErr
}
hardclose := func(err error, step string) error {
prom.ShutdownHardCloses.WithLabelValues(step).Inc()
return closeWire(step)
}
c.shutdown.Begin()
// new calls to c.ReadFrame and c.WriteFrame will now return ErrShutdown
// Acquiring writeMtx and readMtx afterwards ensures that already-running calls exit successfully
// disable renewing timeouts now, enforce the requested deadline instead
// we need to do this before acquiring locks to enforce the timeout on slow
// clients / if something hangs (DoS mitigation)
if err := c.nc.DisableTimeouts(); err != nil {
return hardclose(err, "disable_timeouts")
}
if err := c.nc.SetDeadline(deadline); err != nil {
return hardclose(err, "set_deadline")
}
c.writeMtx.Lock()
defer c.writeMtx.Unlock()
if err := c.writeFrame([]byte{}, rstFrameType); err != nil {
return hardclose(err, "write_frame")
}
if err := c.nc.CloseWrite(); err != nil {
return hardclose(err, "close_write")
}
c.readMtx.Lock()
defer c.readMtx.Unlock()
// TODO DoS mitigation: wait for client acknowledgement that they initiated Shutdown,
// then perform abortive close on our side. As explained above, probably requires
// OOB signaling such as TCP's urgent flag => transport-specific?
// TODO DoS mitigation by reading limited number of bytes
// see discussion above why this is non-trivial
defer prometheus.NewTimer(prom.ShutdownDrainSeconds).ObserveDuration()
n, _ := io.Copy(io.Discard, c.nc)
prom.ShutdownDrainBytesRead.Observe(float64(n))
return closeWire("close")
}