Skip to content

Commit

Permalink
Additional keepalive diagnostics (#1692)
Browse files Browse the repository at this point in the history
Motivation:

It's useful to know the state of keepalive when debugging connection
issues. gRPC doesn't emit any logs around this at the moment which makes
debugging difficult.

Modifications:

Add additional logs to the idle handler when:
- the idle timeout task is scheduled, cancelled, and fires
- the keepalive timer is scheduled
- the scheduled close timer is fired
- the connection is closed
- GOAWAY frames are sent (already logs on receive)
- PING frames are sent and received

Result:

Better visibility into connection lifecycle
  • Loading branch information
glbrntt authored Nov 1, 2023
1 parent 02ff057 commit 663a852
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 15 deletions.
60 changes: 45 additions & 15 deletions Sources/GRPC/GRPCIdleHandler.swift
Original file line number Diff line number Diff line change
Expand Up @@ -98,19 +98,6 @@ internal final class GRPCIdleHandler: ChannelInboundHandler {
)
}

private func sendGoAway(lastStreamID streamID: HTTP2StreamID) {
guard let context = self.context else {
return
}

let frame = HTTP2Frame(
streamID: .rootStream,
payload: .goAway(lastStreamID: streamID, errorCode: .noError, opaqueData: nil)
)

context.writeAndFlush(self.wrapOutboundOut(frame), promise: nil)
}

private func perform(operations: GRPCIdleHandlerStateMachine.Operations) {
// Prod the connection manager.
if let event = operations.connectionManagerEvent, let manager = self.mode.connectionManager {
Expand All @@ -137,11 +124,17 @@ internal final class GRPCIdleHandler: ChannelInboundHandler {
if let idleTask = operations.idleTask {
switch idleTask {
case let .cancel(task):
self.stateMachine.logger.debug("idle timeout task cancelled")
task.cancel()

case .schedule:
if self.idleTimeout != .nanoseconds(.max), let context = self.context {
self.stateMachine.logger.debug(
"scheduling idle timeout task",
metadata: [MetadataKey.delayMs: "\(self.idleTimeout.milliseconds)"]
)
let task = context.eventLoop.scheduleTask(in: self.idleTimeout) {
self.stateMachine.logger.debug("idle timeout task fired")
self.idleTimeoutFired()
}
self.perform(operations: self.stateMachine.scheduledIdleTimeoutTask(task))
Expand All @@ -151,6 +144,13 @@ internal final class GRPCIdleHandler: ChannelInboundHandler {

// Send a GOAWAY frame.
if let streamID = operations.sendGoAwayWithLastPeerInitiatedStreamID {
self.stateMachine.logger.debug(
"sending GOAWAY frame",
metadata: [
MetadataKey.h2GoAwayLastStreamID: "\(Int(streamID))"
]
)

let goAwayFrame = HTTP2Frame(
streamID: .rootStream,
payload: .goAway(lastStreamID: streamID, errorCode: .noError, opaqueData: nil)
Expand All @@ -175,6 +175,7 @@ internal final class GRPCIdleHandler: ChannelInboundHandler {
// Close on the next event-loop tick so we don't drop any events which are
// currently being processed.
context.eventLoop.execute {
self.stateMachine.logger.debug("closing connection")
context.close(mode: .all, promise: nil)
}
}
Expand All @@ -186,8 +187,12 @@ internal final class GRPCIdleHandler: ChannelInboundHandler {
()

case .ack:
// NIO's HTTP2 handler acks for us so this is a no-op.
()
// NIO's HTTP2 handler acks for us so this is a no-op. Log so it doesn't appear that we are
// ignoring pings.
self.stateMachine.logger.debug(
"sending PING frame",
metadata: [MetadataKey.h2PingAck: "true"]
)

case .cancelScheduledTimeout:
self.scheduledClose?.cancel()
Expand All @@ -197,6 +202,15 @@ internal final class GRPCIdleHandler: ChannelInboundHandler {
self.schedulePing(in: delay, timeout: timeout)

case let .reply(framePayload):
switch framePayload {
case .ping(_, let ack):
self.stateMachine.logger.debug(
"sending PING frame",
metadata: [MetadataKey.h2PingAck: "\(ack)"]
)
default:
()
}
let frame = HTTP2Frame(streamID: .rootStream, payload: framePayload)
self.context?.writeAndFlush(self.wrapOutboundOut(frame), promise: nil)

Expand All @@ -210,6 +224,11 @@ internal final class GRPCIdleHandler: ChannelInboundHandler {
return
}

self.stateMachine.logger.debug(
"scheduled keepalive pings",
metadata: [MetadataKey.intervalMs: "\(delay.milliseconds)"]
)

self.scheduledPing = self.context?.eventLoop.scheduleRepeatedTask(
initialDelay: delay,
delay: delay
Expand All @@ -226,6 +245,7 @@ internal final class GRPCIdleHandler: ChannelInboundHandler {

private func scheduleClose(in timeout: TimeAmount) {
self.scheduledClose = self.context?.eventLoop.scheduleTask(in: timeout) {
self.stateMachine.logger.debug("keepalive timer expired")
self.perform(operations: self.stateMachine.shutdownNow())
}
}
Expand Down Expand Up @@ -318,6 +338,10 @@ internal final class GRPCIdleHandler: ChannelInboundHandler {
case let .settings(.settings(settings)):
self.perform(operations: self.stateMachine.receiveSettings(settings))
case let .ping(data, ack):
self.stateMachine.logger.debug(
"received PING frame",
metadata: [MetadataKey.h2PingAck: "\(ack)"]
)
self.handlePingAction(self.pingHandler.read(pingData: data, ack: ack))
default:
// We're not interested in other events.
Expand Down Expand Up @@ -350,3 +374,9 @@ extension HTTP2SettingsParameter {
}
}
}

extension TimeAmount {
fileprivate var milliseconds: Int64 {
self.nanoseconds / 1_000_000
}
}
4 changes: 4 additions & 0 deletions Sources/GRPC/Logger.swift
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ enum MetadataKey {
static let h2DataBytes = "h2_data_bytes"
static let h2GoAwayError = "h2_goaway_error"
static let h2GoAwayLastStreamID = "h2_goaway_last_stream_id"
static let h2PingAck = "h2_ping_ack"

static let delayMs = "delay_ms"
static let intervalMs = "interval_ms"

static let error = "error"
}
Expand Down

0 comments on commit 663a852

Please sign in to comment.