Skip to content

Commit

Permalink
Make Agent Register request asynchronous
Browse files Browse the repository at this point in the history
The "Register" agent->controller method call could get stuck
when the IP address configured as ControllerHost does not exist.
This timeout of about 2 seconds could completely occupy the event
loop and would not allow other calls on the DBus API of the Agent.

Making the "Register" method call asynchronous we allow the event
loop to stay unoccupied and be able to accept SwitchController
request.

Signed-off-by: Mark Kemel <[email protected]>
  • Loading branch information
mkemel committed Oct 15, 2024
1 parent 08745e4 commit 08c0daf
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 46 deletions.
131 changes: 87 additions & 44 deletions src/agent/agent.c
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,13 @@ static int agent_heartbeat_timer_callback(sd_event_source *event_source, UNUSED
Agent *agent = (Agent *) userdata;

int r = 0;
if (agent->connection_state == AGENT_CONNECTION_STATE_CONNECTING) {
bc_log_error("Agent connection attempt failed, retrying");
if (agent->register_call_slot != NULL) {
sd_bus_slot_unrefp(&agent->register_call_slot);
}
agent->connection_state = AGENT_CONNECTION_STATE_RETRY;
}
if (agent->connection_state == AGENT_CONNECTION_STATE_CONNECTED &&
agent_check_controller_liveness(agent)) {
r = sd_bus_emit_signal(
Expand Down Expand Up @@ -532,6 +539,9 @@ void agent_unref(Agent *agent) {
sd_event_unrefp(&agent->event);
}

if (agent->register_call_slot != NULL) {
sd_bus_slot_unrefp(&agent->register_call_slot);
}
if (agent->metrics_slot != NULL) {
sd_bus_slot_unrefp(&agent->metrics_slot);
}
Expand Down Expand Up @@ -2684,53 +2694,19 @@ void agent_stop(Agent *agent) {
}
}

static bool agent_connect(Agent *agent) {
peer_bus_close(agent->peer_dbus);

bc_log_infof("Connecting to controller on %s", agent->orch_addr);

agent->peer_dbus = peer_bus_open(agent->event, "peer-bus-to-controller", agent->orch_addr);
if (agent->peer_dbus == NULL) {
bc_log_error("Failed to open peer dbus");
return false;
}

bus_socket_set_options(agent->peer_dbus, agent->peer_socket_options);

int r = sd_bus_add_object_vtable(
agent->peer_dbus,
NULL,
INTERNAL_AGENT_OBJECT_PATH,
INTERNAL_AGENT_INTERFACE,
internal_agent_vtable,
agent);
if (r < 0) {
bc_log_errorf("Failed to add agent vtable: %s", strerror(-r));
return false;
}
static int agent_process_register_callback(sd_bus_message *m, Agent *agent) {
int r = 0;

_cleanup_sd_bus_message_ sd_bus_message *bus_msg = NULL;
sd_bus_error error = SD_BUS_ERROR_NULL;
r = sd_bus_call_method(
agent->peer_dbus,
BC_DBUS_NAME,
INTERNAL_CONTROLLER_OBJECT_PATH,
INTERNAL_CONTROLLER_INTERFACE,
"Register",
&error,
&bus_msg,
"s",
agent->name);
if (r < 0) {
bc_log_errorf("Registering as '%s' failed: %s", agent->name, error.message);
sd_bus_error_free(&error);
return false;
if (sd_bus_message_is_method_error(m, NULL)) {
bc_log_errorf("Registering as '%s' failed: %s", agent->name, sd_bus_message_get_error(m)->message);
return -EPERM;
}

r = sd_bus_message_read(bus_msg, "");
bc_log_info("Register call response received");
r = sd_bus_message_read(m, "");
if (r < 0) {
bc_log_errorf("Failed to parse response message: %s", strerror(-r));
return false;
return r;
}

bc_log_infof("Connected to controller as '%s'", agent->name);
Expand Down Expand Up @@ -2759,7 +2735,7 @@ static bool agent_connect(Agent *agent) {
agent);
if (r < 0) {
bc_log_errorf("Failed to add heartbeat signal match: %s", strerror(-r));
return false;
return r;
}

r = sd_bus_match_signal_async(
Expand All @@ -2774,7 +2750,7 @@ static bool agent_connect(Agent *agent) {
agent);
if (r < 0) {
bc_log_errorf("Failed to request match for Disconnected signal: %s", strerror(-r));
return false;
return r;
}

/* re-emit ProxyNew signals */
Expand All @@ -2791,6 +2767,73 @@ static bool agent_connect(Agent *agent) {
}
}

return 0;
}

static int agent_register_callback(sd_bus_message *m, void *userdata, UNUSED sd_bus_error *ret_error) {
Agent *agent = (Agent *) userdata;

if (agent->connection_state != AGENT_CONNECTION_STATE_CONNECTING) {
bc_log_error("Agent is not in CONNECTING state, dropping Register callback");
if (agent->register_call_slot != NULL) {
sd_bus_slot_unrefp(&agent->register_call_slot);
}
return -EPERM;
}
int r = 0;
r = agent_process_register_callback(m, agent);
if (r < 0) {
agent->connection_state = AGENT_CONNECTION_STATE_RETRY;
return r;
}
return 0;
}

static bool agent_connect(Agent *agent) {
peer_bus_close(agent->peer_dbus);

bc_log_infof("Connecting to controller on %s", agent->orch_addr);
agent->connection_state = AGENT_CONNECTION_STATE_CONNECTING;

agent->peer_dbus = peer_bus_open(agent->event, "peer-bus-to-controller", agent->orch_addr);
if (agent->peer_dbus == NULL) {
bc_log_error("Failed to open peer dbus");
return false;
}

bus_socket_set_options(agent->peer_dbus, agent->peer_socket_options);

int r = sd_bus_add_object_vtable(
agent->peer_dbus,
NULL,
INTERNAL_AGENT_OBJECT_PATH,
INTERNAL_AGENT_INTERFACE,
internal_agent_vtable,
agent);
if (r < 0) {
bc_log_errorf("Failed to add agent vtable: %s", strerror(-r));
return false;
}

_cleanup_sd_bus_message_ sd_bus_message *bus_msg = NULL;
sd_bus_error error = SD_BUS_ERROR_NULL;
r = sd_bus_call_method_async(
agent->peer_dbus,
NULL, // No slot needed since peer_dbus is closed on reconnect
BC_DBUS_NAME,
INTERNAL_CONTROLLER_OBJECT_PATH,
INTERNAL_CONTROLLER_INTERFACE,
"Register",
agent_register_callback,
agent,
"s",
agent->name);
if (r < 0) {
bc_log_errorf("Registering as '%s' failed: %s", agent->name, error.message);
sd_bus_error_free(&error);
return false;
}

return true;
}

Expand Down
6 changes: 4 additions & 2 deletions src/agent/agent.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ typedef struct JobTracker JobTracker;
typedef enum {
AGENT_CONNECTION_STATE_DISCONNECTED,
AGENT_CONNECTION_STATE_CONNECTED,
AGENT_CONNECTION_STATE_RETRY
AGENT_CONNECTION_STATE_RETRY,
AGENT_CONNECTION_STATE_CONNECTING
} AgentConnectionState;

struct Agent {
Expand All @@ -61,7 +62,7 @@ struct Agent {
char *controller_address;
long heartbeat_interval_msec;
long controller_heartbeat_threshold_msec;

AgentConnectionState connection_state;
uint64_t connection_retry_count;
uint64_t controller_last_seen;
Expand All @@ -82,6 +83,7 @@ struct Agent {
sd_bus *systemd_dbus;
sd_bus *peer_dbus;

sd_bus_slot *register_call_slot;
sd_bus_slot *metrics_slot;

LIST_HEAD(SystemdRequest, outstanding_requests);
Expand Down
1 change: 1 addition & 0 deletions tests/bluechi_test/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ def serialize(self) -> str:
ControllerAddress={self.controller_address}
HeartbeatInterval={self.heartbeat_interval}
ControllerHeartbeatThreshold={self.controller_heartbeat_threshold}
RegisterCallTimeout={self.register_call_timeout}
LogLevel={self.log_level}
LogTarget={self.log_target}
LogIsQuiet={self.log_is_quiet}
Expand Down

0 comments on commit 08c0daf

Please sign in to comment.