diff --git a/agent/acs/session/task_stop_verification_ack_responder_integ_test.go b/agent/acs/session/task_stop_verification_ack_responder_integ_test.go index 408c87aeedb..1d82f056bf2 100644 --- a/agent/acs/session/task_stop_verification_ack_responder_integ_test.go +++ b/agent/acs/session/task_stop_verification_ack_responder_integ_test.go @@ -82,16 +82,19 @@ func TestTaskStopVerificationACKResponder_StopsSpecificTasks(t *testing.T) { taskEngine, done, dockerClient, _ := engine.SetupIntegTestTaskEngine(engine.DefaultTestConfigIntegTest(), nil, t) defer done() + testEvents := engine.InitTestEventCollection(taskEngine) + var tasks []*apitask.Task for i := 0; i < 3; i++ { task := engine.CreateTestTask(fmt.Sprintf("test_task_%d", i)) createLongRunningContainers(task, 1) go taskEngine.AddTask(task) - engine.VerifyContainerManifestPulledStateChange(t, taskEngine) - engine.VerifyTaskManifestPulledStateChange(t, taskEngine) - engine.VerifyContainerRunningStateChange(t, taskEngine) - engine.VerifyTaskRunningStateChange(t, taskEngine) + containerName := task.Arn + ":" + task.Containers[0].Name + engine.VerifyContainerStatus(apicontainerstatus.ContainerManifestPulled, containerName, testEvents, t) + engine.VerifyTaskStatus(apitaskstatus.TaskManifestPulled, task.Arn, testEvents, t) + engine.VerifyContainerStatus(apicontainerstatus.ContainerRunning, containerName, testEvents, t) + engine.VerifyTaskStatus(apitaskstatus.TaskRunning, task.Arn, testEvents, t) tasks = append(tasks, task) } @@ -113,9 +116,10 @@ func TestTaskStopVerificationACKResponder_StopsSpecificTasks(t *testing.T) { }) // Wait for all state changes before verifying container and task statuses. - for i := 0; i < 2; i++ { - engine.VerifyContainerStoppedStateChange(t, taskEngine) - engine.VerifyTaskStoppedStateChange(t, taskEngine) + for _, task := range tasks[1:] { + containerName := task.Arn + ":" + task.Containers[0].Name + engine.VerifyContainerStatus(apicontainerstatus.ContainerStopped, containerName, testEvents, t) + engine.VerifyTaskStatus(apitaskstatus.TaskStopped, task.Arn, testEvents, t) } // Verify that the last 2 tasks and their containers have stopped. diff --git a/agent/dockerclient/dockerapi/docker_client.go b/agent/dockerclient/dockerapi/docker_client.go index 3e276a18652..d97c69208fb 100644 --- a/agent/dockerclient/dockerapi/docker_client.go +++ b/agent/dockerclient/dockerapi/docker_client.go @@ -84,6 +84,20 @@ const ( // output will be suppressed in debug mode pullStatusSuppressDelay = 2 * time.Second + // Retry settings for pulling manifests. + // + // First few retries are quick (starting with 10ms) but the backoff increases + // fast (with a multiplier of 3 capping at 5s). This is to help setups that depend on + // network pause container for communicating to image repositories which require the pause + // container to be initialized before it is ready to serve requests. + // A proper long term solution is for the pause container to have a health check and Agent to + // wait for it to become healthy but until then we are relying on this retry strategy. + maximumManifestPullRetries = 9 + minimumManifestPullRetryDelay = 10 * time.Millisecond + maximumManifestPullRetryDelay = 5 * time.Second + manifestPullRetryDelayMultiplier = 3 + manifestPullRetryJitterMultiplier = 0.2 + // retry settings for pulling images maximumPullRetries = 5 minimumPullRetryDelay = 1100 * time.Millisecond @@ -325,8 +339,8 @@ func NewDockerGoClient(sdkclientFactory sdkclientfactory.Factory, context: ctx, imagePullBackoff: retry.NewExponentialBackoff(minimumPullRetryDelay, maximumPullRetryDelay, pullRetryJitterMultiplier, pullRetryDelayMultiplier), - manifestPullBackoff: retry.NewExponentialBackoff(minimumPullRetryDelay, maximumPullRetryDelay, - pullRetryJitterMultiplier, pullRetryDelayMultiplier), + manifestPullBackoff: retry.NewExponentialBackoff(minimumManifestPullRetryDelay, + maximumManifestPullRetryDelay, manifestPullRetryJitterMultiplier, manifestPullRetryDelayMultiplier), imageTagBackoff: retry.NewConstantBackoff(tagImageRetryInterval), inactivityTimeoutHandler: handleInactivityTimeout, }, nil @@ -372,7 +386,7 @@ func (dg *dockerGoClient) PullImageManifest( // Call DistributionInspect API with retries startTime := time.Now() var distInspectPtr *registry.DistributionInspect - err = retry.RetryNWithBackoffCtx(ctx, dg.manifestPullBackoff, maximumPullRetries, func() error { + err = retry.RetryNWithBackoffCtx(ctx, dg.manifestPullBackoff, maximumManifestPullRetries, func() error { distInspect, err := client.DistributionInspect(ctx, imageRef, encodedAuth) if err != nil { return err diff --git a/agent/dockerclient/dockerapi/docker_client_test.go b/agent/dockerclient/dockerapi/docker_client_test.go index ad269abf553..a8b217495ac 100644 --- a/agent/dockerclient/dockerapi/docker_client_test.go +++ b/agent/dockerclient/dockerapi/docker_client_test.go @@ -351,7 +351,7 @@ func TestPullImageManifest(t *testing.T) { client.EXPECT(). DistributionInspect( gomock.Any(), "image", base64.URLEncoding.EncodeToString([]byte("{}"))). - Times(maximumPullRetries). + Times(maximumManifestPullRetries). Return( registry.DistributionInspect{}, errors.New("Some error for https://prod-us-east-1-starport-layer-bucket.s3.us-east-1.amazonaws.com")) diff --git a/agent/engine/common_integ_testutil.go b/agent/engine/common_integ_testutil.go index 86a6b57394f..949f96ec75f 100644 --- a/agent/engine/common_integ_testutil.go +++ b/agent/engine/common_integ_testutil.go @@ -355,10 +355,10 @@ func InitTestEventCollection(taskEngine TaskEngine) *TestEvents { // This method queries the TestEvents struct to check a Task Status. // This method will block if there are no more stateChangeEvents from the DockerTaskEngine but is expected -func VerifyTaskStatus(status apitaskstatus.TaskStatus, taskARN string, testEvents *TestEvents, t *testing.T) error { +func VerifyTaskStatus(status apitaskstatus.TaskStatus, taskARN string, testEvents *TestEvents, t *testing.T) { for { if _, found := testEvents.RecordedEvents[statechange.TaskEvent][status.String()][taskARN]; found { - return nil + return } event := <-testEvents.StateChangeEvents RecordTestEvent(testEvents, event) @@ -367,10 +367,10 @@ func VerifyTaskStatus(status apitaskstatus.TaskStatus, taskARN string, testEvent // This method queries the TestEvents struct to check a Task Status. // This method will block if there are no more stateChangeEvents from the DockerTaskEngine but is expected -func VerifyContainerStatus(status apicontainerstatus.ContainerStatus, ARNcontName string, testEvents *TestEvents, t *testing.T) error { +func VerifyContainerStatus(status apicontainerstatus.ContainerStatus, ARNcontName string, testEvents *TestEvents, t *testing.T) { for { if _, found := testEvents.RecordedEvents[statechange.ContainerEvent][status.String()][ARNcontName]; found { - return nil + return } event := <-testEvents.StateChangeEvents RecordTestEvent(testEvents, event) diff --git a/agent/engine/engine_integ_test.go b/agent/engine/engine_integ_test.go index cbc95956090..f291c2836a6 100644 --- a/agent/engine/engine_integ_test.go +++ b/agent/engine/engine_integ_test.go @@ -620,7 +620,7 @@ func TestManifestPulledDoesNotDependOnContainerOrdering(t *testing.T) { defer done() first := createTestContainerWithImageAndName(testRegistryImage, "first") - first.Command = []string{"sh", "-c", "sleep 60"} + first.Command = GetLongRunningCommand() second := createTestContainerWithImageAndName(testRegistryImage, "second") second.SetDependsOn([]apicontainer.DependsOn{ diff --git a/agent/engine/engine_sudo_linux_integ_test.go b/agent/engine/engine_sudo_linux_integ_test.go index 0128ec29f82..7f8456b9f41 100644 --- a/agent/engine/engine_sudo_linux_integ_test.go +++ b/agent/engine/engine_sudo_linux_integ_test.go @@ -228,28 +228,22 @@ func TestFirelensFluentbit(t *testing.T) { testEvents := InitTestEventCollection(taskEngine) //Verify logsender container is running - err = VerifyContainerStatus(apicontainerstatus.ContainerRunning, testTask.Arn+":logsender", testEvents, t) - assert.NoError(t, err, "Verify logsender container is running") + VerifyContainerStatus(apicontainerstatus.ContainerRunning, testTask.Arn+":logsender", testEvents, t) //Verify firelens container is running - err = VerifyContainerStatus(apicontainerstatus.ContainerRunning, testTask.Arn+":firelens", testEvents, t) - assert.NoError(t, err, "Verify firelens container is running") + VerifyContainerStatus(apicontainerstatus.ContainerRunning, testTask.Arn+":firelens", testEvents, t) //Verify task is in running state - err = VerifyTaskStatus(apitaskstatus.TaskRunning, testTask.Arn, testEvents, t) - assert.NoError(t, err, "Not verified task running") + VerifyTaskStatus(apitaskstatus.TaskRunning, testTask.Arn, testEvents, t) //Verify logsender container is stopped - err = VerifyContainerStatus(apicontainerstatus.ContainerStopped, testTask.Arn+":logsender", testEvents, t) - assert.NoError(t, err) + VerifyContainerStatus(apicontainerstatus.ContainerStopped, testTask.Arn+":logsender", testEvents, t) //Verify firelens container is stopped - err = VerifyContainerStatus(apicontainerstatus.ContainerStopped, testTask.Arn+":firelens", testEvents, t) - assert.NoError(t, err) + VerifyContainerStatus(apicontainerstatus.ContainerStopped, testTask.Arn+":firelens", testEvents, t) //Verify the task itself has stopped - err = VerifyTaskStatus(apitaskstatus.TaskStopped, testTask.Arn, testEvents, t) - assert.NoError(t, err) + VerifyTaskStatus(apitaskstatus.TaskStopped, testTask.Arn, testEvents, t) taskID := testTask.GetID()