From 7bb221389c6701e72397cc9bee3f0ab339945ca8 Mon Sep 17 00:00:00 2001 From: Siddhant Srivastava <32227121+alter-mage@users.noreply.github.com> Date: Wed, 2 Oct 2024 17:14:48 -0700 Subject: [PATCH] fix: separate nrf due to nucleus exit (#1658) --- scripts/loader | 5 ++ .../KernelUpdateDeploymentTask.java | 24 ++++++- .../activator/KernelUpdateActivator.java | 14 +++++ .../KernelUpdateDeploymentTaskTest.java | 63 +++++++++++++++++++ .../activator/KernelUpdateActivatorTest.java | 23 ++++++- 5 files changed, 123 insertions(+), 6 deletions(-) diff --git a/scripts/loader b/scripts/loader index 86b27642d4..f4dcc6c855 100755 --- a/scripts/loader +++ b/scripts/loader @@ -14,6 +14,8 @@ echo "Greengrass root: "${GG_ROOT} LAUNCH_DIR="$GG_ROOT/alts/current" CONFIG_FILE="" +echo "Absolute launch dir: "$(readlink $LAUNCH_DIR) + is_directory_link() { [ -L "$1" ] && [ -d "$1" ] } @@ -100,4 +102,7 @@ if [ $sigterm_received -eq 0 ] && is_directory_link "${GG_ROOT}/alts/old" && is_ flip_link "${GG_ROOT}/alts/old" "${LAUNCH_DIR}" fi +## Touch an empty file to indicate rollback due to unexpected Nucleus exit +touch "${GG_ROOT}/work/aws.greengrass.Nucleus/restart_panic" + exit ${kernel_exit_code} diff --git a/src/main/java/com/aws/greengrass/deployment/KernelUpdateDeploymentTask.java b/src/main/java/com/aws/greengrass/deployment/KernelUpdateDeploymentTask.java index 89fa294f52..10bca7313a 100644 --- a/src/main/java/com/aws/greengrass/deployment/KernelUpdateDeploymentTask.java +++ b/src/main/java/com/aws/greengrass/deployment/KernelUpdateDeploymentTask.java @@ -23,6 +23,7 @@ import com.aws.greengrass.util.Utils; import java.io.IOException; +import java.nio.file.Files; import java.util.Collections; import java.util.List; import java.util.concurrent.CancellationException; @@ -32,12 +33,14 @@ import java.util.stream.Collectors; import static com.aws.greengrass.deployment.DeploymentConfigMerger.DEPLOYMENT_ID_LOG_KEY; +import static com.aws.greengrass.deployment.DeviceConfiguration.DEFAULT_NUCLEUS_COMPONENT_NAME; import static com.aws.greengrass.deployment.bootstrap.BootstrapSuccessCode.REQUEST_RESTART; import static com.aws.greengrass.deployment.model.Deployment.DeploymentStage.KERNEL_ACTIVATION; import static com.aws.greengrass.deployment.model.Deployment.DeploymentStage.KERNEL_ROLLBACK; import static com.aws.greengrass.deployment.model.Deployment.DeploymentStage.ROLLBACK_BOOTSTRAP; public class KernelUpdateDeploymentTask implements DeploymentTask { + public static final String RESTART_PANIC_FILE_NAME = "restart_panic"; private final Kernel kernel; private final Logger logger; private final Deployment deployment; @@ -149,10 +152,25 @@ private void saveDeploymentStatusDetails(Throwable failureCause) throws IOExcept private DeploymentException getDeploymentStatusDetails() { if (Utils.isEmpty(deployment.getStageDetails())) { - return new DeploymentException( - "Nucleus update workflow failed to restart Nucleus. See loader logs for more details", - DeploymentErrorCode.NUCLEUS_RESTART_FAILURE); + try { + if (Files.deleteIfExists( + kernel.getNucleusPaths().workPath(DEFAULT_NUCLEUS_COMPONENT_NAME) + .resolve(RESTART_PANIC_FILE_NAME).toAbsolutePath())) { + return new DeploymentException( + "Nucleus update workflow failed to restart Nucleus. See loader logs for more details", + DeploymentErrorCode.NUCLEUS_RESTART_FAILURE); + } else { + return new DeploymentException("Nucleus update workflow failed to restart Nucleus due to an " + + "unexpected device IO error", + DeploymentErrorCode.IO_WRITE_ERROR); + } + } catch (IOException e) { + return new DeploymentException("Nucleus update workflow failed to restart Nucleus due to an " + + "unexpected device IO error. See loader logs for more details", e, + DeploymentErrorCode.IO_WRITE_ERROR); + } } + List errorStack = deployment.getErrorStack() == null ? Collections.emptyList() : deployment.getErrorStack().stream().map(DeploymentErrorCode::valueOf).collect(Collectors.toList()); diff --git a/src/main/java/com/aws/greengrass/deployment/activator/KernelUpdateActivator.java b/src/main/java/com/aws/greengrass/deployment/activator/KernelUpdateActivator.java index 18affabefd..07cda7efd7 100644 --- a/src/main/java/com/aws/greengrass/deployment/activator/KernelUpdateActivator.java +++ b/src/main/java/com/aws/greengrass/deployment/activator/KernelUpdateActivator.java @@ -16,10 +16,12 @@ import com.aws.greengrass.lifecyclemanager.KernelAlternatives; import com.aws.greengrass.lifecyclemanager.KernelLifecycle; import com.aws.greengrass.lifecyclemanager.exceptions.DirectoryValidationException; +import com.aws.greengrass.util.NucleusPaths; import com.aws.greengrass.util.Pair; import com.aws.greengrass.util.Utils; import java.io.IOException; +import java.nio.file.Files; import java.nio.file.Path; import java.util.List; import java.util.Map; @@ -28,6 +30,8 @@ import static com.aws.greengrass.deployment.DeploymentConfigMerger.DEPLOYMENT_ID_LOG_KEY; import static com.aws.greengrass.deployment.DeploymentConfigMerger.MERGE_CONFIG_EVENT_KEY; +import static com.aws.greengrass.deployment.DeviceConfiguration.DEFAULT_NUCLEUS_COMPONENT_NAME; +import static com.aws.greengrass.deployment.KernelUpdateDeploymentTask.RESTART_PANIC_FILE_NAME; import static com.aws.greengrass.deployment.bootstrap.BootstrapSuccessCode.REQUEST_REBOOT; import static com.aws.greengrass.deployment.bootstrap.BootstrapSuccessCode.REQUEST_RESTART; import static com.aws.greengrass.deployment.model.Deployment.DeploymentStage.KERNEL_ROLLBACK; @@ -39,6 +43,7 @@ public class KernelUpdateActivator extends DeploymentActivator { private final BootstrapManager bootstrapManager; private final KernelAlternatives kernelAlternatives; + private final NucleusPaths nucleusPaths; /** * Constructor of KernelUpdateActivator. @@ -51,6 +56,7 @@ public KernelUpdateActivator(Kernel kernel, BootstrapManager bootstrapManager) { super(kernel); this.bootstrapManager = bootstrapManager; this.kernelAlternatives = kernel.getContext().get(KernelAlternatives.class); + this.nucleusPaths = kernel.getNucleusPaths(); } @Override @@ -81,6 +87,14 @@ public void activate(Map newConfig, Deployment deployment, updateConfiguration(deploymentDocument.getTimestamp(), newConfig); + // Try and delete restart panic file if it exists + try { + Files.deleteIfExists(nucleusPaths.workPath(DEFAULT_NUCLEUS_COMPONENT_NAME) + .resolve(RESTART_PANIC_FILE_NAME).toAbsolutePath()); + } catch (IOException e) { + logger.atWarn().log("Unable to delete an existing restart panic file", e); + } + Path bootstrapTaskFilePath; try { bootstrapTaskFilePath = deploymentDirectoryManager.getBootstrapTaskFilePath(); diff --git a/src/test/java/com/aws/greengrass/deployment/KernelUpdateDeploymentTaskTest.java b/src/test/java/com/aws/greengrass/deployment/KernelUpdateDeploymentTaskTest.java index 17505cfb72..b702fc9772 100644 --- a/src/test/java/com/aws/greengrass/deployment/KernelUpdateDeploymentTaskTest.java +++ b/src/test/java/com/aws/greengrass/deployment/KernelUpdateDeploymentTaskTest.java @@ -19,6 +19,7 @@ import com.aws.greengrass.logging.api.Logger; import com.aws.greengrass.logging.impl.LogManager; import com.aws.greengrass.testcommons.testutilities.GGExtension; +import com.aws.greengrass.util.NucleusPaths; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -28,6 +29,9 @@ import org.mockito.junit.jupiter.MockitoExtension; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; import java.util.Arrays; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; @@ -37,7 +41,11 @@ import static com.aws.greengrass.dependency.State.FINISHED; import static com.aws.greengrass.dependency.State.RUNNING; import static com.aws.greengrass.dependency.State.STARTING; +import static com.aws.greengrass.deployment.DeviceConfiguration.DEFAULT_NUCLEUS_COMPONENT_NAME; +import static com.aws.greengrass.deployment.KernelUpdateDeploymentTask.RESTART_PANIC_FILE_NAME; import static com.aws.greengrass.deployment.bootstrap.BootstrapSuccessCode.REQUEST_RESTART; +import static com.aws.greengrass.deployment.errorcode.DeploymentErrorCode.IO_WRITE_ERROR; +import static com.aws.greengrass.deployment.errorcode.DeploymentErrorCode.NUCLEUS_RESTART_FAILURE; import static com.aws.greengrass.deployment.model.Deployment.DeploymentStage.KERNEL_ACTIVATION; import static com.aws.greengrass.deployment.model.Deployment.DeploymentStage.KERNEL_ROLLBACK; import static com.aws.greengrass.testcommons.testutilities.ExceptionLogProtector.ignoreExceptionOfType; @@ -74,6 +82,8 @@ class KernelUpdateDeploymentTaskTest { GreengrassService mainService; @Mock ComponentManager componentManager; + @Mock + NucleusPaths nucleusPaths; ExecutorService executorService; KernelUpdateDeploymentTask task; @@ -88,6 +98,7 @@ void beforeEach() throws Exception { lenient().doReturn(true).when(greengrassService).shouldAutoStart(); lenient().doReturn(Arrays.asList(greengrassService)).when(kernel).orderedDependencies(); lenient().doNothing().when(componentManager).cleanupStaleVersions(); + lenient().doReturn(nucleusPaths).when(kernel).getNucleusPaths(); Topic topic = mock(Topic.class); lenient().doReturn(1L).when(topic).getModtime(); @@ -159,6 +170,58 @@ void GIVEN_deployment_rollback_WHEN_service_broken_THEN_rollback_fails(Extension assertEquals("mock activate error", result.getFailureCause().getMessage()); } + @Test + void Given_deployment_rollback_WHEN_stage_details_absent_THEN_rollback_succeeds_with_io_error() throws IOException { + doReturn(KERNEL_ROLLBACK).when(deployment).getDeploymentStage(); + doReturn(FINISHED).when(greengrassService).getState(); + doReturn(true).when(greengrassService).reachedDesiredState(); + doReturn(null).when(deployment).getStageDetails(); + doReturn(Paths.get("")).when(nucleusPaths).workPath(eq(DEFAULT_NUCLEUS_COMPONENT_NAME)); + + DeploymentResult result = task.call(); + assertEquals(DeploymentResult.DeploymentStatus.FAILED_ROLLBACK_COMPLETE, result.getDeploymentStatus()); + assertThat(result.getFailureCause(), isA(DeploymentException.class)); + assertEquals("Nucleus update workflow failed to restart Nucleus due to an unexpected device IO error", + result.getFailureCause().getMessage()); + assertEquals(IO_WRITE_ERROR, ((DeploymentException) result.getFailureCause()).getErrorCodes().get(0)); + } + + @Test + void Given_deployment_rollback_WHEN_panic_file_detected_THEN_rollback_succeeds_with_nucleus_restart_failure() throws IOException { + doReturn(KERNEL_ROLLBACK).when(deployment).getDeploymentStage(); + doReturn(FINISHED).when(greengrassService).getState(); + doReturn(true).when(greengrassService).reachedDesiredState(); + doReturn(null).when(deployment).getStageDetails(); + Path panicScriptPath = Paths.get("").resolve(RESTART_PANIC_FILE_NAME); + Files.createFile(panicScriptPath.toAbsolutePath()); + doReturn(Paths.get("")).when(nucleusPaths).workPath(eq(DEFAULT_NUCLEUS_COMPONENT_NAME)); + + DeploymentResult result = task.call(); + assertEquals(DeploymentResult.DeploymentStatus.FAILED_ROLLBACK_COMPLETE, result.getDeploymentStatus()); + assertThat(result.getFailureCause(), isA(DeploymentException.class)); + assertEquals("Nucleus update workflow failed to restart Nucleus. See loader logs for more details", + result.getFailureCause().getMessage()); + assertEquals(NUCLEUS_RESTART_FAILURE, ((DeploymentException) result.getFailureCause()).getErrorCodes().get(0)); + Files.deleteIfExists(panicScriptPath); + } + + @Test + void Given_deployment_rollback_WHEN_io_exception_when_resolving_path_THEN_rollback_succeeds_with_io_error() throws IOException { + doReturn(KERNEL_ROLLBACK).when(deployment).getDeploymentStage(); + doReturn(FINISHED).when(greengrassService).getState(); + doReturn(true).when(greengrassService).reachedDesiredState(); + doReturn(null).when(deployment).getStageDetails(); + doThrow(new IOException("mock io exception")).when(nucleusPaths).workPath(DEFAULT_NUCLEUS_COMPONENT_NAME); + + DeploymentResult result = task.call(); + assertEquals(DeploymentResult.DeploymentStatus.FAILED_ROLLBACK_COMPLETE, result.getDeploymentStatus()); + assertThat(result.getFailureCause(), isA(DeploymentException.class)); + assertEquals("Nucleus update workflow failed to restart Nucleus due to an unexpected device IO error. See loader logs for more details", + result.getFailureCause().getMessage()); + assertEquals(IO_WRITE_ERROR, ((DeploymentException) result.getFailureCause()).getErrorCodes().get(0)); + assertEquals("mock io exception", result.getFailureCause().getCause().getMessage()); + } + @Test void GIVEN_deployment_rollback_WHEN_service_healthy_THEN_rollback_succeeds() throws Exception { doReturn(KERNEL_ROLLBACK).when(deployment).getDeploymentStage(); diff --git a/src/test/java/com/aws/greengrass/deployment/activator/KernelUpdateActivatorTest.java b/src/test/java/com/aws/greengrass/deployment/activator/KernelUpdateActivatorTest.java index d340fd8b69..3922d5e79f 100644 --- a/src/test/java/com/aws/greengrass/deployment/activator/KernelUpdateActivatorTest.java +++ b/src/test/java/com/aws/greengrass/deployment/activator/KernelUpdateActivatorTest.java @@ -23,6 +23,7 @@ import com.aws.greengrass.lifecyclemanager.exceptions.DirectoryValidationException; import com.aws.greengrass.testcommons.testutilities.GGExtension; import com.aws.greengrass.testcommons.testutilities.TestUtils; +import com.aws.greengrass.util.NucleusPaths; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; @@ -39,6 +40,7 @@ import java.util.Map; import java.util.concurrent.CompletableFuture; +import static com.aws.greengrass.deployment.DeviceConfiguration.DEFAULT_NUCLEUS_COMPONENT_NAME; import static com.aws.greengrass.deployment.bootstrap.BootstrapSuccessCode.NO_OP; import static com.aws.greengrass.deployment.bootstrap.BootstrapSuccessCode.REQUEST_REBOOT; import static com.aws.greengrass.deployment.bootstrap.BootstrapSuccessCode.REQUEST_RESTART; @@ -72,6 +74,8 @@ class KernelUpdateActivatorTest { @Mock KernelAlternatives kernelAlternatives; @Mock + NucleusPaths nucleusPaths; + @Mock CompletableFuture totallyCompleteFuture; @Mock Deployment deployment; @@ -85,9 +89,10 @@ class KernelUpdateActivatorTest { KernelUpdateActivator kernelUpdateActivator; @BeforeEach - void beforeEach() { + void beforeEach() throws IOException { doReturn(deploymentDirectoryManager).when(context).get(eq(DeploymentDirectoryManager.class)); doReturn(kernelAlternatives).when(context).get(eq(KernelAlternatives.class)); + doReturn(nucleusPaths).when(kernel).getNucleusPaths(); doReturn(context).when(kernel).getContext(); lenient().doReturn(config).when(kernel).getConfig(); kernelUpdateActivator = new KernelUpdateActivator(kernel, bootstrapManager); @@ -120,6 +125,8 @@ void GIVEN_deployment_activate_WHEN_prepareBootstrap_fails_THEN_deployment_rollb doReturn(bootstrapFilePath).when(deploymentDirectoryManager).getBootstrapTaskFilePath(); Path targetConfigFilePath = mock(Path.class); doReturn(targetConfigFilePath).when(deploymentDirectoryManager).getTargetConfigFilePath(); + IOException mockNucleusWorkPathIOE = new IOException("Mock Nucleus work path IOE"); + doThrow(mockNucleusWorkPathIOE).when(nucleusPaths).workPath(eq(DEFAULT_NUCLEUS_COMPONENT_NAME)); IOException mockIOE = new IOException("mock error"); doThrow(mockIOE).when(kernelAlternatives).prepareBootstrap(eq("testId")); doThrow(new IOException()).when(deploymentDirectoryManager).writeDeploymentMetadata(eq(deployment)); @@ -146,6 +153,8 @@ void GIVEN_deployment_activate_WHEN_bootstrap_task_fails_THEN_deployment_rollbac doReturn(targetConfigFilePath).when(deploymentDirectoryManager).getTargetConfigFilePath(); ServiceUpdateException mockSUE = new ServiceUpdateException("mock error", DeploymentErrorCode.COMPONENT_BOOTSTRAP_ERROR, DeploymentErrorType.USER_COMPONENT_ERROR); + IOException mockNucleusWorkPathIOE = new IOException("Mock Nucleus work path IOE"); + doThrow(mockNucleusWorkPathIOE).when(nucleusPaths).workPath(eq(DEFAULT_NUCLEUS_COMPONENT_NAME)); doThrow(mockSUE).when(bootstrapManager).executeAllBootstrapTasksSequentially(eq(bootstrapFilePath)); doThrow(new IOException()).when(kernelAlternatives).prepareRollback(); @@ -163,10 +172,14 @@ void GIVEN_deployment_activate_WHEN_bootstrap_task_fails_THEN_deployment_rollbac } @Test - void GIVEN_deployment_activate_WHEN_bootstrap_finishes_THEN_request_restart() throws Exception { + void GIVEN_deployment_activate_WHEN_bootstrap_finishes_THEN_request_restart(ExtensionContext context) throws Exception { + ignoreExceptionOfType(context, IOException.class); + Path bootstrapFilePath = mock(Path.class); doReturn(bootstrapFilePath).when(deploymentDirectoryManager).getBootstrapTaskFilePath(); Path targetConfigFilePath = mock(Path.class); + IOException mockNucleusWorkPathIOE = new IOException("Mock Nucleus work path IOE"); + doThrow(mockNucleusWorkPathIOE).when(nucleusPaths).workPath(eq(DEFAULT_NUCLEUS_COMPONENT_NAME)); doReturn(targetConfigFilePath).when(deploymentDirectoryManager).getTargetConfigFilePath(); doReturn(NO_OP).when(bootstrapManager).executeAllBootstrapTasksSequentially(eq(bootstrapFilePath)); doReturn(false).when(bootstrapManager).hasNext(); @@ -179,11 +192,15 @@ void GIVEN_deployment_activate_WHEN_bootstrap_finishes_THEN_request_restart() th } @Test - void GIVEN_deployment_activate_WHEN_bootstrap_requires_reboot_THEN_request_reboot() throws Exception { + void GIVEN_deployment_activate_WHEN_bootstrap_requires_reboot_THEN_request_reboot(ExtensionContext context) throws Exception { + ignoreExceptionOfType(context, IOException.class); + Path bootstrapFilePath = mock(Path.class); doReturn(bootstrapFilePath).when(deploymentDirectoryManager).getBootstrapTaskFilePath(); Path targetConfigFilePath = mock(Path.class); doReturn(targetConfigFilePath).when(deploymentDirectoryManager).getTargetConfigFilePath(); + IOException mockNucleusWorkPathIOE = new IOException("Mock Nucleus work path IOE"); + doThrow(mockNucleusWorkPathIOE).when(nucleusPaths).workPath(eq(DEFAULT_NUCLEUS_COMPONENT_NAME)); doReturn(REQUEST_REBOOT).when(bootstrapManager).executeAllBootstrapTasksSequentially(eq(bootstrapFilePath)); doReturn(true).when(bootstrapManager).hasNext();