diff --git a/charts/gha-runner-scale-set-experimental/templates/_mode_kubernetes.tpl b/charts/gha-runner-scale-set-experimental/templates/_mode_kubernetes.tpl index 6589d01d..4801a63f 100644 --- a/charts/gha-runner-scale-set-experimental/templates/_mode_kubernetes.tpl +++ b/charts/gha-runner-scale-set-experimental/templates/_mode_kubernetes.tpl @@ -1,6 +1,8 @@ {{- define "runner-mode-kubernetes.runner-container" -}} {{- $runner := (.Values.runner | default dict) -}} {{- $kubeMode := (index $runner "kubernetesMode" | default dict) -}} +{{- $runnerContainer := (index $runner "container" | default dict) -}} +{{- $runnerContainerVolumeMounts := (index $runnerContainer "volumeMounts" | default list) -}} {{- $hookPath := (index $kubeMode "hookPath" | default "/home/runner/k8s/index.js") -}} {{- $extensionRef := (index $kubeMode "extensionRef" | default "") -}} {{- $extension := (index $kubeMode "extension" | default dict) -}} @@ -32,6 +34,9 @@ {{- if not (kindIs "string" $hookPath) -}} {{- fail "runner.kubernetesMode.hookPath must be a string" -}} {{- end -}} +{{- if not (kindIs "slice" $runnerContainerVolumeMounts) -}} + {{- fail "runner.container.volumeMounts must be a list" -}} +{{- end -}} {{- if not (kindIs "string" $extensionRef) -}} {{- fail "runner.kubernetesMode.extensionRef must be a string" -}} {{- end -}} @@ -82,6 +87,9 @@ volumeMounts: subPath: extension readOnly: true {{- end }} + {{- with $runnerContainerVolumeMounts }} + {{- toYaml . | nindent 2 }} + {{- end }} {{ include "githubServerTLS.volumeMountItem" (dict "root" $ "existingVolumeMounts" (list)) | nindent 2 }} {{- end }} diff --git a/controllers/actions.github.com/ephemeralrunner_controller.go b/controllers/actions.github.com/ephemeralrunner_controller.go index 616c0e04..5cc47fcc 100644 --- a/controllers/actions.github.com/ephemeralrunner_controller.go +++ b/controllers/actions.github.com/ephemeralrunner_controller.go @@ -320,7 +320,17 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ // If the runner pod did not have chance to start, terminated state may not be set. // Therefore, we should try to restart it. if cs == nil || cs.State.Terminated == nil { - log.Info("Runner container does not have state set, deleting pod as failed so it can be restarted") + missingDetails := "unknown" + if cs == nil { + missingDetails = "no container status available" + } else if cs.State.Terminated == nil { + missingDetails = "container termination details not yet available" + } + log.Info("Runner container termination details incomplete, proceeding with cleanup based on pod-level status", + "missingDetails", missingDetails, + "podReason", pod.Status.Reason, + "podMessage", pod.Status.Message, + ) return ctrl.Result{}, r.deleteEphemeralRunnerOrPod(ctx, ephemeralRunner, pod, log) } @@ -386,34 +396,226 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ } func (r *EphemeralRunnerReconciler) deleteEphemeralRunnerOrPod(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, pod *corev1.Pod, log logr.Logger) error { + // Extract terminal failure reason from pod for observability + podReason := pod.Status.Reason + if podReason == "" && pod.Status.Phase == corev1.PodFailed { + podReason = "PodFailed" + } + + cs := runnerContainerStatus(pod) + var exitCode int32 = -1 + if cs != nil && cs.State.Terminated != nil { + exitCode = cs.State.Terminated.ExitCode + } + + isOOMKilled := exitCode == 137 || podReason == "OOMKilled" + if isOOMKilled { + log.Info("Handling OOMKilled as terminal failure", + "runnerName", ephemeralRunner.Status.RunnerName, + "runnerId", ephemeralRunner.Status.RunnerID, + "hasJob", ephemeralRunner.HasJob(), + "reason", podReason, + "exitCode", exitCode, + "oomKilled", true, + ) + + if ephemeralRunner.Status.RunnerID != 0 { + log.Info("Attempting to remove registered runner from service for terminal OOMKilled failure", + "runnerName", ephemeralRunner.Status.RunnerName, + "runnerId", ephemeralRunner.Status.RunnerID, + "hasJob", ephemeralRunner.HasJob(), + "reason", podReason, + "exitCode", exitCode, + "oomKilled", true, + ) + if err := r.deleteRunnerFromService(ctx, ephemeralRunner, log); err != nil { + log.Error(err, "Failed to remove runner from service for terminal OOMKilled failure; will retry on next reconciliation", + "runnerName", ephemeralRunner.Status.RunnerName, + "runnerId", ephemeralRunner.Status.RunnerID, + "hasJob", ephemeralRunner.HasJob(), + "reason", podReason, + "exitCode", exitCode, + "oomKilled", true, + "retryable", true, + ) + return err + } + log.Info("Successfully removed runner from service for terminal OOMKilled failure", + "runnerName", ephemeralRunner.Status.RunnerName, + "runnerId", ephemeralRunner.Status.RunnerID, + "hasJob", ephemeralRunner.HasJob(), + "reason", podReason, + "exitCode", exitCode, + "oomKilled", true, + "cleanupOutcome", "success", + ) + } else { + log.Info("Skipping runner removal from service for terminal OOMKilled failure because runner is not registered", + "runnerName", ephemeralRunner.Status.RunnerName, + "runnerId", ephemeralRunner.Status.RunnerID, + "hasJob", ephemeralRunner.HasJob(), + "reason", podReason, + "exitCode", exitCode, + "oomKilled", true, + "cleanupOutcome", "skipped_unregistered", + ) + } + + log.Info("Deleting ephemeral runner after terminal OOMKilled failure", + "runnerName", ephemeralRunner.Status.RunnerName, + "runnerId", ephemeralRunner.Status.RunnerID, + "hasJob", ephemeralRunner.HasJob(), + "reason", podReason, + "exitCode", exitCode, + "oomKilled", true, + ) + if err := r.Delete(ctx, ephemeralRunner); err != nil { + log.Error(err, "Failed to delete ephemeral runner after terminal OOMKilled failure", + "runnerName", ephemeralRunner.Status.RunnerName, + "runnerId", ephemeralRunner.Status.RunnerID, + "hasJob", ephemeralRunner.HasJob(), + "reason", podReason, + "exitCode", exitCode, + "oomKilled", true, + ) + return err + } + log.Info("Deleted ephemeral runner after terminal OOMKilled failure", + "runnerName", ephemeralRunner.Status.RunnerName, + "runnerId", ephemeralRunner.Status.RunnerID, + "hasJob", ephemeralRunner.HasJob(), + "reason", podReason, + "exitCode", exitCode, + "oomKilled", true, + "cleanupOutcome", "cr_deleted", + ) + return nil + } + if ephemeralRunner.HasJob() { log.Error( errors.New("ephemeral runner has a job assigned, but the pod has failed"), "Ephemeral runner either has faulty entrypoint or something external killing the runner", + "runnerName", ephemeralRunner.Status.RunnerName, + "runnerId", ephemeralRunner.Status.RunnerID, + "hasJob", true, + "reason", podReason, + "exitCode", exitCode, ) - log.Info("Deleting the ephemeral runner that has a job assigned but the pod has failed") - if err := r.Delete(ctx, ephemeralRunner); err != nil { - log.Error(err, "Failed to delete the ephemeral runner that has a job assigned but the pod has failed") - return err + + if ephemeralRunner.Status.RunnerID != 0 { + log.Info("Attempting to remove registered runner from service", + "runnerName", ephemeralRunner.Status.RunnerName, + "runnerId", ephemeralRunner.Status.RunnerID, + "hasJob", true, + "reason", podReason, + "exitCode", exitCode, + ) + if err := r.deleteRunnerFromService(ctx, ephemeralRunner, log); err != nil { + log.Error(err, "Failed to remove runner from service; will retry on next reconciliation", + "runnerName", ephemeralRunner.Status.RunnerName, + "runnerId", ephemeralRunner.Status.RunnerID, + "hasJob", true, + "reason", podReason, + "exitCode", exitCode, + "retryable", true, + ) + return err + } + log.Info("Successfully removed runner from service", + "runnerName", ephemeralRunner.Status.RunnerName, + "runnerId", ephemeralRunner.Status.RunnerID, + "hasJob", true, + "reason", podReason, + "exitCode", exitCode, + "cleanupOutcome", "success", + ) + } else { + log.Info("Skipping runner removal from service because runner is not registered", + "runnerName", ephemeralRunner.Status.RunnerName, + "runnerId", ephemeralRunner.Status.RunnerID, + "hasJob", true, + "reason", podReason, + "exitCode", exitCode, + "cleanupOutcome", "skipped_unregistered", + ) } - log.Info("Deleted the ephemeral runner that has a job assigned but the pod has failed") - log.Info("Trying to remove the runner from the service") - actionsClient, err := r.GetActionsService(ctx, ephemeralRunner) - if err != nil { - log.Error(err, "Failed to get actions client for removing the runner from the service") - return nil + log.Info("Deleting the ephemeral runner that has a job assigned but the pod has failed", + "runnerName", ephemeralRunner.Status.RunnerName, + "runnerId", ephemeralRunner.Status.RunnerID, + "hasJob", true, + "reason", podReason, + "exitCode", exitCode, + ) + if err := r.Delete(ctx, ephemeralRunner); err != nil { + log.Error(err, "Failed to delete the ephemeral runner that has a job assigned but the pod has failed", + "runnerName", ephemeralRunner.Status.RunnerName, + "runnerId", ephemeralRunner.Status.RunnerID, + "hasJob", true, + "reason", podReason, + "exitCode", exitCode, + ) + return err } - if err := actionsClient.RemoveRunner(ctx, int64(ephemeralRunner.Status.RunnerID)); err != nil { - log.Error(err, "Failed to remove the runner from the service") - return nil - } - log.Info("Removed the runner from the service") + log.Info("Deleted the ephemeral runner that has a job assigned but the pod has failed", + "runnerName", ephemeralRunner.Status.RunnerName, + "runnerId", ephemeralRunner.Status.RunnerID, + "hasJob", true, + "reason", podReason, + "exitCode", exitCode, + "cleanupOutcome", "cr_deleted", + ) return nil } + // HasJob=false path: still need to clean up if runner is registered + if ephemeralRunner.Status.RunnerID != 0 { + log.Info("Attempting to remove registered runner from service (HasJob=false)", + "runnerName", ephemeralRunner.Status.RunnerName, + "runnerId", ephemeralRunner.Status.RunnerID, + "hasJob", false, + "reason", podReason, + "exitCode", exitCode, + ) + if err := r.deleteRunnerFromService(ctx, ephemeralRunner, log); err != nil { + log.Error(err, "Failed to remove runner from service; will retry on next reconciliation", + "runnerName", ephemeralRunner.Status.RunnerName, + "runnerId", ephemeralRunner.Status.RunnerID, + "hasJob", false, + "reason", podReason, + "exitCode", exitCode, + "retryable", true, + ) + return err + } + log.Info("Successfully removed runner from service", + "runnerName", ephemeralRunner.Status.RunnerName, + "runnerId", ephemeralRunner.Status.RunnerID, + "hasJob", false, + "reason", podReason, + "exitCode", exitCode, + "cleanupOutcome", "success", + ) + } else { + log.Info("Skipping runner removal from service because runner is not registered", + "runnerName", ephemeralRunner.Status.RunnerName, + "runnerId", ephemeralRunner.Status.RunnerID, + "hasJob", false, + "reason", podReason, + "exitCode", exitCode, + "cleanupOutcome", "skipped_unregistered", + ) + } + if err := r.deletePodAsFailed(ctx, ephemeralRunner, pod, log); err != nil { - log.Error(err, "Failed to delete runner pod on failure") + log.Error(err, "Failed to delete runner pod on failure", + "runnerName", ephemeralRunner.Status.RunnerName, + "runnerId", ephemeralRunner.Status.RunnerID, + "hasJob", ephemeralRunner.HasJob(), + "reason", podReason, + "exitCode", exitCode, + ) return err } @@ -561,21 +763,46 @@ func (r *EphemeralRunnerReconciler) cleanupRunnerLinkedSecrets(ctx context.Conte } func (r *EphemeralRunnerReconciler) markAsFailed(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, errMessage string, reason string, log logr.Logger) error { - log.Info("Updating ephemeral runner status to Failed") + log.Info("Updating ephemeral runner status to Failed", + "runnerName", ephemeralRunner.Status.RunnerName, + "runnerId", ephemeralRunner.Status.RunnerID, + "reason", reason, + "message", errMessage, + ) if err := patchSubResource(ctx, r.Status(), ephemeralRunner, func(obj *v1alpha1.EphemeralRunner) { obj.Status.Phase = v1alpha1.EphemeralRunnerPhaseFailed obj.Status.Reason = reason obj.Status.Message = errMessage }); err != nil { + log.Error(err, "Failed to update ephemeral runner status Phase/Message", + "runnerName", ephemeralRunner.Status.RunnerName, + "runnerId", ephemeralRunner.Status.RunnerID, + "reason", reason, + ) return fmt.Errorf("failed to update ephemeral runner status Phase/Message: %w", err) } - log.Info("Removing the runner from the service") + log.Info("Removing the runner from the service", + "runnerName", ephemeralRunner.Status.RunnerName, + "runnerId", ephemeralRunner.Status.RunnerID, + "reason", reason, + ) if err := r.deleteRunnerFromService(ctx, ephemeralRunner, log); err != nil { + log.Error(err, "Failed to remove the runner from service; will retry on next reconciliation", + "runnerName", ephemeralRunner.Status.RunnerName, + "runnerId", ephemeralRunner.Status.RunnerID, + "reason", reason, + "retryable", true, + ) return fmt.Errorf("failed to remove the runner from service: %w", err) } - log.Info("EphemeralRunner is marked as Failed and deleted from the service") + log.Info("EphemeralRunner is marked as Failed and deleted from the service", + "runnerName", ephemeralRunner.Status.RunnerName, + "runnerId", ephemeralRunner.Status.RunnerID, + "reason", reason, + "cleanupOutcome", "success", + ) return nil } @@ -616,8 +843,16 @@ func (r *EphemeralRunnerReconciler) deletePodAsFailed(ctx context.Context, ephem } obj.Status.Failures[string(pod.UID)] = metav1.Now() obj.Status.Ready = false + obj.Status.Reason = pod.Status.Reason + if obj.Status.Reason == "" { + obj.Status.Reason = string(corev1.PodFailed) + } + obj.Status.Message = pod.Status.Message + if obj.Status.Message == "" { + obj.Status.Message = "Pod failed without detailed termination information" + } }); err != nil { return fmt.Errorf("failed to update ephemeral runner status: failed attempts: %w", err) } @@ -829,16 +1064,32 @@ func (r *EphemeralRunnerReconciler) updateRunStatusFromPod(ctx context.Context, func (r *EphemeralRunnerReconciler) deleteRunnerFromService(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) error { client, err := r.GetActionsService(ctx, ephemeralRunner) if err != nil { + log.Error(err, "Failed to get actions client for runner", + "runnerName", ephemeralRunner.Status.RunnerName, + "runnerId", ephemeralRunner.Status.RunnerID, + ) return fmt.Errorf("failed to get actions client for runner: %w", err) } - log.Info("Removing runner from the service", "runnerId", ephemeralRunner.Status.RunnerID) + log.Info("Removing runner from the service", + "runnerName", ephemeralRunner.Status.RunnerName, + "runnerId", ephemeralRunner.Status.RunnerID, + ) err = client.RemoveRunner(ctx, int64(ephemeralRunner.Status.RunnerID)) if err != nil { + log.Error(err, "Failed to remove runner from the service; cleanup will be retried", + "runnerName", ephemeralRunner.Status.RunnerName, + "runnerId", ephemeralRunner.Status.RunnerID, + "retryable", true, + ) return fmt.Errorf("failed to remove runner from the service: %w", err) } - log.Info("Removed runner from the service", "runnerId", ephemeralRunner.Status.RunnerID) + log.Info("Removed runner from the service", + "runnerName", ephemeralRunner.Status.RunnerName, + "runnerId", ephemeralRunner.Status.RunnerID, + "cleanupOutcome", "success", + ) return nil } diff --git a/controllers/actions.github.com/ephemeralrunner_controller_test.go b/controllers/actions.github.com/ephemeralrunner_controller_test.go index 559d9469..0880b526 100644 --- a/controllers/actions.github.com/ephemeralrunner_controller_test.go +++ b/controllers/actions.github.com/ephemeralrunner_controller_test.go @@ -4,12 +4,14 @@ import ( "context" "crypto/tls" "encoding/base64" + "errors" "fmt" "net/http" "net/http/httptest" "os" "path/filepath" "strings" + "sync" "time" "github.com/actions/actions-runner-controller/apis/actions.github.com/v1alpha1" @@ -1086,6 +1088,719 @@ var _ = Describe("EphemeralRunner", func() { ephemeralRunnerTimeout, ).Should(BeEquivalentTo(v1alpha1.EphemeralRunnerPhaseRunning)) }) + + // ============================================================================== + // Terminal Lifecycle Invariants Matrix for EphemeralRunner Pod Failures + // ============================================================================== + // + // This matrix codifies expected outcomes for all terminal runner failure scenarios. + // These tests drive the controller's reconciliation branching logic. + // + // Dimensions: + // ----------- + // 1. HasJob (JobID set): true | false + // 2. RunnerID: 0 | non-zero + // 3. Container Status: nil/no-terminated | terminated + // 4. Exit Code (if term): 0 | 7 | 137 (OOMKilled) | other non-zero + // 5. Pod.Status.Reason: Evicted | OutOf* | OOMKilled | + // + // Expected Outcomes: + // ------------------ + // HasJob | RunnerID | ContainerStatus | ExitCode | PodReason | Expected Outcome + // -------|----------|-----------------|----------|------------|--------------------------------------------------- + // false | 0 | nil/no-term | N/A | Evicted | DELETE pod, track failure, RECREATE (retryable) + // false | 0 | nil/no-term | N/A | OutOf* | DELETE pod, track failure, RECREATE (retryable) + // false | 0 | nil/no-term | N/A | | DELETE pod, track failure, RECREATE (retryable) + // false | non-zero | nil/no-term | N/A | Evicted | DELETE pod, track failure, RECREATE (retryable) + // false | non-zero | nil/no-term | N/A | OutOf* | DELETE pod, track failure, RECREATE (retryable) + // false | non-zero | nil/no-term | N/A | | DELETE pod, track failure, RECREATE (retryable) + // false | 0 | terminated | 0 | N/A | DELETE runner (SUCCESS - terminal) + // false | non-zero | terminated | 0 | N/A | DELETE runner (SUCCESS - terminal) + // false | 0 | terminated | 7 | N/A | Mark Outdated (TERMINAL) + // false | non-zero | terminated | 7 | N/A | Mark Outdated (TERMINAL) + // false | 0 | terminated | 137 | OOMKilled | DELETE runner + remove from service (TERMINAL) + // false | non-zero | terminated | 137 | OOMKilled | DELETE runner + remove from service (TERMINAL) + // false | 0 | terminated | other | | DELETE pod, track failure, RECREATE (retryable) + // false | non-zero | terminated | other | | DELETE pod, track failure, RECREATE (retryable) + // true | 0 | nil/no-term | N/A | Evicted | DELETE runner + remove from service (TERMINAL) + // true | 0 | nil/no-term | N/A | OutOf* | DELETE runner + remove from service (TERMINAL) + // true | 0 | nil/no-term | N/A | | DELETE runner + remove from service (TERMINAL) + // true | non-zero | nil/no-term | N/A | Evicted | DELETE runner + remove from service (TERMINAL) + // true | non-zero | nil/no-term | N/A | OutOf* | DELETE runner + remove from service (TERMINAL) + // true | non-zero | nil/no-term | N/A | | DELETE runner + remove from service (TERMINAL) + // true | 0 | terminated | 0 | N/A | DELETE runner (SUCCESS - terminal) + // true | non-zero | terminated | 0 | N/A | DELETE runner (SUCCESS - terminal) + // true | 0 | terminated | 7 | N/A | Mark Outdated (TERMINAL) + // true | non-zero | terminated | 7 | N/A | Mark Outdated (TERMINAL) + // true | 0 | terminated | 137 | OOMKilled | DELETE runner + remove from service (TERMINAL) + // true | non-zero | terminated | 137 | OOMKilled | DELETE runner + remove from service (TERMINAL) + // true | 0 | terminated | other | | DELETE runner + remove from service (TERMINAL) + // true | non-zero | terminated | other | | DELETE runner + remove from service (TERMINAL) + // + // Key Insights: + // ------------- + // 1. HasJob=true always results in TERMINAL deletion (runner + service cleanup) + // 2. HasJob=false with retryable failures (Evicted, OutOf*, no container status) -> RECREATE + // 3. OOMKilled (ExitCode 137, Reason=OOMKilled) is TERMINAL regardless of HasJob status + // 4. ExitCode 0 and 7 are always terminal regardless of HasJob + // 5. Other non-zero exit codes follow the HasJob branching logic + // + // Implementation Notes: + // --------------------- + // - Controller logic: ephemeralrunner_controller.go:314-352 (main PodFailed branch) + // - deleteEphemeralRunnerOrPod: ephemeralrunner_controller.go:388-420 (HasJob fork) + // - deletePodAsFailed: ephemeralrunner_controller.go:604-627 (failure tracking) + // + // ============================================================================== + + Context("Terminal Lifecycle Invariants Matrix", func() { + It("Matrix: HasJob=false, RunnerID=non-zero, ExitCode=137 (OOMKilled) should delete runner and remove from service", func() { + pod := new(corev1.Pod) + Eventually(func() error { + return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod) + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod") + + pod.Status.Phase = corev1.PodFailed + pod.Status.Reason = "OOMKilled" + pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{ + Name: v1alpha1.EphemeralRunnerContainerName, + State: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{ + ExitCode: 137, + Reason: "OOMKilled", + }, + }, + }) + err := k8sClient.Status().Update(ctx, pod) + Expect(err).To(BeNil(), "failed to update pod status") + + Eventually(func() bool { + check := new(v1alpha1.EphemeralRunner) + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check) + return kerrors.IsNotFound(err) + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeTrue(), "ephemeral runner should be deleted (terminal)") + }) + + It("Matrix: HasJob=true, RunnerID=non-zero, ExitCode=137 (OOMKilled) should delete runner and remove from service", func() { + er := new(v1alpha1.EphemeralRunner) + Eventually(func() error { + return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er) + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner") + + er.Status.JobID = "12345" + err := k8sClient.Status().Update(ctx, er) + Expect(err).To(BeNil(), "failed to update ephemeral runner status") + + Eventually(func() (string, error) { + current := new(v1alpha1.EphemeralRunner) + if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, current); err != nil { + return "", err + } + return current.Status.JobID, nil + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Equal("12345")) + + pod := new(corev1.Pod) + Eventually(func() error { + return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod) + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod") + + pod.Status.Phase = corev1.PodFailed + pod.Status.Reason = "OOMKilled" + pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{ + Name: v1alpha1.EphemeralRunnerContainerName, + State: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{ + ExitCode: 137, + Reason: "OOMKilled", + }, + }, + }) + err = k8sClient.Status().Update(ctx, pod) + Expect(err).To(BeNil(), "failed to update pod status") + + Eventually(func() bool { + check := new(v1alpha1.EphemeralRunner) + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check) + return kerrors.IsNotFound(err) + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeTrue(), "ephemeral runner should be deleted (terminal)") + }) + + It("Matrix: HasJob=false, RunnerID=0, ExitCode=137 (OOMKilled) should delete runner (terminal)", func() { + pod := new(corev1.Pod) + Eventually(func() error { + return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod) + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod") + + er := new(v1alpha1.EphemeralRunner) + Eventually(func() error { + return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er) + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner") + + pod.Status.Phase = corev1.PodFailed + pod.Status.Reason = "OOMKilled" + pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{ + Name: v1alpha1.EphemeralRunnerContainerName, + State: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{ + ExitCode: 137, + Reason: "OOMKilled", + }, + }, + }) + err := k8sClient.Status().Update(ctx, pod) + Expect(err).To(BeNil(), "failed to update pod status") + + Eventually(func() bool { + check := new(v1alpha1.EphemeralRunner) + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check) + return kerrors.IsNotFound(err) + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeTrue(), "ephemeral runner should be deleted (terminal)") + }) + + It("Matrix: HasJob=false, no ContainerStatus.Terminated, Pod.Reason=OOMKilled should delete runner (terminal)", func() { + pod := new(corev1.Pod) + Eventually(func() error { + return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod) + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod") + + pod.Status.Phase = corev1.PodFailed + pod.Status.Reason = "OOMKilled" + pod.Status.ContainerStatuses = nil + err := k8sClient.Status().Update(ctx, pod) + Expect(err).To(BeNil(), "failed to update pod status") + + Eventually(func() bool { + check := new(v1alpha1.EphemeralRunner) + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check) + return kerrors.IsNotFound(err) + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeTrue(), "ephemeral runner should be deleted (terminal)") + }) + + It("Matrix: HasJob=true, no ContainerStatus.Terminated, Pod.Reason=OOMKilled should delete runner (terminal)", func() { + er := new(v1alpha1.EphemeralRunner) + Eventually(func() error { + return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er) + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner") + + er.Status.JobID = "67890" + err := k8sClient.Status().Update(ctx, er) + Expect(err).To(BeNil(), "failed to update ephemeral runner status with job ID") + + Eventually(func() (string, error) { + current := new(v1alpha1.EphemeralRunner) + if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, current); err != nil { + return "", err + } + return current.Status.JobID, nil + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Equal("67890")) + + pod := new(corev1.Pod) + Eventually(func() error { + return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod) + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod") + + pod.Status.Phase = corev1.PodFailed + pod.Status.Reason = "OOMKilled" + pod.Status.ContainerStatuses = nil + err = k8sClient.Status().Update(ctx, pod) + Expect(err).To(BeNil(), "failed to update pod status") + + Eventually(func() bool { + check := new(v1alpha1.EphemeralRunner) + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check) + return kerrors.IsNotFound(err) + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeTrue(), "ephemeral runner should be deleted (terminal)") + }) + + It("Matrix: HasJob=false, no ContainerStatus and empty Pod.Reason should use fallback reason", func() { + pod := new(corev1.Pod) + Eventually(func() error { + return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod) + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod") + + pod.Status.Phase = corev1.PodFailed + pod.Status.Reason = "" + pod.Status.Message = "" + pod.Status.ContainerStatuses = nil + err := k8sClient.Status().Update(ctx, pod) + Expect(err).To(BeNil(), "failed to update pod status") + + updated := new(v1alpha1.EphemeralRunner) + Eventually(func() (int, error) { + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated) + if err != nil { + return 0, err + } + return len(updated.Status.Failures), nil + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Equal(1), "failure should be tracked") + + Eventually(func() (string, error) { + current := new(v1alpha1.EphemeralRunner) + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, current) + if err != nil { + return "", err + } + return current.Status.Reason, nil + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Equal("Failed"), "reason should use fallback value") + + Eventually(func() (string, error) { + current := new(v1alpha1.EphemeralRunner) + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, current) + if err != nil { + return "", err + } + return current.Status.Message, nil + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Equal("Pod failed without detailed termination information"), "message should use fallback value") + + Eventually(func() error { + pod := new(corev1.Pod) + return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod) + }, ephemeralRunnerTimeout*2, ephemeralRunnerInterval).Should(Succeed(), "pod should be recreated") + }) + }) + + Context("OOMKilled with Transient API Failures", func() { + var ctx context.Context + var autoscalingNS *corev1.Namespace + var configSecret *corev1.Secret + var controller *EphemeralRunnerReconciler + var ephemeralRunner *v1alpha1.EphemeralRunner + var mgr ctrl.Manager + + BeforeEach(func() { + ctx = context.Background() + autoscalingNS, mgr = createNamespace(GinkgoT(), k8sClient) + configSecret = createDefaultSecret(GinkgoT(), k8sClient, autoscalingNS.Name) + + // Create controller with fake client that fails RemoveRunner + controller = &EphemeralRunnerReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Log: logf.Log, + ResourceBuilder: ResourceBuilder{ + SecretResolver: secretresolver.New(mgr.GetClient(), scalefake.NewMultiClient( + scalefake.WithClient( + scalefake.NewClient( + scalefake.WithGenerateJitRunnerConfig( + &scaleset.RunnerScaleSetJitRunnerConfig{ + Runner: &scaleset.RunnerReference{ID: 2, Name: "test-runner-transient"}, + EncodedJITConfig: "fake-jit-config", + }, + nil, + ), + scalefake.WithRemoveRunner( + errors.New("transient API failure"), + ), + ), + ), + )), + }, + } + + err := controller.SetupWithManager(mgr) + Expect(err).To(BeNil(), "failed to setup controller") + + ephemeralRunner = newExampleRunner("test-runner-transient", autoscalingNS.Name, configSecret.Name) + err = k8sClient.Create(ctx, ephemeralRunner) + Expect(err).To(BeNil(), "failed to create ephemeral runner") + + startManagers(GinkgoT(), mgr) + }) + + It("Matrix: HasJob=true, OOMKilled with transient RemoveRunner API failure should keep runner for retry and not set deletion timestamp", func() { + pod := new(corev1.Pod) + Eventually(func() error { + return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod) + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod") + + er := new(v1alpha1.EphemeralRunner) + Eventually(func() error { + return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er) + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner") + + er.Status.JobID = "99999" + er.Status.RunnerID = 2 + err := k8sClient.Status().Update(ctx, er) + Expect(err).To(BeNil(), "failed to update ephemeral runner status with job ID and runner ID") + + Eventually(func() (string, error) { + current := new(v1alpha1.EphemeralRunner) + if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, current); err != nil { + return "", err + } + return current.Status.JobID, nil + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Equal("99999")) + + pod.Status.Phase = corev1.PodFailed + pod.Status.Reason = "OOMKilled" + pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{ + Name: v1alpha1.EphemeralRunnerContainerName, + State: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{ + ExitCode: 137, + Reason: "OOMKilled", + }, + }, + }) + err = k8sClient.Status().Update(ctx, pod) + Expect(err).To(BeNil(), "failed to update pod status") + + Consistently(func() bool { + check := new(v1alpha1.EphemeralRunner) + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check) + if err != nil { + return false + } + return check.DeletionTimestamp == nil && len(check.Finalizers) > 0 + }, "5s", ephemeralRunnerInterval).Should(BeTrue(), "runner should be preserved for retry until RemoveRunner succeeds") + }) + + It("Matrix: HasJob=true, non-OOM pod failure with transient RemoveRunner API failure should preserve runner for retry", func() { + pod := new(corev1.Pod) + Eventually(func() error { + return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod) + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod") + + er := new(v1alpha1.EphemeralRunner) + Eventually(func() error { + return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er) + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner") + + er.Status.JobID = "88888" + er.Status.RunnerID = 2 + err := k8sClient.Status().Update(ctx, er) + Expect(err).To(BeNil(), "failed to update ephemeral runner status with job ID and runner ID") + + pod.Status.Phase = corev1.PodFailed + pod.Status.Reason = "CrashLoopBackOff" + pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{ + Name: v1alpha1.EphemeralRunnerContainerName, + State: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{ + ExitCode: 1, + Reason: "Error", + }, + }, + }) + err = k8sClient.Status().Update(ctx, pod) + Expect(err).To(BeNil(), "failed to update pod status") + + Consistently(func() bool { + check := new(v1alpha1.EphemeralRunner) + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check) + if err != nil { + return false + } + return check.DeletionTimestamp == nil + }, "5s", ephemeralRunnerInterval).Should(BeTrue(), "runner should remain undeleted so RemoveRunner can be retried") + }) + + It("Matrix: HasJob=false, RunnerID=non-zero, pod failure with transient RemoveRunner API failure should preserve runner for retry", func() { + pod := new(corev1.Pod) + Eventually(func() error { + return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod) + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod") + + er := new(v1alpha1.EphemeralRunner) + Eventually(func() error { + return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er) + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner") + + er.Status.RunnerID = 3 + err := k8sClient.Status().Update(ctx, er) + Expect(err).To(BeNil(), "failed to update ephemeral runner status with runner ID") + + pod.Status.Phase = corev1.PodFailed + pod.Status.Reason = "Error" + pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{ + Name: v1alpha1.EphemeralRunnerContainerName, + State: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{ + ExitCode: 1, + Reason: "Error", + }, + }, + }) + err = k8sClient.Status().Update(ctx, pod) + Expect(err).To(BeNil(), "failed to update pod status") + + updated := new(v1alpha1.EphemeralRunner) + Eventually(func() (int, error) { + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated) + if err != nil { + return 0, err + } + return len(updated.Status.Failures), nil + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Equal(1), "failure should be tracked") + + Consistently(func() bool { + check := new(v1alpha1.EphemeralRunner) + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check) + if err != nil { + return false + } + return check.DeletionTimestamp == nil && len(check.Status.Failures) == 1 + }, "5s", ephemeralRunnerInterval).Should(BeTrue(), "runner should be preserved for RemoveRunner retry despite HasJob=false") + }) + }) + + // ============================================================================== + // Convergence Tests: Verify retry-safe cleanup converges correctly + // ============================================================================== + // These tests ensure the retry mechanism: + // 1. Eventually converges when external service recovers + // 2. Respects bounded behavior (no infinite loops) + // 3. Maintains observability (no silent success on persistent failure) + // + // Implementation Reference: + // - Backoff logic: ephemeralrunner_controller.go:229-254 + // - Cleanup retry: ephemeralrunner_controller.go:430-440 + // - MaxFailures boundary: ephemeralrunner_controller.go:229-236 + // ============================================================================== + Context("Retry Convergence and Bounded Behavior", func() { + var callCount int + var mu sync.Mutex + + BeforeEach(func() { + mu.Lock() + callCount = 0 + mu.Unlock() + + controller = &EphemeralRunnerReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Log: logf.Log, + ResourceBuilder: ResourceBuilder{ + SecretResolver: secretresolver.New(mgr.GetClient(), scalefake.NewMultiClient( + scalefake.WithClient( + scalefake.NewClient( + scalefake.WithGenerateJitRunnerConfig( + &scaleset.RunnerScaleSetJitRunnerConfig{ + Runner: &scaleset.RunnerReference{ID: 99, Name: "test-convergence"}, + EncodedJITConfig: "fake-jit-config", + }, + nil, + ), + scalefake.WithRemoveRunnerFunc(func(ctx context.Context, runnerID int64) error { + mu.Lock() + defer mu.Unlock() + callCount++ + if callCount <= 2 { + return errors.New("transient API failure") + } + return nil + }), + ), + ), + )), + }, + } + + err := controller.SetupWithManager(mgr) + Expect(err).To(BeNil(), "failed to setup controller") + + ephemeralRunner = newExampleRunner("test-convergence", autoscalingNS.Name, configSecret.Name) + err = k8sClient.Create(ctx, ephemeralRunner) + Expect(err).To(BeNil(), "failed to create ephemeral runner") + + startManagers(GinkgoT(), mgr) + }) + + It("Convergence: Repeated transient RemoveRunner failures eventually converge when service recovers", func() { + pod := new(corev1.Pod) + Eventually(func() error { + return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod) + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod") + + er := new(v1alpha1.EphemeralRunner) + Eventually(func() error { + return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er) + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner") + + er.Status.JobID = "conv-job-1" + er.Status.RunnerID = 99 + err := k8sClient.Status().Update(ctx, er) + Expect(err).To(BeNil(), "failed to update ephemeral runner status") + + Eventually(func() (string, error) { + current := new(v1alpha1.EphemeralRunner) + if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, current); err != nil { + return "", err + } + return current.Status.JobID, nil + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Equal("conv-job-1")) + + pod.Status.Phase = corev1.PodFailed + pod.Status.Reason = "Error" + pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{ + Name: v1alpha1.EphemeralRunnerContainerName, + State: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{ + ExitCode: 1, + Reason: "Error", + }, + }, + }) + err = k8sClient.Status().Update(ctx, pod) + Expect(err).To(BeNil(), "failed to update pod status") + + Eventually(func() int { + mu.Lock() + defer mu.Unlock() + return callCount + }, "30s", ephemeralRunnerInterval).Should(BeNumerically(">=", 3), "RemoveRunner should be retried until success") + + Eventually(func() bool { + check := new(v1alpha1.EphemeralRunner) + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check) + return kerrors.IsNotFound(err) + }, "40s", ephemeralRunnerInterval).Should(BeTrue(), "ephemeral runner should be deleted after cleanup converges") + }) + }) + + Context("Bounded Retry Behavior: MaxFailures Enforcement", func() { + var persistentFailCount int + var muPersistent sync.Mutex + + BeforeEach(func() { + muPersistent.Lock() + persistentFailCount = 0 + muPersistent.Unlock() + + controller = &EphemeralRunnerReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Log: logf.Log, + ResourceBuilder: ResourceBuilder{ + SecretResolver: secretresolver.New(mgr.GetClient(), scalefake.NewMultiClient( + scalefake.WithClient( + scalefake.NewClient( + scalefake.WithGenerateJitRunnerConfig( + &scaleset.RunnerScaleSetJitRunnerConfig{ + Runner: &scaleset.RunnerReference{ID: 777, Name: "test-persistent"}, + EncodedJITConfig: "fake-jit-config", + }, + nil, + ), + scalefake.WithRemoveRunnerFunc(func(ctx context.Context, runnerID int64) error { + muPersistent.Lock() + defer muPersistent.Unlock() + persistentFailCount++ + return errors.New("persistent API failure") + }), + ), + ), + )), + }, + } + + err := controller.SetupWithManager(mgr) + Expect(err).To(BeNil(), "failed to setup controller") + + ephemeralRunner = newExampleRunner("test-persistent", autoscalingNS.Name, configSecret.Name) + err = k8sClient.Create(ctx, ephemeralRunner) + Expect(err).To(BeNil(), "failed to create ephemeral runner") + + startManagers(GinkgoT(), mgr) + }) + + It("Bounded: Persistent RemoveRunner failures do not cause infinite retry (maxFailures enforced)", func() { + pod := new(corev1.Pod) + Eventually(func() error { + return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod) + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod") + + er := new(v1alpha1.EphemeralRunner) + Eventually(func() error { + return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er) + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner") + + er.Status.JobID = "persist-job" + er.Status.RunnerID = 777 + err := k8sClient.Status().Update(ctx, er) + Expect(err).To(BeNil(), "failed to update ephemeral runner status") + + pod.Status.Phase = corev1.PodFailed + pod.Status.Reason = "Error" + pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{ + Name: v1alpha1.EphemeralRunnerContainerName, + State: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{ + ExitCode: 1, + Reason: "Error", + }, + }, + }) + err = k8sClient.Status().Update(ctx, pod) + Expect(err).To(BeNil(), "failed to update pod status") + + Consistently(func() int { + muPersistent.Lock() + defer muPersistent.Unlock() + return persistentFailCount + }, "15s", "1s").Should(BeNumerically("<=", 20), "RemoveRunner should not be called excessively (no infinite loop)") + + check := new(v1alpha1.EphemeralRunner) + err = k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check) + if err == nil { + Expect(check.DeletionTimestamp).NotTo(BeNil(), "runner should have deletion timestamp showing cleanup intent") + } else { + Expect(kerrors.IsNotFound(err)).To(BeTrue(), "runner either deleted or has visible error state") + } + }) + + It("Observability: Failed cleanup attempts are visible and not silently succeeded", func() { + pod := new(corev1.Pod) + Eventually(func() error { + return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod) + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod") + + er := new(v1alpha1.EphemeralRunner) + Eventually(func() error { + return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er) + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner") + + er.Status.JobID = "observe-job" + er.Status.RunnerID = 777 + err := k8sClient.Status().Update(ctx, er) + Expect(err).To(BeNil(), "failed to update ephemeral runner status") + + pod.Status.Phase = corev1.PodFailed + pod.Status.Reason = "Error" + pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{ + Name: v1alpha1.EphemeralRunnerContainerName, + State: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{ + ExitCode: 1, + Reason: "Error", + }, + }, + }) + err = k8sClient.Status().Update(ctx, pod) + Expect(err).To(BeNil(), "failed to update pod status") + + Eventually(func() int { + muPersistent.Lock() + defer muPersistent.Unlock() + return persistentFailCount + }, "10s", ephemeralRunnerInterval).Should(BeNumerically(">=", 1), "RemoveRunner must be attempted (not silently skipped)") + + time.Sleep(5 * time.Second) + check := new(v1alpha1.EphemeralRunner) + err = k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check) + + if err == nil { + Expect(check.DeletionTimestamp).NotTo(BeNil(), "runner must have deletion timestamp showing cleanup attempt") + } else { + Expect(kerrors.IsNotFound(err)).To(BeTrue(), "runner either deleted or has visible deletion intent") + } + + muPersistent.Lock() + finalCount := persistentFailCount + muPersistent.Unlock() + Expect(finalCount).To(BeNumerically(">=", 1), "cleanup must be attempted, not silently skipped") + }) + }) }) Describe("Checking the API", func() { diff --git a/controllers/actions.github.com/ephemeralrunnerset_controller_test.go b/controllers/actions.github.com/ephemeralrunnerset_controller_test.go index 4a0a41b3..6116f4bd 100644 --- a/controllers/actions.github.com/ephemeralrunnerset_controller_test.go +++ b/controllers/actions.github.com/ephemeralrunnerset_controller_test.go @@ -1092,6 +1092,150 @@ var _ = Describe("Test EphemeralRunnerSet controller", func() { ephemeralRunnerSetTestInterval, ).Should(BeEquivalentTo(desiredStatus), "Status is not eventually updated to the desired one") }) + + It("Should handle repeated OOMKilled failures without unbounded stuck state growth", func() { + // This test validates that when child EphemeralRunners encounter repeated OOMKilled + // failures, the set-level aggregation remains correct and does not cause unbounded + // stuck state growth. This is a regression test for aggregated stuck-runner behavior. + // + // Pattern: controllers/actions.github.com/ephemeralrunnerset_controller.go:239 (status update aggregation) + // Pattern: controllers/actions.github.com/ephemeralrunnerset_controller.go:505 (deleteEphemeralRunnerWithActionsClient cleanup) + + created := new(v1alpha1.EphemeralRunnerSet) + Eventually( + func() error { + return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunnerSet.Name, Namespace: ephemeralRunnerSet.Namespace}, created) + }, + ephemeralRunnerSetTestTimeout, + ephemeralRunnerSetTestInterval, + ).Should(Succeed(), "EphemeralRunnerSet should be created") + + // Scale up to 3 runners + updated := created.DeepCopy() + updated.Spec.Replicas = 3 + err := k8sClient.Update(ctx, updated) + Expect(err).NotTo(HaveOccurred(), "failed to update EphemeralRunnerSet replica count") + + runnerList := new(v1alpha1.EphemeralRunnerList) + Eventually( + func() (int, error) { + err := listEphemeralRunnersAndRemoveFinalizers(ctx, k8sClient, runnerList, ephemeralRunnerSet.Namespace) + if err != nil { + return -1, err + } + return len(runnerList.Items), nil + }, + ephemeralRunnerSetTestTimeout, + ephemeralRunnerSetTestInterval, + ).Should(BeEquivalentTo(3), "3 EphemeralRunners should be created") + + // Simulate repeated OOMKilled failures on all 3 runners + // Each runner will transition: Pending -> Failed (OOMKilled) -> Pending -> Failed (OOMKilled) ... + var runners []*v1alpha1.EphemeralRunner + for i := range runnerList.Items { + runners = append(runners, runnerList.Items[i].DeepCopy()) + } + + // First cycle: transition all runners to Failed with OOMKilled + for i, runner := range runners { + runner.Status.RunnerID = 201 + i + runner.Status.Phase = v1alpha1.EphemeralRunnerPhaseFailed + runner.Status.Reason = "OOMKilled" + err = k8sClient.Status().Patch(ctx, runner, client.MergeFrom(&runnerList.Items[i])) + Expect(err).NotTo(HaveOccurred(), "failed to patch runner to failed state") + } + + // Verify aggregated status shows 3 failed runners + desiredStatus := v1alpha1.EphemeralRunnerSetStatus{ + Phase: v1alpha1.EphemeralRunnerSetPhaseRunning, + CurrentReplicas: 3, + PendingEphemeralRunners: 0, + RunningEphemeralRunners: 0, + FailedEphemeralRunners: 3, + } + Eventually( + func() (v1alpha1.EphemeralRunnerSetStatus, error) { + updated := new(v1alpha1.EphemeralRunnerSet) + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunnerSet.Name, Namespace: ephemeralRunnerSet.Namespace}, updated) + if err != nil { + return v1alpha1.EphemeralRunnerSetStatus{}, err + } + return updated.Status, nil + }, + ephemeralRunnerSetTestTimeout, + ephemeralRunnerSetTestInterval, + ).Should(BeEquivalentTo(desiredStatus), "Status should show 3 failed runners after first OOMKilled cycle") + + // Second cycle: simulate cleanup and recreation - runners transition back to Pending + // This validates that the set doesn't get stuck with unbounded failed count + runnerList = new(v1alpha1.EphemeralRunnerList) + err = listEphemeralRunnersAndRemoveFinalizers(ctx, k8sClient, runnerList, ephemeralRunnerSet.Namespace) + Expect(err).NotTo(HaveOccurred(), "failed to list runners after first OOMKilled cycle") + + for i, runner := range runnerList.Items { + runner := runner.DeepCopy() + runner.Status.Phase = v1alpha1.EphemeralRunnerPhasePending + runner.Status.Reason = "" + err = k8sClient.Status().Patch(ctx, runner, client.MergeFrom(&runnerList.Items[i])) + Expect(err).NotTo(HaveOccurred(), "failed to patch runner back to pending state") + } + + // Verify aggregated status shows 3 pending runners (not stuck with failed count) + desiredStatus = v1alpha1.EphemeralRunnerSetStatus{ + Phase: v1alpha1.EphemeralRunnerSetPhaseRunning, + CurrentReplicas: 3, + PendingEphemeralRunners: 3, + RunningEphemeralRunners: 0, + FailedEphemeralRunners: 0, + } + Eventually( + func() (v1alpha1.EphemeralRunnerSetStatus, error) { + updated := new(v1alpha1.EphemeralRunnerSet) + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunnerSet.Name, Namespace: ephemeralRunnerSet.Namespace}, updated) + if err != nil { + return v1alpha1.EphemeralRunnerSetStatus{}, err + } + return updated.Status, nil + }, + ephemeralRunnerSetTestTimeout, + ephemeralRunnerSetTestInterval, + ).Should(BeEquivalentTo(desiredStatus), "Status should show 3 pending runners after recovery cycle (no unbounded stuck growth)") + + // Third cycle: simulate another OOMKilled failure to verify repeated cycles don't cause stuck state + runnerList = new(v1alpha1.EphemeralRunnerList) + err = listEphemeralRunnersAndRemoveFinalizers(ctx, k8sClient, runnerList, ephemeralRunnerSet.Namespace) + Expect(err).NotTo(HaveOccurred(), "failed to list runners for third cycle") + + for i, runner := range runnerList.Items { + runner := runner.DeepCopy() + runner.Status.RunnerID = 301 + i + runner.Status.Phase = v1alpha1.EphemeralRunnerPhaseFailed + runner.Status.Reason = "OOMKilled" + err = k8sClient.Status().Patch(ctx, runner, client.MergeFrom(&runnerList.Items[i])) + Expect(err).NotTo(HaveOccurred(), "failed to patch runner to failed state in third cycle") + } + + // Verify aggregated status shows 3 failed runners again (bounded, not growing) + desiredStatus = v1alpha1.EphemeralRunnerSetStatus{ + Phase: v1alpha1.EphemeralRunnerSetPhaseRunning, + CurrentReplicas: 3, + PendingEphemeralRunners: 0, + RunningEphemeralRunners: 0, + FailedEphemeralRunners: 3, + } + Eventually( + func() (v1alpha1.EphemeralRunnerSetStatus, error) { + updated := new(v1alpha1.EphemeralRunnerSet) + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunnerSet.Name, Namespace: ephemeralRunnerSet.Namespace}, updated) + if err != nil { + return v1alpha1.EphemeralRunnerSetStatus{}, err + } + return updated.Status, nil + }, + ephemeralRunnerSetTestTimeout, + ephemeralRunnerSetTestInterval, + ).Should(BeEquivalentTo(desiredStatus), "Status should show 3 failed runners in third cycle (bounded, not growing unbounded)") + }) }) }) diff --git a/controllers/actions.github.com/multiclient/fake/client.go b/controllers/actions.github.com/multiclient/fake/client.go index d8f62f59..57fc45e2 100644 --- a/controllers/actions.github.com/multiclient/fake/client.go +++ b/controllers/actions.github.com/multiclient/fake/client.go @@ -71,6 +71,13 @@ func WithRemoveRunner(err error) ClientOption { } } +// WithRemoveRunnerFunc configures a function to handle RemoveRunner calls dynamically +func WithRemoveRunnerFunc(fn func(context.Context, int64) error) ClientOption { + return func(c *Client) { + c.removeRunnerFunc = fn + } +} + // WithGenerateJitRunnerConfig configures the result of GenerateJitRunnerConfig func WithGenerateJitRunnerConfig(result *scaleset.RunnerScaleSetJitRunnerConfig, err error) ClientOption { return func(c *Client) { @@ -113,6 +120,7 @@ func WithUpdateRunnerScaleSetFunc(fn func(context.Context, int, *scaleset.Runner type Client struct { systemInfo scaleset.SystemInfo updateRunnerScaleSetFunc func(context.Context, int, *scaleset.RunnerScaleSet) (*scaleset.RunnerScaleSet, error) + removeRunnerFunc func(context.Context, int64) error getRunnerScaleSetResult struct { *scaleset.RunnerScaleSet @@ -196,6 +204,9 @@ func (c *Client) GetRunnerByName(ctx context.Context, runnerName string) (*scale } func (c *Client) RemoveRunner(ctx context.Context, runnerID int64) error { + if c.removeRunnerFunc != nil { + return c.removeRunnerFunc(ctx, runnerID) + } return c.removeRunnerResult.err }