OOM detection

This commit is contained in:
Nikola Jokic 2026-05-06 19:21:47 +02:00
parent 8c84ab2f42
commit a7cbd0dae3
No known key found for this signature in database
GPG Key ID: 419BB425B0E501B0
5 changed files with 1151 additions and 22 deletions

View File

@ -1,6 +1,8 @@
{{- define "runner-mode-kubernetes.runner-container" -}}
{{- $runner := (.Values.runner | default dict) -}}
{{- $kubeMode := (index $runner "kubernetesMode" | default dict) -}}
{{- $runnerContainer := (index $runner "container" | default dict) -}}
{{- $runnerContainerVolumeMounts := (index $runnerContainer "volumeMounts" | default list) -}}
{{- $hookPath := (index $kubeMode "hookPath" | default "/home/runner/k8s/index.js") -}}
{{- $extensionRef := (index $kubeMode "extensionRef" | default "") -}}
{{- $extension := (index $kubeMode "extension" | default dict) -}}
@ -32,6 +34,9 @@
{{- if not (kindIs "string" $hookPath) -}}
{{- fail "runner.kubernetesMode.hookPath must be a string" -}}
{{- end -}}
{{- if not (kindIs "slice" $runnerContainerVolumeMounts) -}}
{{- fail "runner.container.volumeMounts must be a list" -}}
{{- end -}}
{{- if not (kindIs "string" $extensionRef) -}}
{{- fail "runner.kubernetesMode.extensionRef must be a string" -}}
{{- end -}}
@ -82,6 +87,9 @@ volumeMounts:
subPath: extension
readOnly: true
{{- end }}
{{- with $runnerContainerVolumeMounts }}
{{- toYaml . | nindent 2 }}
{{- end }}
{{ include "githubServerTLS.volumeMountItem" (dict "root" $ "existingVolumeMounts" (list)) | nindent 2 }}
{{- end }}

View File

@ -320,7 +320,17 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
// If the runner pod did not have chance to start, terminated state may not be set.
// Therefore, we should try to restart it.
if cs == nil || cs.State.Terminated == nil {
log.Info("Runner container does not have state set, deleting pod as failed so it can be restarted")
missingDetails := "unknown"
if cs == nil {
missingDetails = "no container status available"
} else if cs.State.Terminated == nil {
missingDetails = "container termination details not yet available"
}
log.Info("Runner container termination details incomplete, proceeding with cleanup based on pod-level status",
"missingDetails", missingDetails,
"podReason", pod.Status.Reason,
"podMessage", pod.Status.Message,
)
return ctrl.Result{}, r.deleteEphemeralRunnerOrPod(ctx, ephemeralRunner, pod, log)
}
@ -386,34 +396,226 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
}
func (r *EphemeralRunnerReconciler) deleteEphemeralRunnerOrPod(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, pod *corev1.Pod, log logr.Logger) error {
// Extract terminal failure reason from pod for observability
podReason := pod.Status.Reason
if podReason == "" && pod.Status.Phase == corev1.PodFailed {
podReason = "PodFailed"
}
cs := runnerContainerStatus(pod)
var exitCode int32 = -1
if cs != nil && cs.State.Terminated != nil {
exitCode = cs.State.Terminated.ExitCode
}
isOOMKilled := exitCode == 137 || podReason == "OOMKilled"
if isOOMKilled {
log.Info("Handling OOMKilled as terminal failure",
"runnerName", ephemeralRunner.Status.RunnerName,
"runnerId", ephemeralRunner.Status.RunnerID,
"hasJob", ephemeralRunner.HasJob(),
"reason", podReason,
"exitCode", exitCode,
"oomKilled", true,
)
if ephemeralRunner.Status.RunnerID != 0 {
log.Info("Attempting to remove registered runner from service for terminal OOMKilled failure",
"runnerName", ephemeralRunner.Status.RunnerName,
"runnerId", ephemeralRunner.Status.RunnerID,
"hasJob", ephemeralRunner.HasJob(),
"reason", podReason,
"exitCode", exitCode,
"oomKilled", true,
)
if err := r.deleteRunnerFromService(ctx, ephemeralRunner, log); err != nil {
log.Error(err, "Failed to remove runner from service for terminal OOMKilled failure; will retry on next reconciliation",
"runnerName", ephemeralRunner.Status.RunnerName,
"runnerId", ephemeralRunner.Status.RunnerID,
"hasJob", ephemeralRunner.HasJob(),
"reason", podReason,
"exitCode", exitCode,
"oomKilled", true,
"retryable", true,
)
return err
}
log.Info("Successfully removed runner from service for terminal OOMKilled failure",
"runnerName", ephemeralRunner.Status.RunnerName,
"runnerId", ephemeralRunner.Status.RunnerID,
"hasJob", ephemeralRunner.HasJob(),
"reason", podReason,
"exitCode", exitCode,
"oomKilled", true,
"cleanupOutcome", "success",
)
} else {
log.Info("Skipping runner removal from service for terminal OOMKilled failure because runner is not registered",
"runnerName", ephemeralRunner.Status.RunnerName,
"runnerId", ephemeralRunner.Status.RunnerID,
"hasJob", ephemeralRunner.HasJob(),
"reason", podReason,
"exitCode", exitCode,
"oomKilled", true,
"cleanupOutcome", "skipped_unregistered",
)
}
log.Info("Deleting ephemeral runner after terminal OOMKilled failure",
"runnerName", ephemeralRunner.Status.RunnerName,
"runnerId", ephemeralRunner.Status.RunnerID,
"hasJob", ephemeralRunner.HasJob(),
"reason", podReason,
"exitCode", exitCode,
"oomKilled", true,
)
if err := r.Delete(ctx, ephemeralRunner); err != nil {
log.Error(err, "Failed to delete ephemeral runner after terminal OOMKilled failure",
"runnerName", ephemeralRunner.Status.RunnerName,
"runnerId", ephemeralRunner.Status.RunnerID,
"hasJob", ephemeralRunner.HasJob(),
"reason", podReason,
"exitCode", exitCode,
"oomKilled", true,
)
return err
}
log.Info("Deleted ephemeral runner after terminal OOMKilled failure",
"runnerName", ephemeralRunner.Status.RunnerName,
"runnerId", ephemeralRunner.Status.RunnerID,
"hasJob", ephemeralRunner.HasJob(),
"reason", podReason,
"exitCode", exitCode,
"oomKilled", true,
"cleanupOutcome", "cr_deleted",
)
return nil
}
if ephemeralRunner.HasJob() {
log.Error(
errors.New("ephemeral runner has a job assigned, but the pod has failed"),
"Ephemeral runner either has faulty entrypoint or something external killing the runner",
"runnerName", ephemeralRunner.Status.RunnerName,
"runnerId", ephemeralRunner.Status.RunnerID,
"hasJob", true,
"reason", podReason,
"exitCode", exitCode,
)
log.Info("Deleting the ephemeral runner that has a job assigned but the pod has failed")
if err := r.Delete(ctx, ephemeralRunner); err != nil {
log.Error(err, "Failed to delete the ephemeral runner that has a job assigned but the pod has failed")
return err
if ephemeralRunner.Status.RunnerID != 0 {
log.Info("Attempting to remove registered runner from service",
"runnerName", ephemeralRunner.Status.RunnerName,
"runnerId", ephemeralRunner.Status.RunnerID,
"hasJob", true,
"reason", podReason,
"exitCode", exitCode,
)
if err := r.deleteRunnerFromService(ctx, ephemeralRunner, log); err != nil {
log.Error(err, "Failed to remove runner from service; will retry on next reconciliation",
"runnerName", ephemeralRunner.Status.RunnerName,
"runnerId", ephemeralRunner.Status.RunnerID,
"hasJob", true,
"reason", podReason,
"exitCode", exitCode,
"retryable", true,
)
return err
}
log.Info("Successfully removed runner from service",
"runnerName", ephemeralRunner.Status.RunnerName,
"runnerId", ephemeralRunner.Status.RunnerID,
"hasJob", true,
"reason", podReason,
"exitCode", exitCode,
"cleanupOutcome", "success",
)
} else {
log.Info("Skipping runner removal from service because runner is not registered",
"runnerName", ephemeralRunner.Status.RunnerName,
"runnerId", ephemeralRunner.Status.RunnerID,
"hasJob", true,
"reason", podReason,
"exitCode", exitCode,
"cleanupOutcome", "skipped_unregistered",
)
}
log.Info("Deleted the ephemeral runner that has a job assigned but the pod has failed")
log.Info("Trying to remove the runner from the service")
actionsClient, err := r.GetActionsService(ctx, ephemeralRunner)
if err != nil {
log.Error(err, "Failed to get actions client for removing the runner from the service")
return nil
log.Info("Deleting the ephemeral runner that has a job assigned but the pod has failed",
"runnerName", ephemeralRunner.Status.RunnerName,
"runnerId", ephemeralRunner.Status.RunnerID,
"hasJob", true,
"reason", podReason,
"exitCode", exitCode,
)
if err := r.Delete(ctx, ephemeralRunner); err != nil {
log.Error(err, "Failed to delete the ephemeral runner that has a job assigned but the pod has failed",
"runnerName", ephemeralRunner.Status.RunnerName,
"runnerId", ephemeralRunner.Status.RunnerID,
"hasJob", true,
"reason", podReason,
"exitCode", exitCode,
)
return err
}
if err := actionsClient.RemoveRunner(ctx, int64(ephemeralRunner.Status.RunnerID)); err != nil {
log.Error(err, "Failed to remove the runner from the service")
return nil
}
log.Info("Removed the runner from the service")
log.Info("Deleted the ephemeral runner that has a job assigned but the pod has failed",
"runnerName", ephemeralRunner.Status.RunnerName,
"runnerId", ephemeralRunner.Status.RunnerID,
"hasJob", true,
"reason", podReason,
"exitCode", exitCode,
"cleanupOutcome", "cr_deleted",
)
return nil
}
// HasJob=false path: still need to clean up if runner is registered
if ephemeralRunner.Status.RunnerID != 0 {
log.Info("Attempting to remove registered runner from service (HasJob=false)",
"runnerName", ephemeralRunner.Status.RunnerName,
"runnerId", ephemeralRunner.Status.RunnerID,
"hasJob", false,
"reason", podReason,
"exitCode", exitCode,
)
if err := r.deleteRunnerFromService(ctx, ephemeralRunner, log); err != nil {
log.Error(err, "Failed to remove runner from service; will retry on next reconciliation",
"runnerName", ephemeralRunner.Status.RunnerName,
"runnerId", ephemeralRunner.Status.RunnerID,
"hasJob", false,
"reason", podReason,
"exitCode", exitCode,
"retryable", true,
)
return err
}
log.Info("Successfully removed runner from service",
"runnerName", ephemeralRunner.Status.RunnerName,
"runnerId", ephemeralRunner.Status.RunnerID,
"hasJob", false,
"reason", podReason,
"exitCode", exitCode,
"cleanupOutcome", "success",
)
} else {
log.Info("Skipping runner removal from service because runner is not registered",
"runnerName", ephemeralRunner.Status.RunnerName,
"runnerId", ephemeralRunner.Status.RunnerID,
"hasJob", false,
"reason", podReason,
"exitCode", exitCode,
"cleanupOutcome", "skipped_unregistered",
)
}
if err := r.deletePodAsFailed(ctx, ephemeralRunner, pod, log); err != nil {
log.Error(err, "Failed to delete runner pod on failure")
log.Error(err, "Failed to delete runner pod on failure",
"runnerName", ephemeralRunner.Status.RunnerName,
"runnerId", ephemeralRunner.Status.RunnerID,
"hasJob", ephemeralRunner.HasJob(),
"reason", podReason,
"exitCode", exitCode,
)
return err
}
@ -561,21 +763,46 @@ func (r *EphemeralRunnerReconciler) cleanupRunnerLinkedSecrets(ctx context.Conte
}
func (r *EphemeralRunnerReconciler) markAsFailed(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, errMessage string, reason string, log logr.Logger) error {
log.Info("Updating ephemeral runner status to Failed")
log.Info("Updating ephemeral runner status to Failed",
"runnerName", ephemeralRunner.Status.RunnerName,
"runnerId", ephemeralRunner.Status.RunnerID,
"reason", reason,
"message", errMessage,
)
if err := patchSubResource(ctx, r.Status(), ephemeralRunner, func(obj *v1alpha1.EphemeralRunner) {
obj.Status.Phase = v1alpha1.EphemeralRunnerPhaseFailed
obj.Status.Reason = reason
obj.Status.Message = errMessage
}); err != nil {
log.Error(err, "Failed to update ephemeral runner status Phase/Message",
"runnerName", ephemeralRunner.Status.RunnerName,
"runnerId", ephemeralRunner.Status.RunnerID,
"reason", reason,
)
return fmt.Errorf("failed to update ephemeral runner status Phase/Message: %w", err)
}
log.Info("Removing the runner from the service")
log.Info("Removing the runner from the service",
"runnerName", ephemeralRunner.Status.RunnerName,
"runnerId", ephemeralRunner.Status.RunnerID,
"reason", reason,
)
if err := r.deleteRunnerFromService(ctx, ephemeralRunner, log); err != nil {
log.Error(err, "Failed to remove the runner from service; will retry on next reconciliation",
"runnerName", ephemeralRunner.Status.RunnerName,
"runnerId", ephemeralRunner.Status.RunnerID,
"reason", reason,
"retryable", true,
)
return fmt.Errorf("failed to remove the runner from service: %w", err)
}
log.Info("EphemeralRunner is marked as Failed and deleted from the service")
log.Info("EphemeralRunner is marked as Failed and deleted from the service",
"runnerName", ephemeralRunner.Status.RunnerName,
"runnerId", ephemeralRunner.Status.RunnerID,
"reason", reason,
"cleanupOutcome", "success",
)
return nil
}
@ -616,8 +843,16 @@ func (r *EphemeralRunnerReconciler) deletePodAsFailed(ctx context.Context, ephem
}
obj.Status.Failures[string(pod.UID)] = metav1.Now()
obj.Status.Ready = false
obj.Status.Reason = pod.Status.Reason
if obj.Status.Reason == "" {
obj.Status.Reason = string(corev1.PodFailed)
}
obj.Status.Message = pod.Status.Message
if obj.Status.Message == "" {
obj.Status.Message = "Pod failed without detailed termination information"
}
}); err != nil {
return fmt.Errorf("failed to update ephemeral runner status: failed attempts: %w", err)
}
@ -829,16 +1064,32 @@ func (r *EphemeralRunnerReconciler) updateRunStatusFromPod(ctx context.Context,
func (r *EphemeralRunnerReconciler) deleteRunnerFromService(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) error {
client, err := r.GetActionsService(ctx, ephemeralRunner)
if err != nil {
log.Error(err, "Failed to get actions client for runner",
"runnerName", ephemeralRunner.Status.RunnerName,
"runnerId", ephemeralRunner.Status.RunnerID,
)
return fmt.Errorf("failed to get actions client for runner: %w", err)
}
log.Info("Removing runner from the service", "runnerId", ephemeralRunner.Status.RunnerID)
log.Info("Removing runner from the service",
"runnerName", ephemeralRunner.Status.RunnerName,
"runnerId", ephemeralRunner.Status.RunnerID,
)
err = client.RemoveRunner(ctx, int64(ephemeralRunner.Status.RunnerID))
if err != nil {
log.Error(err, "Failed to remove runner from the service; cleanup will be retried",
"runnerName", ephemeralRunner.Status.RunnerName,
"runnerId", ephemeralRunner.Status.RunnerID,
"retryable", true,
)
return fmt.Errorf("failed to remove runner from the service: %w", err)
}
log.Info("Removed runner from the service", "runnerId", ephemeralRunner.Status.RunnerID)
log.Info("Removed runner from the service",
"runnerName", ephemeralRunner.Status.RunnerName,
"runnerId", ephemeralRunner.Status.RunnerID,
"cleanupOutcome", "success",
)
return nil
}

View File

@ -4,12 +4,14 @@ import (
"context"
"crypto/tls"
"encoding/base64"
"errors"
"fmt"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"strings"
"sync"
"time"
"github.com/actions/actions-runner-controller/apis/actions.github.com/v1alpha1"
@ -1086,6 +1088,719 @@ var _ = Describe("EphemeralRunner", func() {
ephemeralRunnerTimeout,
).Should(BeEquivalentTo(v1alpha1.EphemeralRunnerPhaseRunning))
})
// ==============================================================================
// Terminal Lifecycle Invariants Matrix for EphemeralRunner Pod Failures
// ==============================================================================
//
// This matrix codifies expected outcomes for all terminal runner failure scenarios.
// These tests drive the controller's reconciliation branching logic.
//
// Dimensions:
// -----------
// 1. HasJob (JobID set): true | false
// 2. RunnerID: 0 | non-zero
// 3. Container Status: nil/no-terminated | terminated
// 4. Exit Code (if term): 0 | 7 | 137 (OOMKilled) | other non-zero
// 5. Pod.Status.Reason: Evicted | OutOf* | OOMKilled | <empty>
//
// Expected Outcomes:
// ------------------
// HasJob | RunnerID | ContainerStatus | ExitCode | PodReason | Expected Outcome
// -------|----------|-----------------|----------|------------|---------------------------------------------------
// false | 0 | nil/no-term | N/A | Evicted | DELETE pod, track failure, RECREATE (retryable)
// false | 0 | nil/no-term | N/A | OutOf* | DELETE pod, track failure, RECREATE (retryable)
// false | 0 | nil/no-term | N/A | <empty> | DELETE pod, track failure, RECREATE (retryable)
// false | non-zero | nil/no-term | N/A | Evicted | DELETE pod, track failure, RECREATE (retryable)
// false | non-zero | nil/no-term | N/A | OutOf* | DELETE pod, track failure, RECREATE (retryable)
// false | non-zero | nil/no-term | N/A | <empty> | DELETE pod, track failure, RECREATE (retryable)
// false | 0 | terminated | 0 | N/A | DELETE runner (SUCCESS - terminal)
// false | non-zero | terminated | 0 | N/A | DELETE runner (SUCCESS - terminal)
// false | 0 | terminated | 7 | N/A | Mark Outdated (TERMINAL)
// false | non-zero | terminated | 7 | N/A | Mark Outdated (TERMINAL)
// false | 0 | terminated | 137 | OOMKilled | DELETE runner + remove from service (TERMINAL)
// false | non-zero | terminated | 137 | OOMKilled | DELETE runner + remove from service (TERMINAL)
// false | 0 | terminated | other | <empty> | DELETE pod, track failure, RECREATE (retryable)
// false | non-zero | terminated | other | <empty> | DELETE pod, track failure, RECREATE (retryable)
// true | 0 | nil/no-term | N/A | Evicted | DELETE runner + remove from service (TERMINAL)
// true | 0 | nil/no-term | N/A | OutOf* | DELETE runner + remove from service (TERMINAL)
// true | 0 | nil/no-term | N/A | <empty> | DELETE runner + remove from service (TERMINAL)
// true | non-zero | nil/no-term | N/A | Evicted | DELETE runner + remove from service (TERMINAL)
// true | non-zero | nil/no-term | N/A | OutOf* | DELETE runner + remove from service (TERMINAL)
// true | non-zero | nil/no-term | N/A | <empty> | DELETE runner + remove from service (TERMINAL)
// true | 0 | terminated | 0 | N/A | DELETE runner (SUCCESS - terminal)
// true | non-zero | terminated | 0 | N/A | DELETE runner (SUCCESS - terminal)
// true | 0 | terminated | 7 | N/A | Mark Outdated (TERMINAL)
// true | non-zero | terminated | 7 | N/A | Mark Outdated (TERMINAL)
// true | 0 | terminated | 137 | OOMKilled | DELETE runner + remove from service (TERMINAL)
// true | non-zero | terminated | 137 | OOMKilled | DELETE runner + remove from service (TERMINAL)
// true | 0 | terminated | other | <empty> | DELETE runner + remove from service (TERMINAL)
// true | non-zero | terminated | other | <empty> | DELETE runner + remove from service (TERMINAL)
//
// Key Insights:
// -------------
// 1. HasJob=true always results in TERMINAL deletion (runner + service cleanup)
// 2. HasJob=false with retryable failures (Evicted, OutOf*, no container status) -> RECREATE
// 3. OOMKilled (ExitCode 137, Reason=OOMKilled) is TERMINAL regardless of HasJob status
// 4. ExitCode 0 and 7 are always terminal regardless of HasJob
// 5. Other non-zero exit codes follow the HasJob branching logic
//
// Implementation Notes:
// ---------------------
// - Controller logic: ephemeralrunner_controller.go:314-352 (main PodFailed branch)
// - deleteEphemeralRunnerOrPod: ephemeralrunner_controller.go:388-420 (HasJob fork)
// - deletePodAsFailed: ephemeralrunner_controller.go:604-627 (failure tracking)
//
// ==============================================================================
Context("Terminal Lifecycle Invariants Matrix", func() {
It("Matrix: HasJob=false, RunnerID=non-zero, ExitCode=137 (OOMKilled) should delete runner and remove from service", func() {
pod := new(corev1.Pod)
Eventually(func() error {
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod")
pod.Status.Phase = corev1.PodFailed
pod.Status.Reason = "OOMKilled"
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
Name: v1alpha1.EphemeralRunnerContainerName,
State: corev1.ContainerState{
Terminated: &corev1.ContainerStateTerminated{
ExitCode: 137,
Reason: "OOMKilled",
},
},
})
err := k8sClient.Status().Update(ctx, pod)
Expect(err).To(BeNil(), "failed to update pod status")
Eventually(func() bool {
check := new(v1alpha1.EphemeralRunner)
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check)
return kerrors.IsNotFound(err)
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeTrue(), "ephemeral runner should be deleted (terminal)")
})
It("Matrix: HasJob=true, RunnerID=non-zero, ExitCode=137 (OOMKilled) should delete runner and remove from service", func() {
er := new(v1alpha1.EphemeralRunner)
Eventually(func() error {
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner")
er.Status.JobID = "12345"
err := k8sClient.Status().Update(ctx, er)
Expect(err).To(BeNil(), "failed to update ephemeral runner status")
Eventually(func() (string, error) {
current := new(v1alpha1.EphemeralRunner)
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, current); err != nil {
return "", err
}
return current.Status.JobID, nil
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Equal("12345"))
pod := new(corev1.Pod)
Eventually(func() error {
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod")
pod.Status.Phase = corev1.PodFailed
pod.Status.Reason = "OOMKilled"
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
Name: v1alpha1.EphemeralRunnerContainerName,
State: corev1.ContainerState{
Terminated: &corev1.ContainerStateTerminated{
ExitCode: 137,
Reason: "OOMKilled",
},
},
})
err = k8sClient.Status().Update(ctx, pod)
Expect(err).To(BeNil(), "failed to update pod status")
Eventually(func() bool {
check := new(v1alpha1.EphemeralRunner)
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check)
return kerrors.IsNotFound(err)
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeTrue(), "ephemeral runner should be deleted (terminal)")
})
It("Matrix: HasJob=false, RunnerID=0, ExitCode=137 (OOMKilled) should delete runner (terminal)", func() {
pod := new(corev1.Pod)
Eventually(func() error {
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod")
er := new(v1alpha1.EphemeralRunner)
Eventually(func() error {
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner")
pod.Status.Phase = corev1.PodFailed
pod.Status.Reason = "OOMKilled"
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
Name: v1alpha1.EphemeralRunnerContainerName,
State: corev1.ContainerState{
Terminated: &corev1.ContainerStateTerminated{
ExitCode: 137,
Reason: "OOMKilled",
},
},
})
err := k8sClient.Status().Update(ctx, pod)
Expect(err).To(BeNil(), "failed to update pod status")
Eventually(func() bool {
check := new(v1alpha1.EphemeralRunner)
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check)
return kerrors.IsNotFound(err)
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeTrue(), "ephemeral runner should be deleted (terminal)")
})
It("Matrix: HasJob=false, no ContainerStatus.Terminated, Pod.Reason=OOMKilled should delete runner (terminal)", func() {
pod := new(corev1.Pod)
Eventually(func() error {
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod")
pod.Status.Phase = corev1.PodFailed
pod.Status.Reason = "OOMKilled"
pod.Status.ContainerStatuses = nil
err := k8sClient.Status().Update(ctx, pod)
Expect(err).To(BeNil(), "failed to update pod status")
Eventually(func() bool {
check := new(v1alpha1.EphemeralRunner)
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check)
return kerrors.IsNotFound(err)
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeTrue(), "ephemeral runner should be deleted (terminal)")
})
It("Matrix: HasJob=true, no ContainerStatus.Terminated, Pod.Reason=OOMKilled should delete runner (terminal)", func() {
er := new(v1alpha1.EphemeralRunner)
Eventually(func() error {
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner")
er.Status.JobID = "67890"
err := k8sClient.Status().Update(ctx, er)
Expect(err).To(BeNil(), "failed to update ephemeral runner status with job ID")
Eventually(func() (string, error) {
current := new(v1alpha1.EphemeralRunner)
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, current); err != nil {
return "", err
}
return current.Status.JobID, nil
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Equal("67890"))
pod := new(corev1.Pod)
Eventually(func() error {
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod")
pod.Status.Phase = corev1.PodFailed
pod.Status.Reason = "OOMKilled"
pod.Status.ContainerStatuses = nil
err = k8sClient.Status().Update(ctx, pod)
Expect(err).To(BeNil(), "failed to update pod status")
Eventually(func() bool {
check := new(v1alpha1.EphemeralRunner)
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check)
return kerrors.IsNotFound(err)
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeTrue(), "ephemeral runner should be deleted (terminal)")
})
It("Matrix: HasJob=false, no ContainerStatus and empty Pod.Reason should use fallback reason", func() {
pod := new(corev1.Pod)
Eventually(func() error {
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod")
pod.Status.Phase = corev1.PodFailed
pod.Status.Reason = ""
pod.Status.Message = ""
pod.Status.ContainerStatuses = nil
err := k8sClient.Status().Update(ctx, pod)
Expect(err).To(BeNil(), "failed to update pod status")
updated := new(v1alpha1.EphemeralRunner)
Eventually(func() (int, error) {
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated)
if err != nil {
return 0, err
}
return len(updated.Status.Failures), nil
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Equal(1), "failure should be tracked")
Eventually(func() (string, error) {
current := new(v1alpha1.EphemeralRunner)
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, current)
if err != nil {
return "", err
}
return current.Status.Reason, nil
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Equal("Failed"), "reason should use fallback value")
Eventually(func() (string, error) {
current := new(v1alpha1.EphemeralRunner)
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, current)
if err != nil {
return "", err
}
return current.Status.Message, nil
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Equal("Pod failed without detailed termination information"), "message should use fallback value")
Eventually(func() error {
pod := new(corev1.Pod)
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
}, ephemeralRunnerTimeout*2, ephemeralRunnerInterval).Should(Succeed(), "pod should be recreated")
})
})
Context("OOMKilled with Transient API Failures", func() {
var ctx context.Context
var autoscalingNS *corev1.Namespace
var configSecret *corev1.Secret
var controller *EphemeralRunnerReconciler
var ephemeralRunner *v1alpha1.EphemeralRunner
var mgr ctrl.Manager
BeforeEach(func() {
ctx = context.Background()
autoscalingNS, mgr = createNamespace(GinkgoT(), k8sClient)
configSecret = createDefaultSecret(GinkgoT(), k8sClient, autoscalingNS.Name)
// Create controller with fake client that fails RemoveRunner
controller = &EphemeralRunnerReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Log: logf.Log,
ResourceBuilder: ResourceBuilder{
SecretResolver: secretresolver.New(mgr.GetClient(), scalefake.NewMultiClient(
scalefake.WithClient(
scalefake.NewClient(
scalefake.WithGenerateJitRunnerConfig(
&scaleset.RunnerScaleSetJitRunnerConfig{
Runner: &scaleset.RunnerReference{ID: 2, Name: "test-runner-transient"},
EncodedJITConfig: "fake-jit-config",
},
nil,
),
scalefake.WithRemoveRunner(
errors.New("transient API failure"),
),
),
),
)),
},
}
err := controller.SetupWithManager(mgr)
Expect(err).To(BeNil(), "failed to setup controller")
ephemeralRunner = newExampleRunner("test-runner-transient", autoscalingNS.Name, configSecret.Name)
err = k8sClient.Create(ctx, ephemeralRunner)
Expect(err).To(BeNil(), "failed to create ephemeral runner")
startManagers(GinkgoT(), mgr)
})
It("Matrix: HasJob=true, OOMKilled with transient RemoveRunner API failure should keep runner for retry and not set deletion timestamp", func() {
pod := new(corev1.Pod)
Eventually(func() error {
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod")
er := new(v1alpha1.EphemeralRunner)
Eventually(func() error {
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner")
er.Status.JobID = "99999"
er.Status.RunnerID = 2
err := k8sClient.Status().Update(ctx, er)
Expect(err).To(BeNil(), "failed to update ephemeral runner status with job ID and runner ID")
Eventually(func() (string, error) {
current := new(v1alpha1.EphemeralRunner)
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, current); err != nil {
return "", err
}
return current.Status.JobID, nil
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Equal("99999"))
pod.Status.Phase = corev1.PodFailed
pod.Status.Reason = "OOMKilled"
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
Name: v1alpha1.EphemeralRunnerContainerName,
State: corev1.ContainerState{
Terminated: &corev1.ContainerStateTerminated{
ExitCode: 137,
Reason: "OOMKilled",
},
},
})
err = k8sClient.Status().Update(ctx, pod)
Expect(err).To(BeNil(), "failed to update pod status")
Consistently(func() bool {
check := new(v1alpha1.EphemeralRunner)
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check)
if err != nil {
return false
}
return check.DeletionTimestamp == nil && len(check.Finalizers) > 0
}, "5s", ephemeralRunnerInterval).Should(BeTrue(), "runner should be preserved for retry until RemoveRunner succeeds")
})
It("Matrix: HasJob=true, non-OOM pod failure with transient RemoveRunner API failure should preserve runner for retry", func() {
pod := new(corev1.Pod)
Eventually(func() error {
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod")
er := new(v1alpha1.EphemeralRunner)
Eventually(func() error {
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner")
er.Status.JobID = "88888"
er.Status.RunnerID = 2
err := k8sClient.Status().Update(ctx, er)
Expect(err).To(BeNil(), "failed to update ephemeral runner status with job ID and runner ID")
pod.Status.Phase = corev1.PodFailed
pod.Status.Reason = "CrashLoopBackOff"
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
Name: v1alpha1.EphemeralRunnerContainerName,
State: corev1.ContainerState{
Terminated: &corev1.ContainerStateTerminated{
ExitCode: 1,
Reason: "Error",
},
},
})
err = k8sClient.Status().Update(ctx, pod)
Expect(err).To(BeNil(), "failed to update pod status")
Consistently(func() bool {
check := new(v1alpha1.EphemeralRunner)
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check)
if err != nil {
return false
}
return check.DeletionTimestamp == nil
}, "5s", ephemeralRunnerInterval).Should(BeTrue(), "runner should remain undeleted so RemoveRunner can be retried")
})
It("Matrix: HasJob=false, RunnerID=non-zero, pod failure with transient RemoveRunner API failure should preserve runner for retry", func() {
pod := new(corev1.Pod)
Eventually(func() error {
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod")
er := new(v1alpha1.EphemeralRunner)
Eventually(func() error {
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner")
er.Status.RunnerID = 3
err := k8sClient.Status().Update(ctx, er)
Expect(err).To(BeNil(), "failed to update ephemeral runner status with runner ID")
pod.Status.Phase = corev1.PodFailed
pod.Status.Reason = "Error"
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
Name: v1alpha1.EphemeralRunnerContainerName,
State: corev1.ContainerState{
Terminated: &corev1.ContainerStateTerminated{
ExitCode: 1,
Reason: "Error",
},
},
})
err = k8sClient.Status().Update(ctx, pod)
Expect(err).To(BeNil(), "failed to update pod status")
updated := new(v1alpha1.EphemeralRunner)
Eventually(func() (int, error) {
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated)
if err != nil {
return 0, err
}
return len(updated.Status.Failures), nil
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Equal(1), "failure should be tracked")
Consistently(func() bool {
check := new(v1alpha1.EphemeralRunner)
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check)
if err != nil {
return false
}
return check.DeletionTimestamp == nil && len(check.Status.Failures) == 1
}, "5s", ephemeralRunnerInterval).Should(BeTrue(), "runner should be preserved for RemoveRunner retry despite HasJob=false")
})
})
// ==============================================================================
// Convergence Tests: Verify retry-safe cleanup converges correctly
// ==============================================================================
// These tests ensure the retry mechanism:
// 1. Eventually converges when external service recovers
// 2. Respects bounded behavior (no infinite loops)
// 3. Maintains observability (no silent success on persistent failure)
//
// Implementation Reference:
// - Backoff logic: ephemeralrunner_controller.go:229-254
// - Cleanup retry: ephemeralrunner_controller.go:430-440
// - MaxFailures boundary: ephemeralrunner_controller.go:229-236
// ==============================================================================
Context("Retry Convergence and Bounded Behavior", func() {
var callCount int
var mu sync.Mutex
BeforeEach(func() {
mu.Lock()
callCount = 0
mu.Unlock()
controller = &EphemeralRunnerReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Log: logf.Log,
ResourceBuilder: ResourceBuilder{
SecretResolver: secretresolver.New(mgr.GetClient(), scalefake.NewMultiClient(
scalefake.WithClient(
scalefake.NewClient(
scalefake.WithGenerateJitRunnerConfig(
&scaleset.RunnerScaleSetJitRunnerConfig{
Runner: &scaleset.RunnerReference{ID: 99, Name: "test-convergence"},
EncodedJITConfig: "fake-jit-config",
},
nil,
),
scalefake.WithRemoveRunnerFunc(func(ctx context.Context, runnerID int64) error {
mu.Lock()
defer mu.Unlock()
callCount++
if callCount <= 2 {
return errors.New("transient API failure")
}
return nil
}),
),
),
)),
},
}
err := controller.SetupWithManager(mgr)
Expect(err).To(BeNil(), "failed to setup controller")
ephemeralRunner = newExampleRunner("test-convergence", autoscalingNS.Name, configSecret.Name)
err = k8sClient.Create(ctx, ephemeralRunner)
Expect(err).To(BeNil(), "failed to create ephemeral runner")
startManagers(GinkgoT(), mgr)
})
It("Convergence: Repeated transient RemoveRunner failures eventually converge when service recovers", func() {
pod := new(corev1.Pod)
Eventually(func() error {
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod")
er := new(v1alpha1.EphemeralRunner)
Eventually(func() error {
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner")
er.Status.JobID = "conv-job-1"
er.Status.RunnerID = 99
err := k8sClient.Status().Update(ctx, er)
Expect(err).To(BeNil(), "failed to update ephemeral runner status")
Eventually(func() (string, error) {
current := new(v1alpha1.EphemeralRunner)
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, current); err != nil {
return "", err
}
return current.Status.JobID, nil
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Equal("conv-job-1"))
pod.Status.Phase = corev1.PodFailed
pod.Status.Reason = "Error"
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
Name: v1alpha1.EphemeralRunnerContainerName,
State: corev1.ContainerState{
Terminated: &corev1.ContainerStateTerminated{
ExitCode: 1,
Reason: "Error",
},
},
})
err = k8sClient.Status().Update(ctx, pod)
Expect(err).To(BeNil(), "failed to update pod status")
Eventually(func() int {
mu.Lock()
defer mu.Unlock()
return callCount
}, "30s", ephemeralRunnerInterval).Should(BeNumerically(">=", 3), "RemoveRunner should be retried until success")
Eventually(func() bool {
check := new(v1alpha1.EphemeralRunner)
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check)
return kerrors.IsNotFound(err)
}, "40s", ephemeralRunnerInterval).Should(BeTrue(), "ephemeral runner should be deleted after cleanup converges")
})
})
Context("Bounded Retry Behavior: MaxFailures Enforcement", func() {
var persistentFailCount int
var muPersistent sync.Mutex
BeforeEach(func() {
muPersistent.Lock()
persistentFailCount = 0
muPersistent.Unlock()
controller = &EphemeralRunnerReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Log: logf.Log,
ResourceBuilder: ResourceBuilder{
SecretResolver: secretresolver.New(mgr.GetClient(), scalefake.NewMultiClient(
scalefake.WithClient(
scalefake.NewClient(
scalefake.WithGenerateJitRunnerConfig(
&scaleset.RunnerScaleSetJitRunnerConfig{
Runner: &scaleset.RunnerReference{ID: 777, Name: "test-persistent"},
EncodedJITConfig: "fake-jit-config",
},
nil,
),
scalefake.WithRemoveRunnerFunc(func(ctx context.Context, runnerID int64) error {
muPersistent.Lock()
defer muPersistent.Unlock()
persistentFailCount++
return errors.New("persistent API failure")
}),
),
),
)),
},
}
err := controller.SetupWithManager(mgr)
Expect(err).To(BeNil(), "failed to setup controller")
ephemeralRunner = newExampleRunner("test-persistent", autoscalingNS.Name, configSecret.Name)
err = k8sClient.Create(ctx, ephemeralRunner)
Expect(err).To(BeNil(), "failed to create ephemeral runner")
startManagers(GinkgoT(), mgr)
})
It("Bounded: Persistent RemoveRunner failures do not cause infinite retry (maxFailures enforced)", func() {
pod := new(corev1.Pod)
Eventually(func() error {
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod")
er := new(v1alpha1.EphemeralRunner)
Eventually(func() error {
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner")
er.Status.JobID = "persist-job"
er.Status.RunnerID = 777
err := k8sClient.Status().Update(ctx, er)
Expect(err).To(BeNil(), "failed to update ephemeral runner status")
pod.Status.Phase = corev1.PodFailed
pod.Status.Reason = "Error"
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
Name: v1alpha1.EphemeralRunnerContainerName,
State: corev1.ContainerState{
Terminated: &corev1.ContainerStateTerminated{
ExitCode: 1,
Reason: "Error",
},
},
})
err = k8sClient.Status().Update(ctx, pod)
Expect(err).To(BeNil(), "failed to update pod status")
Consistently(func() int {
muPersistent.Lock()
defer muPersistent.Unlock()
return persistentFailCount
}, "15s", "1s").Should(BeNumerically("<=", 20), "RemoveRunner should not be called excessively (no infinite loop)")
check := new(v1alpha1.EphemeralRunner)
err = k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check)
if err == nil {
Expect(check.DeletionTimestamp).NotTo(BeNil(), "runner should have deletion timestamp showing cleanup intent")
} else {
Expect(kerrors.IsNotFound(err)).To(BeTrue(), "runner either deleted or has visible error state")
}
})
It("Observability: Failed cleanup attempts are visible and not silently succeeded", func() {
pod := new(corev1.Pod)
Eventually(func() error {
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod")
er := new(v1alpha1.EphemeralRunner)
Eventually(func() error {
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner")
er.Status.JobID = "observe-job"
er.Status.RunnerID = 777
err := k8sClient.Status().Update(ctx, er)
Expect(err).To(BeNil(), "failed to update ephemeral runner status")
pod.Status.Phase = corev1.PodFailed
pod.Status.Reason = "Error"
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
Name: v1alpha1.EphemeralRunnerContainerName,
State: corev1.ContainerState{
Terminated: &corev1.ContainerStateTerminated{
ExitCode: 1,
Reason: "Error",
},
},
})
err = k8sClient.Status().Update(ctx, pod)
Expect(err).To(BeNil(), "failed to update pod status")
Eventually(func() int {
muPersistent.Lock()
defer muPersistent.Unlock()
return persistentFailCount
}, "10s", ephemeralRunnerInterval).Should(BeNumerically(">=", 1), "RemoveRunner must be attempted (not silently skipped)")
time.Sleep(5 * time.Second)
check := new(v1alpha1.EphemeralRunner)
err = k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check)
if err == nil {
Expect(check.DeletionTimestamp).NotTo(BeNil(), "runner must have deletion timestamp showing cleanup attempt")
} else {
Expect(kerrors.IsNotFound(err)).To(BeTrue(), "runner either deleted or has visible deletion intent")
}
muPersistent.Lock()
finalCount := persistentFailCount
muPersistent.Unlock()
Expect(finalCount).To(BeNumerically(">=", 1), "cleanup must be attempted, not silently skipped")
})
})
})
Describe("Checking the API", func() {

View File

@ -1092,6 +1092,150 @@ var _ = Describe("Test EphemeralRunnerSet controller", func() {
ephemeralRunnerSetTestInterval,
).Should(BeEquivalentTo(desiredStatus), "Status is not eventually updated to the desired one")
})
It("Should handle repeated OOMKilled failures without unbounded stuck state growth", func() {
// This test validates that when child EphemeralRunners encounter repeated OOMKilled
// failures, the set-level aggregation remains correct and does not cause unbounded
// stuck state growth. This is a regression test for aggregated stuck-runner behavior.
//
// Pattern: controllers/actions.github.com/ephemeralrunnerset_controller.go:239 (status update aggregation)
// Pattern: controllers/actions.github.com/ephemeralrunnerset_controller.go:505 (deleteEphemeralRunnerWithActionsClient cleanup)
created := new(v1alpha1.EphemeralRunnerSet)
Eventually(
func() error {
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunnerSet.Name, Namespace: ephemeralRunnerSet.Namespace}, created)
},
ephemeralRunnerSetTestTimeout,
ephemeralRunnerSetTestInterval,
).Should(Succeed(), "EphemeralRunnerSet should be created")
// Scale up to 3 runners
updated := created.DeepCopy()
updated.Spec.Replicas = 3
err := k8sClient.Update(ctx, updated)
Expect(err).NotTo(HaveOccurred(), "failed to update EphemeralRunnerSet replica count")
runnerList := new(v1alpha1.EphemeralRunnerList)
Eventually(
func() (int, error) {
err := listEphemeralRunnersAndRemoveFinalizers(ctx, k8sClient, runnerList, ephemeralRunnerSet.Namespace)
if err != nil {
return -1, err
}
return len(runnerList.Items), nil
},
ephemeralRunnerSetTestTimeout,
ephemeralRunnerSetTestInterval,
).Should(BeEquivalentTo(3), "3 EphemeralRunners should be created")
// Simulate repeated OOMKilled failures on all 3 runners
// Each runner will transition: Pending -> Failed (OOMKilled) -> Pending -> Failed (OOMKilled) ...
var runners []*v1alpha1.EphemeralRunner
for i := range runnerList.Items {
runners = append(runners, runnerList.Items[i].DeepCopy())
}
// First cycle: transition all runners to Failed with OOMKilled
for i, runner := range runners {
runner.Status.RunnerID = 201 + i
runner.Status.Phase = v1alpha1.EphemeralRunnerPhaseFailed
runner.Status.Reason = "OOMKilled"
err = k8sClient.Status().Patch(ctx, runner, client.MergeFrom(&runnerList.Items[i]))
Expect(err).NotTo(HaveOccurred(), "failed to patch runner to failed state")
}
// Verify aggregated status shows 3 failed runners
desiredStatus := v1alpha1.EphemeralRunnerSetStatus{
Phase: v1alpha1.EphemeralRunnerSetPhaseRunning,
CurrentReplicas: 3,
PendingEphemeralRunners: 0,
RunningEphemeralRunners: 0,
FailedEphemeralRunners: 3,
}
Eventually(
func() (v1alpha1.EphemeralRunnerSetStatus, error) {
updated := new(v1alpha1.EphemeralRunnerSet)
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunnerSet.Name, Namespace: ephemeralRunnerSet.Namespace}, updated)
if err != nil {
return v1alpha1.EphemeralRunnerSetStatus{}, err
}
return updated.Status, nil
},
ephemeralRunnerSetTestTimeout,
ephemeralRunnerSetTestInterval,
).Should(BeEquivalentTo(desiredStatus), "Status should show 3 failed runners after first OOMKilled cycle")
// Second cycle: simulate cleanup and recreation - runners transition back to Pending
// This validates that the set doesn't get stuck with unbounded failed count
runnerList = new(v1alpha1.EphemeralRunnerList)
err = listEphemeralRunnersAndRemoveFinalizers(ctx, k8sClient, runnerList, ephemeralRunnerSet.Namespace)
Expect(err).NotTo(HaveOccurred(), "failed to list runners after first OOMKilled cycle")
for i, runner := range runnerList.Items {
runner := runner.DeepCopy()
runner.Status.Phase = v1alpha1.EphemeralRunnerPhasePending
runner.Status.Reason = ""
err = k8sClient.Status().Patch(ctx, runner, client.MergeFrom(&runnerList.Items[i]))
Expect(err).NotTo(HaveOccurred(), "failed to patch runner back to pending state")
}
// Verify aggregated status shows 3 pending runners (not stuck with failed count)
desiredStatus = v1alpha1.EphemeralRunnerSetStatus{
Phase: v1alpha1.EphemeralRunnerSetPhaseRunning,
CurrentReplicas: 3,
PendingEphemeralRunners: 3,
RunningEphemeralRunners: 0,
FailedEphemeralRunners: 0,
}
Eventually(
func() (v1alpha1.EphemeralRunnerSetStatus, error) {
updated := new(v1alpha1.EphemeralRunnerSet)
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunnerSet.Name, Namespace: ephemeralRunnerSet.Namespace}, updated)
if err != nil {
return v1alpha1.EphemeralRunnerSetStatus{}, err
}
return updated.Status, nil
},
ephemeralRunnerSetTestTimeout,
ephemeralRunnerSetTestInterval,
).Should(BeEquivalentTo(desiredStatus), "Status should show 3 pending runners after recovery cycle (no unbounded stuck growth)")
// Third cycle: simulate another OOMKilled failure to verify repeated cycles don't cause stuck state
runnerList = new(v1alpha1.EphemeralRunnerList)
err = listEphemeralRunnersAndRemoveFinalizers(ctx, k8sClient, runnerList, ephemeralRunnerSet.Namespace)
Expect(err).NotTo(HaveOccurred(), "failed to list runners for third cycle")
for i, runner := range runnerList.Items {
runner := runner.DeepCopy()
runner.Status.RunnerID = 301 + i
runner.Status.Phase = v1alpha1.EphemeralRunnerPhaseFailed
runner.Status.Reason = "OOMKilled"
err = k8sClient.Status().Patch(ctx, runner, client.MergeFrom(&runnerList.Items[i]))
Expect(err).NotTo(HaveOccurred(), "failed to patch runner to failed state in third cycle")
}
// Verify aggregated status shows 3 failed runners again (bounded, not growing)
desiredStatus = v1alpha1.EphemeralRunnerSetStatus{
Phase: v1alpha1.EphemeralRunnerSetPhaseRunning,
CurrentReplicas: 3,
PendingEphemeralRunners: 0,
RunningEphemeralRunners: 0,
FailedEphemeralRunners: 3,
}
Eventually(
func() (v1alpha1.EphemeralRunnerSetStatus, error) {
updated := new(v1alpha1.EphemeralRunnerSet)
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunnerSet.Name, Namespace: ephemeralRunnerSet.Namespace}, updated)
if err != nil {
return v1alpha1.EphemeralRunnerSetStatus{}, err
}
return updated.Status, nil
},
ephemeralRunnerSetTestTimeout,
ephemeralRunnerSetTestInterval,
).Should(BeEquivalentTo(desiredStatus), "Status should show 3 failed runners in third cycle (bounded, not growing unbounded)")
})
})
})

View File

@ -71,6 +71,13 @@ func WithRemoveRunner(err error) ClientOption {
}
}
// WithRemoveRunnerFunc configures a function to handle RemoveRunner calls dynamically
func WithRemoveRunnerFunc(fn func(context.Context, int64) error) ClientOption {
return func(c *Client) {
c.removeRunnerFunc = fn
}
}
// WithGenerateJitRunnerConfig configures the result of GenerateJitRunnerConfig
func WithGenerateJitRunnerConfig(result *scaleset.RunnerScaleSetJitRunnerConfig, err error) ClientOption {
return func(c *Client) {
@ -113,6 +120,7 @@ func WithUpdateRunnerScaleSetFunc(fn func(context.Context, int, *scaleset.Runner
type Client struct {
systemInfo scaleset.SystemInfo
updateRunnerScaleSetFunc func(context.Context, int, *scaleset.RunnerScaleSet) (*scaleset.RunnerScaleSet, error)
removeRunnerFunc func(context.Context, int64) error
getRunnerScaleSetResult struct {
*scaleset.RunnerScaleSet
@ -196,6 +204,9 @@ func (c *Client) GetRunnerByName(ctx context.Context, runnerName string) (*scale
}
func (c *Client) RemoveRunner(ctx context.Context, runnerID int64) error {
if c.removeRunnerFunc != nil {
return c.removeRunnerFunc(ctx, runnerID)
}
return c.removeRunnerResult.err
}