OOM detection
This commit is contained in:
parent
8c84ab2f42
commit
a7cbd0dae3
|
|
@ -1,6 +1,8 @@
|
|||
{{- define "runner-mode-kubernetes.runner-container" -}}
|
||||
{{- $runner := (.Values.runner | default dict) -}}
|
||||
{{- $kubeMode := (index $runner "kubernetesMode" | default dict) -}}
|
||||
{{- $runnerContainer := (index $runner "container" | default dict) -}}
|
||||
{{- $runnerContainerVolumeMounts := (index $runnerContainer "volumeMounts" | default list) -}}
|
||||
{{- $hookPath := (index $kubeMode "hookPath" | default "/home/runner/k8s/index.js") -}}
|
||||
{{- $extensionRef := (index $kubeMode "extensionRef" | default "") -}}
|
||||
{{- $extension := (index $kubeMode "extension" | default dict) -}}
|
||||
|
|
@ -32,6 +34,9 @@
|
|||
{{- if not (kindIs "string" $hookPath) -}}
|
||||
{{- fail "runner.kubernetesMode.hookPath must be a string" -}}
|
||||
{{- end -}}
|
||||
{{- if not (kindIs "slice" $runnerContainerVolumeMounts) -}}
|
||||
{{- fail "runner.container.volumeMounts must be a list" -}}
|
||||
{{- end -}}
|
||||
{{- if not (kindIs "string" $extensionRef) -}}
|
||||
{{- fail "runner.kubernetesMode.extensionRef must be a string" -}}
|
||||
{{- end -}}
|
||||
|
|
@ -82,6 +87,9 @@ volumeMounts:
|
|||
subPath: extension
|
||||
readOnly: true
|
||||
{{- end }}
|
||||
{{- with $runnerContainerVolumeMounts }}
|
||||
{{- toYaml . | nindent 2 }}
|
||||
{{- end }}
|
||||
{{ include "githubServerTLS.volumeMountItem" (dict "root" $ "existingVolumeMounts" (list)) | nindent 2 }}
|
||||
{{- end }}
|
||||
|
||||
|
|
|
|||
|
|
@ -320,7 +320,17 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
|
|||
// If the runner pod did not have chance to start, terminated state may not be set.
|
||||
// Therefore, we should try to restart it.
|
||||
if cs == nil || cs.State.Terminated == nil {
|
||||
log.Info("Runner container does not have state set, deleting pod as failed so it can be restarted")
|
||||
missingDetails := "unknown"
|
||||
if cs == nil {
|
||||
missingDetails = "no container status available"
|
||||
} else if cs.State.Terminated == nil {
|
||||
missingDetails = "container termination details not yet available"
|
||||
}
|
||||
log.Info("Runner container termination details incomplete, proceeding with cleanup based on pod-level status",
|
||||
"missingDetails", missingDetails,
|
||||
"podReason", pod.Status.Reason,
|
||||
"podMessage", pod.Status.Message,
|
||||
)
|
||||
return ctrl.Result{}, r.deleteEphemeralRunnerOrPod(ctx, ephemeralRunner, pod, log)
|
||||
}
|
||||
|
||||
|
|
@ -386,34 +396,226 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
|
|||
}
|
||||
|
||||
func (r *EphemeralRunnerReconciler) deleteEphemeralRunnerOrPod(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, pod *corev1.Pod, log logr.Logger) error {
|
||||
// Extract terminal failure reason from pod for observability
|
||||
podReason := pod.Status.Reason
|
||||
if podReason == "" && pod.Status.Phase == corev1.PodFailed {
|
||||
podReason = "PodFailed"
|
||||
}
|
||||
|
||||
cs := runnerContainerStatus(pod)
|
||||
var exitCode int32 = -1
|
||||
if cs != nil && cs.State.Terminated != nil {
|
||||
exitCode = cs.State.Terminated.ExitCode
|
||||
}
|
||||
|
||||
isOOMKilled := exitCode == 137 || podReason == "OOMKilled"
|
||||
if isOOMKilled {
|
||||
log.Info("Handling OOMKilled as terminal failure",
|
||||
"runnerName", ephemeralRunner.Status.RunnerName,
|
||||
"runnerId", ephemeralRunner.Status.RunnerID,
|
||||
"hasJob", ephemeralRunner.HasJob(),
|
||||
"reason", podReason,
|
||||
"exitCode", exitCode,
|
||||
"oomKilled", true,
|
||||
)
|
||||
|
||||
if ephemeralRunner.Status.RunnerID != 0 {
|
||||
log.Info("Attempting to remove registered runner from service for terminal OOMKilled failure",
|
||||
"runnerName", ephemeralRunner.Status.RunnerName,
|
||||
"runnerId", ephemeralRunner.Status.RunnerID,
|
||||
"hasJob", ephemeralRunner.HasJob(),
|
||||
"reason", podReason,
|
||||
"exitCode", exitCode,
|
||||
"oomKilled", true,
|
||||
)
|
||||
if err := r.deleteRunnerFromService(ctx, ephemeralRunner, log); err != nil {
|
||||
log.Error(err, "Failed to remove runner from service for terminal OOMKilled failure; will retry on next reconciliation",
|
||||
"runnerName", ephemeralRunner.Status.RunnerName,
|
||||
"runnerId", ephemeralRunner.Status.RunnerID,
|
||||
"hasJob", ephemeralRunner.HasJob(),
|
||||
"reason", podReason,
|
||||
"exitCode", exitCode,
|
||||
"oomKilled", true,
|
||||
"retryable", true,
|
||||
)
|
||||
return err
|
||||
}
|
||||
log.Info("Successfully removed runner from service for terminal OOMKilled failure",
|
||||
"runnerName", ephemeralRunner.Status.RunnerName,
|
||||
"runnerId", ephemeralRunner.Status.RunnerID,
|
||||
"hasJob", ephemeralRunner.HasJob(),
|
||||
"reason", podReason,
|
||||
"exitCode", exitCode,
|
||||
"oomKilled", true,
|
||||
"cleanupOutcome", "success",
|
||||
)
|
||||
} else {
|
||||
log.Info("Skipping runner removal from service for terminal OOMKilled failure because runner is not registered",
|
||||
"runnerName", ephemeralRunner.Status.RunnerName,
|
||||
"runnerId", ephemeralRunner.Status.RunnerID,
|
||||
"hasJob", ephemeralRunner.HasJob(),
|
||||
"reason", podReason,
|
||||
"exitCode", exitCode,
|
||||
"oomKilled", true,
|
||||
"cleanupOutcome", "skipped_unregistered",
|
||||
)
|
||||
}
|
||||
|
||||
log.Info("Deleting ephemeral runner after terminal OOMKilled failure",
|
||||
"runnerName", ephemeralRunner.Status.RunnerName,
|
||||
"runnerId", ephemeralRunner.Status.RunnerID,
|
||||
"hasJob", ephemeralRunner.HasJob(),
|
||||
"reason", podReason,
|
||||
"exitCode", exitCode,
|
||||
"oomKilled", true,
|
||||
)
|
||||
if err := r.Delete(ctx, ephemeralRunner); err != nil {
|
||||
log.Error(err, "Failed to delete ephemeral runner after terminal OOMKilled failure",
|
||||
"runnerName", ephemeralRunner.Status.RunnerName,
|
||||
"runnerId", ephemeralRunner.Status.RunnerID,
|
||||
"hasJob", ephemeralRunner.HasJob(),
|
||||
"reason", podReason,
|
||||
"exitCode", exitCode,
|
||||
"oomKilled", true,
|
||||
)
|
||||
return err
|
||||
}
|
||||
log.Info("Deleted ephemeral runner after terminal OOMKilled failure",
|
||||
"runnerName", ephemeralRunner.Status.RunnerName,
|
||||
"runnerId", ephemeralRunner.Status.RunnerID,
|
||||
"hasJob", ephemeralRunner.HasJob(),
|
||||
"reason", podReason,
|
||||
"exitCode", exitCode,
|
||||
"oomKilled", true,
|
||||
"cleanupOutcome", "cr_deleted",
|
||||
)
|
||||
return nil
|
||||
}
|
||||
|
||||
if ephemeralRunner.HasJob() {
|
||||
log.Error(
|
||||
errors.New("ephemeral runner has a job assigned, but the pod has failed"),
|
||||
"Ephemeral runner either has faulty entrypoint or something external killing the runner",
|
||||
"runnerName", ephemeralRunner.Status.RunnerName,
|
||||
"runnerId", ephemeralRunner.Status.RunnerID,
|
||||
"hasJob", true,
|
||||
"reason", podReason,
|
||||
"exitCode", exitCode,
|
||||
)
|
||||
log.Info("Deleting the ephemeral runner that has a job assigned but the pod has failed")
|
||||
if err := r.Delete(ctx, ephemeralRunner); err != nil {
|
||||
log.Error(err, "Failed to delete the ephemeral runner that has a job assigned but the pod has failed")
|
||||
return err
|
||||
|
||||
if ephemeralRunner.Status.RunnerID != 0 {
|
||||
log.Info("Attempting to remove registered runner from service",
|
||||
"runnerName", ephemeralRunner.Status.RunnerName,
|
||||
"runnerId", ephemeralRunner.Status.RunnerID,
|
||||
"hasJob", true,
|
||||
"reason", podReason,
|
||||
"exitCode", exitCode,
|
||||
)
|
||||
if err := r.deleteRunnerFromService(ctx, ephemeralRunner, log); err != nil {
|
||||
log.Error(err, "Failed to remove runner from service; will retry on next reconciliation",
|
||||
"runnerName", ephemeralRunner.Status.RunnerName,
|
||||
"runnerId", ephemeralRunner.Status.RunnerID,
|
||||
"hasJob", true,
|
||||
"reason", podReason,
|
||||
"exitCode", exitCode,
|
||||
"retryable", true,
|
||||
)
|
||||
return err
|
||||
}
|
||||
log.Info("Successfully removed runner from service",
|
||||
"runnerName", ephemeralRunner.Status.RunnerName,
|
||||
"runnerId", ephemeralRunner.Status.RunnerID,
|
||||
"hasJob", true,
|
||||
"reason", podReason,
|
||||
"exitCode", exitCode,
|
||||
"cleanupOutcome", "success",
|
||||
)
|
||||
} else {
|
||||
log.Info("Skipping runner removal from service because runner is not registered",
|
||||
"runnerName", ephemeralRunner.Status.RunnerName,
|
||||
"runnerId", ephemeralRunner.Status.RunnerID,
|
||||
"hasJob", true,
|
||||
"reason", podReason,
|
||||
"exitCode", exitCode,
|
||||
"cleanupOutcome", "skipped_unregistered",
|
||||
)
|
||||
}
|
||||
|
||||
log.Info("Deleted the ephemeral runner that has a job assigned but the pod has failed")
|
||||
log.Info("Trying to remove the runner from the service")
|
||||
actionsClient, err := r.GetActionsService(ctx, ephemeralRunner)
|
||||
if err != nil {
|
||||
log.Error(err, "Failed to get actions client for removing the runner from the service")
|
||||
return nil
|
||||
log.Info("Deleting the ephemeral runner that has a job assigned but the pod has failed",
|
||||
"runnerName", ephemeralRunner.Status.RunnerName,
|
||||
"runnerId", ephemeralRunner.Status.RunnerID,
|
||||
"hasJob", true,
|
||||
"reason", podReason,
|
||||
"exitCode", exitCode,
|
||||
)
|
||||
if err := r.Delete(ctx, ephemeralRunner); err != nil {
|
||||
log.Error(err, "Failed to delete the ephemeral runner that has a job assigned but the pod has failed",
|
||||
"runnerName", ephemeralRunner.Status.RunnerName,
|
||||
"runnerId", ephemeralRunner.Status.RunnerID,
|
||||
"hasJob", true,
|
||||
"reason", podReason,
|
||||
"exitCode", exitCode,
|
||||
)
|
||||
return err
|
||||
}
|
||||
if err := actionsClient.RemoveRunner(ctx, int64(ephemeralRunner.Status.RunnerID)); err != nil {
|
||||
log.Error(err, "Failed to remove the runner from the service")
|
||||
return nil
|
||||
}
|
||||
log.Info("Removed the runner from the service")
|
||||
log.Info("Deleted the ephemeral runner that has a job assigned but the pod has failed",
|
||||
"runnerName", ephemeralRunner.Status.RunnerName,
|
||||
"runnerId", ephemeralRunner.Status.RunnerID,
|
||||
"hasJob", true,
|
||||
"reason", podReason,
|
||||
"exitCode", exitCode,
|
||||
"cleanupOutcome", "cr_deleted",
|
||||
)
|
||||
return nil
|
||||
}
|
||||
|
||||
// HasJob=false path: still need to clean up if runner is registered
|
||||
if ephemeralRunner.Status.RunnerID != 0 {
|
||||
log.Info("Attempting to remove registered runner from service (HasJob=false)",
|
||||
"runnerName", ephemeralRunner.Status.RunnerName,
|
||||
"runnerId", ephemeralRunner.Status.RunnerID,
|
||||
"hasJob", false,
|
||||
"reason", podReason,
|
||||
"exitCode", exitCode,
|
||||
)
|
||||
if err := r.deleteRunnerFromService(ctx, ephemeralRunner, log); err != nil {
|
||||
log.Error(err, "Failed to remove runner from service; will retry on next reconciliation",
|
||||
"runnerName", ephemeralRunner.Status.RunnerName,
|
||||
"runnerId", ephemeralRunner.Status.RunnerID,
|
||||
"hasJob", false,
|
||||
"reason", podReason,
|
||||
"exitCode", exitCode,
|
||||
"retryable", true,
|
||||
)
|
||||
return err
|
||||
}
|
||||
log.Info("Successfully removed runner from service",
|
||||
"runnerName", ephemeralRunner.Status.RunnerName,
|
||||
"runnerId", ephemeralRunner.Status.RunnerID,
|
||||
"hasJob", false,
|
||||
"reason", podReason,
|
||||
"exitCode", exitCode,
|
||||
"cleanupOutcome", "success",
|
||||
)
|
||||
} else {
|
||||
log.Info("Skipping runner removal from service because runner is not registered",
|
||||
"runnerName", ephemeralRunner.Status.RunnerName,
|
||||
"runnerId", ephemeralRunner.Status.RunnerID,
|
||||
"hasJob", false,
|
||||
"reason", podReason,
|
||||
"exitCode", exitCode,
|
||||
"cleanupOutcome", "skipped_unregistered",
|
||||
)
|
||||
}
|
||||
|
||||
if err := r.deletePodAsFailed(ctx, ephemeralRunner, pod, log); err != nil {
|
||||
log.Error(err, "Failed to delete runner pod on failure")
|
||||
log.Error(err, "Failed to delete runner pod on failure",
|
||||
"runnerName", ephemeralRunner.Status.RunnerName,
|
||||
"runnerId", ephemeralRunner.Status.RunnerID,
|
||||
"hasJob", ephemeralRunner.HasJob(),
|
||||
"reason", podReason,
|
||||
"exitCode", exitCode,
|
||||
)
|
||||
return err
|
||||
}
|
||||
|
||||
|
|
@ -561,21 +763,46 @@ func (r *EphemeralRunnerReconciler) cleanupRunnerLinkedSecrets(ctx context.Conte
|
|||
}
|
||||
|
||||
func (r *EphemeralRunnerReconciler) markAsFailed(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, errMessage string, reason string, log logr.Logger) error {
|
||||
log.Info("Updating ephemeral runner status to Failed")
|
||||
log.Info("Updating ephemeral runner status to Failed",
|
||||
"runnerName", ephemeralRunner.Status.RunnerName,
|
||||
"runnerId", ephemeralRunner.Status.RunnerID,
|
||||
"reason", reason,
|
||||
"message", errMessage,
|
||||
)
|
||||
if err := patchSubResource(ctx, r.Status(), ephemeralRunner, func(obj *v1alpha1.EphemeralRunner) {
|
||||
obj.Status.Phase = v1alpha1.EphemeralRunnerPhaseFailed
|
||||
obj.Status.Reason = reason
|
||||
obj.Status.Message = errMessage
|
||||
}); err != nil {
|
||||
log.Error(err, "Failed to update ephemeral runner status Phase/Message",
|
||||
"runnerName", ephemeralRunner.Status.RunnerName,
|
||||
"runnerId", ephemeralRunner.Status.RunnerID,
|
||||
"reason", reason,
|
||||
)
|
||||
return fmt.Errorf("failed to update ephemeral runner status Phase/Message: %w", err)
|
||||
}
|
||||
|
||||
log.Info("Removing the runner from the service")
|
||||
log.Info("Removing the runner from the service",
|
||||
"runnerName", ephemeralRunner.Status.RunnerName,
|
||||
"runnerId", ephemeralRunner.Status.RunnerID,
|
||||
"reason", reason,
|
||||
)
|
||||
if err := r.deleteRunnerFromService(ctx, ephemeralRunner, log); err != nil {
|
||||
log.Error(err, "Failed to remove the runner from service; will retry on next reconciliation",
|
||||
"runnerName", ephemeralRunner.Status.RunnerName,
|
||||
"runnerId", ephemeralRunner.Status.RunnerID,
|
||||
"reason", reason,
|
||||
"retryable", true,
|
||||
)
|
||||
return fmt.Errorf("failed to remove the runner from service: %w", err)
|
||||
}
|
||||
|
||||
log.Info("EphemeralRunner is marked as Failed and deleted from the service")
|
||||
log.Info("EphemeralRunner is marked as Failed and deleted from the service",
|
||||
"runnerName", ephemeralRunner.Status.RunnerName,
|
||||
"runnerId", ephemeralRunner.Status.RunnerID,
|
||||
"reason", reason,
|
||||
"cleanupOutcome", "success",
|
||||
)
|
||||
return nil
|
||||
}
|
||||
|
||||
|
|
@ -616,8 +843,16 @@ func (r *EphemeralRunnerReconciler) deletePodAsFailed(ctx context.Context, ephem
|
|||
}
|
||||
obj.Status.Failures[string(pod.UID)] = metav1.Now()
|
||||
obj.Status.Ready = false
|
||||
|
||||
obj.Status.Reason = pod.Status.Reason
|
||||
if obj.Status.Reason == "" {
|
||||
obj.Status.Reason = string(corev1.PodFailed)
|
||||
}
|
||||
|
||||
obj.Status.Message = pod.Status.Message
|
||||
if obj.Status.Message == "" {
|
||||
obj.Status.Message = "Pod failed without detailed termination information"
|
||||
}
|
||||
}); err != nil {
|
||||
return fmt.Errorf("failed to update ephemeral runner status: failed attempts: %w", err)
|
||||
}
|
||||
|
|
@ -829,16 +1064,32 @@ func (r *EphemeralRunnerReconciler) updateRunStatusFromPod(ctx context.Context,
|
|||
func (r *EphemeralRunnerReconciler) deleteRunnerFromService(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) error {
|
||||
client, err := r.GetActionsService(ctx, ephemeralRunner)
|
||||
if err != nil {
|
||||
log.Error(err, "Failed to get actions client for runner",
|
||||
"runnerName", ephemeralRunner.Status.RunnerName,
|
||||
"runnerId", ephemeralRunner.Status.RunnerID,
|
||||
)
|
||||
return fmt.Errorf("failed to get actions client for runner: %w", err)
|
||||
}
|
||||
|
||||
log.Info("Removing runner from the service", "runnerId", ephemeralRunner.Status.RunnerID)
|
||||
log.Info("Removing runner from the service",
|
||||
"runnerName", ephemeralRunner.Status.RunnerName,
|
||||
"runnerId", ephemeralRunner.Status.RunnerID,
|
||||
)
|
||||
err = client.RemoveRunner(ctx, int64(ephemeralRunner.Status.RunnerID))
|
||||
if err != nil {
|
||||
log.Error(err, "Failed to remove runner from the service; cleanup will be retried",
|
||||
"runnerName", ephemeralRunner.Status.RunnerName,
|
||||
"runnerId", ephemeralRunner.Status.RunnerID,
|
||||
"retryable", true,
|
||||
)
|
||||
return fmt.Errorf("failed to remove runner from the service: %w", err)
|
||||
}
|
||||
|
||||
log.Info("Removed runner from the service", "runnerId", ephemeralRunner.Status.RunnerID)
|
||||
log.Info("Removed runner from the service",
|
||||
"runnerName", ephemeralRunner.Status.RunnerName,
|
||||
"runnerId", ephemeralRunner.Status.RunnerID,
|
||||
"cleanupOutcome", "success",
|
||||
)
|
||||
return nil
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -4,12 +4,14 @@ import (
|
|||
"context"
|
||||
"crypto/tls"
|
||||
"encoding/base64"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/actions/actions-runner-controller/apis/actions.github.com/v1alpha1"
|
||||
|
|
@ -1086,6 +1088,719 @@ var _ = Describe("EphemeralRunner", func() {
|
|||
ephemeralRunnerTimeout,
|
||||
).Should(BeEquivalentTo(v1alpha1.EphemeralRunnerPhaseRunning))
|
||||
})
|
||||
|
||||
// ==============================================================================
|
||||
// Terminal Lifecycle Invariants Matrix for EphemeralRunner Pod Failures
|
||||
// ==============================================================================
|
||||
//
|
||||
// This matrix codifies expected outcomes for all terminal runner failure scenarios.
|
||||
// These tests drive the controller's reconciliation branching logic.
|
||||
//
|
||||
// Dimensions:
|
||||
// -----------
|
||||
// 1. HasJob (JobID set): true | false
|
||||
// 2. RunnerID: 0 | non-zero
|
||||
// 3. Container Status: nil/no-terminated | terminated
|
||||
// 4. Exit Code (if term): 0 | 7 | 137 (OOMKilled) | other non-zero
|
||||
// 5. Pod.Status.Reason: Evicted | OutOf* | OOMKilled | <empty>
|
||||
//
|
||||
// Expected Outcomes:
|
||||
// ------------------
|
||||
// HasJob | RunnerID | ContainerStatus | ExitCode | PodReason | Expected Outcome
|
||||
// -------|----------|-----------------|----------|------------|---------------------------------------------------
|
||||
// false | 0 | nil/no-term | N/A | Evicted | DELETE pod, track failure, RECREATE (retryable)
|
||||
// false | 0 | nil/no-term | N/A | OutOf* | DELETE pod, track failure, RECREATE (retryable)
|
||||
// false | 0 | nil/no-term | N/A | <empty> | DELETE pod, track failure, RECREATE (retryable)
|
||||
// false | non-zero | nil/no-term | N/A | Evicted | DELETE pod, track failure, RECREATE (retryable)
|
||||
// false | non-zero | nil/no-term | N/A | OutOf* | DELETE pod, track failure, RECREATE (retryable)
|
||||
// false | non-zero | nil/no-term | N/A | <empty> | DELETE pod, track failure, RECREATE (retryable)
|
||||
// false | 0 | terminated | 0 | N/A | DELETE runner (SUCCESS - terminal)
|
||||
// false | non-zero | terminated | 0 | N/A | DELETE runner (SUCCESS - terminal)
|
||||
// false | 0 | terminated | 7 | N/A | Mark Outdated (TERMINAL)
|
||||
// false | non-zero | terminated | 7 | N/A | Mark Outdated (TERMINAL)
|
||||
// false | 0 | terminated | 137 | OOMKilled | DELETE runner + remove from service (TERMINAL)
|
||||
// false | non-zero | terminated | 137 | OOMKilled | DELETE runner + remove from service (TERMINAL)
|
||||
// false | 0 | terminated | other | <empty> | DELETE pod, track failure, RECREATE (retryable)
|
||||
// false | non-zero | terminated | other | <empty> | DELETE pod, track failure, RECREATE (retryable)
|
||||
// true | 0 | nil/no-term | N/A | Evicted | DELETE runner + remove from service (TERMINAL)
|
||||
// true | 0 | nil/no-term | N/A | OutOf* | DELETE runner + remove from service (TERMINAL)
|
||||
// true | 0 | nil/no-term | N/A | <empty> | DELETE runner + remove from service (TERMINAL)
|
||||
// true | non-zero | nil/no-term | N/A | Evicted | DELETE runner + remove from service (TERMINAL)
|
||||
// true | non-zero | nil/no-term | N/A | OutOf* | DELETE runner + remove from service (TERMINAL)
|
||||
// true | non-zero | nil/no-term | N/A | <empty> | DELETE runner + remove from service (TERMINAL)
|
||||
// true | 0 | terminated | 0 | N/A | DELETE runner (SUCCESS - terminal)
|
||||
// true | non-zero | terminated | 0 | N/A | DELETE runner (SUCCESS - terminal)
|
||||
// true | 0 | terminated | 7 | N/A | Mark Outdated (TERMINAL)
|
||||
// true | non-zero | terminated | 7 | N/A | Mark Outdated (TERMINAL)
|
||||
// true | 0 | terminated | 137 | OOMKilled | DELETE runner + remove from service (TERMINAL)
|
||||
// true | non-zero | terminated | 137 | OOMKilled | DELETE runner + remove from service (TERMINAL)
|
||||
// true | 0 | terminated | other | <empty> | DELETE runner + remove from service (TERMINAL)
|
||||
// true | non-zero | terminated | other | <empty> | DELETE runner + remove from service (TERMINAL)
|
||||
//
|
||||
// Key Insights:
|
||||
// -------------
|
||||
// 1. HasJob=true always results in TERMINAL deletion (runner + service cleanup)
|
||||
// 2. HasJob=false with retryable failures (Evicted, OutOf*, no container status) -> RECREATE
|
||||
// 3. OOMKilled (ExitCode 137, Reason=OOMKilled) is TERMINAL regardless of HasJob status
|
||||
// 4. ExitCode 0 and 7 are always terminal regardless of HasJob
|
||||
// 5. Other non-zero exit codes follow the HasJob branching logic
|
||||
//
|
||||
// Implementation Notes:
|
||||
// ---------------------
|
||||
// - Controller logic: ephemeralrunner_controller.go:314-352 (main PodFailed branch)
|
||||
// - deleteEphemeralRunnerOrPod: ephemeralrunner_controller.go:388-420 (HasJob fork)
|
||||
// - deletePodAsFailed: ephemeralrunner_controller.go:604-627 (failure tracking)
|
||||
//
|
||||
// ==============================================================================
|
||||
|
||||
Context("Terminal Lifecycle Invariants Matrix", func() {
|
||||
It("Matrix: HasJob=false, RunnerID=non-zero, ExitCode=137 (OOMKilled) should delete runner and remove from service", func() {
|
||||
pod := new(corev1.Pod)
|
||||
Eventually(func() error {
|
||||
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod")
|
||||
|
||||
pod.Status.Phase = corev1.PodFailed
|
||||
pod.Status.Reason = "OOMKilled"
|
||||
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
|
||||
Name: v1alpha1.EphemeralRunnerContainerName,
|
||||
State: corev1.ContainerState{
|
||||
Terminated: &corev1.ContainerStateTerminated{
|
||||
ExitCode: 137,
|
||||
Reason: "OOMKilled",
|
||||
},
|
||||
},
|
||||
})
|
||||
err := k8sClient.Status().Update(ctx, pod)
|
||||
Expect(err).To(BeNil(), "failed to update pod status")
|
||||
|
||||
Eventually(func() bool {
|
||||
check := new(v1alpha1.EphemeralRunner)
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check)
|
||||
return kerrors.IsNotFound(err)
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeTrue(), "ephemeral runner should be deleted (terminal)")
|
||||
})
|
||||
|
||||
It("Matrix: HasJob=true, RunnerID=non-zero, ExitCode=137 (OOMKilled) should delete runner and remove from service", func() {
|
||||
er := new(v1alpha1.EphemeralRunner)
|
||||
Eventually(func() error {
|
||||
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner")
|
||||
|
||||
er.Status.JobID = "12345"
|
||||
err := k8sClient.Status().Update(ctx, er)
|
||||
Expect(err).To(BeNil(), "failed to update ephemeral runner status")
|
||||
|
||||
Eventually(func() (string, error) {
|
||||
current := new(v1alpha1.EphemeralRunner)
|
||||
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, current); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return current.Status.JobID, nil
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Equal("12345"))
|
||||
|
||||
pod := new(corev1.Pod)
|
||||
Eventually(func() error {
|
||||
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod")
|
||||
|
||||
pod.Status.Phase = corev1.PodFailed
|
||||
pod.Status.Reason = "OOMKilled"
|
||||
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
|
||||
Name: v1alpha1.EphemeralRunnerContainerName,
|
||||
State: corev1.ContainerState{
|
||||
Terminated: &corev1.ContainerStateTerminated{
|
||||
ExitCode: 137,
|
||||
Reason: "OOMKilled",
|
||||
},
|
||||
},
|
||||
})
|
||||
err = k8sClient.Status().Update(ctx, pod)
|
||||
Expect(err).To(BeNil(), "failed to update pod status")
|
||||
|
||||
Eventually(func() bool {
|
||||
check := new(v1alpha1.EphemeralRunner)
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check)
|
||||
return kerrors.IsNotFound(err)
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeTrue(), "ephemeral runner should be deleted (terminal)")
|
||||
})
|
||||
|
||||
It("Matrix: HasJob=false, RunnerID=0, ExitCode=137 (OOMKilled) should delete runner (terminal)", func() {
|
||||
pod := new(corev1.Pod)
|
||||
Eventually(func() error {
|
||||
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod")
|
||||
|
||||
er := new(v1alpha1.EphemeralRunner)
|
||||
Eventually(func() error {
|
||||
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner")
|
||||
|
||||
pod.Status.Phase = corev1.PodFailed
|
||||
pod.Status.Reason = "OOMKilled"
|
||||
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
|
||||
Name: v1alpha1.EphemeralRunnerContainerName,
|
||||
State: corev1.ContainerState{
|
||||
Terminated: &corev1.ContainerStateTerminated{
|
||||
ExitCode: 137,
|
||||
Reason: "OOMKilled",
|
||||
},
|
||||
},
|
||||
})
|
||||
err := k8sClient.Status().Update(ctx, pod)
|
||||
Expect(err).To(BeNil(), "failed to update pod status")
|
||||
|
||||
Eventually(func() bool {
|
||||
check := new(v1alpha1.EphemeralRunner)
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check)
|
||||
return kerrors.IsNotFound(err)
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeTrue(), "ephemeral runner should be deleted (terminal)")
|
||||
})
|
||||
|
||||
It("Matrix: HasJob=false, no ContainerStatus.Terminated, Pod.Reason=OOMKilled should delete runner (terminal)", func() {
|
||||
pod := new(corev1.Pod)
|
||||
Eventually(func() error {
|
||||
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod")
|
||||
|
||||
pod.Status.Phase = corev1.PodFailed
|
||||
pod.Status.Reason = "OOMKilled"
|
||||
pod.Status.ContainerStatuses = nil
|
||||
err := k8sClient.Status().Update(ctx, pod)
|
||||
Expect(err).To(BeNil(), "failed to update pod status")
|
||||
|
||||
Eventually(func() bool {
|
||||
check := new(v1alpha1.EphemeralRunner)
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check)
|
||||
return kerrors.IsNotFound(err)
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeTrue(), "ephemeral runner should be deleted (terminal)")
|
||||
})
|
||||
|
||||
It("Matrix: HasJob=true, no ContainerStatus.Terminated, Pod.Reason=OOMKilled should delete runner (terminal)", func() {
|
||||
er := new(v1alpha1.EphemeralRunner)
|
||||
Eventually(func() error {
|
||||
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner")
|
||||
|
||||
er.Status.JobID = "67890"
|
||||
err := k8sClient.Status().Update(ctx, er)
|
||||
Expect(err).To(BeNil(), "failed to update ephemeral runner status with job ID")
|
||||
|
||||
Eventually(func() (string, error) {
|
||||
current := new(v1alpha1.EphemeralRunner)
|
||||
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, current); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return current.Status.JobID, nil
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Equal("67890"))
|
||||
|
||||
pod := new(corev1.Pod)
|
||||
Eventually(func() error {
|
||||
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod")
|
||||
|
||||
pod.Status.Phase = corev1.PodFailed
|
||||
pod.Status.Reason = "OOMKilled"
|
||||
pod.Status.ContainerStatuses = nil
|
||||
err = k8sClient.Status().Update(ctx, pod)
|
||||
Expect(err).To(BeNil(), "failed to update pod status")
|
||||
|
||||
Eventually(func() bool {
|
||||
check := new(v1alpha1.EphemeralRunner)
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check)
|
||||
return kerrors.IsNotFound(err)
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeTrue(), "ephemeral runner should be deleted (terminal)")
|
||||
})
|
||||
|
||||
It("Matrix: HasJob=false, no ContainerStatus and empty Pod.Reason should use fallback reason", func() {
|
||||
pod := new(corev1.Pod)
|
||||
Eventually(func() error {
|
||||
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod")
|
||||
|
||||
pod.Status.Phase = corev1.PodFailed
|
||||
pod.Status.Reason = ""
|
||||
pod.Status.Message = ""
|
||||
pod.Status.ContainerStatuses = nil
|
||||
err := k8sClient.Status().Update(ctx, pod)
|
||||
Expect(err).To(BeNil(), "failed to update pod status")
|
||||
|
||||
updated := new(v1alpha1.EphemeralRunner)
|
||||
Eventually(func() (int, error) {
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return len(updated.Status.Failures), nil
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Equal(1), "failure should be tracked")
|
||||
|
||||
Eventually(func() (string, error) {
|
||||
current := new(v1alpha1.EphemeralRunner)
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, current)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return current.Status.Reason, nil
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Equal("Failed"), "reason should use fallback value")
|
||||
|
||||
Eventually(func() (string, error) {
|
||||
current := new(v1alpha1.EphemeralRunner)
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, current)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return current.Status.Message, nil
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Equal("Pod failed without detailed termination information"), "message should use fallback value")
|
||||
|
||||
Eventually(func() error {
|
||||
pod := new(corev1.Pod)
|
||||
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
|
||||
}, ephemeralRunnerTimeout*2, ephemeralRunnerInterval).Should(Succeed(), "pod should be recreated")
|
||||
})
|
||||
})
|
||||
|
||||
Context("OOMKilled with Transient API Failures", func() {
|
||||
var ctx context.Context
|
||||
var autoscalingNS *corev1.Namespace
|
||||
var configSecret *corev1.Secret
|
||||
var controller *EphemeralRunnerReconciler
|
||||
var ephemeralRunner *v1alpha1.EphemeralRunner
|
||||
var mgr ctrl.Manager
|
||||
|
||||
BeforeEach(func() {
|
||||
ctx = context.Background()
|
||||
autoscalingNS, mgr = createNamespace(GinkgoT(), k8sClient)
|
||||
configSecret = createDefaultSecret(GinkgoT(), k8sClient, autoscalingNS.Name)
|
||||
|
||||
// Create controller with fake client that fails RemoveRunner
|
||||
controller = &EphemeralRunnerReconciler{
|
||||
Client: mgr.GetClient(),
|
||||
Scheme: mgr.GetScheme(),
|
||||
Log: logf.Log,
|
||||
ResourceBuilder: ResourceBuilder{
|
||||
SecretResolver: secretresolver.New(mgr.GetClient(), scalefake.NewMultiClient(
|
||||
scalefake.WithClient(
|
||||
scalefake.NewClient(
|
||||
scalefake.WithGenerateJitRunnerConfig(
|
||||
&scaleset.RunnerScaleSetJitRunnerConfig{
|
||||
Runner: &scaleset.RunnerReference{ID: 2, Name: "test-runner-transient"},
|
||||
EncodedJITConfig: "fake-jit-config",
|
||||
},
|
||||
nil,
|
||||
),
|
||||
scalefake.WithRemoveRunner(
|
||||
errors.New("transient API failure"),
|
||||
),
|
||||
),
|
||||
),
|
||||
)),
|
||||
},
|
||||
}
|
||||
|
||||
err := controller.SetupWithManager(mgr)
|
||||
Expect(err).To(BeNil(), "failed to setup controller")
|
||||
|
||||
ephemeralRunner = newExampleRunner("test-runner-transient", autoscalingNS.Name, configSecret.Name)
|
||||
err = k8sClient.Create(ctx, ephemeralRunner)
|
||||
Expect(err).To(BeNil(), "failed to create ephemeral runner")
|
||||
|
||||
startManagers(GinkgoT(), mgr)
|
||||
})
|
||||
|
||||
It("Matrix: HasJob=true, OOMKilled with transient RemoveRunner API failure should keep runner for retry and not set deletion timestamp", func() {
|
||||
pod := new(corev1.Pod)
|
||||
Eventually(func() error {
|
||||
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod")
|
||||
|
||||
er := new(v1alpha1.EphemeralRunner)
|
||||
Eventually(func() error {
|
||||
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner")
|
||||
|
||||
er.Status.JobID = "99999"
|
||||
er.Status.RunnerID = 2
|
||||
err := k8sClient.Status().Update(ctx, er)
|
||||
Expect(err).To(BeNil(), "failed to update ephemeral runner status with job ID and runner ID")
|
||||
|
||||
Eventually(func() (string, error) {
|
||||
current := new(v1alpha1.EphemeralRunner)
|
||||
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, current); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return current.Status.JobID, nil
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Equal("99999"))
|
||||
|
||||
pod.Status.Phase = corev1.PodFailed
|
||||
pod.Status.Reason = "OOMKilled"
|
||||
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
|
||||
Name: v1alpha1.EphemeralRunnerContainerName,
|
||||
State: corev1.ContainerState{
|
||||
Terminated: &corev1.ContainerStateTerminated{
|
||||
ExitCode: 137,
|
||||
Reason: "OOMKilled",
|
||||
},
|
||||
},
|
||||
})
|
||||
err = k8sClient.Status().Update(ctx, pod)
|
||||
Expect(err).To(BeNil(), "failed to update pod status")
|
||||
|
||||
Consistently(func() bool {
|
||||
check := new(v1alpha1.EphemeralRunner)
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return check.DeletionTimestamp == nil && len(check.Finalizers) > 0
|
||||
}, "5s", ephemeralRunnerInterval).Should(BeTrue(), "runner should be preserved for retry until RemoveRunner succeeds")
|
||||
})
|
||||
|
||||
It("Matrix: HasJob=true, non-OOM pod failure with transient RemoveRunner API failure should preserve runner for retry", func() {
|
||||
pod := new(corev1.Pod)
|
||||
Eventually(func() error {
|
||||
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod")
|
||||
|
||||
er := new(v1alpha1.EphemeralRunner)
|
||||
Eventually(func() error {
|
||||
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner")
|
||||
|
||||
er.Status.JobID = "88888"
|
||||
er.Status.RunnerID = 2
|
||||
err := k8sClient.Status().Update(ctx, er)
|
||||
Expect(err).To(BeNil(), "failed to update ephemeral runner status with job ID and runner ID")
|
||||
|
||||
pod.Status.Phase = corev1.PodFailed
|
||||
pod.Status.Reason = "CrashLoopBackOff"
|
||||
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
|
||||
Name: v1alpha1.EphemeralRunnerContainerName,
|
||||
State: corev1.ContainerState{
|
||||
Terminated: &corev1.ContainerStateTerminated{
|
||||
ExitCode: 1,
|
||||
Reason: "Error",
|
||||
},
|
||||
},
|
||||
})
|
||||
err = k8sClient.Status().Update(ctx, pod)
|
||||
Expect(err).To(BeNil(), "failed to update pod status")
|
||||
|
||||
Consistently(func() bool {
|
||||
check := new(v1alpha1.EphemeralRunner)
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return check.DeletionTimestamp == nil
|
||||
}, "5s", ephemeralRunnerInterval).Should(BeTrue(), "runner should remain undeleted so RemoveRunner can be retried")
|
||||
})
|
||||
|
||||
It("Matrix: HasJob=false, RunnerID=non-zero, pod failure with transient RemoveRunner API failure should preserve runner for retry", func() {
|
||||
pod := new(corev1.Pod)
|
||||
Eventually(func() error {
|
||||
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod")
|
||||
|
||||
er := new(v1alpha1.EphemeralRunner)
|
||||
Eventually(func() error {
|
||||
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner")
|
||||
|
||||
er.Status.RunnerID = 3
|
||||
err := k8sClient.Status().Update(ctx, er)
|
||||
Expect(err).To(BeNil(), "failed to update ephemeral runner status with runner ID")
|
||||
|
||||
pod.Status.Phase = corev1.PodFailed
|
||||
pod.Status.Reason = "Error"
|
||||
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
|
||||
Name: v1alpha1.EphemeralRunnerContainerName,
|
||||
State: corev1.ContainerState{
|
||||
Terminated: &corev1.ContainerStateTerminated{
|
||||
ExitCode: 1,
|
||||
Reason: "Error",
|
||||
},
|
||||
},
|
||||
})
|
||||
err = k8sClient.Status().Update(ctx, pod)
|
||||
Expect(err).To(BeNil(), "failed to update pod status")
|
||||
|
||||
updated := new(v1alpha1.EphemeralRunner)
|
||||
Eventually(func() (int, error) {
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return len(updated.Status.Failures), nil
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Equal(1), "failure should be tracked")
|
||||
|
||||
Consistently(func() bool {
|
||||
check := new(v1alpha1.EphemeralRunner)
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return check.DeletionTimestamp == nil && len(check.Status.Failures) == 1
|
||||
}, "5s", ephemeralRunnerInterval).Should(BeTrue(), "runner should be preserved for RemoveRunner retry despite HasJob=false")
|
||||
})
|
||||
})
|
||||
|
||||
// ==============================================================================
|
||||
// Convergence Tests: Verify retry-safe cleanup converges correctly
|
||||
// ==============================================================================
|
||||
// These tests ensure the retry mechanism:
|
||||
// 1. Eventually converges when external service recovers
|
||||
// 2. Respects bounded behavior (no infinite loops)
|
||||
// 3. Maintains observability (no silent success on persistent failure)
|
||||
//
|
||||
// Implementation Reference:
|
||||
// - Backoff logic: ephemeralrunner_controller.go:229-254
|
||||
// - Cleanup retry: ephemeralrunner_controller.go:430-440
|
||||
// - MaxFailures boundary: ephemeralrunner_controller.go:229-236
|
||||
// ==============================================================================
|
||||
Context("Retry Convergence and Bounded Behavior", func() {
|
||||
var callCount int
|
||||
var mu sync.Mutex
|
||||
|
||||
BeforeEach(func() {
|
||||
mu.Lock()
|
||||
callCount = 0
|
||||
mu.Unlock()
|
||||
|
||||
controller = &EphemeralRunnerReconciler{
|
||||
Client: mgr.GetClient(),
|
||||
Scheme: mgr.GetScheme(),
|
||||
Log: logf.Log,
|
||||
ResourceBuilder: ResourceBuilder{
|
||||
SecretResolver: secretresolver.New(mgr.GetClient(), scalefake.NewMultiClient(
|
||||
scalefake.WithClient(
|
||||
scalefake.NewClient(
|
||||
scalefake.WithGenerateJitRunnerConfig(
|
||||
&scaleset.RunnerScaleSetJitRunnerConfig{
|
||||
Runner: &scaleset.RunnerReference{ID: 99, Name: "test-convergence"},
|
||||
EncodedJITConfig: "fake-jit-config",
|
||||
},
|
||||
nil,
|
||||
),
|
||||
scalefake.WithRemoveRunnerFunc(func(ctx context.Context, runnerID int64) error {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
callCount++
|
||||
if callCount <= 2 {
|
||||
return errors.New("transient API failure")
|
||||
}
|
||||
return nil
|
||||
}),
|
||||
),
|
||||
),
|
||||
)),
|
||||
},
|
||||
}
|
||||
|
||||
err := controller.SetupWithManager(mgr)
|
||||
Expect(err).To(BeNil(), "failed to setup controller")
|
||||
|
||||
ephemeralRunner = newExampleRunner("test-convergence", autoscalingNS.Name, configSecret.Name)
|
||||
err = k8sClient.Create(ctx, ephemeralRunner)
|
||||
Expect(err).To(BeNil(), "failed to create ephemeral runner")
|
||||
|
||||
startManagers(GinkgoT(), mgr)
|
||||
})
|
||||
|
||||
It("Convergence: Repeated transient RemoveRunner failures eventually converge when service recovers", func() {
|
||||
pod := new(corev1.Pod)
|
||||
Eventually(func() error {
|
||||
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod")
|
||||
|
||||
er := new(v1alpha1.EphemeralRunner)
|
||||
Eventually(func() error {
|
||||
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner")
|
||||
|
||||
er.Status.JobID = "conv-job-1"
|
||||
er.Status.RunnerID = 99
|
||||
err := k8sClient.Status().Update(ctx, er)
|
||||
Expect(err).To(BeNil(), "failed to update ephemeral runner status")
|
||||
|
||||
Eventually(func() (string, error) {
|
||||
current := new(v1alpha1.EphemeralRunner)
|
||||
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, current); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return current.Status.JobID, nil
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Equal("conv-job-1"))
|
||||
|
||||
pod.Status.Phase = corev1.PodFailed
|
||||
pod.Status.Reason = "Error"
|
||||
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
|
||||
Name: v1alpha1.EphemeralRunnerContainerName,
|
||||
State: corev1.ContainerState{
|
||||
Terminated: &corev1.ContainerStateTerminated{
|
||||
ExitCode: 1,
|
||||
Reason: "Error",
|
||||
},
|
||||
},
|
||||
})
|
||||
err = k8sClient.Status().Update(ctx, pod)
|
||||
Expect(err).To(BeNil(), "failed to update pod status")
|
||||
|
||||
Eventually(func() int {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
return callCount
|
||||
}, "30s", ephemeralRunnerInterval).Should(BeNumerically(">=", 3), "RemoveRunner should be retried until success")
|
||||
|
||||
Eventually(func() bool {
|
||||
check := new(v1alpha1.EphemeralRunner)
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check)
|
||||
return kerrors.IsNotFound(err)
|
||||
}, "40s", ephemeralRunnerInterval).Should(BeTrue(), "ephemeral runner should be deleted after cleanup converges")
|
||||
})
|
||||
})
|
||||
|
||||
Context("Bounded Retry Behavior: MaxFailures Enforcement", func() {
|
||||
var persistentFailCount int
|
||||
var muPersistent sync.Mutex
|
||||
|
||||
BeforeEach(func() {
|
||||
muPersistent.Lock()
|
||||
persistentFailCount = 0
|
||||
muPersistent.Unlock()
|
||||
|
||||
controller = &EphemeralRunnerReconciler{
|
||||
Client: mgr.GetClient(),
|
||||
Scheme: mgr.GetScheme(),
|
||||
Log: logf.Log,
|
||||
ResourceBuilder: ResourceBuilder{
|
||||
SecretResolver: secretresolver.New(mgr.GetClient(), scalefake.NewMultiClient(
|
||||
scalefake.WithClient(
|
||||
scalefake.NewClient(
|
||||
scalefake.WithGenerateJitRunnerConfig(
|
||||
&scaleset.RunnerScaleSetJitRunnerConfig{
|
||||
Runner: &scaleset.RunnerReference{ID: 777, Name: "test-persistent"},
|
||||
EncodedJITConfig: "fake-jit-config",
|
||||
},
|
||||
nil,
|
||||
),
|
||||
scalefake.WithRemoveRunnerFunc(func(ctx context.Context, runnerID int64) error {
|
||||
muPersistent.Lock()
|
||||
defer muPersistent.Unlock()
|
||||
persistentFailCount++
|
||||
return errors.New("persistent API failure")
|
||||
}),
|
||||
),
|
||||
),
|
||||
)),
|
||||
},
|
||||
}
|
||||
|
||||
err := controller.SetupWithManager(mgr)
|
||||
Expect(err).To(BeNil(), "failed to setup controller")
|
||||
|
||||
ephemeralRunner = newExampleRunner("test-persistent", autoscalingNS.Name, configSecret.Name)
|
||||
err = k8sClient.Create(ctx, ephemeralRunner)
|
||||
Expect(err).To(BeNil(), "failed to create ephemeral runner")
|
||||
|
||||
startManagers(GinkgoT(), mgr)
|
||||
})
|
||||
|
||||
It("Bounded: Persistent RemoveRunner failures do not cause infinite retry (maxFailures enforced)", func() {
|
||||
pod := new(corev1.Pod)
|
||||
Eventually(func() error {
|
||||
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod")
|
||||
|
||||
er := new(v1alpha1.EphemeralRunner)
|
||||
Eventually(func() error {
|
||||
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner")
|
||||
|
||||
er.Status.JobID = "persist-job"
|
||||
er.Status.RunnerID = 777
|
||||
err := k8sClient.Status().Update(ctx, er)
|
||||
Expect(err).To(BeNil(), "failed to update ephemeral runner status")
|
||||
|
||||
pod.Status.Phase = corev1.PodFailed
|
||||
pod.Status.Reason = "Error"
|
||||
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
|
||||
Name: v1alpha1.EphemeralRunnerContainerName,
|
||||
State: corev1.ContainerState{
|
||||
Terminated: &corev1.ContainerStateTerminated{
|
||||
ExitCode: 1,
|
||||
Reason: "Error",
|
||||
},
|
||||
},
|
||||
})
|
||||
err = k8sClient.Status().Update(ctx, pod)
|
||||
Expect(err).To(BeNil(), "failed to update pod status")
|
||||
|
||||
Consistently(func() int {
|
||||
muPersistent.Lock()
|
||||
defer muPersistent.Unlock()
|
||||
return persistentFailCount
|
||||
}, "15s", "1s").Should(BeNumerically("<=", 20), "RemoveRunner should not be called excessively (no infinite loop)")
|
||||
|
||||
check := new(v1alpha1.EphemeralRunner)
|
||||
err = k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check)
|
||||
if err == nil {
|
||||
Expect(check.DeletionTimestamp).NotTo(BeNil(), "runner should have deletion timestamp showing cleanup intent")
|
||||
} else {
|
||||
Expect(kerrors.IsNotFound(err)).To(BeTrue(), "runner either deleted or has visible error state")
|
||||
}
|
||||
})
|
||||
|
||||
It("Observability: Failed cleanup attempts are visible and not silently succeeded", func() {
|
||||
pod := new(corev1.Pod)
|
||||
Eventually(func() error {
|
||||
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get pod")
|
||||
|
||||
er := new(v1alpha1.EphemeralRunner)
|
||||
Eventually(func() error {
|
||||
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner")
|
||||
|
||||
er.Status.JobID = "observe-job"
|
||||
er.Status.RunnerID = 777
|
||||
err := k8sClient.Status().Update(ctx, er)
|
||||
Expect(err).To(BeNil(), "failed to update ephemeral runner status")
|
||||
|
||||
pod.Status.Phase = corev1.PodFailed
|
||||
pod.Status.Reason = "Error"
|
||||
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
|
||||
Name: v1alpha1.EphemeralRunnerContainerName,
|
||||
State: corev1.ContainerState{
|
||||
Terminated: &corev1.ContainerStateTerminated{
|
||||
ExitCode: 1,
|
||||
Reason: "Error",
|
||||
},
|
||||
},
|
||||
})
|
||||
err = k8sClient.Status().Update(ctx, pod)
|
||||
Expect(err).To(BeNil(), "failed to update pod status")
|
||||
|
||||
Eventually(func() int {
|
||||
muPersistent.Lock()
|
||||
defer muPersistent.Unlock()
|
||||
return persistentFailCount
|
||||
}, "10s", ephemeralRunnerInterval).Should(BeNumerically(">=", 1), "RemoveRunner must be attempted (not silently skipped)")
|
||||
|
||||
time.Sleep(5 * time.Second)
|
||||
check := new(v1alpha1.EphemeralRunner)
|
||||
err = k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check)
|
||||
|
||||
if err == nil {
|
||||
Expect(check.DeletionTimestamp).NotTo(BeNil(), "runner must have deletion timestamp showing cleanup attempt")
|
||||
} else {
|
||||
Expect(kerrors.IsNotFound(err)).To(BeTrue(), "runner either deleted or has visible deletion intent")
|
||||
}
|
||||
|
||||
muPersistent.Lock()
|
||||
finalCount := persistentFailCount
|
||||
muPersistent.Unlock()
|
||||
Expect(finalCount).To(BeNumerically(">=", 1), "cleanup must be attempted, not silently skipped")
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
Describe("Checking the API", func() {
|
||||
|
|
|
|||
|
|
@ -1092,6 +1092,150 @@ var _ = Describe("Test EphemeralRunnerSet controller", func() {
|
|||
ephemeralRunnerSetTestInterval,
|
||||
).Should(BeEquivalentTo(desiredStatus), "Status is not eventually updated to the desired one")
|
||||
})
|
||||
|
||||
It("Should handle repeated OOMKilled failures without unbounded stuck state growth", func() {
|
||||
// This test validates that when child EphemeralRunners encounter repeated OOMKilled
|
||||
// failures, the set-level aggregation remains correct and does not cause unbounded
|
||||
// stuck state growth. This is a regression test for aggregated stuck-runner behavior.
|
||||
//
|
||||
// Pattern: controllers/actions.github.com/ephemeralrunnerset_controller.go:239 (status update aggregation)
|
||||
// Pattern: controllers/actions.github.com/ephemeralrunnerset_controller.go:505 (deleteEphemeralRunnerWithActionsClient cleanup)
|
||||
|
||||
created := new(v1alpha1.EphemeralRunnerSet)
|
||||
Eventually(
|
||||
func() error {
|
||||
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunnerSet.Name, Namespace: ephemeralRunnerSet.Namespace}, created)
|
||||
},
|
||||
ephemeralRunnerSetTestTimeout,
|
||||
ephemeralRunnerSetTestInterval,
|
||||
).Should(Succeed(), "EphemeralRunnerSet should be created")
|
||||
|
||||
// Scale up to 3 runners
|
||||
updated := created.DeepCopy()
|
||||
updated.Spec.Replicas = 3
|
||||
err := k8sClient.Update(ctx, updated)
|
||||
Expect(err).NotTo(HaveOccurred(), "failed to update EphemeralRunnerSet replica count")
|
||||
|
||||
runnerList := new(v1alpha1.EphemeralRunnerList)
|
||||
Eventually(
|
||||
func() (int, error) {
|
||||
err := listEphemeralRunnersAndRemoveFinalizers(ctx, k8sClient, runnerList, ephemeralRunnerSet.Namespace)
|
||||
if err != nil {
|
||||
return -1, err
|
||||
}
|
||||
return len(runnerList.Items), nil
|
||||
},
|
||||
ephemeralRunnerSetTestTimeout,
|
||||
ephemeralRunnerSetTestInterval,
|
||||
).Should(BeEquivalentTo(3), "3 EphemeralRunners should be created")
|
||||
|
||||
// Simulate repeated OOMKilled failures on all 3 runners
|
||||
// Each runner will transition: Pending -> Failed (OOMKilled) -> Pending -> Failed (OOMKilled) ...
|
||||
var runners []*v1alpha1.EphemeralRunner
|
||||
for i := range runnerList.Items {
|
||||
runners = append(runners, runnerList.Items[i].DeepCopy())
|
||||
}
|
||||
|
||||
// First cycle: transition all runners to Failed with OOMKilled
|
||||
for i, runner := range runners {
|
||||
runner.Status.RunnerID = 201 + i
|
||||
runner.Status.Phase = v1alpha1.EphemeralRunnerPhaseFailed
|
||||
runner.Status.Reason = "OOMKilled"
|
||||
err = k8sClient.Status().Patch(ctx, runner, client.MergeFrom(&runnerList.Items[i]))
|
||||
Expect(err).NotTo(HaveOccurred(), "failed to patch runner to failed state")
|
||||
}
|
||||
|
||||
// Verify aggregated status shows 3 failed runners
|
||||
desiredStatus := v1alpha1.EphemeralRunnerSetStatus{
|
||||
Phase: v1alpha1.EphemeralRunnerSetPhaseRunning,
|
||||
CurrentReplicas: 3,
|
||||
PendingEphemeralRunners: 0,
|
||||
RunningEphemeralRunners: 0,
|
||||
FailedEphemeralRunners: 3,
|
||||
}
|
||||
Eventually(
|
||||
func() (v1alpha1.EphemeralRunnerSetStatus, error) {
|
||||
updated := new(v1alpha1.EphemeralRunnerSet)
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunnerSet.Name, Namespace: ephemeralRunnerSet.Namespace}, updated)
|
||||
if err != nil {
|
||||
return v1alpha1.EphemeralRunnerSetStatus{}, err
|
||||
}
|
||||
return updated.Status, nil
|
||||
},
|
||||
ephemeralRunnerSetTestTimeout,
|
||||
ephemeralRunnerSetTestInterval,
|
||||
).Should(BeEquivalentTo(desiredStatus), "Status should show 3 failed runners after first OOMKilled cycle")
|
||||
|
||||
// Second cycle: simulate cleanup and recreation - runners transition back to Pending
|
||||
// This validates that the set doesn't get stuck with unbounded failed count
|
||||
runnerList = new(v1alpha1.EphemeralRunnerList)
|
||||
err = listEphemeralRunnersAndRemoveFinalizers(ctx, k8sClient, runnerList, ephemeralRunnerSet.Namespace)
|
||||
Expect(err).NotTo(HaveOccurred(), "failed to list runners after first OOMKilled cycle")
|
||||
|
||||
for i, runner := range runnerList.Items {
|
||||
runner := runner.DeepCopy()
|
||||
runner.Status.Phase = v1alpha1.EphemeralRunnerPhasePending
|
||||
runner.Status.Reason = ""
|
||||
err = k8sClient.Status().Patch(ctx, runner, client.MergeFrom(&runnerList.Items[i]))
|
||||
Expect(err).NotTo(HaveOccurred(), "failed to patch runner back to pending state")
|
||||
}
|
||||
|
||||
// Verify aggregated status shows 3 pending runners (not stuck with failed count)
|
||||
desiredStatus = v1alpha1.EphemeralRunnerSetStatus{
|
||||
Phase: v1alpha1.EphemeralRunnerSetPhaseRunning,
|
||||
CurrentReplicas: 3,
|
||||
PendingEphemeralRunners: 3,
|
||||
RunningEphemeralRunners: 0,
|
||||
FailedEphemeralRunners: 0,
|
||||
}
|
||||
Eventually(
|
||||
func() (v1alpha1.EphemeralRunnerSetStatus, error) {
|
||||
updated := new(v1alpha1.EphemeralRunnerSet)
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunnerSet.Name, Namespace: ephemeralRunnerSet.Namespace}, updated)
|
||||
if err != nil {
|
||||
return v1alpha1.EphemeralRunnerSetStatus{}, err
|
||||
}
|
||||
return updated.Status, nil
|
||||
},
|
||||
ephemeralRunnerSetTestTimeout,
|
||||
ephemeralRunnerSetTestInterval,
|
||||
).Should(BeEquivalentTo(desiredStatus), "Status should show 3 pending runners after recovery cycle (no unbounded stuck growth)")
|
||||
|
||||
// Third cycle: simulate another OOMKilled failure to verify repeated cycles don't cause stuck state
|
||||
runnerList = new(v1alpha1.EphemeralRunnerList)
|
||||
err = listEphemeralRunnersAndRemoveFinalizers(ctx, k8sClient, runnerList, ephemeralRunnerSet.Namespace)
|
||||
Expect(err).NotTo(HaveOccurred(), "failed to list runners for third cycle")
|
||||
|
||||
for i, runner := range runnerList.Items {
|
||||
runner := runner.DeepCopy()
|
||||
runner.Status.RunnerID = 301 + i
|
||||
runner.Status.Phase = v1alpha1.EphemeralRunnerPhaseFailed
|
||||
runner.Status.Reason = "OOMKilled"
|
||||
err = k8sClient.Status().Patch(ctx, runner, client.MergeFrom(&runnerList.Items[i]))
|
||||
Expect(err).NotTo(HaveOccurred(), "failed to patch runner to failed state in third cycle")
|
||||
}
|
||||
|
||||
// Verify aggregated status shows 3 failed runners again (bounded, not growing)
|
||||
desiredStatus = v1alpha1.EphemeralRunnerSetStatus{
|
||||
Phase: v1alpha1.EphemeralRunnerSetPhaseRunning,
|
||||
CurrentReplicas: 3,
|
||||
PendingEphemeralRunners: 0,
|
||||
RunningEphemeralRunners: 0,
|
||||
FailedEphemeralRunners: 3,
|
||||
}
|
||||
Eventually(
|
||||
func() (v1alpha1.EphemeralRunnerSetStatus, error) {
|
||||
updated := new(v1alpha1.EphemeralRunnerSet)
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunnerSet.Name, Namespace: ephemeralRunnerSet.Namespace}, updated)
|
||||
if err != nil {
|
||||
return v1alpha1.EphemeralRunnerSetStatus{}, err
|
||||
}
|
||||
return updated.Status, nil
|
||||
},
|
||||
ephemeralRunnerSetTestTimeout,
|
||||
ephemeralRunnerSetTestInterval,
|
||||
).Should(BeEquivalentTo(desiredStatus), "Status should show 3 failed runners in third cycle (bounded, not growing unbounded)")
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
|
|
|
|||
|
|
@ -71,6 +71,13 @@ func WithRemoveRunner(err error) ClientOption {
|
|||
}
|
||||
}
|
||||
|
||||
// WithRemoveRunnerFunc configures a function to handle RemoveRunner calls dynamically
|
||||
func WithRemoveRunnerFunc(fn func(context.Context, int64) error) ClientOption {
|
||||
return func(c *Client) {
|
||||
c.removeRunnerFunc = fn
|
||||
}
|
||||
}
|
||||
|
||||
// WithGenerateJitRunnerConfig configures the result of GenerateJitRunnerConfig
|
||||
func WithGenerateJitRunnerConfig(result *scaleset.RunnerScaleSetJitRunnerConfig, err error) ClientOption {
|
||||
return func(c *Client) {
|
||||
|
|
@ -113,6 +120,7 @@ func WithUpdateRunnerScaleSetFunc(fn func(context.Context, int, *scaleset.Runner
|
|||
type Client struct {
|
||||
systemInfo scaleset.SystemInfo
|
||||
updateRunnerScaleSetFunc func(context.Context, int, *scaleset.RunnerScaleSet) (*scaleset.RunnerScaleSet, error)
|
||||
removeRunnerFunc func(context.Context, int64) error
|
||||
|
||||
getRunnerScaleSetResult struct {
|
||||
*scaleset.RunnerScaleSet
|
||||
|
|
@ -196,6 +204,9 @@ func (c *Client) GetRunnerByName(ctx context.Context, runnerName string) (*scale
|
|||
}
|
||||
|
||||
func (c *Client) RemoveRunner(ctx context.Context, runnerID int64) error {
|
||||
if c.removeRunnerFunc != nil {
|
||||
return c.removeRunnerFunc(ctx, runnerID)
|
||||
}
|
||||
return c.removeRunnerResult.err
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue