From 033647e3fa71637ba0199098e30b62b102a97cc0 Mon Sep 17 00:00:00 2001 From: Nikola Jokic Date: Thu, 11 Dec 2025 14:26:55 +0100 Subject: [PATCH 1/4] add out of retry --- .../ephemeralrunner_controller.go | 103 +++++++++++------- 1 file changed, 63 insertions(+), 40 deletions(-) diff --git a/controllers/actions.github.com/ephemeralrunner_controller.go b/controllers/actions.github.com/ephemeralrunner_controller.go index 80a012be..e841861a 100644 --- a/controllers/actions.github.com/ephemeralrunner_controller.go +++ b/controllers/actions.github.com/ephemeralrunner_controller.go @@ -312,26 +312,44 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ cs := runnerContainerStatus(pod) switch { - case cs == nil: - // starting, no container state yet - log.Info("Waiting for runner container status to be available") - return ctrl.Result{}, nil - case cs.State.Terminated == nil: // still running or evicted - if pod.Status.Phase == corev1.PodFailed && pod.Status.Reason == "Evicted" { - log.Info("Pod set the termination phase, but container state is not terminated. Deleting pod", + case pod.Status.Phase == corev1.PodFailed: // All containers are stopped + switch { + case pod.Status.Reason == "Evicted": + log.Info("Pod evicted; Deleting ephemeral runner or pod", "PodPhase", pod.Status.Phase, "PodReason", pod.Status.Reason, "PodMessage", pod.Status.Message, ) - if err := r.deletePodAsFailed(ctx, ephemeralRunner, pod, log); err != nil { - log.Error(err, "failed to delete pod as failed on pod.Status.Phase: Failed") + return ctrl.Result{}, r.deleteEphemeralRunnerOrPod(ctx, ephemeralRunner, pod, log) + + case strings.HasPrefix(pod.Status.Reason, "OutOf"): // most likely a transient issue. + log.Info("Pod set the termination phase due to resource constraints, but container state is not terminated. Deleting ephemeral runner or pod", + "PodPhase", pod.Status.Phase, + "PodReason", pod.Status.Reason, + "PodMessage", pod.Status.Message, + ) + return ctrl.Result{}, r.deleteEphemeralRunnerOrPod(ctx, ephemeralRunner, pod, log) + + default: + log.Info("Pod is in failed phase; updating ephemeral runner status", + "PodPhase", pod.Status.Phase, + "PodReason", pod.Status.Reason, + "PodMessage", pod.Status.Message, + ) + if err := r.updateRunStatusFromPod(ctx, ephemeralRunner, pod, log); err != nil { + log.Info("Failed to update ephemeral runner status. Requeue to not miss this event") return ctrl.Result{}, err } return ctrl.Result{}, nil } - log.Info("Ephemeral runner container is still running") + case cs == nil: + // starting, no container state yet + log.Info("Waiting for runner container status to be available") + return ctrl.Result{}, nil + + case cs.State.Terminated == nil: // container is not terminated and pod phase is not faile< so runner is still running if err := r.updateRunStatusFromPod(ctx, ephemeralRunner, pod, log); err != nil { log.Info("Failed to update ephemeral runner status. Requeue to not miss this event") return ctrl.Result{}, err @@ -340,36 +358,7 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ case cs.State.Terminated.ExitCode != 0: // failed log.Info("Ephemeral runner container failed", "exitCode", cs.State.Terminated.ExitCode) - if ephemeralRunner.HasJob() { - log.Error( - errors.New("ephemeral runner has a job assigned, but the pod has failed"), - "Ephemeral runner either has faulty entrypoint or something external killing the runner", - ) - log.Info("Deleting the ephemeral runner that has a job assigned but the pod has failed") - if err := r.Delete(ctx, ephemeralRunner); err != nil { - log.Error(err, "Failed to delete the ephemeral runner that has a job assigned but the pod has failed") - return ctrl.Result{}, err - } - - log.Info("Deleted the ephemeral runner that has a job assigned but the pod has failed") - log.Info("Trying to remove the runner from the service") - actionsClient, err := r.GetActionsService(ctx, ephemeralRunner) - if err != nil { - log.Error(err, "Failed to get actions client for removing the runner from the service") - return ctrl.Result{}, nil - } - if err := actionsClient.RemoveRunner(ctx, int64(ephemeralRunner.Status.RunnerId)); err != nil { - log.Error(err, "Failed to remove the runner from the service") - return ctrl.Result{}, nil - } - log.Info("Removed the runner from the service") - return ctrl.Result{}, nil - } - if err := r.deletePodAsFailed(ctx, ephemeralRunner, pod, log); err != nil { - log.Error(err, "Failed to delete runner pod on failure") - return ctrl.Result{}, err - } - return ctrl.Result{}, nil + return ctrl.Result{}, r.deleteEphemeralRunnerOrPod(ctx, ephemeralRunner, pod, log) default: // succeeded log.Info("Ephemeral runner has finished successfully, deleting ephemeral runner", "exitCode", cs.State.Terminated.ExitCode) @@ -381,6 +370,40 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ } } +func (r *EphemeralRunnerReconciler) deleteEphemeralRunnerOrPod(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, pod *corev1.Pod, log logr.Logger) error { + if ephemeralRunner.HasJob() { + log.Error( + errors.New("ephemeral runner has a job assigned, but the pod has failed"), + "Ephemeral runner either has faulty entrypoint or something external killing the runner", + ) + log.Info("Deleting the ephemeral runner that has a job assigned but the pod has failed") + if err := r.Delete(ctx, ephemeralRunner); err != nil { + log.Error(err, "Failed to delete the ephemeral runner that has a job assigned but the pod has failed") + return err + } + + log.Info("Deleted the ephemeral runner that has a job assigned but the pod has failed") + log.Info("Trying to remove the runner from the service") + actionsClient, err := r.GetActionsService(ctx, ephemeralRunner) + if err != nil { + log.Error(err, "Failed to get actions client for removing the runner from the service") + return nil + } + if err := actionsClient.RemoveRunner(ctx, int64(ephemeralRunner.Status.RunnerId)); err != nil { + log.Error(err, "Failed to remove the runner from the service") + return nil + } + log.Info("Removed the runner from the service") + return nil + } + if err := r.deletePodAsFailed(ctx, ephemeralRunner, pod, log); err != nil { + log.Error(err, "Failed to delete runner pod on failure") + return err + } + + return nil +} + func (r *EphemeralRunnerReconciler) cleanupRunnerFromService(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) (ok bool, err error) { if err := r.deleteRunnerFromService(ctx, ephemeralRunner, log); err != nil { actionsError := &actions.ActionsError{} From 71e9f54fda595f6bc3ca5985afead260421cf6fb Mon Sep 17 00:00:00 2001 From: Nikola Jokic Date: Thu, 11 Dec 2025 14:37:49 +0100 Subject: [PATCH 2/4] add out of test --- .../ephemeralrunner_controller_test.go | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/controllers/actions.github.com/ephemeralrunner_controller_test.go b/controllers/actions.github.com/ephemeralrunner_controller_test.go index a19f272f..5a8e9b7f 100644 --- a/controllers/actions.github.com/ephemeralrunner_controller_test.go +++ b/controllers/actions.github.com/ephemeralrunner_controller_test.go @@ -745,6 +745,52 @@ var _ = Describe("EphemeralRunner", func() { ).Should(BeEquivalentTo(true)) }) + It("It should re-create pod on OutOfsomething", func() { + pod := new(corev1.Pod) + Eventually( + func() (bool, error) { + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod) + if err != nil { + return false, err + } + return true, nil + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).Should(BeEquivalentTo(true)) + + pod.Status.Phase = corev1.PodFailed + pod.Status.Reason = "OutOfpods" + pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{ + Name: v1alpha1.EphemeralRunnerContainerName, + State: corev1.ContainerState{}, + }) + err := k8sClient.Status().Update(ctx, pod) + Expect(err).To(BeNil(), "failed to patch pod status") + + updated := new(v1alpha1.EphemeralRunner) + Eventually(func() (bool, error) { + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated) + if err != nil { + return false, err + } + return len(updated.Status.Failures) == 1, nil + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo(true)) + + // should re-create after failure + Eventually( + func() (bool, error) { + pod := new(corev1.Pod) + if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil { + return false, err + } + return true, nil + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).Should(BeEquivalentTo(true)) + }) + It("It should not set the phase to succeeded without pod termination status", func() { pod := new(corev1.Pod) Eventually( From 68c04fac00f776a9a84f48448a491d49fa970986 Mon Sep 17 00:00:00 2001 From: Nikola Jokic Date: Thu, 11 Dec 2025 16:24:33 +0100 Subject: [PATCH 3/4] updates --- .../ephemeralrunner_controller.go | 21 ++++++++++--------- .../ephemeralrunner_controller_test.go | 2 +- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/controllers/actions.github.com/ephemeralrunner_controller.go b/controllers/actions.github.com/ephemeralrunner_controller.go index e841861a..a608f6d1 100644 --- a/controllers/actions.github.com/ephemeralrunner_controller.go +++ b/controllers/actions.github.com/ephemeralrunner_controller.go @@ -316,26 +316,26 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ switch { case pod.Status.Reason == "Evicted": log.Info("Pod evicted; Deleting ephemeral runner or pod", - "PodPhase", pod.Status.Phase, - "PodReason", pod.Status.Reason, - "PodMessage", pod.Status.Message, + "podPhase", pod.Status.Phase, + "podReason", pod.Status.Reason, + "podMessage", pod.Status.Message, ) return ctrl.Result{}, r.deleteEphemeralRunnerOrPod(ctx, ephemeralRunner, pod, log) case strings.HasPrefix(pod.Status.Reason, "OutOf"): // most likely a transient issue. log.Info("Pod set the termination phase due to resource constraints, but container state is not terminated. Deleting ephemeral runner or pod", - "PodPhase", pod.Status.Phase, - "PodReason", pod.Status.Reason, - "PodMessage", pod.Status.Message, + "podPhase", pod.Status.Phase, + "podReason", pod.Status.Reason, + "podMessage", pod.Status.Message, ) return ctrl.Result{}, r.deleteEphemeralRunnerOrPod(ctx, ephemeralRunner, pod, log) default: log.Info("Pod is in failed phase; updating ephemeral runner status", - "PodPhase", pod.Status.Phase, - "PodReason", pod.Status.Reason, - "PodMessage", pod.Status.Message, + "podPhase", pod.Status.Phase, + "podReason", pod.Status.Reason, + "podMessage", pod.Status.Message, ) if err := r.updateRunStatusFromPod(ctx, ephemeralRunner, pod, log); err != nil { log.Info("Failed to update ephemeral runner status. Requeue to not miss this event") @@ -349,7 +349,8 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ log.Info("Waiting for runner container status to be available") return ctrl.Result{}, nil - case cs.State.Terminated == nil: // container is not terminated and pod phase is not faile< so runner is still running + case cs.State.Terminated == nil: // container is not terminated and pod phase is not failed, so runner is still running + log.Info("Runner container is still running; updating ephemeral runner status") if err := r.updateRunStatusFromPod(ctx, ephemeralRunner, pod, log); err != nil { log.Info("Failed to update ephemeral runner status. Requeue to not miss this event") return ctrl.Result{}, err diff --git a/controllers/actions.github.com/ephemeralrunner_controller_test.go b/controllers/actions.github.com/ephemeralrunner_controller_test.go index 5a8e9b7f..129dcd17 100644 --- a/controllers/actions.github.com/ephemeralrunner_controller_test.go +++ b/controllers/actions.github.com/ephemeralrunner_controller_test.go @@ -745,7 +745,7 @@ var _ = Describe("EphemeralRunner", func() { ).Should(BeEquivalentTo(true)) }) - It("It should re-create pod on OutOfsomething", func() { + It("It should re-create pod on reason starting with OutOf", func() { pod := new(corev1.Pod) Eventually( func() (bool, error) { From 7d3f8e0856070bd7287261ed26b28292dd37ab23 Mon Sep 17 00:00:00 2001 From: Nikola Jokic Date: Fri, 12 Dec 2025 12:49:12 +0100 Subject: [PATCH 4/4] fix log message --- controllers/actions.github.com/ephemeralrunner_controller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/controllers/actions.github.com/ephemeralrunner_controller.go b/controllers/actions.github.com/ephemeralrunner_controller.go index a608f6d1..40d25ad3 100644 --- a/controllers/actions.github.com/ephemeralrunner_controller.go +++ b/controllers/actions.github.com/ephemeralrunner_controller.go @@ -324,7 +324,7 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ return ctrl.Result{}, r.deleteEphemeralRunnerOrPod(ctx, ephemeralRunner, pod, log) case strings.HasPrefix(pod.Status.Reason, "OutOf"): // most likely a transient issue. - log.Info("Pod set the termination phase due to resource constraints, but container state is not terminated. Deleting ephemeral runner or pod", + log.Info("Pod failed with reason starting with OutOf. Deleting ephemeral runner or pod", "podPhase", pod.Status.Phase, "podReason", pod.Status.Reason, "podMessage", pod.Status.Message,