From 425c977a64072425bfee72a275bf95d9cda90892 Mon Sep 17 00:00:00 2001 From: Nikola Jokic Date: Fri, 5 Sep 2025 16:09:00 +0200 Subject: [PATCH] Remove ephemeral runner when exit code != 0 and is patched with the job --- .../v1alpha1/ephemeralrunner_types.go | 4 + .../ephemeralrunner_controller.go | 30 ++--- .../ephemeralrunner_controller_test.go | 107 +++++++++++++++--- .../ephemeralrunnerset_controller.go | 2 +- 4 files changed, 111 insertions(+), 32 deletions(-) diff --git a/apis/actions.github.com/v1alpha1/ephemeralrunner_types.go b/apis/actions.github.com/v1alpha1/ephemeralrunner_types.go index 838996d7..114dbaa2 100644 --- a/apis/actions.github.com/v1alpha1/ephemeralrunner_types.go +++ b/apis/actions.github.com/v1alpha1/ephemeralrunner_types.go @@ -50,6 +50,10 @@ func (er *EphemeralRunner) IsDone() bool { return er.Status.Phase == corev1.PodSucceeded || er.Status.Phase == corev1.PodFailed } +func (er *EphemeralRunner) HasJob() bool { + return er.Status.JobRequestId != 0 +} + func (er *EphemeralRunner) HasContainerHookConfigured() bool { for i := range er.Spec.Spec.Containers { if er.Spec.Spec.Containers[i].Name != EphemeralRunnerContainerName { diff --git a/controllers/actions.github.com/ephemeralrunner_controller.go b/controllers/actions.github.com/ephemeralrunner_controller.go index 39f3a16e..4da5173c 100644 --- a/controllers/actions.github.com/ephemeralrunner_controller.go +++ b/controllers/actions.github.com/ephemeralrunner_controller.go @@ -321,6 +321,18 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ case cs.State.Terminated.ExitCode != 0: // failed log.Info("Ephemeral runner container failed", "exitCode", cs.State.Terminated.ExitCode) + if ephemeralRunner.HasJob() { + log.Error( + errors.New("ephemeral runner has a job assigned, but the pod has failed"), + "Ephemeral runner either has faulty entrypoint or something external killing the runner", + ) + log.Info("Deleting the ephemeral runner that has a job assigned but the pod has failed") + if err := r.Delete(ctx, ephemeralRunner); err != nil { + log.Error(err, "Failed to delete the ephemeral runner that has a job assigned but the pod has failed") + return ctrl.Result{}, err + } + return ctrl.Result{}, nil + } if err := r.deletePodAsFailed(ctx, ephemeralRunner, pod, log); err != nil { log.Error(err, "Failed to delete runner pod on failure") return ctrl.Result{}, err @@ -328,9 +340,9 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ return ctrl.Result{}, nil default: // succeeded - log.Info("Ephemeral runner has finished successfully") - if err := r.markAsFinished(ctx, ephemeralRunner, log); err != nil { - log.Error(err, "Failed to mark ephemeral runner as finished") + log.Info("Ephemeral runner has finished successfully, deleting ephemeral runner", "exitCode", cs.State.Terminated.ExitCode) + if err := r.Delete(ctx, ephemeralRunner); err != nil { + log.Error(err, "Failed to delete ephemeral runner after successful completion") return ctrl.Result{}, err } return ctrl.Result{}, nil @@ -500,18 +512,6 @@ func (r *EphemeralRunnerReconciler) markAsFailed(ctx context.Context, ephemeralR return nil } -func (r *EphemeralRunnerReconciler) markAsFinished(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) error { - log.Info("Updating ephemeral runner status to Finished") - if err := patchSubResource(ctx, r.Status(), ephemeralRunner, func(obj *v1alpha1.EphemeralRunner) { - obj.Status.Phase = corev1.PodSucceeded - }); err != nil { - return fmt.Errorf("failed to update ephemeral runner with status finished: %w", err) - } - - log.Info("EphemeralRunner status is marked as Finished") - return nil -} - // deletePodAsFailed is responsible for deleting the pod and updating the .Status.Failures for tracking failure count. // It should not be responsible for setting the status to Failed. func (r *EphemeralRunnerReconciler) deletePodAsFailed(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, pod *corev1.Pod, log logr.Logger) error { diff --git a/controllers/actions.github.com/ephemeralrunner_controller_test.go b/controllers/actions.github.com/ephemeralrunner_controller_test.go index 3aa6a8ea..563608c5 100644 --- a/controllers/actions.github.com/ephemeralrunner_controller_test.go +++ b/controllers/actions.github.com/ephemeralrunner_controller_test.go @@ -176,7 +176,7 @@ var _ = Describe("EphemeralRunner", func() { ).Should(BeEquivalentTo(ephemeralRunner.Name)) }) - It("It should re-create pod on failure", func() { + It("It should re-create pod on failure and no job assigned", func() { pod := new(corev1.Pod) Eventually(func() (bool, error) { if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil { @@ -200,6 +200,67 @@ var _ = Describe("EphemeralRunner", func() { ).Should(BeEquivalentTo(true)) }) + It("It should delete ephemeral runner on failure and job assigned", func() { + er := new(v1alpha1.EphemeralRunner) + // Check if finalizer is added + Eventually( + func() error { + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er) + return err + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).Should(Succeed(), "failed to get ephemeral runner") + + // update job id to simulate job assigned + er.Status.JobRequestId = 1 + err := k8sClient.Status().Update(ctx, er) + Expect(err).To(BeNil(), "failed to update ephemeral runner status") + + er = new(v1alpha1.EphemeralRunner) + Eventually( + func() (int64, error) { + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er) + if err != nil { + return 0, err + } + return er.Status.JobRequestId, nil + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).Should(BeEquivalentTo(int64(1))) + + pod := new(corev1.Pod) + Eventually(func() (bool, error) { + if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil { + return false, err + } + return true, nil + }).Should(BeEquivalentTo(true)) + + // delete pod to simulate failure + pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{ + Name: v1alpha1.EphemeralRunnerContainerName, + State: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{ + ExitCode: 1, + }, + }, + }) + err = k8sClient.Status().Update(ctx, pod) + Expect(err).To(BeNil(), "Failed to update pod status") + + er = new(v1alpha1.EphemeralRunner) + Eventually( + func() bool { + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er) + return kerrors.IsNotFound(err) + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).Should(BeTrue(), "Ephemeral runner should eventually be deleted") + }) + It("It should failed if a pod template is invalid", func() { invalideEphemeralRunner := newExampleRunner("invalid-ephemeral-runner", autoscalingNS.Name, configSecret.Name) invalideEphemeralRunner.Spec.Spec.PriorityClassName = "notexist" @@ -208,13 +269,22 @@ var _ = Describe("EphemeralRunner", func() { Expect(err).To(BeNil()) updated := new(v1alpha1.EphemeralRunner) - Eventually(func() (corev1.PodPhase, error) { - err := k8sClient.Get(ctx, client.ObjectKey{Name: invalideEphemeralRunner.Name, Namespace: invalideEphemeralRunner.Namespace}, updated) - if err != nil { - return "", nil - } - return updated.Status.Phase, nil - }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo(corev1.PodFailed)) + Eventually( + func() (corev1.PodPhase, error) { + err := k8sClient.Get( + ctx, + client.ObjectKey{Name: invalideEphemeralRunner.Name, Namespace: invalideEphemeralRunner.Namespace}, + updated, + ) + if err != nil { + return "", nil + } + return updated.Status.Phase, nil + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).Should(BeEquivalentTo(corev1.PodFailed)) + Expect(updated.Status.Reason).Should(Equal("InvalidPod")) Expect(updated.Status.Message).Should(Equal("Failed to create the pod: pods \"invalid-ephemeral-runner\" is forbidden: no PriorityClass with name notexist was found")) }) @@ -775,7 +845,7 @@ var _ = Describe("EphemeralRunner", func() { startManagers(GinkgoT(), mgr) }) - It("It should set the Phase to Succeeded", func() { + It("It should delete EphemeralRunner when pod exits successfully", func() { ephemeralRunner := newExampleRunner("test-runner", autoscalingNS.Name, configSecret.Name) err := k8sClient.Create(ctx, ephemeralRunner) @@ -801,13 +871,18 @@ var _ = Describe("EphemeralRunner", func() { Expect(err).To(BeNil(), "failed to update pod status") updated := new(v1alpha1.EphemeralRunner) - Eventually(func() (corev1.PodPhase, error) { - err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated) - if err != nil { - return "", nil - } - return updated.Status.Phase, nil - }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo(corev1.PodSucceeded)) + Eventually( + func() bool { + err := k8sClient.Get( + ctx, + client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, + updated, + ) + return kerrors.IsNotFound(err) + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).Should(BeTrue()) }) }) diff --git a/controllers/actions.github.com/ephemeralrunnerset_controller.go b/controllers/actions.github.com/ephemeralrunnerset_controller.go index 2a09a1c5..ee766ef1 100644 --- a/controllers/actions.github.com/ephemeralrunnerset_controller.go +++ b/controllers/actions.github.com/ephemeralrunnerset_controller.go @@ -453,7 +453,7 @@ func (r *EphemeralRunnerSetReconciler) deleteIdleEphemeralRunners(ctx context.Co continue } - if !isDone && ephemeralRunner.Status.JobRequestId > 0 { + if !isDone && ephemeralRunner.HasJob() { log.Info("Skipping ephemeral runner since it is running a job", "name", ephemeralRunner.Name, "jobRequestId", ephemeralRunner.Status.JobRequestId) continue }