From 0f2a659878884cd9af613193e6c89defdb55f21a Mon Sep 17 00:00:00 2001 From: Junya Okabe <86868255+Okabe-Junya@users.noreply.github.com> Date: Thu, 7 May 2026 20:26:46 +0900 Subject: [PATCH] Fix: Detect init container failure in EphemeralRunner controller (#4457) --- .../ephemeralrunner_controller.go | 16 +++ .../ephemeralrunner_controller_test.go | 109 ++++++++++++++++++ 2 files changed, 125 insertions(+) diff --git a/controllers/actions.github.com/ephemeralrunner_controller.go b/controllers/actions.github.com/ephemeralrunner_controller.go index 616c0e04..ee5e2c87 100644 --- a/controllers/actions.github.com/ephemeralrunner_controller.go +++ b/controllers/actions.github.com/ephemeralrunner_controller.go @@ -351,6 +351,12 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ ) return ctrl.Result{}, r.deleteEphemeralRunnerOrPod(ctx, ephemeralRunner, pod, log) + case initContainerFailed(pod): + log.Info("Pod has a failed init container, deleting pod as failed so it can be restarted", + "initContainerStatuses", pod.Status.InitContainerStatuses, + ) + return ctrl.Result{}, r.deleteEphemeralRunnerOrPod(ctx, ephemeralRunner, pod, log) + case cs == nil: // starting, no container state yet log.Info("Waiting for runner container status to be available") @@ -862,3 +868,13 @@ func runnerContainerStatus(pod *corev1.Pod) *corev1.ContainerStatus { } return nil } + +func initContainerFailed(pod *corev1.Pod) bool { + for i := range pod.Status.InitContainerStatuses { + cs := &pod.Status.InitContainerStatuses[i] + if cs.State.Terminated != nil && cs.State.Terminated.ExitCode != 0 { + return true + } + } + return false +} diff --git a/controllers/actions.github.com/ephemeralrunner_controller_test.go b/controllers/actions.github.com/ephemeralrunner_controller_test.go index 559d9469..6b92a0ee 100644 --- a/controllers/actions.github.com/ephemeralrunner_controller_test.go +++ b/controllers/actions.github.com/ephemeralrunner_controller_test.go @@ -355,6 +355,115 @@ var _ = Describe("EphemeralRunner", func() { ).Should(BeTrue(), "Pod should be re-created") }) + It("It should re-create pod when init container fails before pod phase transitions to Failed", func() { + pod := new(corev1.Pod) + Eventually(func() (bool, error) { + if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil { + return false, err + } + return true, nil + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo(true)) + + oldPodUID := pod.UID + + // Simulate init container failure without PodFailed phase. + // This can happen when the kubelet has not yet transitioned the pod phase. + pod.Status.Phase = corev1.PodPending + pod.Status.InitContainerStatuses = []corev1.ContainerStatus{ + { + Name: "setup", + State: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{ + ExitCode: 1, + Reason: "StartError", + Message: "failed to create containerd task: context canceled", + }, + }, + }, + } + err := k8sClient.Status().Update(ctx, pod) + Expect(err).To(BeNil(), "Failed to update pod status") + + Eventually( + func() (int, error) { + updated := new(v1alpha1.EphemeralRunner) + err := k8sClient.Get( + ctx, + client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, + updated, + ) + if err != nil { + return 0, err + } + return len(updated.Status.Failures), nil + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).Should(BeEquivalentTo(1)) + + Eventually( + func() (bool, error) { + newPod := new(corev1.Pod) + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, newPod) + if err != nil { + return false, err + } + return newPod.UID != oldPodUID, nil + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).Should(BeTrue(), "Pod should be re-created after init container failure") + }) + + It("It should delete ephemeral runner when init container fails and job is assigned", func() { + er := new(v1alpha1.EphemeralRunner) + Eventually(func() error { + return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er) + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner") + + er.Status.JobID = "1" + err := k8sClient.Status().Update(ctx, er) + Expect(err).To(BeNil(), "failed to update ephemeral runner status") + + Eventually(func() (string, error) { + current := new(v1alpha1.EphemeralRunner) + if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, current); err != nil { + return "", err + } + return current.Status.JobID, nil + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo("1")) + + pod := new(corev1.Pod) + Eventually(func() (bool, error) { + if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil { + return false, err + } + return true, nil + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo(true)) + + // Simulate init container failure with job assigned + pod.Status.Phase = corev1.PodPending + pod.Status.InitContainerStatuses = []corev1.ContainerStatus{ + { + Name: "setup", + State: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{ + ExitCode: 1, + Reason: "StartError", + }, + }, + }, + } + err = k8sClient.Status().Update(ctx, pod) + Expect(err).To(BeNil(), "Failed to update pod status") + + Eventually(func() bool { + check := new(v1alpha1.EphemeralRunner) + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check) + return kerrors.IsNotFound(err) + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeTrue(), "Ephemeral runner should eventually be deleted when init container fails with job assigned") + }) + It("It should treat pod failed with runner container exit 0 as success with job id", func() { er := new(v1alpha1.EphemeralRunner) Eventually(func() error {