Remove ephemeral runner when exit code != 0 and is patched with the job

This commit is contained in:
Nikola Jokic 2025-09-05 16:09:00 +02:00
parent 0a0be027fd
commit 425c977a64
No known key found for this signature in database
GPG Key ID: E4104494F9B8DDF6
4 changed files with 111 additions and 32 deletions

View File

@ -50,6 +50,10 @@ func (er *EphemeralRunner) IsDone() bool {
return er.Status.Phase == corev1.PodSucceeded || er.Status.Phase == corev1.PodFailed
}
func (er *EphemeralRunner) HasJob() bool {
return er.Status.JobRequestId != 0
}
func (er *EphemeralRunner) HasContainerHookConfigured() bool {
for i := range er.Spec.Spec.Containers {
if er.Spec.Spec.Containers[i].Name != EphemeralRunnerContainerName {

View File

@ -321,6 +321,18 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
case cs.State.Terminated.ExitCode != 0: // failed
log.Info("Ephemeral runner container failed", "exitCode", cs.State.Terminated.ExitCode)
if ephemeralRunner.HasJob() {
log.Error(
errors.New("ephemeral runner has a job assigned, but the pod has failed"),
"Ephemeral runner either has faulty entrypoint or something external killing the runner",
)
log.Info("Deleting the ephemeral runner that has a job assigned but the pod has failed")
if err := r.Delete(ctx, ephemeralRunner); err != nil {
log.Error(err, "Failed to delete the ephemeral runner that has a job assigned but the pod has failed")
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
}
if err := r.deletePodAsFailed(ctx, ephemeralRunner, pod, log); err != nil {
log.Error(err, "Failed to delete runner pod on failure")
return ctrl.Result{}, err
@ -328,9 +340,9 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
return ctrl.Result{}, nil
default: // succeeded
log.Info("Ephemeral runner has finished successfully")
if err := r.markAsFinished(ctx, ephemeralRunner, log); err != nil {
log.Error(err, "Failed to mark ephemeral runner as finished")
log.Info("Ephemeral runner has finished successfully, deleting ephemeral runner", "exitCode", cs.State.Terminated.ExitCode)
if err := r.Delete(ctx, ephemeralRunner); err != nil {
log.Error(err, "Failed to delete ephemeral runner after successful completion")
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
@ -500,18 +512,6 @@ func (r *EphemeralRunnerReconciler) markAsFailed(ctx context.Context, ephemeralR
return nil
}
func (r *EphemeralRunnerReconciler) markAsFinished(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) error {
log.Info("Updating ephemeral runner status to Finished")
if err := patchSubResource(ctx, r.Status(), ephemeralRunner, func(obj *v1alpha1.EphemeralRunner) {
obj.Status.Phase = corev1.PodSucceeded
}); err != nil {
return fmt.Errorf("failed to update ephemeral runner with status finished: %w", err)
}
log.Info("EphemeralRunner status is marked as Finished")
return nil
}
// deletePodAsFailed is responsible for deleting the pod and updating the .Status.Failures for tracking failure count.
// It should not be responsible for setting the status to Failed.
func (r *EphemeralRunnerReconciler) deletePodAsFailed(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, pod *corev1.Pod, log logr.Logger) error {

View File

@ -176,7 +176,7 @@ var _ = Describe("EphemeralRunner", func() {
).Should(BeEquivalentTo(ephemeralRunner.Name))
})
It("It should re-create pod on failure", func() {
It("It should re-create pod on failure and no job assigned", func() {
pod := new(corev1.Pod)
Eventually(func() (bool, error) {
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil {
@ -200,6 +200,67 @@ var _ = Describe("EphemeralRunner", func() {
).Should(BeEquivalentTo(true))
})
It("It should delete ephemeral runner on failure and job assigned", func() {
er := new(v1alpha1.EphemeralRunner)
// Check if finalizer is added
Eventually(
func() error {
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
return err
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).Should(Succeed(), "failed to get ephemeral runner")
// update job id to simulate job assigned
er.Status.JobRequestId = 1
err := k8sClient.Status().Update(ctx, er)
Expect(err).To(BeNil(), "failed to update ephemeral runner status")
er = new(v1alpha1.EphemeralRunner)
Eventually(
func() (int64, error) {
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
if err != nil {
return 0, err
}
return er.Status.JobRequestId, nil
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).Should(BeEquivalentTo(int64(1)))
pod := new(corev1.Pod)
Eventually(func() (bool, error) {
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil {
return false, err
}
return true, nil
}).Should(BeEquivalentTo(true))
// delete pod to simulate failure
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
Name: v1alpha1.EphemeralRunnerContainerName,
State: corev1.ContainerState{
Terminated: &corev1.ContainerStateTerminated{
ExitCode: 1,
},
},
})
err = k8sClient.Status().Update(ctx, pod)
Expect(err).To(BeNil(), "Failed to update pod status")
er = new(v1alpha1.EphemeralRunner)
Eventually(
func() bool {
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
return kerrors.IsNotFound(err)
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).Should(BeTrue(), "Ephemeral runner should eventually be deleted")
})
It("It should failed if a pod template is invalid", func() {
invalideEphemeralRunner := newExampleRunner("invalid-ephemeral-runner", autoscalingNS.Name, configSecret.Name)
invalideEphemeralRunner.Spec.Spec.PriorityClassName = "notexist"
@ -208,13 +269,22 @@ var _ = Describe("EphemeralRunner", func() {
Expect(err).To(BeNil())
updated := new(v1alpha1.EphemeralRunner)
Eventually(func() (corev1.PodPhase, error) {
err := k8sClient.Get(ctx, client.ObjectKey{Name: invalideEphemeralRunner.Name, Namespace: invalideEphemeralRunner.Namespace}, updated)
if err != nil {
return "", nil
}
return updated.Status.Phase, nil
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo(corev1.PodFailed))
Eventually(
func() (corev1.PodPhase, error) {
err := k8sClient.Get(
ctx,
client.ObjectKey{Name: invalideEphemeralRunner.Name, Namespace: invalideEphemeralRunner.Namespace},
updated,
)
if err != nil {
return "", nil
}
return updated.Status.Phase, nil
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).Should(BeEquivalentTo(corev1.PodFailed))
Expect(updated.Status.Reason).Should(Equal("InvalidPod"))
Expect(updated.Status.Message).Should(Equal("Failed to create the pod: pods \"invalid-ephemeral-runner\" is forbidden: no PriorityClass with name notexist was found"))
})
@ -775,7 +845,7 @@ var _ = Describe("EphemeralRunner", func() {
startManagers(GinkgoT(), mgr)
})
It("It should set the Phase to Succeeded", func() {
It("It should delete EphemeralRunner when pod exits successfully", func() {
ephemeralRunner := newExampleRunner("test-runner", autoscalingNS.Name, configSecret.Name)
err := k8sClient.Create(ctx, ephemeralRunner)
@ -801,13 +871,18 @@ var _ = Describe("EphemeralRunner", func() {
Expect(err).To(BeNil(), "failed to update pod status")
updated := new(v1alpha1.EphemeralRunner)
Eventually(func() (corev1.PodPhase, error) {
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated)
if err != nil {
return "", nil
}
return updated.Status.Phase, nil
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo(corev1.PodSucceeded))
Eventually(
func() bool {
err := k8sClient.Get(
ctx,
client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace},
updated,
)
return kerrors.IsNotFound(err)
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).Should(BeTrue())
})
})

View File

@ -453,7 +453,7 @@ func (r *EphemeralRunnerSetReconciler) deleteIdleEphemeralRunners(ctx context.Co
continue
}
if !isDone && ephemeralRunner.Status.JobRequestId > 0 {
if !isDone && ephemeralRunner.HasJob() {
log.Info("Skipping ephemeral runner since it is running a job", "name", ephemeralRunner.Name, "jobRequestId", ephemeralRunner.Status.JobRequestId)
continue
}