Remove ephemeral runner when exit code != 0 and is patched with the job
This commit is contained in:
parent
0a0be027fd
commit
425c977a64
|
|
@ -50,6 +50,10 @@ func (er *EphemeralRunner) IsDone() bool {
|
|||
return er.Status.Phase == corev1.PodSucceeded || er.Status.Phase == corev1.PodFailed
|
||||
}
|
||||
|
||||
func (er *EphemeralRunner) HasJob() bool {
|
||||
return er.Status.JobRequestId != 0
|
||||
}
|
||||
|
||||
func (er *EphemeralRunner) HasContainerHookConfigured() bool {
|
||||
for i := range er.Spec.Spec.Containers {
|
||||
if er.Spec.Spec.Containers[i].Name != EphemeralRunnerContainerName {
|
||||
|
|
|
|||
|
|
@ -321,6 +321,18 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
|
|||
|
||||
case cs.State.Terminated.ExitCode != 0: // failed
|
||||
log.Info("Ephemeral runner container failed", "exitCode", cs.State.Terminated.ExitCode)
|
||||
if ephemeralRunner.HasJob() {
|
||||
log.Error(
|
||||
errors.New("ephemeral runner has a job assigned, but the pod has failed"),
|
||||
"Ephemeral runner either has faulty entrypoint or something external killing the runner",
|
||||
)
|
||||
log.Info("Deleting the ephemeral runner that has a job assigned but the pod has failed")
|
||||
if err := r.Delete(ctx, ephemeralRunner); err != nil {
|
||||
log.Error(err, "Failed to delete the ephemeral runner that has a job assigned but the pod has failed")
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
if err := r.deletePodAsFailed(ctx, ephemeralRunner, pod, log); err != nil {
|
||||
log.Error(err, "Failed to delete runner pod on failure")
|
||||
return ctrl.Result{}, err
|
||||
|
|
@ -328,9 +340,9 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
|
|||
return ctrl.Result{}, nil
|
||||
|
||||
default: // succeeded
|
||||
log.Info("Ephemeral runner has finished successfully")
|
||||
if err := r.markAsFinished(ctx, ephemeralRunner, log); err != nil {
|
||||
log.Error(err, "Failed to mark ephemeral runner as finished")
|
||||
log.Info("Ephemeral runner has finished successfully, deleting ephemeral runner", "exitCode", cs.State.Terminated.ExitCode)
|
||||
if err := r.Delete(ctx, ephemeralRunner); err != nil {
|
||||
log.Error(err, "Failed to delete ephemeral runner after successful completion")
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
return ctrl.Result{}, nil
|
||||
|
|
@ -500,18 +512,6 @@ func (r *EphemeralRunnerReconciler) markAsFailed(ctx context.Context, ephemeralR
|
|||
return nil
|
||||
}
|
||||
|
||||
func (r *EphemeralRunnerReconciler) markAsFinished(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) error {
|
||||
log.Info("Updating ephemeral runner status to Finished")
|
||||
if err := patchSubResource(ctx, r.Status(), ephemeralRunner, func(obj *v1alpha1.EphemeralRunner) {
|
||||
obj.Status.Phase = corev1.PodSucceeded
|
||||
}); err != nil {
|
||||
return fmt.Errorf("failed to update ephemeral runner with status finished: %w", err)
|
||||
}
|
||||
|
||||
log.Info("EphemeralRunner status is marked as Finished")
|
||||
return nil
|
||||
}
|
||||
|
||||
// deletePodAsFailed is responsible for deleting the pod and updating the .Status.Failures for tracking failure count.
|
||||
// It should not be responsible for setting the status to Failed.
|
||||
func (r *EphemeralRunnerReconciler) deletePodAsFailed(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, pod *corev1.Pod, log logr.Logger) error {
|
||||
|
|
|
|||
|
|
@ -176,7 +176,7 @@ var _ = Describe("EphemeralRunner", func() {
|
|||
).Should(BeEquivalentTo(ephemeralRunner.Name))
|
||||
})
|
||||
|
||||
It("It should re-create pod on failure", func() {
|
||||
It("It should re-create pod on failure and no job assigned", func() {
|
||||
pod := new(corev1.Pod)
|
||||
Eventually(func() (bool, error) {
|
||||
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil {
|
||||
|
|
@ -200,6 +200,67 @@ var _ = Describe("EphemeralRunner", func() {
|
|||
).Should(BeEquivalentTo(true))
|
||||
})
|
||||
|
||||
It("It should delete ephemeral runner on failure and job assigned", func() {
|
||||
er := new(v1alpha1.EphemeralRunner)
|
||||
// Check if finalizer is added
|
||||
Eventually(
|
||||
func() error {
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
|
||||
return err
|
||||
},
|
||||
ephemeralRunnerTimeout,
|
||||
ephemeralRunnerInterval,
|
||||
).Should(Succeed(), "failed to get ephemeral runner")
|
||||
|
||||
// update job id to simulate job assigned
|
||||
er.Status.JobRequestId = 1
|
||||
err := k8sClient.Status().Update(ctx, er)
|
||||
Expect(err).To(BeNil(), "failed to update ephemeral runner status")
|
||||
|
||||
er = new(v1alpha1.EphemeralRunner)
|
||||
Eventually(
|
||||
func() (int64, error) {
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return er.Status.JobRequestId, nil
|
||||
},
|
||||
ephemeralRunnerTimeout,
|
||||
ephemeralRunnerInterval,
|
||||
).Should(BeEquivalentTo(int64(1)))
|
||||
|
||||
pod := new(corev1.Pod)
|
||||
Eventually(func() (bool, error) {
|
||||
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil {
|
||||
return false, err
|
||||
}
|
||||
return true, nil
|
||||
}).Should(BeEquivalentTo(true))
|
||||
|
||||
// delete pod to simulate failure
|
||||
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
|
||||
Name: v1alpha1.EphemeralRunnerContainerName,
|
||||
State: corev1.ContainerState{
|
||||
Terminated: &corev1.ContainerStateTerminated{
|
||||
ExitCode: 1,
|
||||
},
|
||||
},
|
||||
})
|
||||
err = k8sClient.Status().Update(ctx, pod)
|
||||
Expect(err).To(BeNil(), "Failed to update pod status")
|
||||
|
||||
er = new(v1alpha1.EphemeralRunner)
|
||||
Eventually(
|
||||
func() bool {
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
|
||||
return kerrors.IsNotFound(err)
|
||||
},
|
||||
ephemeralRunnerTimeout,
|
||||
ephemeralRunnerInterval,
|
||||
).Should(BeTrue(), "Ephemeral runner should eventually be deleted")
|
||||
})
|
||||
|
||||
It("It should failed if a pod template is invalid", func() {
|
||||
invalideEphemeralRunner := newExampleRunner("invalid-ephemeral-runner", autoscalingNS.Name, configSecret.Name)
|
||||
invalideEphemeralRunner.Spec.Spec.PriorityClassName = "notexist"
|
||||
|
|
@ -208,13 +269,22 @@ var _ = Describe("EphemeralRunner", func() {
|
|||
Expect(err).To(BeNil())
|
||||
|
||||
updated := new(v1alpha1.EphemeralRunner)
|
||||
Eventually(func() (corev1.PodPhase, error) {
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: invalideEphemeralRunner.Name, Namespace: invalideEphemeralRunner.Namespace}, updated)
|
||||
if err != nil {
|
||||
return "", nil
|
||||
}
|
||||
return updated.Status.Phase, nil
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo(corev1.PodFailed))
|
||||
Eventually(
|
||||
func() (corev1.PodPhase, error) {
|
||||
err := k8sClient.Get(
|
||||
ctx,
|
||||
client.ObjectKey{Name: invalideEphemeralRunner.Name, Namespace: invalideEphemeralRunner.Namespace},
|
||||
updated,
|
||||
)
|
||||
if err != nil {
|
||||
return "", nil
|
||||
}
|
||||
return updated.Status.Phase, nil
|
||||
},
|
||||
ephemeralRunnerTimeout,
|
||||
ephemeralRunnerInterval,
|
||||
).Should(BeEquivalentTo(corev1.PodFailed))
|
||||
|
||||
Expect(updated.Status.Reason).Should(Equal("InvalidPod"))
|
||||
Expect(updated.Status.Message).Should(Equal("Failed to create the pod: pods \"invalid-ephemeral-runner\" is forbidden: no PriorityClass with name notexist was found"))
|
||||
})
|
||||
|
|
@ -775,7 +845,7 @@ var _ = Describe("EphemeralRunner", func() {
|
|||
startManagers(GinkgoT(), mgr)
|
||||
})
|
||||
|
||||
It("It should set the Phase to Succeeded", func() {
|
||||
It("It should delete EphemeralRunner when pod exits successfully", func() {
|
||||
ephemeralRunner := newExampleRunner("test-runner", autoscalingNS.Name, configSecret.Name)
|
||||
|
||||
err := k8sClient.Create(ctx, ephemeralRunner)
|
||||
|
|
@ -801,13 +871,18 @@ var _ = Describe("EphemeralRunner", func() {
|
|||
Expect(err).To(BeNil(), "failed to update pod status")
|
||||
|
||||
updated := new(v1alpha1.EphemeralRunner)
|
||||
Eventually(func() (corev1.PodPhase, error) {
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated)
|
||||
if err != nil {
|
||||
return "", nil
|
||||
}
|
||||
return updated.Status.Phase, nil
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo(corev1.PodSucceeded))
|
||||
Eventually(
|
||||
func() bool {
|
||||
err := k8sClient.Get(
|
||||
ctx,
|
||||
client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace},
|
||||
updated,
|
||||
)
|
||||
return kerrors.IsNotFound(err)
|
||||
},
|
||||
ephemeralRunnerTimeout,
|
||||
ephemeralRunnerInterval,
|
||||
).Should(BeTrue())
|
||||
})
|
||||
})
|
||||
|
||||
|
|
|
|||
|
|
@ -453,7 +453,7 @@ func (r *EphemeralRunnerSetReconciler) deleteIdleEphemeralRunners(ctx context.Co
|
|||
continue
|
||||
}
|
||||
|
||||
if !isDone && ephemeralRunner.Status.JobRequestId > 0 {
|
||||
if !isDone && ephemeralRunner.HasJob() {
|
||||
log.Info("Skipping ephemeral runner since it is running a job", "name", ephemeralRunner.Name, "jobRequestId", ephemeralRunner.Status.JobRequestId)
|
||||
continue
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue