Remove ephemeral runner when exit code != 0 and is patched with the job
This commit is contained in:
		
							parent
							
								
									0a0be027fd
								
							
						
					
					
						commit
						425c977a64
					
				|  | @ -50,6 +50,10 @@ func (er *EphemeralRunner) IsDone() bool { | |||
| 	return er.Status.Phase == corev1.PodSucceeded || er.Status.Phase == corev1.PodFailed | ||||
| } | ||||
| 
 | ||||
| func (er *EphemeralRunner) HasJob() bool { | ||||
| 	return er.Status.JobRequestId != 0 | ||||
| } | ||||
| 
 | ||||
| func (er *EphemeralRunner) HasContainerHookConfigured() bool { | ||||
| 	for i := range er.Spec.Spec.Containers { | ||||
| 		if er.Spec.Spec.Containers[i].Name != EphemeralRunnerContainerName { | ||||
|  |  | |||
|  | @ -321,6 +321,18 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ | |||
| 
 | ||||
| 	case cs.State.Terminated.ExitCode != 0: // failed
 | ||||
| 		log.Info("Ephemeral runner container failed", "exitCode", cs.State.Terminated.ExitCode) | ||||
| 		if ephemeralRunner.HasJob() { | ||||
| 			log.Error( | ||||
| 				errors.New("ephemeral runner has a job assigned, but the pod has failed"), | ||||
| 				"Ephemeral runner either has faulty entrypoint or something external killing the runner", | ||||
| 			) | ||||
| 			log.Info("Deleting the ephemeral runner that has a job assigned but the pod has failed") | ||||
| 			if err := r.Delete(ctx, ephemeralRunner); err != nil { | ||||
| 				log.Error(err, "Failed to delete the ephemeral runner that has a job assigned but the pod has failed") | ||||
| 				return ctrl.Result{}, err | ||||
| 			} | ||||
| 			return ctrl.Result{}, nil | ||||
| 		} | ||||
| 		if err := r.deletePodAsFailed(ctx, ephemeralRunner, pod, log); err != nil { | ||||
| 			log.Error(err, "Failed to delete runner pod on failure") | ||||
| 			return ctrl.Result{}, err | ||||
|  | @ -328,9 +340,9 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ | |||
| 		return ctrl.Result{}, nil | ||||
| 
 | ||||
| 	default: // succeeded
 | ||||
| 		log.Info("Ephemeral runner has finished successfully") | ||||
| 		if err := r.markAsFinished(ctx, ephemeralRunner, log); err != nil { | ||||
| 			log.Error(err, "Failed to mark ephemeral runner as finished") | ||||
| 		log.Info("Ephemeral runner has finished successfully, deleting ephemeral runner", "exitCode", cs.State.Terminated.ExitCode) | ||||
| 		if err := r.Delete(ctx, ephemeralRunner); err != nil { | ||||
| 			log.Error(err, "Failed to delete ephemeral runner after successful completion") | ||||
| 			return ctrl.Result{}, err | ||||
| 		} | ||||
| 		return ctrl.Result{}, nil | ||||
|  | @ -500,18 +512,6 @@ func (r *EphemeralRunnerReconciler) markAsFailed(ctx context.Context, ephemeralR | |||
| 	return nil | ||||
| } | ||||
| 
 | ||||
| func (r *EphemeralRunnerReconciler) markAsFinished(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) error { | ||||
| 	log.Info("Updating ephemeral runner status to Finished") | ||||
| 	if err := patchSubResource(ctx, r.Status(), ephemeralRunner, func(obj *v1alpha1.EphemeralRunner) { | ||||
| 		obj.Status.Phase = corev1.PodSucceeded | ||||
| 	}); err != nil { | ||||
| 		return fmt.Errorf("failed to update ephemeral runner with status finished: %w", err) | ||||
| 	} | ||||
| 
 | ||||
| 	log.Info("EphemeralRunner status is marked as Finished") | ||||
| 	return nil | ||||
| } | ||||
| 
 | ||||
| // deletePodAsFailed is responsible for deleting the pod and updating the .Status.Failures for tracking failure count.
 | ||||
| // It should not be responsible for setting the status to Failed.
 | ||||
| func (r *EphemeralRunnerReconciler) deletePodAsFailed(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, pod *corev1.Pod, log logr.Logger) error { | ||||
|  |  | |||
|  | @ -176,7 +176,7 @@ var _ = Describe("EphemeralRunner", func() { | |||
| 			).Should(BeEquivalentTo(ephemeralRunner.Name)) | ||||
| 		}) | ||||
| 
 | ||||
| 		It("It should re-create pod on failure", func() { | ||||
| 		It("It should re-create pod on failure and no job assigned", func() { | ||||
| 			pod := new(corev1.Pod) | ||||
| 			Eventually(func() (bool, error) { | ||||
| 				if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil { | ||||
|  | @ -200,6 +200,67 @@ var _ = Describe("EphemeralRunner", func() { | |||
| 			).Should(BeEquivalentTo(true)) | ||||
| 		}) | ||||
| 
 | ||||
| 		It("It should delete ephemeral runner on failure and job assigned", func() { | ||||
| 			er := new(v1alpha1.EphemeralRunner) | ||||
| 			// Check if finalizer is added
 | ||||
| 			Eventually( | ||||
| 				func() error { | ||||
| 					err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er) | ||||
| 					return err | ||||
| 				}, | ||||
| 				ephemeralRunnerTimeout, | ||||
| 				ephemeralRunnerInterval, | ||||
| 			).Should(Succeed(), "failed to get ephemeral runner") | ||||
| 
 | ||||
| 			// update job id to simulate job assigned
 | ||||
| 			er.Status.JobRequestId = 1 | ||||
| 			err := k8sClient.Status().Update(ctx, er) | ||||
| 			Expect(err).To(BeNil(), "failed to update ephemeral runner status") | ||||
| 
 | ||||
| 			er = new(v1alpha1.EphemeralRunner) | ||||
| 			Eventually( | ||||
| 				func() (int64, error) { | ||||
| 					err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er) | ||||
| 					if err != nil { | ||||
| 						return 0, err | ||||
| 					} | ||||
| 					return er.Status.JobRequestId, nil | ||||
| 				}, | ||||
| 				ephemeralRunnerTimeout, | ||||
| 				ephemeralRunnerInterval, | ||||
| 			).Should(BeEquivalentTo(int64(1))) | ||||
| 
 | ||||
| 			pod := new(corev1.Pod) | ||||
| 			Eventually(func() (bool, error) { | ||||
| 				if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil { | ||||
| 					return false, err | ||||
| 				} | ||||
| 				return true, nil | ||||
| 			}).Should(BeEquivalentTo(true)) | ||||
| 
 | ||||
| 			// delete pod to simulate failure
 | ||||
| 			pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{ | ||||
| 				Name: v1alpha1.EphemeralRunnerContainerName, | ||||
| 				State: corev1.ContainerState{ | ||||
| 					Terminated: &corev1.ContainerStateTerminated{ | ||||
| 						ExitCode: 1, | ||||
| 					}, | ||||
| 				}, | ||||
| 			}) | ||||
| 			err = k8sClient.Status().Update(ctx, pod) | ||||
| 			Expect(err).To(BeNil(), "Failed to update pod status") | ||||
| 
 | ||||
| 			er = new(v1alpha1.EphemeralRunner) | ||||
| 			Eventually( | ||||
| 				func() bool { | ||||
| 					err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er) | ||||
| 					return kerrors.IsNotFound(err) | ||||
| 				}, | ||||
| 				ephemeralRunnerTimeout, | ||||
| 				ephemeralRunnerInterval, | ||||
| 			).Should(BeTrue(), "Ephemeral runner should eventually be deleted") | ||||
| 		}) | ||||
| 
 | ||||
| 		It("It should failed if a pod template is invalid", func() { | ||||
| 			invalideEphemeralRunner := newExampleRunner("invalid-ephemeral-runner", autoscalingNS.Name, configSecret.Name) | ||||
| 			invalideEphemeralRunner.Spec.Spec.PriorityClassName = "notexist" | ||||
|  | @ -208,13 +269,22 @@ var _ = Describe("EphemeralRunner", func() { | |||
| 			Expect(err).To(BeNil()) | ||||
| 
 | ||||
| 			updated := new(v1alpha1.EphemeralRunner) | ||||
| 			Eventually(func() (corev1.PodPhase, error) { | ||||
| 				err := k8sClient.Get(ctx, client.ObjectKey{Name: invalideEphemeralRunner.Name, Namespace: invalideEphemeralRunner.Namespace}, updated) | ||||
| 				if err != nil { | ||||
| 					return "", nil | ||||
| 				} | ||||
| 				return updated.Status.Phase, nil | ||||
| 			}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo(corev1.PodFailed)) | ||||
| 			Eventually( | ||||
| 				func() (corev1.PodPhase, error) { | ||||
| 					err := k8sClient.Get( | ||||
| 						ctx, | ||||
| 						client.ObjectKey{Name: invalideEphemeralRunner.Name, Namespace: invalideEphemeralRunner.Namespace}, | ||||
| 						updated, | ||||
| 					) | ||||
| 					if err != nil { | ||||
| 						return "", nil | ||||
| 					} | ||||
| 					return updated.Status.Phase, nil | ||||
| 				}, | ||||
| 				ephemeralRunnerTimeout, | ||||
| 				ephemeralRunnerInterval, | ||||
| 			).Should(BeEquivalentTo(corev1.PodFailed)) | ||||
| 
 | ||||
| 			Expect(updated.Status.Reason).Should(Equal("InvalidPod")) | ||||
| 			Expect(updated.Status.Message).Should(Equal("Failed to create the pod: pods \"invalid-ephemeral-runner\" is forbidden: no PriorityClass with name notexist was found")) | ||||
| 		}) | ||||
|  | @ -775,7 +845,7 @@ var _ = Describe("EphemeralRunner", func() { | |||
| 			startManagers(GinkgoT(), mgr) | ||||
| 		}) | ||||
| 
 | ||||
| 		It("It should set the Phase to Succeeded", func() { | ||||
| 		It("It should delete EphemeralRunner when pod exits successfully", func() { | ||||
| 			ephemeralRunner := newExampleRunner("test-runner", autoscalingNS.Name, configSecret.Name) | ||||
| 
 | ||||
| 			err := k8sClient.Create(ctx, ephemeralRunner) | ||||
|  | @ -801,13 +871,18 @@ var _ = Describe("EphemeralRunner", func() { | |||
| 			Expect(err).To(BeNil(), "failed to update pod status") | ||||
| 
 | ||||
| 			updated := new(v1alpha1.EphemeralRunner) | ||||
| 			Eventually(func() (corev1.PodPhase, error) { | ||||
| 				err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated) | ||||
| 				if err != nil { | ||||
| 					return "", nil | ||||
| 				} | ||||
| 				return updated.Status.Phase, nil | ||||
| 			}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo(corev1.PodSucceeded)) | ||||
| 			Eventually( | ||||
| 				func() bool { | ||||
| 					err := k8sClient.Get( | ||||
| 						ctx, | ||||
| 						client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, | ||||
| 						updated, | ||||
| 					) | ||||
| 					return kerrors.IsNotFound(err) | ||||
| 				}, | ||||
| 				ephemeralRunnerTimeout, | ||||
| 				ephemeralRunnerInterval, | ||||
| 			).Should(BeTrue()) | ||||
| 		}) | ||||
| 	}) | ||||
| 
 | ||||
|  |  | |||
|  | @ -453,7 +453,7 @@ func (r *EphemeralRunnerSetReconciler) deleteIdleEphemeralRunners(ctx context.Co | |||
| 			continue | ||||
| 		} | ||||
| 
 | ||||
| 		if !isDone && ephemeralRunner.Status.JobRequestId > 0 { | ||||
| 		if !isDone && ephemeralRunner.HasJob() { | ||||
| 			log.Info("Skipping ephemeral runner since it is running a job", "name", ephemeralRunner.Name, "jobRequestId", ephemeralRunner.Status.JobRequestId) | ||||
| 			continue | ||||
| 		} | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue