From b08d533105b7ad1d55fe6fa611d84dce9eb5748f Mon Sep 17 00:00:00 2001 From: Toru Komatsu Date: Fri, 8 Dec 2023 05:11:52 +0900 Subject: [PATCH] Record the error when the creation pod fails (#3112) Signed-off-by: utam0k --- controllers/actions.github.com/constants.go | 6 +++++ .../ephemeralrunner_controller.go | 23 +++++++++++++++---- .../ephemeralrunner_controller_test.go | 19 +++++++++++++++ 3 files changed, 44 insertions(+), 4 deletions(-) diff --git a/controllers/actions.github.com/constants.go b/controllers/actions.github.com/constants.go index 6484f9e3..a9fcb727 100644 --- a/controllers/actions.github.com/constants.go +++ b/controllers/actions.github.com/constants.go @@ -66,3 +66,9 @@ const DefaultScaleSetListenerLogFormat = string(logging.LogFormatText) // ownerKey is field selector matching the owner name of a particular resource const resourceOwnerKey = ".metadata.controller" + +// EphemeralRunner pod creation failure reasons +const ( + ReasonTooManyPodFailures = "TooManyPodFailures" + ReasonInvalidPodFailure = "InvalidPod" +) diff --git a/controllers/actions.github.com/ephemeralrunner_controller.go b/controllers/actions.github.com/ephemeralrunner_controller.go index cbc8a372..f8ce3635 100644 --- a/controllers/actions.github.com/ephemeralrunner_controller.go +++ b/controllers/actions.github.com/ephemeralrunner_controller.go @@ -192,7 +192,7 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ case len(ephemeralRunner.Status.Failures) > 5: log.Info("EphemeralRunner has failed more than 5 times. Marking it as failed") errMessage := fmt.Sprintf("Pod has failed to start more than 5 times: %s", pod.Status.Message) - if err := r.markAsFailed(ctx, ephemeralRunner, errMessage, log); err != nil { + if err := r.markAsFailed(ctx, ephemeralRunner, errMessage, ReasonTooManyPodFailures, log); err != nil { log.Error(err, "Failed to set ephemeral runner to phase Failed") return ctrl.Result{}, err } @@ -201,7 +201,22 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ default: // Pod was not found. Create if the pod has never been created log.Info("Creating new EphemeralRunner pod.") - return r.createPod(ctx, ephemeralRunner, secret, log) + result, err := r.createPod(ctx, ephemeralRunner, secret, log) + switch { + case err == nil: + return result, nil + case kerrors.IsInvalid(err) || kerrors.IsForbidden(err): + log.Error(err, "Failed to create a pod due to unrecoverable failure") + errMessage := fmt.Sprintf("Failed to create the pod: %v", err) + if err := r.markAsFailed(ctx, ephemeralRunner, errMessage, ReasonInvalidPodFailure, log); err != nil { + log.Error(err, "Failed to set ephemeral runner to phase Failed") + return ctrl.Result{}, err + } + return ctrl.Result{}, nil + default: + log.Error(err, "Failed to create the pod") + return ctrl.Result{}, err + } } } @@ -424,11 +439,11 @@ func (r *EphemeralRunnerReconciler) cleanupRunnerLinkedSecrets(ctx context.Conte return false, multierr.Combine(errs...) } -func (r *EphemeralRunnerReconciler) markAsFailed(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, errMessage string, log logr.Logger) error { +func (r *EphemeralRunnerReconciler) markAsFailed(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, errMessage string, reason string, log logr.Logger) error { log.Info("Updating ephemeral runner status to Failed") if err := patchSubResource(ctx, r.Status(), ephemeralRunner, func(obj *v1alpha1.EphemeralRunner) { obj.Status.Phase = corev1.PodFailed - obj.Status.Reason = "TooManyPodFailures" + obj.Status.Reason = reason obj.Status.Message = errMessage }); err != nil { return fmt.Errorf("failed to update ephemeral runner status Phase/Message: %v", err) diff --git a/controllers/actions.github.com/ephemeralrunner_controller_test.go b/controllers/actions.github.com/ephemeralrunner_controller_test.go index 32704d7b..4cc6ffde 100644 --- a/controllers/actions.github.com/ephemeralrunner_controller_test.go +++ b/controllers/actions.github.com/ephemeralrunner_controller_test.go @@ -189,6 +189,25 @@ var _ = Describe("EphemeralRunner", func() { ).Should(BeEquivalentTo(true)) }) + It("It should failed if a pod template is invalid", func() { + invalideEphemeralRunner := newExampleRunner("invalid-ephemeral-runner", autoscalingNS.Name, configSecret.Name) + invalideEphemeralRunner.Spec.Spec.PriorityClassName = "notexist" + + err := k8sClient.Create(ctx, invalideEphemeralRunner) + Expect(err).To(BeNil()) + + updated := new(v1alpha1.EphemeralRunner) + Eventually(func() (corev1.PodPhase, error) { + err := k8sClient.Get(ctx, client.ObjectKey{Name: invalideEphemeralRunner.Name, Namespace: invalideEphemeralRunner.Namespace}, updated) + if err != nil { + return "", nil + } + return updated.Status.Phase, nil + }, timeout, interval).Should(BeEquivalentTo(corev1.PodFailed)) + Expect(updated.Status.Reason).Should(Equal("InvalidPod")) + Expect(updated.Status.Message).Should(Equal("Failed to create the pod: pods \"invalid-ephemeral-runner\" is forbidden: no PriorityClass with name notexist was found")) + }) + It("It should clean up resources when deleted", func() { // wait for pod to be created pod := new(corev1.Pod)