Record the error when the creation pod fails (#3112)

Signed-off-by: utam0k <k0ma@utam0k.jp>
This commit is contained in:
Toru Komatsu 2023-12-08 05:11:52 +09:00 committed by GitHub
parent 0bfa57ac50
commit b08d533105
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 44 additions and 4 deletions

View File

@ -66,3 +66,9 @@ const DefaultScaleSetListenerLogFormat = string(logging.LogFormatText)
// ownerKey is field selector matching the owner name of a particular resource
const resourceOwnerKey = ".metadata.controller"
// EphemeralRunner pod creation failure reasons
const (
ReasonTooManyPodFailures = "TooManyPodFailures"
ReasonInvalidPodFailure = "InvalidPod"
)

View File

@ -192,7 +192,7 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
case len(ephemeralRunner.Status.Failures) > 5:
log.Info("EphemeralRunner has failed more than 5 times. Marking it as failed")
errMessage := fmt.Sprintf("Pod has failed to start more than 5 times: %s", pod.Status.Message)
if err := r.markAsFailed(ctx, ephemeralRunner, errMessage, log); err != nil {
if err := r.markAsFailed(ctx, ephemeralRunner, errMessage, ReasonTooManyPodFailures, log); err != nil {
log.Error(err, "Failed to set ephemeral runner to phase Failed")
return ctrl.Result{}, err
}
@ -201,7 +201,22 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
default:
// Pod was not found. Create if the pod has never been created
log.Info("Creating new EphemeralRunner pod.")
return r.createPod(ctx, ephemeralRunner, secret, log)
result, err := r.createPod(ctx, ephemeralRunner, secret, log)
switch {
case err == nil:
return result, nil
case kerrors.IsInvalid(err) || kerrors.IsForbidden(err):
log.Error(err, "Failed to create a pod due to unrecoverable failure")
errMessage := fmt.Sprintf("Failed to create the pod: %v", err)
if err := r.markAsFailed(ctx, ephemeralRunner, errMessage, ReasonInvalidPodFailure, log); err != nil {
log.Error(err, "Failed to set ephemeral runner to phase Failed")
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
default:
log.Error(err, "Failed to create the pod")
return ctrl.Result{}, err
}
}
}
@ -424,11 +439,11 @@ func (r *EphemeralRunnerReconciler) cleanupRunnerLinkedSecrets(ctx context.Conte
return false, multierr.Combine(errs...)
}
func (r *EphemeralRunnerReconciler) markAsFailed(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, errMessage string, log logr.Logger) error {
func (r *EphemeralRunnerReconciler) markAsFailed(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, errMessage string, reason string, log logr.Logger) error {
log.Info("Updating ephemeral runner status to Failed")
if err := patchSubResource(ctx, r.Status(), ephemeralRunner, func(obj *v1alpha1.EphemeralRunner) {
obj.Status.Phase = corev1.PodFailed
obj.Status.Reason = "TooManyPodFailures"
obj.Status.Reason = reason
obj.Status.Message = errMessage
}); err != nil {
return fmt.Errorf("failed to update ephemeral runner status Phase/Message: %v", err)

View File

@ -189,6 +189,25 @@ var _ = Describe("EphemeralRunner", func() {
).Should(BeEquivalentTo(true))
})
It("It should failed if a pod template is invalid", func() {
invalideEphemeralRunner := newExampleRunner("invalid-ephemeral-runner", autoscalingNS.Name, configSecret.Name)
invalideEphemeralRunner.Spec.Spec.PriorityClassName = "notexist"
err := k8sClient.Create(ctx, invalideEphemeralRunner)
Expect(err).To(BeNil())
updated := new(v1alpha1.EphemeralRunner)
Eventually(func() (corev1.PodPhase, error) {
err := k8sClient.Get(ctx, client.ObjectKey{Name: invalideEphemeralRunner.Name, Namespace: invalideEphemeralRunner.Namespace}, updated)
if err != nil {
return "", nil
}
return updated.Status.Phase, nil
}, timeout, interval).Should(BeEquivalentTo(corev1.PodFailed))
Expect(updated.Status.Reason).Should(Equal("InvalidPod"))
Expect(updated.Status.Message).Should(Equal("Failed to create the pod: pods \"invalid-ephemeral-runner\" is forbidden: no PriorityClass with name notexist was found"))
})
It("It should clean up resources when deleted", func() {
// wait for pod to be created
pod := new(corev1.Pod)