From 4551309e30a1fb1ba43b9031b7f7f6467e9003fd Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Sun, 13 Mar 2022 13:09:46 +0000 Subject: [PATCH] Fix runners to not terminate before unregistration when scaling down #1179 was not working particularly for scale down of static (and perhaps long-running ephemeral) runners, which resulted in some runner pods are terminated before the requested unregistration processes complete, that triggered some in-progress workflow jobs to hang forever. This fixes an edge-case that resulted in a decreased desired replicas to trigger the failure, so that every runner is unregistered then terminated, as originally designed. --- controllers/runner_pod_owner.go | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/controllers/runner_pod_owner.go b/controllers/runner_pod_owner.go index c505db78..abc5020a 100644 --- a/controllers/runner_pod_owner.go +++ b/controllers/runner_pod_owner.go @@ -429,24 +429,26 @@ func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger, // or actions/runner may potentially misbehave on SIGTERM immediately sent by K8s. // We'd better unregister first and then start a pod deletion process. // The annotation works as a mark to start the pod unregistration and deletion process of ours. + + if _, ok := getAnnotation(ss.owner, AnnotationKeyUnregistrationRequestTimestamp); ok { + log.V(2).Info("Still waiting for runner pod(s) unregistration to complete") + + continue + } + for _, po := range ss.pods { if _, err := annotatePodOnce(ctx, c, log, &po, AnnotationKeyUnregistrationRequestTimestamp, time.Now().Format(time.RFC3339)); err != nil { return nil, err } } - if _, ok := getAnnotation(ss.owner, AnnotationKeyUnregistrationRequestTimestamp); !ok { - updated := ss.owner.withAnnotation(AnnotationKeyUnregistrationRequestTimestamp, time.Now().Format(time.RFC3339)) - - if err := c.Patch(ctx, updated, client.MergeFrom(ss.owner)); err != nil { - log.Error(err, fmt.Sprintf("Failed to patch owner to have %s annotation", AnnotationKeyUnregistrationRequestTimestamp)) - return nil, err - } - - log.V(2).Info("Redundant owner has been annotated to start the unregistration before deletion") - } else { - log.V(2).Info("BUG: Redundant object was already annotated") + updated := ss.owner.withAnnotation(AnnotationKeyUnregistrationRequestTimestamp, time.Now().Format(time.RFC3339)) + if err := c.Patch(ctx, updated, client.MergeFrom(ss.owner)); err != nil { + log.Error(err, fmt.Sprintf("Failed to patch owner to have %s annotation", AnnotationKeyUnregistrationRequestTimestamp)) + return nil, err } + + log.V(2).Info("Redundant owner has been annotated to start the unregistration before deletion") } } else if retained > newDesiredReplicas { log.V(2).Info("Waiting sync before scale down", "retained", retained, "newDesiredReplicas", newDesiredReplicas) @@ -555,9 +557,9 @@ func collectPodsForOwners(ctx context.Context, c client.Client, log logr.Logger, } else { log.V(2).Info("BUG: Redundant owner was already annotated to start the deletion") } - } - continue + continue + } } if annotations := res.owner.GetAnnotations(); annotations != nil {