From cbbc383a8049d134dbba67bee667acf7df7bf814 Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Mon, 7 Mar 2022 09:35:13 +0900 Subject: [PATCH] Auto-correct replicas number on missing webhook_job completion event (#1180) While testing #1179, I discovered that ARC sometimes stop resyncing RunnerReplicaSet when the desired replicas is greater than the actual number of runner pods. This seems to happen when ARC missed receiving a workflow_job completion event but it has no way to decide if it is either (1) something went wrong on ARC or (2) a loadbalancer in the middle or GitHub or anything not ARC went wrong. It needs a standard to decide it, or if it's not impossible, how to deal with it. In this change, I added a hard-coded 10 minutes timeout(can be made customizable later) to prevent runner pod recreation. Now, a RunnerReplicaSet/RunnerSet to restart runner pod recreation 10 minutes after the last scale-up. If the workflow completion event arrived after the timeout, it will decrease the desired replicas number that results in the removal of a runner pod. The removed runner pod might be deleted without ever being used, but I think that's better than leaving the desired replicas and the actual number of replicas diverged forever. --- controllers/constants.go | 12 ++++++++++++ controllers/runner_pod_owner.go | 32 +++++++++++++++++++++++++------- 2 files changed, 37 insertions(+), 7 deletions(-) diff --git a/controllers/constants.go b/controllers/constants.go index 9d967fb4..68e1db37 100644 --- a/controllers/constants.go +++ b/controllers/constants.go @@ -45,4 +45,16 @@ const ( registrationTimeout = 10 * time.Minute defaultRegistrationCheckInterval = time.Minute + + // DefaultRunnerPodRecreationDelayAfterWebhookScale is the delay until syncing the runners with the desired replicas + // after a webhook-based scale up. + // This is used to prevent ARC from recreating completed runner pods that are deleted soon without being used at all. + // In other words, this is used as a timer to wait for the completed runner to emit the next `workflow_job` webhook event to decrease the desired replicas. + // So if we set 30 seconds for this, you are basically saying that you would assume GitHub and your installation of ARC to + // emit and propagate a workflow_job completion event down to the RunnerSet or RunnerReplicaSet, vha ARC's github webhook server and HRA, in approximately 30 seconds. + // In case it actually took more than DefaultRunnerPodRecreationDelayAfterWebhookScale for the workflow_job completion event to arrive, + // ARC will recreate the completed runner(s), assuming something went wrong in either GitHub, your K8s cluster, or ARC, so ARC needs to resync anyway. + // + // See https://github.com/actions-runner-controller/actions-runner-controller/pull/1180 + DefaultRunnerPodRecreationDelayAfterWebhookScale = 10 * time.Minute ) diff --git a/controllers/runner_pod_owner.go b/controllers/runner_pod_owner.go index 160fa0d8..7bc826c9 100644 --- a/controllers/runner_pod_owner.go +++ b/controllers/runner_pod_owner.go @@ -328,9 +328,31 @@ func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger, maybeRunning := pending + running - if newDesiredReplicas > maybeRunning && ephemeral && lastSyncTime != nil && effectiveTime != nil && lastSyncTime.After(effectiveTime.Time) { - log.V(2).Info("Detected that some ephemeral runners have disappeared. Usually this is due to that ephemeral runner completions so ARC does not create new runners until EffectiveTime is updated.", "lastSyncTime", metav1.Time{Time: *lastSyncTime}, "effectiveTime", *effectiveTime, "desired", newDesiredReplicas, "pending", pending, "running", running) - } else if newDesiredReplicas > maybeRunning { + wantMoreRunners := newDesiredReplicas > maybeRunning + alreadySyncedAfterEffectiveTime := lastSyncTime != nil && effectiveTime != nil && lastSyncTime.After(effectiveTime.Time) + runnerPodRecreationDelayAfterWebhookScale := lastSyncTime != nil && time.Now().Before(lastSyncTime.Add(DefaultRunnerPodRecreationDelayAfterWebhookScale)) + + log = log.WithValues( + "lastSyncTime", lastSyncTime, + "effectiveTime", effectiveTime, + "templateHashDesired", desiredTemplateHash, + "replicasDesired", newDesiredReplicas, + "replicasPending", pending, + "replicasRunning", running, + "replicasMaybeRunning", maybeRunning, + "templateHashObserved", hashes, + ) + + if wantMoreRunners && alreadySyncedAfterEffectiveTime && runnerPodRecreationDelayAfterWebhookScale { + log.V(2).Info( + "Detected that some ephemeral runners have disappeared. " + + "Usually this is due to that ephemeral runner completions " + + "so ARC does not create new runners until EffectiveTime is updated, or DefaultRunnerPodRecreationDelayAfterWebhookScale is elapsed.") + } else if wantMoreRunners { + if alreadySyncedAfterEffectiveTime && !runnerPodRecreationDelayAfterWebhookScale { + log.V(2).Info("Adding more replicas because DefaultRunnerPodRecreationDelayAfterWebhookScale has been passed") + } + num := newDesiredReplicas - maybeRunning for i := 0; i < num; i++ { @@ -342,10 +364,6 @@ func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger, log.V(1).Info("Created replica(s)", "created", num, - "templateHashDesired", desiredTemplateHash, - "replicasDesired", newDesiredReplicas, - "replicasMaybeRunning", maybeRunning, - "templateHashObserved", hashes, ) return nil, nil