Auto-correct replicas number on missing webhook_job completion event (#1180)
While testing #1179, I discovered that ARC sometimes stop resyncing RunnerReplicaSet when the desired replicas is greater than the actual number of runner pods. This seems to happen when ARC missed receiving a workflow_job completion event but it has no way to decide if it is either (1) something went wrong on ARC or (2) a loadbalancer in the middle or GitHub or anything not ARC went wrong. It needs a standard to decide it, or if it's not impossible, how to deal with it. In this change, I added a hard-coded 10 minutes timeout(can be made customizable later) to prevent runner pod recreation. Now, a RunnerReplicaSet/RunnerSet to restart runner pod recreation 10 minutes after the last scale-up. If the workflow completion event arrived after the timeout, it will decrease the desired replicas number that results in the removal of a runner pod. The removed runner pod might be deleted without ever being used, but I think that's better than leaving the desired replicas and the actual number of replicas diverged forever.
This commit is contained in:
		
							parent
							
								
									b57e885a73
								
							
						
					
					
						commit
						cbbc383a80
					
				|  | @ -45,4 +45,16 @@ const ( | |||
| 	registrationTimeout = 10 * time.Minute | ||||
| 
 | ||||
| 	defaultRegistrationCheckInterval = time.Minute | ||||
| 
 | ||||
| 	// DefaultRunnerPodRecreationDelayAfterWebhookScale is the delay until syncing the runners with the desired replicas
 | ||||
| 	// after a webhook-based scale up.
 | ||||
| 	// This is used to prevent ARC from recreating completed runner pods that are deleted soon without being used at all.
 | ||||
| 	// In other words, this is used as a timer to wait for the completed runner to emit the next `workflow_job` webhook event to decrease the desired replicas.
 | ||||
| 	// So if we set 30 seconds for this, you are basically saying that you would assume GitHub and your installation of ARC to
 | ||||
| 	// emit and propagate a workflow_job completion event down to the RunnerSet or RunnerReplicaSet, vha ARC's github webhook server and HRA, in approximately 30 seconds.
 | ||||
| 	// In case it actually took more than DefaultRunnerPodRecreationDelayAfterWebhookScale for the workflow_job completion event to arrive,
 | ||||
| 	// ARC will recreate the completed runner(s), assuming something went wrong in either GitHub, your K8s cluster, or ARC, so ARC needs to resync anyway.
 | ||||
| 	//
 | ||||
| 	// See https://github.com/actions-runner-controller/actions-runner-controller/pull/1180
 | ||||
| 	DefaultRunnerPodRecreationDelayAfterWebhookScale = 10 * time.Minute | ||||
| ) | ||||
|  |  | |||
|  | @ -328,9 +328,31 @@ func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger, | |||
| 
 | ||||
| 	maybeRunning := pending + running | ||||
| 
 | ||||
| 	if newDesiredReplicas > maybeRunning && ephemeral && lastSyncTime != nil && effectiveTime != nil && lastSyncTime.After(effectiveTime.Time) { | ||||
| 		log.V(2).Info("Detected that some ephemeral runners have disappeared. Usually this is due to that ephemeral runner completions so ARC does not create new runners until EffectiveTime is updated.", "lastSyncTime", metav1.Time{Time: *lastSyncTime}, "effectiveTime", *effectiveTime, "desired", newDesiredReplicas, "pending", pending, "running", running) | ||||
| 	} else if newDesiredReplicas > maybeRunning { | ||||
| 	wantMoreRunners := newDesiredReplicas > maybeRunning | ||||
| 	alreadySyncedAfterEffectiveTime := lastSyncTime != nil && effectiveTime != nil && lastSyncTime.After(effectiveTime.Time) | ||||
| 	runnerPodRecreationDelayAfterWebhookScale := lastSyncTime != nil && time.Now().Before(lastSyncTime.Add(DefaultRunnerPodRecreationDelayAfterWebhookScale)) | ||||
| 
 | ||||
| 	log = log.WithValues( | ||||
| 		"lastSyncTime", lastSyncTime, | ||||
| 		"effectiveTime", effectiveTime, | ||||
| 		"templateHashDesired", desiredTemplateHash, | ||||
| 		"replicasDesired", newDesiredReplicas, | ||||
| 		"replicasPending", pending, | ||||
| 		"replicasRunning", running, | ||||
| 		"replicasMaybeRunning", maybeRunning, | ||||
| 		"templateHashObserved", hashes, | ||||
| 	) | ||||
| 
 | ||||
| 	if wantMoreRunners && alreadySyncedAfterEffectiveTime && runnerPodRecreationDelayAfterWebhookScale { | ||||
| 		log.V(2).Info( | ||||
| 			"Detected that some ephemeral runners have disappeared. " + | ||||
| 				"Usually this is due to that ephemeral runner completions " + | ||||
| 				"so ARC does not create new runners until EffectiveTime is updated, or DefaultRunnerPodRecreationDelayAfterWebhookScale is elapsed.") | ||||
| 	} else if wantMoreRunners { | ||||
| 		if alreadySyncedAfterEffectiveTime && !runnerPodRecreationDelayAfterWebhookScale { | ||||
| 			log.V(2).Info("Adding more replicas because DefaultRunnerPodRecreationDelayAfterWebhookScale has been passed") | ||||
| 		} | ||||
| 
 | ||||
| 		num := newDesiredReplicas - maybeRunning | ||||
| 
 | ||||
| 		for i := 0; i < num; i++ { | ||||
|  | @ -342,10 +364,6 @@ func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger, | |||
| 
 | ||||
| 		log.V(1).Info("Created replica(s)", | ||||
| 			"created", num, | ||||
| 			"templateHashDesired", desiredTemplateHash, | ||||
| 			"replicasDesired", newDesiredReplicas, | ||||
| 			"replicasMaybeRunning", maybeRunning, | ||||
| 			"templateHashObserved", hashes, | ||||
| 		) | ||||
| 
 | ||||
| 		return nil, nil | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue