Merge pull request #1214 from actions-runner-controller/fix-static-runners

Fix runner{set,deployment} rollouts and static runner scaling I was testing static runners as a preparation to cut the next release of ARC, v0.22.0, and found several problems that I thought worth being fixed. In particular, this PR fixes static runners reliability issues in two means. c4b24f8366 fixes the issue that ARC gives up retrying RemoveRunner calls too early, especially on static runners, that resulted in static runners to often get terminated prematurely while running jobs. 791634fb12 fixes the issue that ARC was unable to scale up any static runners when the corresponding desired replicas number in e.g. RunnerDeployment gets updated. It was caused by a bug in the mechanism that is intended to prevent ephemeral runners from being recreated in unwanted circumstances, mistakenly triggered also for static runners. Since #1179, RunnerDeployment was not able to gracefully terminate old RunnerReplicaSet on update. c612e87 fixes that by changing RunnerDeployment to firstly scale old RunnerReplicaSet(s) down to zero and waits for sync, and set the deletion timestamp only after that. That way, RunnerDeployment can ensure that all the old RunnerReplicaSets that are being deleted are already scaled to zero passing the standard unregister-and-then-delete runner termination process. It revealed a hidden bug in #1179 that sometimes the scale-to-zero-before-runnerreplicaset-termination does not work as intended. 4551309 fixes that, so that RunnerDeployment can actually terminate old RunnerReplicaSets gracefully.
2022-03-13 22:19:26 +09:00 · 2022-03-13 22:19:26 +09:00 · e7200f274d
parent a1c6d1d11a 1cc06e7408
commit e7200f274d
11 changed files with 132 additions and 77 deletions
--- a/README.md
+++ b/README.md
@ -1320,7 +1320,9 @@ Once the ephemeral runner has completed running a workflow job, it stops with a

 As it's removed after a workflow job run, the runner pod is never reused across multiple GitHub Actions workflow jobs, providing you a clean environment per each workflow job.

-Although not recommended, it's possible to disable passing `--ephemeral` flag by explicitly setting `ephemeral: false` in the `RunnerDeployment` or `RunnerSet` spec. When disabled, your runner becomes "static". A static runner does not stop after workflow job run, and `actions/runner` is known to clean only runner's work dir after each job. That means your runner's environment, including various actions cache, docker images stored in the `dind` and layer cache, is retained across multiple workflow job runs. It may worth trying only when you do want to prioritize job run speed more than job reliability and security.
+Although not recommended, it's possible to disable passing `--ephemeral` flag by explicitly setting `ephemeral: false` in the `RunnerDeployment` or `RunnerSet` spec. When disabled, your runner becomes "static". A static runner does not stop after workflow job run, and in this mode `actions/runner` is known to clean only runner's work dir after each job. That means your runner's environment, including various actions cache, docker images stored in the `dind` and layer cache, is retained across multiple workflow job runs.
+
+Static runners may worth trying only when you do want to prioritize job run speed more than job reliability and security.

 > In early days of the project, the flag passed to the runner when `ephemeral: true` was `--once` rather than `--ephemeral`.
 >
--- a/acceptance/testdata/runnerdeploy.envsubst.yaml
+++ b/acceptance/testdata/runnerdeploy.envsubst.yaml
@ -17,6 +17,8 @@ spec:
      image: ${RUNNER_NAME}:${RUNNER_TAG}
      imagePullPolicy: IfNotPresent

+      ephemeral: ${TEST_EPHEMERAL}
+
      # Whether to pass --ephemeral (true) or --once (false, deprecated)
      env:
      - name: RUNNER_FEATURE_FLAG_EPHEMERAL
--- a/controllers/constants.go
+++ b/controllers/constants.go
@ -34,15 +34,14 @@ const (

 	AnnotationKeyRunnerID = annotationKeyPrefix + "id"

-	// DefaultUnregistrationTimeout is the duration until ARC gives up retrying the combo of ListRunners API (to detect the runner ID by name)
-	// and RemoveRunner API (to actually unregister the runner) calls.
-	// This needs to be longer than 60 seconds because a part of the combo, the ListRunners API, seems to use the Cache-Control header of max-age=60s
-	// and that instructs our cache library httpcache to cache responses for 60 seconds, which results in ARC unable to see the runner in the ListRunners response
-	// up to 60 seconds (or even more depending on the situation).
-	DefaultUnregistrationTimeout = 60 * time.Second
-
 	// This can be any value but a larger value can make an unregistration timeout longer than configured in practice.
-	DefaultUnregistrationRetryDelay = 30 * time.Second
+	DefaultUnregistrationRetryDelay = time.Minute
+
+	// RetryDelayOnCreateRegistrationError is the delay between retry attempts for runner registration token creation.
+	// Usually, a retry in this case happens when e.g. your PAT has no access to certain scope of runners, like you're using repository admin's token
+	// for creating a broader scoped runner token, like organizationa or enterprise runner token.
+	// Such permission issue will never fixed automatically, so we don't need to retry so often, hence this value.
+	RetryDelayOnCreateRegistrationError = 3 * time.Minute

 	// registrationTimeout is the duration until a pod times out after it becomes Ready and Running.
 	// A pod that is timed out can be terminated if needed.
--- a/controllers/integration_test.go
+++ b/controllers/integration_test.go
@ -110,7 +110,6 @@ func SetupIntegrationTest(ctx2 context.Context) *testEnvironment {
 			Name:                        controllerName("runner"),
 			RegistrationRecheckInterval: time.Millisecond * 100,
 			RegistrationRecheckJitter:   time.Millisecond * 10,
-			UnregistrationTimeout:       1 * time.Second,
 			UnregistrationRetryDelay:    1 * time.Second,
 		}
 		err = runnerController.SetupWithManager(mgr)
--- a/controllers/runner_controller.go
+++ b/controllers/runner_controller.go
@ -73,7 +73,6 @@ type RunnerReconciler struct {
 	RegistrationRecheckInterval time.Duration
 	RegistrationRecheckJitter   time.Duration

-	UnregistrationTimeout    time.Duration
 	UnregistrationRetryDelay time.Duration
 }

@ -200,11 +199,11 @@ func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx con
 		newRunner.ObjectMeta.Finalizers = finalizers

 		if err := r.Patch(ctx, newRunner, client.MergeFrom(&runner)); err != nil {
-			log.Error(err, "Failed to update runner for finalizer removal")
+			log.Error(err, "Unable to remove finalizer")
 			return ctrl.Result{}, err
 		}

-		log.Info("Removed runner from GitHub", "repository", runner.Spec.Repository, "organization", runner.Spec.Organization)
+		log.Info("Removed finalizer")
 	}

 	return ctrl.Result{}, nil
@ -212,7 +211,7 @@ func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx con

 func (r *RunnerReconciler) processRunnerCreation(ctx context.Context, runner v1alpha1.Runner, log logr.Logger) (reconcile.Result, error) {
 	if updated, err := r.updateRegistrationToken(ctx, runner); err != nil {
-		return ctrl.Result{}, err
+		return ctrl.Result{RequeueAfter: RetryDelayOnCreateRegistrationError}, nil
 	} else if updated {
 		return ctrl.Result{Requeue: true}, nil
 	}
@ -254,6 +253,10 @@ func (r *RunnerReconciler) updateRegistrationToken(ctx context.Context, runner v

 	rt, err := r.GitHubClient.GetRegistrationToken(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name)
 	if err != nil {
+		// An error can be a permanent, permission issue like the below:
+		//    POST https://api.github.com/enterprises/YOUR_ENTERPRISE/actions/runners/registration-token: 403 Resource not accessible by integration []
+		// In such case retrying in seconds might not make much sense.
+
 		r.Recorder.Event(&runner, corev1.EventTypeWarning, "FailedUpdateRegistrationToken", "Updating registration token failed")
 		log.Error(err, "Failed to get new registration token")
 		return false, err
--- a/controllers/runner_graceful_stop.go
+++ b/controllers/runner_graceful_stop.go
@ -26,13 +26,13 @@ import (
 // This function is designed to complete a lengthy graceful stop process in a unblocking way.
 // When it wants to be retried later, the function returns a non-nil *ctrl.Result as the second return value, may or may not populating the error in the second return value.
 // The caller is expected to return the returned ctrl.Result and error to postpone the current reconcilation loop and trigger a scheduled retry.
-func tickRunnerGracefulStop(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*corev1.Pod, *ctrl.Result, error) {
+func tickRunnerGracefulStop(ctx context.Context, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*corev1.Pod, *ctrl.Result, error) {
 	pod, err := annotatePodOnce(ctx, c, log, pod, AnnotationKeyUnregistrationStartTimestamp, time.Now().Format(time.RFC3339))
 	if err != nil {
 		return nil, &ctrl.Result{}, err
 	}

-	if res, err := ensureRunnerUnregistration(ctx, unregistrationTimeout, retryDelay, log, ghClient, c, enterprise, organization, repository, runner, pod); res != nil {
+	if res, err := ensureRunnerUnregistration(ctx, retryDelay, log, ghClient, c, enterprise, organization, repository, runner, pod); res != nil {
 		return nil, res, err
 	}

@ -69,7 +69,7 @@ func annotatePodOnce(ctx context.Context, c client.Client, log logr.Logger, pod
 }

 // If the first return value is nil, it's safe to delete the runner pod.
-func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*ctrl.Result, error) {
+func ensureRunnerUnregistration(ctx context.Context, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*ctrl.Result, error) {
 	var runnerID *int64

 	if id, ok := getAnnotation(pod, AnnotationKeyRunnerID); ok {
@ -81,12 +81,29 @@ func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.
 		runnerID = &v
 	}

+	if runnerID == nil {
+		runner, err := getRunner(ctx, ghClient, enterprise, organization, repository, runner)
+		if err != nil {
+			return &ctrl.Result{}, err
+		}
+
+		if runner != nil && runner.ID != nil {
+			runnerID = runner.ID
+		}
+	}
+
 	code := runnerContainerExitCode(pod)

 	if pod != nil && pod.Annotations[AnnotationKeyUnregistrationCompleteTimestamp] != "" {
 		// If it's already unregistered in the previous reconcilation loop,
 		// you can safely assume that it won't get registered again so it's safe to delete the runner pod.
 		log.Info("Runner pod is marked as already unregistered.")
+	} else if runnerID == nil {
+		log.Info(
+			"Unregistration started before runner ID is assigned. " +
+				"Perhaps the runner pod was terminated by anyone other than ARC? Was it OOM killed? " +
+				"Marking unregistration as completed anyway because there's nothing ARC can do.",
+		)
 	} else if pod != nil && runnerPodOrContainerIsStopped(pod) {
 		// If it's an ephemeral runner with the actions/runner container exited with 0,
 		// we can safely assume that it has unregistered itself from GitHub Actions
@ -99,7 +116,7 @@ func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.
 		log.Info("Runner pod is annotated to wait for completion")

 		return &ctrl.Result{RequeueAfter: retryDelay}, nil
-	} else if ok, err := unregisterRunner(ctx, ghClient, enterprise, organization, repository, runner, runnerID); err != nil {
+	} else if ok, err := unregisterRunner(ctx, ghClient, enterprise, organization, repository, runner, *runnerID); err != nil {
 		if errors.Is(err, &gogithub.RateLimitError{}) {
 			// We log the underlying error when we failed calling GitHub API to list or unregisters,
 			// or the runner is still busy.
@ -120,6 +137,15 @@ func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.

 		errRes := &gogithub.ErrorResponse{}
 		if errors.As(err, &errRes) {
+			if errRes.Response.StatusCode == 403 {
+				log.Error(err, "Unable to unregister due to permission error. "+
+					"Perhaps you've changed the permissions of PAT or GitHub App, or you updated authentication method of ARC in a wrong way? "+
+					"ARC considers it as already unregistered and continue removing the pod. "+
+					"You may need to remove the runner on GitHub UI.")
+
+				return nil, nil
+			}
+
 			runner, _ := getRunner(ctx, ghClient, enterprise, organization, repository, runner)

 			var runnerID int64
@ -177,17 +203,10 @@ func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.

 		log.Info("Runner was not found on GitHub and the runner pod was not found on Kuberntes.")
 	} else if ts := pod.Annotations[AnnotationKeyUnregistrationStartTimestamp]; ts != "" {
-		t, err := time.Parse(time.RFC3339, ts)
-		if err != nil {
-			return &ctrl.Result{RequeueAfter: retryDelay}, err
-		}
+		log.Info("Runner unregistration is in-progress. It can take forever to complete if if it's a static runner constantly running jobs."+
+			" It can also take very long time if it's an ephemeral runner that is running a log-running job.", "error", err)

-		if r := time.Until(t.Add(unregistrationTimeout)); r > 0 {
-			log.Info("Runner unregistration is in-progress.", "timeout", unregistrationTimeout, "remaining", r)
-			return &ctrl.Result{RequeueAfter: retryDelay}, err
-		}
-
-		log.Info("Runner unregistration has been timed out. The runner pod will be deleted soon.", "timeout", unregistrationTimeout)
+		return &ctrl.Result{RequeueAfter: retryDelay}, nil
 	} else {
 		// A runner and a runner pod that is created by this version of ARC should match
 		// any of the above branches.
@ -333,20 +352,7 @@ func setRunnerEnv(pod *corev1.Pod, key, value string) {
 // There isn't a single right grace period that works for everyone.
 // The longer the grace period is, the earlier a cluster resource shortage can occur due to throttoled runner pod deletions,
 // while the shorter the grace period is, the more likely you may encounter the race issue.
-func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string, id *int64) (bool, error) {
-	if id == nil {
-		runner, err := getRunner(ctx, client, enterprise, org, repo, name)
-		if err != nil {
-			return false, err
-		}
-
-		if runner == nil || runner.ID == nil {
-			return false, nil
-		}
-
-		id = runner.ID
-	}
-
+func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string, id int64) (bool, error) {
 	// For the record, historically ARC did not try to call RemoveRunner on a busy runner, but it's no longer true.
 	// The reason ARC did so was to let a runner running a job to not stop prematurely.
 	//
@ -367,7 +373,7 @@ func unregisterRunner(ctx context.Context, client *github.Client, enterprise, or
 	//   change from 60 seconds.
 	//
 	// TODO: Probably we can just remove the runner by ID without seeing if the runner is busy, by treating it as busy when a remove-runner call failed with 422?
-	if err := client.RemoveRunner(ctx, enterprise, org, repo, *id); err != nil {
+	if err := client.RemoveRunner(ctx, enterprise, org, repo, id); err != nil {
 		return false, err
 	}

--- a/controllers/runner_pod_controller.go
+++ b/controllers/runner_pod_controller.go
@ -45,7 +45,6 @@ type RunnerPodReconciler struct {
 	RegistrationRecheckInterval time.Duration
 	RegistrationRecheckJitter   time.Duration

-	UnregistrationTimeout    time.Duration
 	UnregistrationRetryDelay time.Duration
 }

@ -104,7 +103,7 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
 			// In a standard scenario, the upstream controller, like runnerset-controller, ensures this runner to be gracefully stopped before the deletion timestamp is set.
 			// But for the case that the user manually deleted it for whatever reason,
 			// we have to ensure it to gracefully stop now.
-			updatedPod, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod)
+			updatedPod, res, err := tickRunnerGracefulStop(ctx, r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod)
 			if res != nil {
 				return *res, err
 			}
@ -172,7 +171,7 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
 		//
 		// In a standard scenario, ARC starts the unregistration process before marking the pod for deletion at all,
 		// so that it isn't subject to terminationGracePeriod and can safely take hours to finish it's work.
-		_, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod)
+		_, res, err := tickRunnerGracefulStop(ctx, r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod)
 		if res != nil {
 			return *res, err
 		}
@ -190,15 +189,6 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
 	return ctrl.Result{}, nil
 }

-func (r *RunnerPodReconciler) unregistrationTimeout() time.Duration {
-	unregistrationTimeout := DefaultUnregistrationTimeout
-
-	if r.UnregistrationTimeout > 0 {
-		unregistrationTimeout = r.UnregistrationTimeout
-	}
-	return unregistrationTimeout
-}
-
 func (r *RunnerPodReconciler) unregistrationRetryDelay() time.Duration {
 	retryDelay := DefaultUnregistrationRetryDelay

--- a/controllers/runner_pod_owner.go
+++ b/controllers/runner_pod_owner.go
@ -302,9 +302,11 @@ func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger,
 		log.V(2).Info("Detected some current object(s)", "creationTimestampFirst", timestampFirst, "creationTimestampLast", timestampLast, "names", names)
 	}

-	var pending, running, regTimeout int
+	var total, terminating, pending, running, regTimeout int

 	for _, ss := range currentObjects {
+		total += ss.total
+		terminating += ss.terminating
 		pending += ss.pending
 		running += ss.running
 		regTimeout += ss.regTimeout
@ -319,6 +321,8 @@ func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger,

 	log.V(2).Info(
 		"Found some pods across owner(s)",
+		"total", total,
+		"terminating", terminating,
 		"pending", pending,
 		"running", running,
 		"regTimeout", regTimeout,
@ -329,7 +333,7 @@ func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger,
 	maybeRunning := pending + running

 	wantMoreRunners := newDesiredReplicas > maybeRunning
-	alreadySyncedAfterEffectiveTime := lastSyncTime != nil && effectiveTime != nil && lastSyncTime.After(effectiveTime.Time)
+	alreadySyncedAfterEffectiveTime := ephemeral && lastSyncTime != nil && effectiveTime != nil && lastSyncTime.After(effectiveTime.Time)
 	runnerPodRecreationDelayAfterWebhookScale := lastSyncTime != nil && time.Now().Before(lastSyncTime.Add(DefaultRunnerPodRecreationDelayAfterWebhookScale))

 	log = log.WithValues(
@ -344,6 +348,16 @@ func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger,
 	)

 	if wantMoreRunners && alreadySyncedAfterEffectiveTime && runnerPodRecreationDelayAfterWebhookScale {
+		// This is our special handling of the situation for ephemeral runners only.
+		//
+		// Handling static runners this way results in scale-up to not work at all,
+		// because then any scale up attempts for static runenrs fall within this condition, for two reasons.
+		// First, static(persistent) runners will never restart on their own.
+		// Second, we don't update EffectiveTime for static runners.
+		//
+		// We do need to skip this condition for static runners, and that's why we take the `ephemeral` flag into account when
+		// computing `alreadySyncedAfterEffectiveTime``.
+
 		log.V(2).Info(
 			"Detected that some ephemeral runners have disappeared. " +
 				"Usually this is due to that ephemeral runner completions " +
@ -415,24 +429,26 @@ func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger,
 				// or actions/runner may potentially misbehave on SIGTERM immediately sent by K8s.
 				// We'd better unregister first and then start a pod deletion process.
 				// The annotation works as a mark to start the pod unregistration and deletion process of ours.
+
+				if _, ok := getAnnotation(ss.owner, AnnotationKeyUnregistrationRequestTimestamp); ok {
+					log.V(2).Info("Still waiting for runner pod(s) unregistration to complete")
+
+					continue
+				}
+
 				for _, po := range ss.pods {
 					if _, err := annotatePodOnce(ctx, c, log, &po, AnnotationKeyUnregistrationRequestTimestamp, time.Now().Format(time.RFC3339)); err != nil {
 						return nil, err
 					}
 				}

-				if _, ok := getAnnotation(ss.owner, AnnotationKeyUnregistrationRequestTimestamp); !ok {
-					updated := ss.owner.withAnnotation(AnnotationKeyUnregistrationRequestTimestamp, time.Now().Format(time.RFC3339))
-
-					if err := c.Patch(ctx, updated, client.MergeFrom(ss.object)); err != nil {
-						log.Error(err, fmt.Sprintf("Failed to patch object to have %s annotation", AnnotationKeyUnregistrationRequestTimestamp))
-						return nil, err
-					}
-
-					log.V(2).Info("Redundant object has been annotated to start the unregistration before deletion")
-				} else {
-					log.V(2).Info("BUG: Redundant object was already annotated")
+				updated := ss.owner.withAnnotation(AnnotationKeyUnregistrationRequestTimestamp, time.Now().Format(time.RFC3339))
+				if err := c.Patch(ctx, updated, client.MergeFrom(ss.owner)); err != nil {
+					log.Error(err, fmt.Sprintf("Failed to patch owner to have %s annotation", AnnotationKeyUnregistrationRequestTimestamp))
+					return nil, err
 				}
+
+				log.V(2).Info("Redundant owner has been annotated to start the unregistration before deletion")
 			}
 		} else if retained > newDesiredReplicas {
 			log.V(2).Info("Waiting sync before scale down", "retained", retained, "newDesiredReplicas", newDesiredReplicas)
@ -508,6 +524,9 @@ func collectPodsForOwners(ctx context.Context, c client.Client, log logr.Logger,
 				log.Error(err, "Failed to delete owner")
 				return nil, err
 			}
+
+			log.V(2).Info("Started deletion of owner")
+
 			continue
 		}

@ -523,13 +542,13 @@ func collectPodsForOwners(ctx context.Context, c client.Client, log logr.Logger,
 				}
 			}

-			log.V(2).Info("Marking owner for unregistration completion", "deletionSafe", deletionSafe, "total", res.total)
-
 			if deletionSafe == res.total {
+				log.V(2).Info("Marking owner for unregistration completion", "deletionSafe", deletionSafe, "total", res.total)
+
 				if _, ok := getAnnotation(res.owner, AnnotationKeyUnregistrationCompleteTimestamp); !ok {
 					updated := res.owner.withAnnotation(AnnotationKeyUnregistrationCompleteTimestamp, time.Now().Format(time.RFC3339))

-					if err := c.Patch(ctx, updated, client.MergeFrom(res.object)); err != nil {
+					if err := c.Patch(ctx, updated, client.MergeFrom(res.owner)); err != nil {
 						log.Error(err, fmt.Sprintf("Failed to patch owner to have %s annotation", AnnotationKeyUnregistrationCompleteTimestamp))
 						return nil, err
 					}
@ -538,9 +557,9 @@ func collectPodsForOwners(ctx context.Context, c client.Client, log logr.Logger,
 				} else {
 					log.V(2).Info("BUG: Redundant owner was already annotated to start the deletion")
 				}
-			}

-			continue
+				continue
+			}
 		}

 		if annotations := res.owner.GetAnnotations(); annotations != nil {
--- a/controllers/runnerdeployment_controller.go
+++ b/controllers/runnerdeployment_controller.go
@ -118,6 +118,8 @@ func (r *RunnerDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Req
 			return ctrl.Result{}, err
 		}

+		log.Info("Created runnerreplicaset", "runnerreplicaset", desiredRS.Name)
+
 		return ctrl.Result{}, nil
 	}

@ -142,6 +144,8 @@ func (r *RunnerDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Req
 			return ctrl.Result{}, err
 		}

+		log.Info("Created runnerreplicaset", "runnerreplicaset", desiredRS.Name)
+
 		// We requeue in order to clean up old runner replica sets later.
 		// Otherwise, they aren't cleaned up until the next re-sync interval.
 		return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
@ -222,15 +226,38 @@ func (r *RunnerDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Req
 		for i := range oldSets {
 			rs := oldSets[i]

+			rslog := log.WithValues("runnerreplicaset", rs.Name)
+
+			if rs.Status.Replicas != nil && *rs.Status.Replicas > 0 {
+				if rs.Spec.Replicas != nil && *rs.Spec.Replicas == 0 {
+					rslog.V(2).Info("Waiting for runnerreplicaset to scale to zero")
+
+					continue
+				}
+
+				updated := rs.DeepCopy()
+				zero := 0
+				updated.Spec.Replicas = &zero
+				if err := r.Client.Update(ctx, updated); err != nil {
+					rslog.Error(err, "Failed to scale runnerreplicaset to zero")
+
+					return ctrl.Result{}, err
+				}
+
+				rslog.Info("Scaled runnerreplicaset to zero")
+
+				continue
+			}
+
 			if err := r.Client.Delete(ctx, &rs); err != nil {
-				log.Error(err, "Failed to delete runnerreplicaset resource")
+				rslog.Error(err, "Failed to delete runnerreplicaset resource")

 				return ctrl.Result{}, err
 			}

 			r.Recorder.Event(&rd, corev1.EventTypeNormal, "RunnerReplicaSetDeleted", fmt.Sprintf("Deleted runnerreplicaset '%s'", rs.Name))

-			log.Info("Deleted runnerreplicaset", "runnerdeployment", rd.ObjectMeta.Name, "runnerreplicaset", rs.Name)
+			rslog.Info("Deleted runnerreplicaset")
 		}
 	}

--- a/controllers/runnerreplicaset_controller.go
+++ b/controllers/runnerreplicaset_controller.go
@ -65,6 +65,10 @@ func (r *RunnerReplicaSetReconciler) Reconcile(ctx context.Context, req ctrl.Req
 	}

 	if !rs.ObjectMeta.DeletionTimestamp.IsZero() {
+		// RunnerReplicaSet cannot be gracefuly removed.
+		// That means any runner that is running a job can be prematurely terminated.
+		// To gracefully remove a RunnerReplicaSet, scale it down to zero first, observe RunnerReplicaSet's status replicas,
+		// and remove it only after the status replicas becomes zero.
 		return ctrl.Result{}, nil
 	}

--- a/test/e2e/e2e_test.go
+++ b/test/e2e/e2e_test.go
@ -135,6 +135,7 @@ func TestE2ERunnerDeploy(t *testing.T) {
 	}

 	env := initTestEnv(t)
+	env.useApp = true

 	t.Run("build and load images", func(t *testing.T) {
 		env.buildAndLoadImages(t)
@ -186,6 +187,7 @@ type env struct {
 	runnerLabel, githubToken, testRepo, testOrg, testOrgRepo string
 	githubTokenWebhook                                       string
 	testEnterprise                                           string
+	testEphemeral                                            string
 	featureFlagEphemeral                                     *bool
 	scaleDownDelaySecondsAfterScaleOut                       int64
 	minReplicas                                              int64
@ -218,8 +220,9 @@ func initTestEnv(t *testing.T) *env {
 	e.testRepo = testing.Getenv(t, "TEST_REPO", "")
 	e.testOrg = testing.Getenv(t, "TEST_ORG", "")
 	e.testOrgRepo = testing.Getenv(t, "TEST_ORG_REPO", "")
-	e.testEnterprise = testing.Getenv(t, "TEST_ENTERPRISE")
-	e.testJobs = createTestJobs(id, testResultCMNamePrefix, 100)
+	e.testEnterprise = testing.Getenv(t, "TEST_ENTERPRISE", "")
+	e.testEphemeral = testing.Getenv(t, "TEST_EPHEMERAL", "")
+	e.testJobs = createTestJobs(id, testResultCMNamePrefix, 20)

 	if ephemeral, err := strconv.ParseBool(testing.Getenv(t, "TEST_FEATURE_FLAG_EPHEMERAL", "")); err == nil {
 		e.featureFlagEphemeral = &ephemeral
@ -288,6 +291,7 @@ func (e *env) installActionsRunnerController(t *testing.T) {
 		"WEBHOOK_GITHUB_TOKEN=" + e.githubTokenWebhook,
 		"RUNNER_LABEL=" + e.runnerLabel,
 		"TEST_ID=" + e.testID,
+		"TEST_EPHEMERAL=" + e.testEphemeral,
 		fmt.Sprintf("RUNNER_SCALE_DOWN_DELAY_SECONDS_AFTER_SCALE_OUT=%d", e.scaleDownDelaySecondsAfterScaleOut),
 		fmt.Sprintf("REPO_RUNNER_MIN_REPLICAS=%d", e.minReplicas),
 		fmt.Sprintf("ORG_RUNNER_MIN_REPLICAS=%d", e.minReplicas),