Add DrainJobsMode (aka UpdateStrategy feature) (#2569)

2023-05-23 13:42:30 +02:00 · 2023-05-23 13:42:30 +02:00 · 8afef51c8b
parent 032443fcfd
commit 8afef51c8b
9 changed files with 450 additions and 40 deletions
--- a/.github/actions/execute-assert-arc-e2e/action.yaml
+++ b/.github/actions/execute-assert-arc-e2e/action.yaml
@ -23,6 +23,14 @@ inputs:
  arc-controller-namespace:
    description: 'The namespace of the configured gha-runner-scale-set-controller'
    required: true
+  wait-to-finish:
+    description: 'Wait for the workflow run to finish'
+    required: true
+    default: "true"
+  wait-to-running:
+    description: 'Wait for the workflow run to start running'
+    required: true
+    default: "false"

 runs:
  using: "composite"
@ -118,7 +126,36 @@ runs:
        | ${{steps.query_workflow.outputs.workflow_run_url}} |
        EOF

+    - name: Wait for workflow to start running
+      if: inputs.wait-to-running == 'true' && inputs.wait-to-finish == 'false'
+      uses: actions/github-script@v6
+      with:
+        script: |
+          function sleep(ms) {
+            return new Promise(resolve => setTimeout(resolve, ms))
+          }
+          const owner = '${{inputs.repo-owner}}'
+          const repo = '${{inputs.repo-name}}'
+          const workflow_run_id = ${{steps.query_workflow.outputs.workflow_run}}
+          const workflow_job_id = ${{steps.query_workflow.outputs.workflow_job}}
+          let count = 0
+          while (count++<10) {
+            await sleep(30 * 1000);
+            let getRunResponse = await github.rest.actions.getWorkflowRun({
+              owner: owner,
+              repo: repo,
+              run_id: workflow_run_id
+            })
+            console.log(`${getRunResponse.data.html_url}: ${getRunResponse.data.status} (${getRunResponse.data.conclusion})`);
+            if (getRunResponse.data.status == 'in_progress') {
+              console.log(`Workflow run is in progress.`)
+              return
+            }
+          }
+          core.setFailed(`The triggered workflow run didn't start properly using ${{inputs.arc-name}}`)
+
    - name: Wait for workflow to finish successfully
+      if: inputs.wait-to-finish == 'true'
      uses: actions/github-script@v6
      with:
        script: |
@ -151,10 +188,15 @@ runs:
          }
          core.setFailed(`The triggered workflow run didn't finish properly using ${{inputs.arc-name}}`)

+    - name: cleanup
+      if: inputs.wait-to-finish == 'true'
+      shell: bash
+      run: |
+        helm uninstall ${{ inputs.arc-name }} --namespace ${{inputs.arc-namespace}} --debug
+        kubectl wait --timeout=10s --for=delete AutoScalingRunnerSet -n ${{inputs.arc-name}} -l app.kubernetes.io/instance=${{ inputs.arc-name }}
+
    - name: Gather logs and cleanup
      shell: bash
      if: always()
      run: |
-        helm uninstall ${{ inputs.arc-name }} --namespace ${{inputs.arc-namespace}} --debug
-        kubectl wait --timeout=10s --for=delete AutoScalingRunnerSet -n ${{inputs.arc-name}} -l app.kubernetes.io/instance=${{ inputs.arc-name }}
        kubectl logs deployment/arc-gha-runner-scale-set-controller -n ${{inputs.arc-controller-namespace}}
--- a/.github/workflows/gha-e2e-tests.yaml
+++ b/.github/workflows/gha-e2e-tests.yaml
@ -710,3 +710,173 @@ jobs:
          arc-name: ${{steps.install_arc.outputs.ARC_NAME}}
          arc-namespace: "arc-runners"
          arc-controller-namespace: "arc-systems"
+
+  update-strategy-tests:
+    runs-on: ubuntu-latest
+    timeout-minutes: 20
+    if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.id == github.repository_id
+    env:
+      WORKFLOW_FILE: "arc-test-sleepy-matrix.yaml"
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          ref: ${{github.head_ref}}
+
+      - uses: ./.github/actions/setup-arc-e2e
+        id: setup
+        with:
+          app-id: ${{secrets.E2E_TESTS_ACCESS_APP_ID}}
+          app-pk: ${{secrets.E2E_TESTS_ACCESS_PK}}
+          image-name: ${{env.IMAGE_NAME}}
+          image-tag: ${{env.IMAGE_VERSION}}
+          target-org: ${{env.TARGET_ORG}}
+
+      - name: Install gha-runner-scale-set-controller
+        id: install_arc_controller
+        run: |
+          helm install arc \
+          --namespace "arc-systems" \
+          --create-namespace \
+          --set image.repository=${{ env.IMAGE_NAME }} \
+          --set image.tag=${{ env.IMAGE_VERSION }} \
+          --set flags.updateStrategy="eventual" \
+          ./charts/gha-runner-scale-set-controller \
+          --debug
+          count=0
+          while true; do
+            POD_NAME=$(kubectl get pods -n arc-systems -l app.kubernetes.io/name=gha-runner-scale-set-controller -o name)
+            if [ -n "$POD_NAME" ]; then
+              echo "Pod found: $POD_NAME"
+              break
+            fi
+            if [ "$count" -ge 60 ]; then
+              echo "Timeout waiting for controller pod with label app.kubernetes.io/name=gha-runner-scale-set-controller"
+              exit 1
+            fi
+            sleep 1
+            count=$((count+1))
+          done
+          kubectl wait --timeout=30s --for=condition=ready pod -n arc-systems -l app.kubernetes.io/name=gha-runner-scale-set-controller
+          kubectl get pod -n arc-systems
+          kubectl describe deployment arc-gha-runner-scale-set-controller -n arc-systems
+
+      - name: Install gha-runner-scale-set
+        id: install_arc
+        run: |
+          ARC_NAME=${{github.job}}-$(date +'%M%S')$((($RANDOM + 100) % 100 + 1))
+          helm install "$ARC_NAME" \
+          --namespace "arc-runners" \
+          --create-namespace \
+          --set githubConfigUrl="https://github.com/${{ env.TARGET_ORG }}/${{env.TARGET_REPO}}" \
+          --set githubConfigSecret.github_token="${{ steps.setup.outputs.token }}" \
+          ./charts/gha-runner-scale-set \
+          --debug
+          echo "ARC_NAME=$ARC_NAME" >> $GITHUB_OUTPUT
+          count=0
+          while true; do
+            POD_NAME=$(kubectl get pods -n arc-systems -l actions.github.com/scale-set-name=$ARC_NAME -o name)
+            if [ -n "$POD_NAME" ]; then
+              echo "Pod found: $POD_NAME"
+              break
+            fi
+            if [ "$count" -ge 60 ]; then
+              echo "Timeout waiting for listener pod with label actions.github.com/scale-set-name=$ARC_NAME"
+              exit 1
+            fi
+            sleep 1
+            count=$((count+1))
+          done
+          kubectl wait --timeout=30s --for=condition=ready pod -n arc-systems -l actions.github.com/scale-set-name=$ARC_NAME
+          kubectl get pod -n arc-systems
+
+      - name: Trigger long running jobs and wait for runners to pick them up
+        uses: ./.github/actions/execute-assert-arc-e2e
+        timeout-minutes: 10
+        with:
+          auth-token: ${{ steps.setup.outputs.token }}
+          repo-owner: ${{ env.TARGET_ORG }}
+          repo-name: ${{env.TARGET_REPO}}
+          workflow-file: ${{env.WORKFLOW_FILE}}
+          arc-name: ${{steps.install_arc.outputs.ARC_NAME}}
+          arc-namespace: "arc-runners"
+          arc-controller-namespace: "arc-systems"
+          wait-to-running: "true"
+          wait-to-finish: "false"
+
+      - name: Upgrade the gha-runner-scale-set
+        shell: bash
+        run: |
+          helm upgrade --install "${{ steps.install_arc.outputs.ARC_NAME }}" \
+            --namespace "arc-runners" \
+            --create-namespace \
+            --set githubConfigUrl="https://github.com/${{ env.TARGET_ORG }}/${{ env.TARGET_REPO }}" \
+            --set githubConfigSecret.github_token="${{ steps.setup.outputs.token }}" \
+            --set template.spec.containers[0].name="runner" \
+            --set template.spec.containers[0].image="ghcr.io/actions/actions-runner:latest" \
+            --set template.spec.containers[0].command={"/home/runner/run.sh"} \
+            --set template.spec.containers[0].env[0].name="TEST" \
+            --set template.spec.containers[0].env[0].value="E2E TESTS" \
+            ./charts/gha-runner-scale-set \
+            --debug
+
+      - name: Assert that the listener is deleted while jobs are running
+        shell: bash
+        run: |
+          count=0
+          while true; do
+            LISTENER_COUNT="$(kubectl get pods -l actions.github.com/scale-set-name=${{ steps.install_arc.outputs.ARC_NAME }} -n arc-systems --field-selector=status.phase=Running -o=jsonpath='{.items}' | jq 'length')"
+            RUNNERS_COUNT="$(kubectl get pods -l app.kubernetes.io/component=runner -n arc-runners --field-selector=status.phase=Running -o=jsonpath='{.items}' | jq 'length')"
+            RESOURCES="$(kubectl get pods -A)"
+
+            if [ "$LISTENER_COUNT" -eq 0 ]; then
+              echo "Listener has been deleted"
+              echo "$RESOURCES"
+              exit 0
+            fi
+            if [ "$count" -ge 60 ]; then
+              echo "Timeout waiting for listener to be deleted"
+              echo "$RESOURCES"
+              exit 1
+            fi
+
+            echo "Waiting for listener to be deleted"
+            echo "Listener count: $LISTENER_COUNT target: 0 | Runners count: $RUNNERS_COUNT target: 3"
+
+            sleep 1
+            count=$((count+1))
+          done
+
+      - name: Assert that the listener goes back up after the jobs are done
+        shell: bash
+        run: |
+          count=0
+          while true; do
+            LISTENER_COUNT="$(kubectl get pods -l actions.github.com/scale-set-name=${{ steps.install_arc.outputs.ARC_NAME }} -n arc-systems --field-selector=status.phase=Running -o=jsonpath='{.items}' | jq 'length')"
+            RUNNERS_COUNT="$(kubectl get pods -l app.kubernetes.io/component=runner -n arc-runners --field-selector=status.phase=Running -o=jsonpath='{.items}' | jq 'length')"
+            RESOURCES="$(kubectl get pods -A)"
+
+            if [ "$LISTENER_COUNT" -eq 1 ]; then
+              echo "Listener is up!"
+              echo "$RESOURCES"
+              exit 0
+            fi
+            if [ "$count" -ge 120 ]; then
+              echo "Timeout waiting for listener to be recreated"
+              echo "$RESOURCES"
+              exit 1
+            fi
+
+            echo "Waiting for listener to be recreated"
+            echo "Listener count: $LISTENER_COUNT target: 1 | Runners count: $RUNNERS_COUNT target: 0"
+
+            sleep 1
+            count=$((count+1))
+          done
+      
+      - name: Gather logs and cleanup
+        shell: bash
+        if: always()
+        run: |
+          helm uninstall "${{ steps.install_arc.outputs.ARC_NAME }}" --namespace "arc-runners" --debug
+          kubectl wait --timeout=10s --for=delete AutoScalingRunnerSet -n "${{ steps.install_arc.outputs.ARC_NAME }}" -l app.kubernetes.io/instance="${{ steps.install_arc.outputs.ARC_NAME }}"
+          kubectl logs deployment/arc-gha-runner-scale-set-controller -n "arc-systems"
--- a/.gitignore
+++ b/.gitignore
@ -35,5 +35,4 @@ bin
 .DS_STORE

 /test-assets
-
 /.tools
--- a/charts/gha-runner-scale-set-controller/templates/deployment.yaml
+++ b/charts/gha-runner-scale-set-controller/templates/deployment.yaml
@ -59,6 +59,9 @@ spec:
        {{- with .Values.flags.watchSingleNamespace }}
        - "--watch-single-namespace={{ . }}"
        {{- end }}
+        {{- with .Values.flags.updateStrategy }}
+        - "--update-strategy={{ . }}"
+        {{- end }}
        command:
        - "/manager"
        env:
--- a/charts/gha-runner-scale-set-controller/tests/template_test.go
+++ b/charts/gha-runner-scale-set-controller/tests/template_test.go
@ -356,9 +356,10 @@ func TestTemplate_ControllerDeployment_Defaults(t *testing.T) {
 	assert.Len(t, deployment.Spec.Template.Spec.Containers[0].Command, 1)
 	assert.Equal(t, "/manager", deployment.Spec.Template.Spec.Containers[0].Command[0])

-	assert.Len(t, deployment.Spec.Template.Spec.Containers[0].Args, 2)
+	assert.Len(t, deployment.Spec.Template.Spec.Containers[0].Args, 3)
 	assert.Equal(t, "--auto-scaling-runner-set-only", deployment.Spec.Template.Spec.Containers[0].Args[0])
 	assert.Equal(t, "--log-level=debug", deployment.Spec.Template.Spec.Containers[0].Args[1])
+	assert.Equal(t, "--update-strategy=immediate", deployment.Spec.Template.Spec.Containers[0].Args[2])

 	assert.Len(t, deployment.Spec.Template.Spec.Containers[0].Env, 3)
 	assert.Equal(t, "CONTROLLER_MANAGER_CONTAINER_IMAGE", deployment.Spec.Template.Spec.Containers[0].Env[0].Name)
@ -417,7 +418,8 @@ func TestTemplate_ControllerDeployment_Customize(t *testing.T) {
 			"tolerations[0].key":           "foo",
 			"affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].key":      "foo",
 			"affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].operator": "bar",
-			"priorityClassName": "test-priority-class",
+			"priorityClassName":    "test-priority-class",
+			"flags.updateStrategy": "eventual",
 		},
 		KubectlOptions: k8s.NewKubectlOptions("", "", namespaceName),
 	}
@ -482,10 +484,11 @@ func TestTemplate_ControllerDeployment_Customize(t *testing.T) {
 	assert.Len(t, deployment.Spec.Template.Spec.Containers[0].Command, 1)
 	assert.Equal(t, "/manager", deployment.Spec.Template.Spec.Containers[0].Command[0])

-	assert.Len(t, deployment.Spec.Template.Spec.Containers[0].Args, 3)
+	assert.Len(t, deployment.Spec.Template.Spec.Containers[0].Args, 4)
 	assert.Equal(t, "--auto-scaling-runner-set-only", deployment.Spec.Template.Spec.Containers[0].Args[0])
 	assert.Equal(t, "--auto-scaler-image-pull-secrets=dockerhub", deployment.Spec.Template.Spec.Containers[0].Args[1])
 	assert.Equal(t, "--log-level=debug", deployment.Spec.Template.Spec.Containers[0].Args[2])
+	assert.Equal(t, "--update-strategy=eventual", deployment.Spec.Template.Spec.Containers[0].Args[3])

 	assert.Len(t, deployment.Spec.Template.Spec.Containers[0].Env, 4)
 	assert.Equal(t, "CONTROLLER_MANAGER_CONTAINER_IMAGE", deployment.Spec.Template.Spec.Containers[0].Env[0].Name)
@ -602,11 +605,12 @@ func TestTemplate_EnableLeaderElection(t *testing.T) {
 	assert.Len(t, deployment.Spec.Template.Spec.Containers[0].Command, 1)
 	assert.Equal(t, "/manager", deployment.Spec.Template.Spec.Containers[0].Command[0])

-	assert.Len(t, deployment.Spec.Template.Spec.Containers[0].Args, 4)
+	assert.Len(t, deployment.Spec.Template.Spec.Containers[0].Args, 5)
 	assert.Equal(t, "--auto-scaling-runner-set-only", deployment.Spec.Template.Spec.Containers[0].Args[0])
 	assert.Equal(t, "--enable-leader-election", deployment.Spec.Template.Spec.Containers[0].Args[1])
 	assert.Equal(t, "--leader-election-id=test-arc-gha-runner-scale-set-controller", deployment.Spec.Template.Spec.Containers[0].Args[2])
 	assert.Equal(t, "--log-level=debug", deployment.Spec.Template.Spec.Containers[0].Args[3])
+	assert.Equal(t, "--update-strategy=immediate", deployment.Spec.Template.Spec.Containers[0].Args[4])
 }

 func TestTemplate_ControllerDeployment_ForwardImagePullSecrets(t *testing.T) {
@ -635,10 +639,11 @@ func TestTemplate_ControllerDeployment_ForwardImagePullSecrets(t *testing.T) {

 	assert.Equal(t, namespaceName, deployment.Namespace)

-	assert.Len(t, deployment.Spec.Template.Spec.Containers[0].Args, 3)
+	assert.Len(t, deployment.Spec.Template.Spec.Containers[0].Args, 4)
 	assert.Equal(t, "--auto-scaling-runner-set-only", deployment.Spec.Template.Spec.Containers[0].Args[0])
 	assert.Equal(t, "--auto-scaler-image-pull-secrets=dockerhub,ghcr", deployment.Spec.Template.Spec.Containers[0].Args[1])
 	assert.Equal(t, "--log-level=debug", deployment.Spec.Template.Spec.Containers[0].Args[2])
+	assert.Equal(t, "--update-strategy=immediate", deployment.Spec.Template.Spec.Containers[0].Args[3])
 }

 func TestTemplate_ControllerDeployment_WatchSingleNamespace(t *testing.T) {
@ -716,10 +721,11 @@ func TestTemplate_ControllerDeployment_WatchSingleNamespace(t *testing.T) {
 	assert.Len(t, deployment.Spec.Template.Spec.Containers[0].Command, 1)
 	assert.Equal(t, "/manager", deployment.Spec.Template.Spec.Containers[0].Command[0])

-	assert.Len(t, deployment.Spec.Template.Spec.Containers[0].Args, 3)
+	assert.Len(t, deployment.Spec.Template.Spec.Containers[0].Args, 4)
 	assert.Equal(t, "--auto-scaling-runner-set-only", deployment.Spec.Template.Spec.Containers[0].Args[0])
 	assert.Equal(t, "--log-level=debug", deployment.Spec.Template.Spec.Containers[0].Args[1])
 	assert.Equal(t, "--watch-single-namespace=demo", deployment.Spec.Template.Spec.Containers[0].Args[2])
+	assert.Equal(t, "--update-strategy=immediate", deployment.Spec.Template.Spec.Containers[0].Args[3])

 	assert.Len(t, deployment.Spec.Template.Spec.Containers[0].Env, 3)
 	assert.Equal(t, "CONTROLLER_MANAGER_CONTAINER_IMAGE", deployment.Spec.Template.Spec.Containers[0].Env[0].Name)
--- a/charts/gha-runner-scale-set-controller/values.yaml
+++ b/charts/gha-runner-scale-set-controller/values.yaml
@ -76,10 +76,26 @@ affinity: {}
 priorityClassName: ""

 flags:
-  # Log level can be set here with one of the following values: "debug", "info", "warn", "error".
-  # Defaults to "debug".
+  ## Log level can be set here with one of the following values: "debug", "info", "warn", "error".
+  ## Defaults to "debug".
  logLevel: "debug"

  ## Restricts the controller to only watch resources in the desired namespace.
  ## Defaults to watch all namespaces when unset.
  # watchSingleNamespace: ""
+
+  ## Defines how the controller should handle upgrades while having running jobs.
+  ##
+  ## The srategies available are:
+  ## - "immediate": (default) The controller will immediately apply the change causing the
+  ##   recreation of the listener and ephemeral runner set. This can lead to an 
+  ##   overprovisioning of runners, if there are pending / running jobs. This should not
+  ##   be a problem at a small scale, but it could lead to a significant increase of 
+  ##   resources if you have a lot of jobs running concurrently.
+  ## 
+  ## - "eventual": The controller will remove the listener and ephemeral runner set 
+  ##   immediately, but will not recreate them (to apply changes) until all 
+  ##   pending / running jobs have completed.
+  ##   This can lead to a longer time to apply the change but it will ensure
+  ##   that you don't have any overprovisioning of runners.
+  updateStrategy: "immediate"
--- a/controllers/actions.github.com/autoscalingrunnerset_controller.go
+++ b/controllers/actions.github.com/autoscalingrunnerset_controller.go
@ -49,6 +49,24 @@ const (
 	runnerScaleSetNameAnnotationKey   = "runner-scale-set-name"
 )

+type UpdateStrategy string
+
+// Defines how the controller should handle upgrades while having running jobs.
+const (
+	// "immediate": (default) The controller will immediately apply the change causing the
+	// recreation of the listener and ephemeral runner set. This can lead to an
+	// overprovisioning of runners, if there are pending / running jobs. This should not
+	// be a problem at a small scale, but it could lead to a significant increase of
+	// resources if you have a lot of jobs running concurrently.
+	UpdateStrategyImmediate = UpdateStrategy("immediate")
+	// "eventual": The controller will remove the listener and ephemeral runner set
+	// immediately, but will not recreate them (to apply changes) until all
+	// pending / running jobs have completed.
+	// This can lead to a longer time to apply the change but it will ensure
+	// that you don't have any overprovisioning of runners.
+	UpdateStrategyEventual = UpdateStrategy("eventual")
+)
+
 // AutoscalingRunnerSetReconciler reconciles a AutoscalingRunnerSet object
 type AutoscalingRunnerSetReconciler struct {
 	client.Client
@ -57,6 +75,7 @@ type AutoscalingRunnerSetReconciler struct {
 	ControllerNamespace                           string
 	DefaultRunnerScaleSetListenerImage            string
 	DefaultRunnerScaleSetListenerImagePullSecrets []string
+	UpdateStrategy                                UpdateStrategy
 	ActionsClient                                 actions.MultiClient

 	resourceBuilder resourceBuilder
@ -218,7 +237,48 @@ func (r *AutoscalingRunnerSetReconciler) Reconcile(ctx context.Context, req ctrl
 		log.Info("Find existing ephemeral runner set", "name", runnerSet.Name, "specHash", runnerSet.Labels[labelKeyRunnerSpecHash])
 	}

+	// Make sure the AutoscalingListener is up and running in the controller namespace
+	listener := new(v1alpha1.AutoscalingListener)
+	listenerFound := true
+	if err := r.Get(ctx, client.ObjectKey{Namespace: r.ControllerNamespace, Name: scaleSetListenerName(autoscalingRunnerSet)}, listener); err != nil {
+		if !kerrors.IsNotFound(err) {
+			log.Error(err, "Failed to get AutoscalingListener resource")
+			return ctrl.Result{}, err
+		}
+
+		listenerFound = false
+		log.Info("AutoscalingListener does not exist.")
+	}
+
+	// Our listener pod is out of date, so we need to delete it to get a new recreate.
+	if listenerFound && (listener.Labels[labelKeyRunnerSpecHash] != autoscalingRunnerSet.ListenerSpecHash()) {
+		log.Info("RunnerScaleSetListener is out of date. Deleting it so that it is recreated", "name", listener.Name)
+		if err := r.Delete(ctx, listener); err != nil {
+			if kerrors.IsNotFound(err) {
+				return ctrl.Result{}, nil
+			}
+			log.Error(err, "Failed to delete AutoscalingListener resource")
+			return ctrl.Result{}, err
+		}
+
+		log.Info("Deleted RunnerScaleSetListener since existing one is out of date")
+		return ctrl.Result{}, nil
+	}
+
 	if desiredSpecHash != latestRunnerSet.Labels[labelKeyRunnerSpecHash] {
+		if r.drainingJobs(&latestRunnerSet.Status) {
+			log.Info("Latest runner set spec hash does not match the current autoscaling runner set. Waiting for the running and pending runners to finish:", "running", latestRunnerSet.Status.RunningEphemeralRunners, "pending", latestRunnerSet.Status.PendingEphemeralRunners)
+			log.Info("Scaling down the number of desired replicas to 0")
+			// We are in the process of draining the jobs. The listener has been deleted and the ephemeral runner set replicas
+			// need to scale down to 0
+			err := patch(ctx, r.Client, latestRunnerSet, func(obj *v1alpha1.EphemeralRunnerSet) {
+				obj.Spec.Replicas = 0
+			})
+			if err != nil {
+				log.Error(err, "Failed to patch runner set to set desired count to 0")
+			}
+			return ctrl.Result{}, err
+		}
 		log.Info("Latest runner set spec hash does not match the current autoscaling runner set. Creating a new runner set")
 		return r.createEphemeralRunnerSet(ctx, autoscalingRunnerSet, log)
 	}
@ -234,30 +294,13 @@ func (r *AutoscalingRunnerSetReconciler) Reconcile(ctx context.Context, req ctrl
 	}

 	// Make sure the AutoscalingListener is up and running in the controller namespace
-	listener := new(v1alpha1.AutoscalingListener)
-	if err := r.Get(ctx, client.ObjectKey{Namespace: r.ControllerNamespace, Name: scaleSetListenerName(autoscalingRunnerSet)}, listener); err != nil {
-		if kerrors.IsNotFound(err) {
-			// We don't have a listener
-			log.Info("Creating a new AutoscalingListener for the runner set", "ephemeralRunnerSetName", latestRunnerSet.Name)
-			return r.createAutoScalingListenerForRunnerSet(ctx, autoscalingRunnerSet, latestRunnerSet, log)
+	if !listenerFound {
+		if r.drainingJobs(&latestRunnerSet.Status) {
+			log.Info("Creating a new AutoscalingListener is waiting for the running and pending runners to finish. Waiting for the running and pending runners to finish:", "running", latestRunnerSet.Status.RunningEphemeralRunners, "pending", latestRunnerSet.Status.PendingEphemeralRunners)
+			return ctrl.Result{}, nil
 		}
-		log.Error(err, "Failed to get AutoscalingListener resource")
-		return ctrl.Result{}, err
-	}
-
-	// Our listener pod is out of date, so we need to delete it to get a new recreate.
-	if listener.Labels[labelKeyRunnerSpecHash] != autoscalingRunnerSet.ListenerSpecHash() {
-		log.Info("RunnerScaleSetListener is out of date. Deleting it so that it is recreated", "name", listener.Name)
-		if err := r.Delete(ctx, listener); err != nil {
-			if kerrors.IsNotFound(err) {
-				return ctrl.Result{}, nil
-			}
-			log.Error(err, "Failed to delete AutoscalingListener resource")
-			return ctrl.Result{}, err
-		}
-
-		log.Info("Deleted RunnerScaleSetListener since existing one is out of date")
-		return ctrl.Result{}, nil
+		log.Info("Creating a new AutoscalingListener for the runner set", "ephemeralRunnerSetName", latestRunnerSet.Name)
+		return r.createAutoScalingListenerForRunnerSet(ctx, autoscalingRunnerSet, latestRunnerSet, log)
 	}

 	// Update the status of autoscaling runner set.
@ -276,6 +319,16 @@ func (r *AutoscalingRunnerSetReconciler) Reconcile(ctx context.Context, req ctrl
 	return ctrl.Result{}, nil
 }

+// Prevents overprovisioning of runners.
+// We reach this code path when runner scale set has been patched with a new runner spec but there are still running ephemeral runners.
+// The safest approach is to wait for the running ephemeral runners to finish before creating a new runner set.
+func (r *AutoscalingRunnerSetReconciler) drainingJobs(latestRunnerSetStatus *v1alpha1.EphemeralRunnerSetStatus) bool {
+	if r.UpdateStrategy == UpdateStrategyEventual && ((latestRunnerSetStatus.RunningEphemeralRunners + latestRunnerSetStatus.PendingEphemeralRunners) > 0) {
+		return true
+	}
+	return false
+}
+
 func (r *AutoscalingRunnerSetReconciler) cleanupListener(ctx context.Context, autoscalingRunnerSet *v1alpha1.AutoscalingRunnerSet, logger logr.Logger) (done bool, err error) {
 	logger.Info("Cleaning up the listener")
 	var listener v1alpha1.AutoscalingListener
--- a/controllers/actions.github.com/autoscalingrunnerset_controller_test.go
+++ b/controllers/actions.github.com/autoscalingrunnerset_controller_test.go
@ -42,6 +42,7 @@ const (
 var _ = Describe("Test AutoScalingRunnerSet controller", Ordered, func() {
 	var ctx context.Context
 	var mgr ctrl.Manager
+	var controller *AutoscalingRunnerSetReconciler
 	var autoscalingNS *corev1.Namespace
 	var autoscalingRunnerSet *v1alpha1.AutoscalingRunnerSet
 	var configSecret *corev1.Secret
@ -63,7 +64,7 @@ var _ = Describe("Test AutoScalingRunnerSet controller", Ordered, func() {
 		autoscalingNS, mgr = createNamespace(GinkgoT(), k8sClient)
 		configSecret = createDefaultSecret(GinkgoT(), k8sClient, autoscalingNS.Name)

-		controller := &AutoscalingRunnerSetReconciler{
+		controller = &AutoscalingRunnerSetReconciler{
 			Client:                             mgr.GetClient(),
 			Scheme:                             mgr.GetScheme(),
 			Log:                                logf.Log,
@ -424,6 +425,110 @@ var _ = Describe("Test AutoScalingRunnerSet controller", Ordered, func() {
 		})
 	})

+	Context("When updating an AutoscalingRunnerSet with running or pending jobs", func() {
+		It("It should wait for running and pending jobs to finish before applying the update. Update Strategy is set to eventual.", func() {
+			// Switch update strategy to eventual (drain jobs )
+			controller.UpdateStrategy = UpdateStrategyEventual
+			// Wait till the listener is created
+			listener := new(v1alpha1.AutoscalingListener)
+			Eventually(
+				func() error {
+					return k8sClient.Get(ctx, client.ObjectKey{Name: scaleSetListenerName(autoscalingRunnerSet), Namespace: autoscalingRunnerSet.Namespace}, listener)
+				},
+				autoscalingRunnerSetTestTimeout,
+				autoscalingRunnerSetTestInterval,
+			).Should(Succeed(), "Listener should be created")
+
+			// Wait till the ephemeral runner set is created
+			Eventually(
+				func() (int, error) {
+					runnerSetList := new(v1alpha1.EphemeralRunnerSetList)
+					err := k8sClient.List(ctx, runnerSetList, client.InNamespace(autoscalingRunnerSet.Namespace))
+					if err != nil {
+						return 0, err
+					}
+
+					return len(runnerSetList.Items), nil
+				},
+				autoscalingRunnerSetTestTimeout,
+				autoscalingRunnerSetTestInterval,
+			).Should(BeEquivalentTo(1), "Only one EphemeralRunnerSet should be created")
+
+			runnerSetList := new(v1alpha1.EphemeralRunnerSetList)
+			err := k8sClient.List(ctx, runnerSetList, client.InNamespace(autoscalingRunnerSet.Namespace))
+			Expect(err).NotTo(HaveOccurred(), "failed to list EphemeralRunnerSet")
+
+			// Emulate running and pending jobs
+			runnerSet := runnerSetList.Items[0]
+			activeRunnerSet := runnerSet.DeepCopy()
+			activeRunnerSet.Status.CurrentReplicas = 6
+			activeRunnerSet.Status.FailedEphemeralRunners = 1
+			activeRunnerSet.Status.RunningEphemeralRunners = 2
+			activeRunnerSet.Status.PendingEphemeralRunners = 3
+
+			desiredStatus := v1alpha1.AutoscalingRunnerSetStatus{
+				CurrentRunners:          activeRunnerSet.Status.CurrentReplicas,
+				State:                   "",
+				PendingEphemeralRunners: activeRunnerSet.Status.PendingEphemeralRunners,
+				RunningEphemeralRunners: activeRunnerSet.Status.RunningEphemeralRunners,
+				FailedEphemeralRunners:  activeRunnerSet.Status.FailedEphemeralRunners,
+			}
+
+			err = k8sClient.Status().Patch(ctx, activeRunnerSet, client.MergeFrom(&runnerSet))
+			Expect(err).NotTo(HaveOccurred(), "Failed to patch runner set status")
+
+			Eventually(
+				func() (v1alpha1.AutoscalingRunnerSetStatus, error) {
+					updated := new(v1alpha1.AutoscalingRunnerSet)
+					err := k8sClient.Get(ctx, client.ObjectKey{Name: autoscalingRunnerSet.Name, Namespace: autoscalingRunnerSet.Namespace}, updated)
+					if err != nil {
+						return v1alpha1.AutoscalingRunnerSetStatus{}, fmt.Errorf("failed to get AutoScalingRunnerSet: %w", err)
+					}
+					return updated.Status, nil
+				},
+				autoscalingRunnerSetTestTimeout,
+				autoscalingRunnerSetTestInterval,
+			).Should(BeEquivalentTo(desiredStatus), "AutoScalingRunnerSet status should be updated")
+
+			// Patch the AutoScalingRunnerSet image which should trigger
+			// the recreation of the Listener and EphemeralRunnerSet
+			patched := autoscalingRunnerSet.DeepCopy()
+			patched.Spec.Template.Spec = corev1.PodSpec{
+				Containers: []corev1.Container{
+					{
+						Name:  "runner",
+						Image: "ghcr.io/actions/abcd:1.1.1",
+					},
+				},
+			}
+			// patched.Spec.Template.Spec.PriorityClassName = "test-priority-class"
+			err = k8sClient.Patch(ctx, patched, client.MergeFrom(autoscalingRunnerSet))
+			Expect(err).NotTo(HaveOccurred(), "failed to patch AutoScalingRunnerSet")
+			autoscalingRunnerSet = patched.DeepCopy()
+
+			// The EphemeralRunnerSet should not be recreated
+			Consistently(
+				func() (string, error) {
+					runnerSetList := new(v1alpha1.EphemeralRunnerSetList)
+					err := k8sClient.List(ctx, runnerSetList, client.InNamespace(autoscalingRunnerSet.Namespace))
+					Expect(err).NotTo(HaveOccurred(), "failed to fetch AutoScalingRunnerSet")
+					return runnerSetList.Items[0].Name, nil
+				},
+				autoscalingRunnerSetTestTimeout,
+				autoscalingRunnerSetTestInterval,
+			).Should(Equal(activeRunnerSet.Name), "The EphemeralRunnerSet should not be recreated")
+
+			// The listener should not be recreated
+			Consistently(
+				func() error {
+					return k8sClient.Get(ctx, client.ObjectKey{Name: scaleSetListenerName(autoscalingRunnerSet), Namespace: autoscalingRunnerSet.Namespace}, listener)
+				},
+				autoscalingRunnerSetTestTimeout,
+				autoscalingRunnerSetTestInterval,
+			).ShouldNot(Succeed(), "Listener should not be recreated")
+		})
+	})
+
 	It("Should update Status on EphemeralRunnerSet status Update", func() {
 		ars := new(v1alpha1.AutoscalingRunnerSet)
 		Eventually(
@ -1617,10 +1722,14 @@ var _ = Describe("Test resource version and build version mismatch", func() {

 		startManagers(GinkgoT(), mgr)

-		Eventually(func() bool {
-			ars := new(v1alpha1.AutoscalingRunnerSet)
-			err := k8sClient.Get(ctx, types.NamespacedName{Namespace: autoscalingRunnerSet.Namespace, Name: autoscalingRunnerSet.Name}, ars)
-			return errors.IsNotFound(err)
-		}).Should(BeTrue())
+		Eventually(
+			func() bool {
+				ars := new(v1alpha1.AutoscalingRunnerSet)
+				err := k8sClient.Get(ctx, types.NamespacedName{Namespace: autoscalingRunnerSet.Namespace, Name: autoscalingRunnerSet.Name}, ars)
+				return errors.IsNotFound(err)
+			},
+			autoscalingRunnerSetTestTimeout,
+			autoscalingRunnerSetTestInterval,
+		).Should(BeTrue())
 	})
 })
--- a/main.go
+++ b/main.go
@ -77,6 +77,7 @@ func main() {
 		autoScalingRunnerSetOnly bool
 		enableLeaderElection     bool
 		disableAdmissionWebhook  bool
+		updateStrategy           string
 		leaderElectionId         string
 		port                     int
 		syncPeriod               time.Duration
@ -131,6 +132,7 @@ func main() {
 	flag.StringVar(&logLevel, "log-level", logging.LogLevelDebug, `The verbosity of the logging. Valid values are "debug", "info", "warn", "error". Defaults to "debug".`)
 	flag.StringVar(&logFormat, "log-format", "text", `The log format. Valid options are "text" and "json". Defaults to "text"`)
 	flag.BoolVar(&autoScalingRunnerSetOnly, "auto-scaling-runner-set-only", false, "Make controller only reconcile AutoRunnerScaleSet object.")
+	flag.StringVar(&updateStrategy, "update-strategy", "immediate", `Resources reconciliation strategy on upgrade with running/pending jobs. Valid values are: "immediate", "eventual". Defaults to "immediate".`)
 	flag.Var(&autoScalerImagePullSecrets, "auto-scaler-image-pull-secrets", "The default image-pull secret name for auto-scaler listener container.")
 	flag.Parse()

@ -169,6 +171,14 @@ func main() {
 		if len(watchSingleNamespace) > 0 {
 			newCache = cache.MultiNamespacedCacheBuilder([]string{managerNamespace, watchSingleNamespace})
 		}
+
+		switch updateStrategy {
+		case "eventual", "immediate":
+			log.Info(`Update strategy set to:`, "updateStrategy", updateStrategy)
+		default:
+			log.Info(`Update strategy not recognized. Defaulting to "immediately"`, "updateStrategy", updateStrategy)
+			updateStrategy = "immediate"
+		}
 	}

 	listenerPullPolicy := os.Getenv("CONTROLLER_MANAGER_LISTENER_IMAGE_PULL_POLICY")
@ -216,6 +226,7 @@ func main() {
 			ControllerNamespace:                managerNamespace,
 			DefaultRunnerScaleSetListenerImage: managerImage,
 			ActionsClient:                      actionsMultiClient,
+			UpdateStrategy:                     actionsgithubcom.UpdateStrategy(updateStrategy),
 			DefaultRunnerScaleSetListenerImagePullSecrets: autoScalerImagePullSecrets,
 		}).SetupWithManager(mgr); err != nil {
 			log.Error(err, "unable to create controller", "controller", "AutoscalingRunnerSet")
@ -241,6 +252,7 @@ func main() {
 			log.Error(err, "unable to create controller", "controller", "EphemeralRunnerSet")
 			os.Exit(1)
 		}
+
 		if err = (&actionsgithubcom.AutoscalingListenerReconciler{
 			Client: mgr.GetClient(),
 			Log:    log.WithName("AutoscalingListener"),