diff --git a/.github/actions/execute-assert-arc-e2e/action.yaml b/.github/actions/execute-assert-arc-e2e/action.yaml index 37d9c585..8089d15f 100644 --- a/.github/actions/execute-assert-arc-e2e/action.yaml +++ b/.github/actions/execute-assert-arc-e2e/action.yaml @@ -23,6 +23,14 @@ inputs: arc-controller-namespace: description: 'The namespace of the configured gha-runner-scale-set-controller' required: true + wait-to-finish: + description: 'Wait for the workflow run to finish' + required: true + default: "true" + wait-to-running: + description: 'Wait for the workflow run to start running' + required: true + default: "false" runs: using: "composite" @@ -118,7 +126,36 @@ runs: | ${{steps.query_workflow.outputs.workflow_run_url}} | EOF + - name: Wait for workflow to start running + if: inputs.wait-to-running == 'true' && inputs.wait-to-finish == 'false' + uses: actions/github-script@v6 + with: + script: | + function sleep(ms) { + return new Promise(resolve => setTimeout(resolve, ms)) + } + const owner = '${{inputs.repo-owner}}' + const repo = '${{inputs.repo-name}}' + const workflow_run_id = ${{steps.query_workflow.outputs.workflow_run}} + const workflow_job_id = ${{steps.query_workflow.outputs.workflow_job}} + let count = 0 + while (count++<10) { + await sleep(30 * 1000); + let getRunResponse = await github.rest.actions.getWorkflowRun({ + owner: owner, + repo: repo, + run_id: workflow_run_id + }) + console.log(`${getRunResponse.data.html_url}: ${getRunResponse.data.status} (${getRunResponse.data.conclusion})`); + if (getRunResponse.data.status == 'in_progress') { + console.log(`Workflow run is in progress.`) + return + } + } + core.setFailed(`The triggered workflow run didn't start properly using ${{inputs.arc-name}}`) + - name: Wait for workflow to finish successfully + if: inputs.wait-to-finish == 'true' uses: actions/github-script@v6 with: script: | @@ -151,10 +188,15 @@ runs: } core.setFailed(`The triggered workflow run didn't finish properly using ${{inputs.arc-name}}`) + - name: cleanup + if: inputs.wait-to-finish == 'true' + shell: bash + run: | + helm uninstall ${{ inputs.arc-name }} --namespace ${{inputs.arc-namespace}} --debug + kubectl wait --timeout=10s --for=delete AutoScalingRunnerSet -n ${{inputs.arc-name}} -l app.kubernetes.io/instance=${{ inputs.arc-name }} + - name: Gather logs and cleanup shell: bash if: always() run: | - helm uninstall ${{ inputs.arc-name }} --namespace ${{inputs.arc-namespace}} --debug - kubectl wait --timeout=10s --for=delete AutoScalingRunnerSet -n ${{inputs.arc-name}} -l app.kubernetes.io/instance=${{ inputs.arc-name }} kubectl logs deployment/arc-gha-runner-scale-set-controller -n ${{inputs.arc-controller-namespace}} \ No newline at end of file diff --git a/.github/workflows/gha-e2e-tests.yaml b/.github/workflows/gha-e2e-tests.yaml index 07621df0..6badcbf5 100644 --- a/.github/workflows/gha-e2e-tests.yaml +++ b/.github/workflows/gha-e2e-tests.yaml @@ -710,3 +710,173 @@ jobs: arc-name: ${{steps.install_arc.outputs.ARC_NAME}} arc-namespace: "arc-runners" arc-controller-namespace: "arc-systems" + + update-strategy-tests: + runs-on: ubuntu-latest + timeout-minutes: 20 + if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.id == github.repository_id + env: + WORKFLOW_FILE: "arc-test-sleepy-matrix.yaml" + steps: + - uses: actions/checkout@v3 + with: + ref: ${{github.head_ref}} + + - uses: ./.github/actions/setup-arc-e2e + id: setup + with: + app-id: ${{secrets.E2E_TESTS_ACCESS_APP_ID}} + app-pk: ${{secrets.E2E_TESTS_ACCESS_PK}} + image-name: ${{env.IMAGE_NAME}} + image-tag: ${{env.IMAGE_VERSION}} + target-org: ${{env.TARGET_ORG}} + + - name: Install gha-runner-scale-set-controller + id: install_arc_controller + run: | + helm install arc \ + --namespace "arc-systems" \ + --create-namespace \ + --set image.repository=${{ env.IMAGE_NAME }} \ + --set image.tag=${{ env.IMAGE_VERSION }} \ + --set flags.updateStrategy="eventual" \ + ./charts/gha-runner-scale-set-controller \ + --debug + count=0 + while true; do + POD_NAME=$(kubectl get pods -n arc-systems -l app.kubernetes.io/name=gha-runner-scale-set-controller -o name) + if [ -n "$POD_NAME" ]; then + echo "Pod found: $POD_NAME" + break + fi + if [ "$count" -ge 60 ]; then + echo "Timeout waiting for controller pod with label app.kubernetes.io/name=gha-runner-scale-set-controller" + exit 1 + fi + sleep 1 + count=$((count+1)) + done + kubectl wait --timeout=30s --for=condition=ready pod -n arc-systems -l app.kubernetes.io/name=gha-runner-scale-set-controller + kubectl get pod -n arc-systems + kubectl describe deployment arc-gha-runner-scale-set-controller -n arc-systems + + - name: Install gha-runner-scale-set + id: install_arc + run: | + ARC_NAME=${{github.job}}-$(date +'%M%S')$((($RANDOM + 100) % 100 + 1)) + helm install "$ARC_NAME" \ + --namespace "arc-runners" \ + --create-namespace \ + --set githubConfigUrl="https://github.com/${{ env.TARGET_ORG }}/${{env.TARGET_REPO}}" \ + --set githubConfigSecret.github_token="${{ steps.setup.outputs.token }}" \ + ./charts/gha-runner-scale-set \ + --debug + echo "ARC_NAME=$ARC_NAME" >> $GITHUB_OUTPUT + count=0 + while true; do + POD_NAME=$(kubectl get pods -n arc-systems -l actions.github.com/scale-set-name=$ARC_NAME -o name) + if [ -n "$POD_NAME" ]; then + echo "Pod found: $POD_NAME" + break + fi + if [ "$count" -ge 60 ]; then + echo "Timeout waiting for listener pod with label actions.github.com/scale-set-name=$ARC_NAME" + exit 1 + fi + sleep 1 + count=$((count+1)) + done + kubectl wait --timeout=30s --for=condition=ready pod -n arc-systems -l actions.github.com/scale-set-name=$ARC_NAME + kubectl get pod -n arc-systems + + - name: Trigger long running jobs and wait for runners to pick them up + uses: ./.github/actions/execute-assert-arc-e2e + timeout-minutes: 10 + with: + auth-token: ${{ steps.setup.outputs.token }} + repo-owner: ${{ env.TARGET_ORG }} + repo-name: ${{env.TARGET_REPO}} + workflow-file: ${{env.WORKFLOW_FILE}} + arc-name: ${{steps.install_arc.outputs.ARC_NAME}} + arc-namespace: "arc-runners" + arc-controller-namespace: "arc-systems" + wait-to-running: "true" + wait-to-finish: "false" + + - name: Upgrade the gha-runner-scale-set + shell: bash + run: | + helm upgrade --install "${{ steps.install_arc.outputs.ARC_NAME }}" \ + --namespace "arc-runners" \ + --create-namespace \ + --set githubConfigUrl="https://github.com/${{ env.TARGET_ORG }}/${{ env.TARGET_REPO }}" \ + --set githubConfigSecret.github_token="${{ steps.setup.outputs.token }}" \ + --set template.spec.containers[0].name="runner" \ + --set template.spec.containers[0].image="ghcr.io/actions/actions-runner:latest" \ + --set template.spec.containers[0].command={"/home/runner/run.sh"} \ + --set template.spec.containers[0].env[0].name="TEST" \ + --set template.spec.containers[0].env[0].value="E2E TESTS" \ + ./charts/gha-runner-scale-set \ + --debug + + - name: Assert that the listener is deleted while jobs are running + shell: bash + run: | + count=0 + while true; do + LISTENER_COUNT="$(kubectl get pods -l actions.github.com/scale-set-name=${{ steps.install_arc.outputs.ARC_NAME }} -n arc-systems --field-selector=status.phase=Running -o=jsonpath='{.items}' | jq 'length')" + RUNNERS_COUNT="$(kubectl get pods -l app.kubernetes.io/component=runner -n arc-runners --field-selector=status.phase=Running -o=jsonpath='{.items}' | jq 'length')" + RESOURCES="$(kubectl get pods -A)" + + if [ "$LISTENER_COUNT" -eq 0 ]; then + echo "Listener has been deleted" + echo "$RESOURCES" + exit 0 + fi + if [ "$count" -ge 60 ]; then + echo "Timeout waiting for listener to be deleted" + echo "$RESOURCES" + exit 1 + fi + + echo "Waiting for listener to be deleted" + echo "Listener count: $LISTENER_COUNT target: 0 | Runners count: $RUNNERS_COUNT target: 3" + + sleep 1 + count=$((count+1)) + done + + - name: Assert that the listener goes back up after the jobs are done + shell: bash + run: | + count=0 + while true; do + LISTENER_COUNT="$(kubectl get pods -l actions.github.com/scale-set-name=${{ steps.install_arc.outputs.ARC_NAME }} -n arc-systems --field-selector=status.phase=Running -o=jsonpath='{.items}' | jq 'length')" + RUNNERS_COUNT="$(kubectl get pods -l app.kubernetes.io/component=runner -n arc-runners --field-selector=status.phase=Running -o=jsonpath='{.items}' | jq 'length')" + RESOURCES="$(kubectl get pods -A)" + + if [ "$LISTENER_COUNT" -eq 1 ]; then + echo "Listener is up!" + echo "$RESOURCES" + exit 0 + fi + if [ "$count" -ge 120 ]; then + echo "Timeout waiting for listener to be recreated" + echo "$RESOURCES" + exit 1 + fi + + echo "Waiting for listener to be recreated" + echo "Listener count: $LISTENER_COUNT target: 1 | Runners count: $RUNNERS_COUNT target: 0" + + sleep 1 + count=$((count+1)) + done + + - name: Gather logs and cleanup + shell: bash + if: always() + run: | + helm uninstall "${{ steps.install_arc.outputs.ARC_NAME }}" --namespace "arc-runners" --debug + kubectl wait --timeout=10s --for=delete AutoScalingRunnerSet -n "${{ steps.install_arc.outputs.ARC_NAME }}" -l app.kubernetes.io/instance="${{ steps.install_arc.outputs.ARC_NAME }}" + kubectl logs deployment/arc-gha-runner-scale-set-controller -n "arc-systems" \ No newline at end of file diff --git a/.gitignore b/.gitignore index ce539d20..e0fcafbf 100644 --- a/.gitignore +++ b/.gitignore @@ -35,5 +35,4 @@ bin .DS_STORE /test-assets - /.tools diff --git a/charts/gha-runner-scale-set-controller/templates/deployment.yaml b/charts/gha-runner-scale-set-controller/templates/deployment.yaml index 1997d2b0..dc0b88a7 100644 --- a/charts/gha-runner-scale-set-controller/templates/deployment.yaml +++ b/charts/gha-runner-scale-set-controller/templates/deployment.yaml @@ -59,6 +59,9 @@ spec: {{- with .Values.flags.watchSingleNamespace }} - "--watch-single-namespace={{ . }}" {{- end }} + {{- with .Values.flags.updateStrategy }} + - "--update-strategy={{ . }}" + {{- end }} command: - "/manager" env: diff --git a/charts/gha-runner-scale-set-controller/tests/template_test.go b/charts/gha-runner-scale-set-controller/tests/template_test.go index 663e64b2..e972bd07 100644 --- a/charts/gha-runner-scale-set-controller/tests/template_test.go +++ b/charts/gha-runner-scale-set-controller/tests/template_test.go @@ -356,9 +356,10 @@ func TestTemplate_ControllerDeployment_Defaults(t *testing.T) { assert.Len(t, deployment.Spec.Template.Spec.Containers[0].Command, 1) assert.Equal(t, "/manager", deployment.Spec.Template.Spec.Containers[0].Command[0]) - assert.Len(t, deployment.Spec.Template.Spec.Containers[0].Args, 2) + assert.Len(t, deployment.Spec.Template.Spec.Containers[0].Args, 3) assert.Equal(t, "--auto-scaling-runner-set-only", deployment.Spec.Template.Spec.Containers[0].Args[0]) assert.Equal(t, "--log-level=debug", deployment.Spec.Template.Spec.Containers[0].Args[1]) + assert.Equal(t, "--update-strategy=immediate", deployment.Spec.Template.Spec.Containers[0].Args[2]) assert.Len(t, deployment.Spec.Template.Spec.Containers[0].Env, 3) assert.Equal(t, "CONTROLLER_MANAGER_CONTAINER_IMAGE", deployment.Spec.Template.Spec.Containers[0].Env[0].Name) @@ -417,7 +418,8 @@ func TestTemplate_ControllerDeployment_Customize(t *testing.T) { "tolerations[0].key": "foo", "affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].key": "foo", "affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].operator": "bar", - "priorityClassName": "test-priority-class", + "priorityClassName": "test-priority-class", + "flags.updateStrategy": "eventual", }, KubectlOptions: k8s.NewKubectlOptions("", "", namespaceName), } @@ -482,10 +484,11 @@ func TestTemplate_ControllerDeployment_Customize(t *testing.T) { assert.Len(t, deployment.Spec.Template.Spec.Containers[0].Command, 1) assert.Equal(t, "/manager", deployment.Spec.Template.Spec.Containers[0].Command[0]) - assert.Len(t, deployment.Spec.Template.Spec.Containers[0].Args, 3) + assert.Len(t, deployment.Spec.Template.Spec.Containers[0].Args, 4) assert.Equal(t, "--auto-scaling-runner-set-only", deployment.Spec.Template.Spec.Containers[0].Args[0]) assert.Equal(t, "--auto-scaler-image-pull-secrets=dockerhub", deployment.Spec.Template.Spec.Containers[0].Args[1]) assert.Equal(t, "--log-level=debug", deployment.Spec.Template.Spec.Containers[0].Args[2]) + assert.Equal(t, "--update-strategy=eventual", deployment.Spec.Template.Spec.Containers[0].Args[3]) assert.Len(t, deployment.Spec.Template.Spec.Containers[0].Env, 4) assert.Equal(t, "CONTROLLER_MANAGER_CONTAINER_IMAGE", deployment.Spec.Template.Spec.Containers[0].Env[0].Name) @@ -602,11 +605,12 @@ func TestTemplate_EnableLeaderElection(t *testing.T) { assert.Len(t, deployment.Spec.Template.Spec.Containers[0].Command, 1) assert.Equal(t, "/manager", deployment.Spec.Template.Spec.Containers[0].Command[0]) - assert.Len(t, deployment.Spec.Template.Spec.Containers[0].Args, 4) + assert.Len(t, deployment.Spec.Template.Spec.Containers[0].Args, 5) assert.Equal(t, "--auto-scaling-runner-set-only", deployment.Spec.Template.Spec.Containers[0].Args[0]) assert.Equal(t, "--enable-leader-election", deployment.Spec.Template.Spec.Containers[0].Args[1]) assert.Equal(t, "--leader-election-id=test-arc-gha-runner-scale-set-controller", deployment.Spec.Template.Spec.Containers[0].Args[2]) assert.Equal(t, "--log-level=debug", deployment.Spec.Template.Spec.Containers[0].Args[3]) + assert.Equal(t, "--update-strategy=immediate", deployment.Spec.Template.Spec.Containers[0].Args[4]) } func TestTemplate_ControllerDeployment_ForwardImagePullSecrets(t *testing.T) { @@ -635,10 +639,11 @@ func TestTemplate_ControllerDeployment_ForwardImagePullSecrets(t *testing.T) { assert.Equal(t, namespaceName, deployment.Namespace) - assert.Len(t, deployment.Spec.Template.Spec.Containers[0].Args, 3) + assert.Len(t, deployment.Spec.Template.Spec.Containers[0].Args, 4) assert.Equal(t, "--auto-scaling-runner-set-only", deployment.Spec.Template.Spec.Containers[0].Args[0]) assert.Equal(t, "--auto-scaler-image-pull-secrets=dockerhub,ghcr", deployment.Spec.Template.Spec.Containers[0].Args[1]) assert.Equal(t, "--log-level=debug", deployment.Spec.Template.Spec.Containers[0].Args[2]) + assert.Equal(t, "--update-strategy=immediate", deployment.Spec.Template.Spec.Containers[0].Args[3]) } func TestTemplate_ControllerDeployment_WatchSingleNamespace(t *testing.T) { @@ -716,10 +721,11 @@ func TestTemplate_ControllerDeployment_WatchSingleNamespace(t *testing.T) { assert.Len(t, deployment.Spec.Template.Spec.Containers[0].Command, 1) assert.Equal(t, "/manager", deployment.Spec.Template.Spec.Containers[0].Command[0]) - assert.Len(t, deployment.Spec.Template.Spec.Containers[0].Args, 3) + assert.Len(t, deployment.Spec.Template.Spec.Containers[0].Args, 4) assert.Equal(t, "--auto-scaling-runner-set-only", deployment.Spec.Template.Spec.Containers[0].Args[0]) assert.Equal(t, "--log-level=debug", deployment.Spec.Template.Spec.Containers[0].Args[1]) assert.Equal(t, "--watch-single-namespace=demo", deployment.Spec.Template.Spec.Containers[0].Args[2]) + assert.Equal(t, "--update-strategy=immediate", deployment.Spec.Template.Spec.Containers[0].Args[3]) assert.Len(t, deployment.Spec.Template.Spec.Containers[0].Env, 3) assert.Equal(t, "CONTROLLER_MANAGER_CONTAINER_IMAGE", deployment.Spec.Template.Spec.Containers[0].Env[0].Name) diff --git a/charts/gha-runner-scale-set-controller/values.yaml b/charts/gha-runner-scale-set-controller/values.yaml index 03359b5f..b0405609 100644 --- a/charts/gha-runner-scale-set-controller/values.yaml +++ b/charts/gha-runner-scale-set-controller/values.yaml @@ -76,10 +76,26 @@ affinity: {} priorityClassName: "" flags: - # Log level can be set here with one of the following values: "debug", "info", "warn", "error". - # Defaults to "debug". + ## Log level can be set here with one of the following values: "debug", "info", "warn", "error". + ## Defaults to "debug". logLevel: "debug" ## Restricts the controller to only watch resources in the desired namespace. ## Defaults to watch all namespaces when unset. # watchSingleNamespace: "" + + ## Defines how the controller should handle upgrades while having running jobs. + ## + ## The srategies available are: + ## - "immediate": (default) The controller will immediately apply the change causing the + ## recreation of the listener and ephemeral runner set. This can lead to an + ## overprovisioning of runners, if there are pending / running jobs. This should not + ## be a problem at a small scale, but it could lead to a significant increase of + ## resources if you have a lot of jobs running concurrently. + ## + ## - "eventual": The controller will remove the listener and ephemeral runner set + ## immediately, but will not recreate them (to apply changes) until all + ## pending / running jobs have completed. + ## This can lead to a longer time to apply the change but it will ensure + ## that you don't have any overprovisioning of runners. + updateStrategy: "immediate" \ No newline at end of file diff --git a/controllers/actions.github.com/autoscalingrunnerset_controller.go b/controllers/actions.github.com/autoscalingrunnerset_controller.go index 79e28df9..201e796b 100644 --- a/controllers/actions.github.com/autoscalingrunnerset_controller.go +++ b/controllers/actions.github.com/autoscalingrunnerset_controller.go @@ -49,6 +49,24 @@ const ( runnerScaleSetNameAnnotationKey = "runner-scale-set-name" ) +type UpdateStrategy string + +// Defines how the controller should handle upgrades while having running jobs. +const ( + // "immediate": (default) The controller will immediately apply the change causing the + // recreation of the listener and ephemeral runner set. This can lead to an + // overprovisioning of runners, if there are pending / running jobs. This should not + // be a problem at a small scale, but it could lead to a significant increase of + // resources if you have a lot of jobs running concurrently. + UpdateStrategyImmediate = UpdateStrategy("immediate") + // "eventual": The controller will remove the listener and ephemeral runner set + // immediately, but will not recreate them (to apply changes) until all + // pending / running jobs have completed. + // This can lead to a longer time to apply the change but it will ensure + // that you don't have any overprovisioning of runners. + UpdateStrategyEventual = UpdateStrategy("eventual") +) + // AutoscalingRunnerSetReconciler reconciles a AutoscalingRunnerSet object type AutoscalingRunnerSetReconciler struct { client.Client @@ -57,6 +75,7 @@ type AutoscalingRunnerSetReconciler struct { ControllerNamespace string DefaultRunnerScaleSetListenerImage string DefaultRunnerScaleSetListenerImagePullSecrets []string + UpdateStrategy UpdateStrategy ActionsClient actions.MultiClient resourceBuilder resourceBuilder @@ -218,7 +237,48 @@ func (r *AutoscalingRunnerSetReconciler) Reconcile(ctx context.Context, req ctrl log.Info("Find existing ephemeral runner set", "name", runnerSet.Name, "specHash", runnerSet.Labels[labelKeyRunnerSpecHash]) } + // Make sure the AutoscalingListener is up and running in the controller namespace + listener := new(v1alpha1.AutoscalingListener) + listenerFound := true + if err := r.Get(ctx, client.ObjectKey{Namespace: r.ControllerNamespace, Name: scaleSetListenerName(autoscalingRunnerSet)}, listener); err != nil { + if !kerrors.IsNotFound(err) { + log.Error(err, "Failed to get AutoscalingListener resource") + return ctrl.Result{}, err + } + + listenerFound = false + log.Info("AutoscalingListener does not exist.") + } + + // Our listener pod is out of date, so we need to delete it to get a new recreate. + if listenerFound && (listener.Labels[labelKeyRunnerSpecHash] != autoscalingRunnerSet.ListenerSpecHash()) { + log.Info("RunnerScaleSetListener is out of date. Deleting it so that it is recreated", "name", listener.Name) + if err := r.Delete(ctx, listener); err != nil { + if kerrors.IsNotFound(err) { + return ctrl.Result{}, nil + } + log.Error(err, "Failed to delete AutoscalingListener resource") + return ctrl.Result{}, err + } + + log.Info("Deleted RunnerScaleSetListener since existing one is out of date") + return ctrl.Result{}, nil + } + if desiredSpecHash != latestRunnerSet.Labels[labelKeyRunnerSpecHash] { + if r.drainingJobs(&latestRunnerSet.Status) { + log.Info("Latest runner set spec hash does not match the current autoscaling runner set. Waiting for the running and pending runners to finish:", "running", latestRunnerSet.Status.RunningEphemeralRunners, "pending", latestRunnerSet.Status.PendingEphemeralRunners) + log.Info("Scaling down the number of desired replicas to 0") + // We are in the process of draining the jobs. The listener has been deleted and the ephemeral runner set replicas + // need to scale down to 0 + err := patch(ctx, r.Client, latestRunnerSet, func(obj *v1alpha1.EphemeralRunnerSet) { + obj.Spec.Replicas = 0 + }) + if err != nil { + log.Error(err, "Failed to patch runner set to set desired count to 0") + } + return ctrl.Result{}, err + } log.Info("Latest runner set spec hash does not match the current autoscaling runner set. Creating a new runner set") return r.createEphemeralRunnerSet(ctx, autoscalingRunnerSet, log) } @@ -234,30 +294,13 @@ func (r *AutoscalingRunnerSetReconciler) Reconcile(ctx context.Context, req ctrl } // Make sure the AutoscalingListener is up and running in the controller namespace - listener := new(v1alpha1.AutoscalingListener) - if err := r.Get(ctx, client.ObjectKey{Namespace: r.ControllerNamespace, Name: scaleSetListenerName(autoscalingRunnerSet)}, listener); err != nil { - if kerrors.IsNotFound(err) { - // We don't have a listener - log.Info("Creating a new AutoscalingListener for the runner set", "ephemeralRunnerSetName", latestRunnerSet.Name) - return r.createAutoScalingListenerForRunnerSet(ctx, autoscalingRunnerSet, latestRunnerSet, log) + if !listenerFound { + if r.drainingJobs(&latestRunnerSet.Status) { + log.Info("Creating a new AutoscalingListener is waiting for the running and pending runners to finish. Waiting for the running and pending runners to finish:", "running", latestRunnerSet.Status.RunningEphemeralRunners, "pending", latestRunnerSet.Status.PendingEphemeralRunners) + return ctrl.Result{}, nil } - log.Error(err, "Failed to get AutoscalingListener resource") - return ctrl.Result{}, err - } - - // Our listener pod is out of date, so we need to delete it to get a new recreate. - if listener.Labels[labelKeyRunnerSpecHash] != autoscalingRunnerSet.ListenerSpecHash() { - log.Info("RunnerScaleSetListener is out of date. Deleting it so that it is recreated", "name", listener.Name) - if err := r.Delete(ctx, listener); err != nil { - if kerrors.IsNotFound(err) { - return ctrl.Result{}, nil - } - log.Error(err, "Failed to delete AutoscalingListener resource") - return ctrl.Result{}, err - } - - log.Info("Deleted RunnerScaleSetListener since existing one is out of date") - return ctrl.Result{}, nil + log.Info("Creating a new AutoscalingListener for the runner set", "ephemeralRunnerSetName", latestRunnerSet.Name) + return r.createAutoScalingListenerForRunnerSet(ctx, autoscalingRunnerSet, latestRunnerSet, log) } // Update the status of autoscaling runner set. @@ -276,6 +319,16 @@ func (r *AutoscalingRunnerSetReconciler) Reconcile(ctx context.Context, req ctrl return ctrl.Result{}, nil } +// Prevents overprovisioning of runners. +// We reach this code path when runner scale set has been patched with a new runner spec but there are still running ephemeral runners. +// The safest approach is to wait for the running ephemeral runners to finish before creating a new runner set. +func (r *AutoscalingRunnerSetReconciler) drainingJobs(latestRunnerSetStatus *v1alpha1.EphemeralRunnerSetStatus) bool { + if r.UpdateStrategy == UpdateStrategyEventual && ((latestRunnerSetStatus.RunningEphemeralRunners + latestRunnerSetStatus.PendingEphemeralRunners) > 0) { + return true + } + return false +} + func (r *AutoscalingRunnerSetReconciler) cleanupListener(ctx context.Context, autoscalingRunnerSet *v1alpha1.AutoscalingRunnerSet, logger logr.Logger) (done bool, err error) { logger.Info("Cleaning up the listener") var listener v1alpha1.AutoscalingListener diff --git a/controllers/actions.github.com/autoscalingrunnerset_controller_test.go b/controllers/actions.github.com/autoscalingrunnerset_controller_test.go index 390511e6..41512e06 100644 --- a/controllers/actions.github.com/autoscalingrunnerset_controller_test.go +++ b/controllers/actions.github.com/autoscalingrunnerset_controller_test.go @@ -42,6 +42,7 @@ const ( var _ = Describe("Test AutoScalingRunnerSet controller", Ordered, func() { var ctx context.Context var mgr ctrl.Manager + var controller *AutoscalingRunnerSetReconciler var autoscalingNS *corev1.Namespace var autoscalingRunnerSet *v1alpha1.AutoscalingRunnerSet var configSecret *corev1.Secret @@ -63,7 +64,7 @@ var _ = Describe("Test AutoScalingRunnerSet controller", Ordered, func() { autoscalingNS, mgr = createNamespace(GinkgoT(), k8sClient) configSecret = createDefaultSecret(GinkgoT(), k8sClient, autoscalingNS.Name) - controller := &AutoscalingRunnerSetReconciler{ + controller = &AutoscalingRunnerSetReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), Log: logf.Log, @@ -424,6 +425,110 @@ var _ = Describe("Test AutoScalingRunnerSet controller", Ordered, func() { }) }) + Context("When updating an AutoscalingRunnerSet with running or pending jobs", func() { + It("It should wait for running and pending jobs to finish before applying the update. Update Strategy is set to eventual.", func() { + // Switch update strategy to eventual (drain jobs ) + controller.UpdateStrategy = UpdateStrategyEventual + // Wait till the listener is created + listener := new(v1alpha1.AutoscalingListener) + Eventually( + func() error { + return k8sClient.Get(ctx, client.ObjectKey{Name: scaleSetListenerName(autoscalingRunnerSet), Namespace: autoscalingRunnerSet.Namespace}, listener) + }, + autoscalingRunnerSetTestTimeout, + autoscalingRunnerSetTestInterval, + ).Should(Succeed(), "Listener should be created") + + // Wait till the ephemeral runner set is created + Eventually( + func() (int, error) { + runnerSetList := new(v1alpha1.EphemeralRunnerSetList) + err := k8sClient.List(ctx, runnerSetList, client.InNamespace(autoscalingRunnerSet.Namespace)) + if err != nil { + return 0, err + } + + return len(runnerSetList.Items), nil + }, + autoscalingRunnerSetTestTimeout, + autoscalingRunnerSetTestInterval, + ).Should(BeEquivalentTo(1), "Only one EphemeralRunnerSet should be created") + + runnerSetList := new(v1alpha1.EphemeralRunnerSetList) + err := k8sClient.List(ctx, runnerSetList, client.InNamespace(autoscalingRunnerSet.Namespace)) + Expect(err).NotTo(HaveOccurred(), "failed to list EphemeralRunnerSet") + + // Emulate running and pending jobs + runnerSet := runnerSetList.Items[0] + activeRunnerSet := runnerSet.DeepCopy() + activeRunnerSet.Status.CurrentReplicas = 6 + activeRunnerSet.Status.FailedEphemeralRunners = 1 + activeRunnerSet.Status.RunningEphemeralRunners = 2 + activeRunnerSet.Status.PendingEphemeralRunners = 3 + + desiredStatus := v1alpha1.AutoscalingRunnerSetStatus{ + CurrentRunners: activeRunnerSet.Status.CurrentReplicas, + State: "", + PendingEphemeralRunners: activeRunnerSet.Status.PendingEphemeralRunners, + RunningEphemeralRunners: activeRunnerSet.Status.RunningEphemeralRunners, + FailedEphemeralRunners: activeRunnerSet.Status.FailedEphemeralRunners, + } + + err = k8sClient.Status().Patch(ctx, activeRunnerSet, client.MergeFrom(&runnerSet)) + Expect(err).NotTo(HaveOccurred(), "Failed to patch runner set status") + + Eventually( + func() (v1alpha1.AutoscalingRunnerSetStatus, error) { + updated := new(v1alpha1.AutoscalingRunnerSet) + err := k8sClient.Get(ctx, client.ObjectKey{Name: autoscalingRunnerSet.Name, Namespace: autoscalingRunnerSet.Namespace}, updated) + if err != nil { + return v1alpha1.AutoscalingRunnerSetStatus{}, fmt.Errorf("failed to get AutoScalingRunnerSet: %w", err) + } + return updated.Status, nil + }, + autoscalingRunnerSetTestTimeout, + autoscalingRunnerSetTestInterval, + ).Should(BeEquivalentTo(desiredStatus), "AutoScalingRunnerSet status should be updated") + + // Patch the AutoScalingRunnerSet image which should trigger + // the recreation of the Listener and EphemeralRunnerSet + patched := autoscalingRunnerSet.DeepCopy() + patched.Spec.Template.Spec = corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "runner", + Image: "ghcr.io/actions/abcd:1.1.1", + }, + }, + } + // patched.Spec.Template.Spec.PriorityClassName = "test-priority-class" + err = k8sClient.Patch(ctx, patched, client.MergeFrom(autoscalingRunnerSet)) + Expect(err).NotTo(HaveOccurred(), "failed to patch AutoScalingRunnerSet") + autoscalingRunnerSet = patched.DeepCopy() + + // The EphemeralRunnerSet should not be recreated + Consistently( + func() (string, error) { + runnerSetList := new(v1alpha1.EphemeralRunnerSetList) + err := k8sClient.List(ctx, runnerSetList, client.InNamespace(autoscalingRunnerSet.Namespace)) + Expect(err).NotTo(HaveOccurred(), "failed to fetch AutoScalingRunnerSet") + return runnerSetList.Items[0].Name, nil + }, + autoscalingRunnerSetTestTimeout, + autoscalingRunnerSetTestInterval, + ).Should(Equal(activeRunnerSet.Name), "The EphemeralRunnerSet should not be recreated") + + // The listener should not be recreated + Consistently( + func() error { + return k8sClient.Get(ctx, client.ObjectKey{Name: scaleSetListenerName(autoscalingRunnerSet), Namespace: autoscalingRunnerSet.Namespace}, listener) + }, + autoscalingRunnerSetTestTimeout, + autoscalingRunnerSetTestInterval, + ).ShouldNot(Succeed(), "Listener should not be recreated") + }) + }) + It("Should update Status on EphemeralRunnerSet status Update", func() { ars := new(v1alpha1.AutoscalingRunnerSet) Eventually( @@ -1617,10 +1722,14 @@ var _ = Describe("Test resource version and build version mismatch", func() { startManagers(GinkgoT(), mgr) - Eventually(func() bool { - ars := new(v1alpha1.AutoscalingRunnerSet) - err := k8sClient.Get(ctx, types.NamespacedName{Namespace: autoscalingRunnerSet.Namespace, Name: autoscalingRunnerSet.Name}, ars) - return errors.IsNotFound(err) - }).Should(BeTrue()) + Eventually( + func() bool { + ars := new(v1alpha1.AutoscalingRunnerSet) + err := k8sClient.Get(ctx, types.NamespacedName{Namespace: autoscalingRunnerSet.Namespace, Name: autoscalingRunnerSet.Name}, ars) + return errors.IsNotFound(err) + }, + autoscalingRunnerSetTestTimeout, + autoscalingRunnerSetTestInterval, + ).Should(BeTrue()) }) }) diff --git a/main.go b/main.go index 543db603..9cc1bbdd 100644 --- a/main.go +++ b/main.go @@ -77,6 +77,7 @@ func main() { autoScalingRunnerSetOnly bool enableLeaderElection bool disableAdmissionWebhook bool + updateStrategy string leaderElectionId string port int syncPeriod time.Duration @@ -131,6 +132,7 @@ func main() { flag.StringVar(&logLevel, "log-level", logging.LogLevelDebug, `The verbosity of the logging. Valid values are "debug", "info", "warn", "error". Defaults to "debug".`) flag.StringVar(&logFormat, "log-format", "text", `The log format. Valid options are "text" and "json". Defaults to "text"`) flag.BoolVar(&autoScalingRunnerSetOnly, "auto-scaling-runner-set-only", false, "Make controller only reconcile AutoRunnerScaleSet object.") + flag.StringVar(&updateStrategy, "update-strategy", "immediate", `Resources reconciliation strategy on upgrade with running/pending jobs. Valid values are: "immediate", "eventual". Defaults to "immediate".`) flag.Var(&autoScalerImagePullSecrets, "auto-scaler-image-pull-secrets", "The default image-pull secret name for auto-scaler listener container.") flag.Parse() @@ -169,6 +171,14 @@ func main() { if len(watchSingleNamespace) > 0 { newCache = cache.MultiNamespacedCacheBuilder([]string{managerNamespace, watchSingleNamespace}) } + + switch updateStrategy { + case "eventual", "immediate": + log.Info(`Update strategy set to:`, "updateStrategy", updateStrategy) + default: + log.Info(`Update strategy not recognized. Defaulting to "immediately"`, "updateStrategy", updateStrategy) + updateStrategy = "immediate" + } } listenerPullPolicy := os.Getenv("CONTROLLER_MANAGER_LISTENER_IMAGE_PULL_POLICY") @@ -216,6 +226,7 @@ func main() { ControllerNamespace: managerNamespace, DefaultRunnerScaleSetListenerImage: managerImage, ActionsClient: actionsMultiClient, + UpdateStrategy: actionsgithubcom.UpdateStrategy(updateStrategy), DefaultRunnerScaleSetListenerImagePullSecrets: autoScalerImagePullSecrets, }).SetupWithManager(mgr); err != nil { log.Error(err, "unable to create controller", "controller", "AutoscalingRunnerSet") @@ -241,6 +252,7 @@ func main() { log.Error(err, "unable to create controller", "controller", "EphemeralRunnerSet") os.Exit(1) } + if err = (&actionsgithubcom.AutoscalingListenerReconciler{ Client: mgr.GetClient(), Log: log.WithName("AutoscalingListener"),