diff --git a/.github/actions/setup-arc-e2e/action.yaml b/.github/actions/setup-arc-e2e/action.yaml index 0cccd465..ec5b55af 100644 --- a/.github/actions/setup-arc-e2e/action.yaml +++ b/.github/actions/setup-arc-e2e/action.yaml @@ -1,9 +1,9 @@ -name: 'Setup ARC E2E Test Action' -description: 'Build controller image, create kind cluster, load the image, and exchange ARC configure token.' +name: "Setup ARC E2E Test Action" +description: "Build controller image, create kind cluster, load the image, and exchange ARC configure token." inputs: app-id: - description: 'GitHub App Id for exchange access token' + description: "GitHub App Id for exchange access token" required: true app-pk: description: "GitHub App private key for exchange access token" @@ -20,30 +20,31 @@ inputs: outputs: token: - description: 'Token to use for configure ARC' + description: "Token to use for configure ARC" value: ${{steps.config-token.outputs.token}} runs: using: "composite" steps: - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 + uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 with: - # Pinning v0.9.1 for Buildx and BuildKit v0.10.6 - # BuildKit v0.11 which has a bug causing intermittent - # failures pushing images to GHCR - version: v0.9.1 - driver-opts: image=moby/buildkit:v0.10.6 + # Pinning v0.9.1 for Buildx and BuildKit v0.10.6 + # BuildKit v0.11 which has a bug causing intermittent + # failures pushing images to GHCR + version: v0.9.1 + driver-opts: image=moby/buildkit:v0.10.6 - name: Build controller image - uses: docker/build-push-action@v5 + # https://github.com/docker/build-push-action/releases/tag/v6.15.0 + uses: docker/build-push-action@471d1dc4e07e5cdedd4c2171150001c434f0b7a4 with: file: Dockerfile platforms: linux/amd64 load: true build-args: | DOCKER_IMAGE_NAME=${{inputs.image-name}} - VERSION=${{inputs.image-tag}} + VERSION=${{inputs.image-tag}} tags: | ${{inputs.image-name}}:${{inputs.image-tag}} no-cache: true @@ -56,8 +57,9 @@ runs: - name: Get configure token id: config-token + # https://github.com/peter-murray/workflow-application-token-action/releases/tag/v3.0.0 uses: peter-murray/workflow-application-token-action@dc0413987a085fa17d19df9e47d4677cf81ffef3 with: application_id: ${{ inputs.app-id }} application_private_key: ${{ inputs.app-pk }} - organization: ${{ inputs.target-org}} \ No newline at end of file + organization: ${{ inputs.target-org}} diff --git a/.github/actions/setup-docker-environment/action.yaml b/.github/actions/setup-docker-environment/action.yaml index f083ff07..6053125e 100644 --- a/.github/actions/setup-docker-environment/action.yaml +++ b/.github/actions/setup-docker-environment/action.yaml @@ -24,23 +24,27 @@ runs: shell: bash - name: Set up QEMU - uses: docker/setup-qemu-action@v3 + # https://github.com/docker/setup-qemu-action/releases/tag/v3.6.0 + uses: docker/setup-qemu-action@29109295f81e9208d7d86ff1c6c12d2833863392 - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 + # https://github.com/docker/setup-buildx-action/releases/tag/v3.10.0 + uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 with: version: latest - name: Login to DockerHub if: ${{ github.event_name == 'release' || github.event_name == 'push' && github.ref == 'refs/heads/master' && inputs.password != '' }} - uses: docker/login-action@v3 + # https://github.com/docker/login-action/releases/tag/v3.4.0 + uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 with: username: ${{ inputs.username }} password: ${{ inputs.password }} - name: Login to GitHub Container Registry if: ${{ github.event_name == 'release' || github.event_name == 'push' && github.ref == 'refs/heads/master' && inputs.ghcr_password != '' }} - uses: docker/login-action@v3 + # https://github.com/docker/login-action/releases/tag/v3.4.0 + uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 with: registry: ghcr.io username: ${{ inputs.ghcr_username }} diff --git a/.github/workflows/arc-publish-chart.yaml b/.github/workflows/arc-publish-chart.yaml index 96f03f22..5cada00e 100644 --- a/.github/workflows/arc-publish-chart.yaml +++ b/.github/workflows/arc-publish-chart.yaml @@ -5,18 +5,18 @@ name: Publish ARC Helm Charts on: push: branches: - - master + - master paths: - - 'charts/**' - - '.github/workflows/arc-publish-chart.yaml' - - '!charts/actions-runner-controller/docs/**' - - '!charts/gha-runner-scale-set-controller/**' - - '!charts/gha-runner-scale-set/**' - - '!**.md' + - "charts/**" + - ".github/workflows/arc-publish-chart.yaml" + - "!charts/actions-runner-controller/docs/**" + - "!charts/gha-runner-scale-set-controller/**" + - "!charts/gha-runner-scale-set/**" + - "!**.md" workflow_dispatch: inputs: force: - description: 'Force publish even if the chart version is not bumped' + description: "Force publish even if the chart version is not bumped" type: boolean required: true default: false @@ -39,86 +39,89 @@ jobs: outputs: publish-chart: ${{ steps.publish-chart-step.outputs.publish }} steps: - - name: Checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 - - name: Set up Helm - uses: azure/setup-helm@fe7b79cd5ee1e45176fcad797de68ecaf3ca4814 - with: - version: ${{ env.HELM_VERSION }} + - name: Set up Helm + # Using https://github.com/Azure/setup-helm/releases/tag/v4.2.0 + uses: azure/setup-helm@fe7b79cd5ee1e45176fcad797de68ecaf3ca4814 + with: + version: ${{ env.HELM_VERSION }} - - name: Set up kube-score - run: | - wget https://github.com/zegl/kube-score/releases/download/v${{ env.KUBE_SCORE_VERSION }}/kube-score_${{ env.KUBE_SCORE_VERSION }}_linux_amd64 -O kube-score - chmod 755 kube-score + - name: Set up kube-score + run: | + wget https://github.com/zegl/kube-score/releases/download/v${{ env.KUBE_SCORE_VERSION }}/kube-score_${{ env.KUBE_SCORE_VERSION }}_linux_amd64 -O kube-score + chmod 755 kube-score - - name: Kube-score generated manifests - run: helm template --values charts/.ci/values-kube-score.yaml charts/* | ./kube-score score - --ignore-test pod-networkpolicy --ignore-test deployment-has-poddisruptionbudget --ignore-test deployment-has-host-podantiaffinity --ignore-test container-security-context --ignore-test pod-probes --ignore-test container-image-tag --enable-optional-test container-security-context-privileged --enable-optional-test container-security-context-readonlyrootfilesystem + - name: Kube-score generated manifests + run: helm template --values charts/.ci/values-kube-score.yaml charts/* | ./kube-score score - --ignore-test pod-networkpolicy --ignore-test deployment-has-poddisruptionbudget --ignore-test deployment-has-host-podantiaffinity --ignore-test container-security-context --ignore-test pod-probes --ignore-test container-image-tag --enable-optional-test container-security-context-privileged --enable-optional-test container-security-context-readonlyrootfilesystem - # python is a requirement for the chart-testing action below (supports yamllint among other tests) - - uses: actions/setup-python@v5 - with: - python-version: '3.11' + # python is a requirement for the chart-testing action below (supports yamllint among other tests) + - uses: actions/setup-python@v5 + with: + python-version: "3.11" - - name: Set up chart-testing - uses: helm/chart-testing-action@v2.6.0 + - name: Set up chart-testing + # https://github.com/helm/chart-testing-action/releases/tag/v2.7.0 + uses: helm/chart-testing-action@0d28d3144d3a25ea2cc349d6e59901c4ff469b3b - - name: Run chart-testing (list-changed) - id: list-changed - run: | - changed=$(ct list-changed --config charts/.ci/ct-config.yaml) - if [[ -n "$changed" ]]; then - echo "changed=true" >> $GITHUB_OUTPUT - fi + - name: Run chart-testing (list-changed) + id: list-changed + run: | + changed=$(ct list-changed --config charts/.ci/ct-config.yaml) + if [[ -n "$changed" ]]; then + echo "changed=true" >> $GITHUB_OUTPUT + fi - - name: Run chart-testing (lint) - run: | - ct lint --config charts/.ci/ct-config.yaml + - name: Run chart-testing (lint) + run: | + ct lint --config charts/.ci/ct-config.yaml - - name: Create kind cluster - if: steps.list-changed.outputs.changed == 'true' - uses: helm/kind-action@v1.4.0 + - name: Create kind cluster + if: steps.list-changed.outputs.changed == 'true' + # https://github.com/helm/kind-action/releases/tag/v1.12.0 + uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 - # We need cert-manager already installed in the cluster because we assume the CRDs exist - - name: Install cert-manager - if: steps.list-changed.outputs.changed == 'true' - run: | - helm repo add jetstack https://charts.jetstack.io --force-update - helm install cert-manager jetstack/cert-manager --set installCRDs=true --wait + # We need cert-manager already installed in the cluster because we assume the CRDs exist + - name: Install cert-manager + if: steps.list-changed.outputs.changed == 'true' + run: | + helm repo add jetstack https://charts.jetstack.io --force-update + helm install cert-manager jetstack/cert-manager --set installCRDs=true --wait - - name: Run chart-testing (install) - if: steps.list-changed.outputs.changed == 'true' - run: ct install --config charts/.ci/ct-config.yaml + - name: Run chart-testing (install) + if: steps.list-changed.outputs.changed == 'true' + run: ct install --config charts/.ci/ct-config.yaml - # WARNING: This relies on the latest release being at the top of the JSON from GitHub and a clean chart.yaml - - name: Check if Chart Publish is Needed - id: publish-chart-step - run: | - CHART_TEXT=$(curl -fs https://raw.githubusercontent.com/${{ github.repository }}/master/charts/actions-runner-controller/Chart.yaml) - NEW_CHART_VERSION=$(echo "$CHART_TEXT" | grep version: | cut -d ' ' -f 2) - RELEASE_LIST=$(curl -fs https://api.github.com/repos/${{ github.repository }}/releases | jq .[].tag_name | grep actions-runner-controller | cut -d '"' -f 2 | cut -d '-' -f 4) - LATEST_RELEASED_CHART_VERSION=$(echo $RELEASE_LIST | cut -d ' ' -f 1) + # WARNING: This relies on the latest release being at the top of the JSON from GitHub and a clean chart.yaml + - name: Check if Chart Publish is Needed + id: publish-chart-step + run: | + CHART_TEXT=$(curl -fs https://raw.githubusercontent.com/${{ github.repository }}/master/charts/actions-runner-controller/Chart.yaml) + NEW_CHART_VERSION=$(echo "$CHART_TEXT" | grep version: | cut -d ' ' -f 2) + RELEASE_LIST=$(curl -fs https://api.github.com/repos/${{ github.repository }}/releases | jq .[].tag_name | grep actions-runner-controller | cut -d '"' -f 2 | cut -d '-' -f 4) + LATEST_RELEASED_CHART_VERSION=$(echo $RELEASE_LIST | cut -d ' ' -f 1) - echo "CHART_VERSION_IN_MASTER=$NEW_CHART_VERSION" >> $GITHUB_ENV - echo "LATEST_CHART_VERSION=$LATEST_RELEASED_CHART_VERSION" >> $GITHUB_ENV + echo "CHART_VERSION_IN_MASTER=$NEW_CHART_VERSION" >> $GITHUB_ENV + echo "LATEST_CHART_VERSION=$LATEST_RELEASED_CHART_VERSION" >> $GITHUB_ENV - # Always publish if force is true - if [[ $NEW_CHART_VERSION != $LATEST_RELEASED_CHART_VERSION || "${{ inputs.force }}" == "true" ]]; then - echo "publish=true" >> $GITHUB_OUTPUT - else - echo "publish=false" >> $GITHUB_OUTPUT - fi + # Always publish if force is true + if [[ $NEW_CHART_VERSION != $LATEST_RELEASED_CHART_VERSION || "${{ inputs.force }}" == "true" ]]; then + echo "publish=true" >> $GITHUB_OUTPUT + else + echo "publish=false" >> $GITHUB_OUTPUT + fi - - name: Job summary - run: | - echo "Chart linting has been completed." >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "**Status:**" >> $GITHUB_STEP_SUMMARY - echo "- chart version in master: ${{ env.CHART_VERSION_IN_MASTER }}" >> $GITHUB_STEP_SUMMARY - echo "- latest chart version: ${{ env.LATEST_CHART_VERSION }}" >> $GITHUB_STEP_SUMMARY - echo "- publish new chart: ${{ steps.publish-chart-step.outputs.publish }}" >> $GITHUB_STEP_SUMMARY + - name: Job summary + run: | + echo "Chart linting has been completed." >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "**Status:**" >> $GITHUB_STEP_SUMMARY + echo "- chart version in master: ${{ env.CHART_VERSION_IN_MASTER }}" >> $GITHUB_STEP_SUMMARY + echo "- latest chart version: ${{ env.LATEST_CHART_VERSION }}" >> $GITHUB_STEP_SUMMARY + echo "- publish new chart: ${{ steps.publish-chart-step.outputs.publish }}" >> $GITHUB_STEP_SUMMARY publish-chart: if: needs.lint-chart.outputs.publish-chart == 'true' @@ -133,80 +136,81 @@ jobs: CHART_TARGET_BRANCH: master steps: - - name: Checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 - - name: Configure Git - run: | - git config user.name "$GITHUB_ACTOR" - git config user.email "$GITHUB_ACTOR@users.noreply.github.com" + - name: Configure Git + run: | + git config user.name "$GITHUB_ACTOR" + git config user.email "$GITHUB_ACTOR@users.noreply.github.com" - - name: Get Token - id: get_workflow_token - uses: peter-murray/workflow-application-token-action@dc0413987a085fa17d19df9e47d4677cf81ffef3 - with: - application_id: ${{ secrets.ACTIONS_ACCESS_APP_ID }} - application_private_key: ${{ secrets.ACTIONS_ACCESS_PK }} - organization: ${{ env.CHART_TARGET_ORG }} + - name: Get Token + id: get_workflow_token + # https://github.com/peter-murray/workflow-application-token-action/releases/tag/v3.0.0 + uses: peter-murray/workflow-application-token-action@dc0413987a085fa17d19df9e47d4677cf81ffef3 + with: + application_id: ${{ secrets.ACTIONS_ACCESS_APP_ID }} + application_private_key: ${{ secrets.ACTIONS_ACCESS_PK }} + organization: ${{ env.CHART_TARGET_ORG }} - - name: Install chart-releaser - uses: helm/chart-releaser-action@v1.4.1 - with: - install_only: true - install_dir: ${{ github.workspace }}/bin + - name: Install chart-releaser + uses: helm/chart-releaser-action@cae68fefc6b5f367a0275617c9f83181ba54714f + with: + install_only: true + install_dir: ${{ github.workspace }}/bin - - name: Package and upload release assets - run: | - cr package \ - ${{ github.workspace }}/charts/actions-runner-controller/ \ - --package-path .cr-release-packages + - name: Package and upload release assets + run: | + cr package \ + ${{ github.workspace }}/charts/actions-runner-controller/ \ + --package-path .cr-release-packages - cr upload \ - --owner "$(echo ${{ github.repository }} | cut -d '/' -f 1)" \ - --git-repo "$(echo ${{ github.repository }} | cut -d '/' -f 2)" \ - --package-path .cr-release-packages \ - --token ${{ secrets.GITHUB_TOKEN }} + cr upload \ + --owner "$(echo ${{ github.repository }} | cut -d '/' -f 1)" \ + --git-repo "$(echo ${{ github.repository }} | cut -d '/' -f 2)" \ + --package-path .cr-release-packages \ + --token ${{ secrets.GITHUB_TOKEN }} - - name: Generate updated index.yaml - run: | - cr index \ - --owner "$(echo ${{ github.repository }} | cut -d '/' -f 1)" \ - --git-repo "$(echo ${{ github.repository }} | cut -d '/' -f 2)" \ - --index-path ${{ github.workspace }}/index.yaml \ - --token ${{ secrets.GITHUB_TOKEN }} \ - --push \ - --pages-branch 'gh-pages' \ - --pages-index-path 'index.yaml' + - name: Generate updated index.yaml + run: | + cr index \ + --owner "$(echo ${{ github.repository }} | cut -d '/' -f 1)" \ + --git-repo "$(echo ${{ github.repository }} | cut -d '/' -f 2)" \ + --index-path ${{ github.workspace }}/index.yaml \ + --token ${{ secrets.GITHUB_TOKEN }} \ + --push \ + --pages-branch 'gh-pages' \ + --pages-index-path 'index.yaml' - # Chart Release was never intended to publish to a different repo - # this workaround is intended to move the index.yaml to the target repo - # where the github pages are hosted - - name: Checkout target repository - uses: actions/checkout@v4 - with: - repository: ${{ env.CHART_TARGET_ORG }}/${{ env.CHART_TARGET_REPO }} - path: ${{ env.CHART_TARGET_REPO }} - ref: ${{ env.CHART_TARGET_BRANCH }} - token: ${{ steps.get_workflow_token.outputs.token }} + # Chart Release was never intended to publish to a different repo + # this workaround is intended to move the index.yaml to the target repo + # where the github pages are hosted + - name: Checkout target repository + uses: actions/checkout@v4 + with: + repository: ${{ env.CHART_TARGET_ORG }}/${{ env.CHART_TARGET_REPO }} + path: ${{ env.CHART_TARGET_REPO }} + ref: ${{ env.CHART_TARGET_BRANCH }} + token: ${{ steps.get_workflow_token.outputs.token }} - - name: Copy index.yaml - run: | - cp ${{ github.workspace }}/index.yaml ${{ env.CHART_TARGET_REPO }}/actions-runner-controller/index.yaml + - name: Copy index.yaml + run: | + cp ${{ github.workspace }}/index.yaml ${{ env.CHART_TARGET_REPO }}/actions-runner-controller/index.yaml - - name: Commit and push to target repository - run: | - git config user.name "$GITHUB_ACTOR" - git config user.email "$GITHUB_ACTOR@users.noreply.github.com" - git add . - git commit -m "Update index.yaml" - git push - working-directory: ${{ github.workspace }}/${{ env.CHART_TARGET_REPO }} + - name: Commit and push to target repository + run: | + git config user.name "$GITHUB_ACTOR" + git config user.email "$GITHUB_ACTOR@users.noreply.github.com" + git add . + git commit -m "Update index.yaml" + git push + working-directory: ${{ github.workspace }}/${{ env.CHART_TARGET_REPO }} - - name: Job summary - run: | - echo "New helm chart has been published" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "**Status:**" >> $GITHUB_STEP_SUMMARY - echo "- New [index.yaml](https://github.com/${{ env.CHART_TARGET_ORG }}/${{ env.CHART_TARGET_REPO }}/tree/master/actions-runner-controller) pushed" >> $GITHUB_STEP_SUMMARY + - name: Job summary + run: | + echo "New helm chart has been published" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "**Status:**" >> $GITHUB_STEP_SUMMARY + echo "- New [index.yaml](https://github.com/${{ env.CHART_TARGET_ORG }}/${{ env.CHART_TARGET_REPO }}/tree/master/actions-runner-controller) pushed" >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/arc-publish.yaml b/.github/workflows/arc-publish.yaml index 37d67e9f..1a9328ca 100644 --- a/.github/workflows/arc-publish.yaml +++ b/.github/workflows/arc-publish.yaml @@ -9,17 +9,17 @@ on: workflow_dispatch: inputs: release_tag_name: - description: 'Tag name of the release to publish' + description: "Tag name of the release to publish" required: true push_to_registries: - description: 'Push images to registries' + description: "Push images to registries" required: true type: boolean default: false permissions: - contents: write - packages: write + contents: write + packages: write env: TARGET_ORG: actions-runner-controller @@ -43,7 +43,7 @@ jobs: - uses: actions/setup-go@v5 with: - go-version-file: 'go.mod' + go-version-file: "go.mod" - name: Install tools run: | @@ -73,6 +73,7 @@ jobs: - name: Get Token id: get_workflow_token + # https://github.com/peter-murray/workflow-application-token-action/releases/tag/v3.0.0 uses: peter-murray/workflow-application-token-action@dc0413987a085fa17d19df9e47d4677cf81ffef3 with: application_id: ${{ secrets.ACTIONS_ACCESS_APP_ID }} diff --git a/.github/workflows/arc-release-runners.yaml b/.github/workflows/arc-release-runners.yaml index 55ced306..da1fbf54 100644 --- a/.github/workflows/arc-release-runners.yaml +++ b/.github/workflows/arc-release-runners.yaml @@ -7,10 +7,10 @@ on: # are available to the workflow run push: branches: - - 'master' + - "master" paths: - - 'runner/VERSION' - - '.github/workflows/arc-release-runners.yaml' + - "runner/VERSION" + - ".github/workflows/arc-release-runners.yaml" env: # Safeguard to prevent pushing images to registeries after build @@ -39,6 +39,7 @@ jobs: - name: Get Token id: get_workflow_token + # https://github.com/peter-murray/workflow-application-token-action/releases/tag/v3.0.0 uses: peter-murray/workflow-application-token-action@dc0413987a085fa17d19df9e47d4677cf81ffef3 with: application_id: ${{ secrets.ACTIONS_ACCESS_APP_ID }} diff --git a/.github/workflows/arc-validate-chart.yaml b/.github/workflows/arc-validate-chart.yaml index f38a3fc3..d93fa27f 100644 --- a/.github/workflows/arc-validate-chart.yaml +++ b/.github/workflows/arc-validate-chart.yaml @@ -5,20 +5,20 @@ on: branches: - master paths: - - 'charts/**' - - '.github/workflows/arc-validate-chart.yaml' - - '!charts/actions-runner-controller/docs/**' - - '!**.md' - - '!charts/gha-runner-scale-set-controller/**' - - '!charts/gha-runner-scale-set/**' + - "charts/**" + - ".github/workflows/arc-validate-chart.yaml" + - "!charts/actions-runner-controller/docs/**" + - "!**.md" + - "!charts/gha-runner-scale-set-controller/**" + - "!charts/gha-runner-scale-set/**" push: paths: - - 'charts/**' - - '.github/workflows/arc-validate-chart.yaml' - - '!charts/actions-runner-controller/docs/**' - - '!**.md' - - '!charts/gha-runner-scale-set-controller/**' - - '!charts/gha-runner-scale-set/**' + - "charts/**" + - ".github/workflows/arc-validate-chart.yaml" + - "!charts/actions-runner-controller/docs/**" + - "!**.md" + - "!charts/gha-runner-scale-set-controller/**" + - "!charts/gha-runner-scale-set/**" workflow_dispatch: env: KUBE_SCORE_VERSION: 1.10.0 @@ -45,34 +45,19 @@ jobs: fetch-depth: 0 - name: Set up Helm - # Using https://github.com/Azure/setup-helm/releases/tag/v4.2 + # Using https://github.com/Azure/setup-helm/releases/tag/v4.2.0 uses: azure/setup-helm@fe7b79cd5ee1e45176fcad797de68ecaf3ca4814 with: version: ${{ env.HELM_VERSION }} - - name: Set up kube-score - run: | - wget https://github.com/zegl/kube-score/releases/download/v${{ env.KUBE_SCORE_VERSION }}/kube-score_${{ env.KUBE_SCORE_VERSION }}_linux_amd64 -O kube-score - chmod 755 kube-score - - - name: Kube-score generated manifests - run: helm template --values charts/.ci/values-kube-score.yaml charts/* | ./kube-score score - - --ignore-test pod-networkpolicy - --ignore-test deployment-has-poddisruptionbudget - --ignore-test deployment-has-host-podantiaffinity - --ignore-test container-security-context - --ignore-test pod-probes - --ignore-test container-image-tag - --enable-optional-test container-security-context-privileged - --enable-optional-test container-security-context-readonlyrootfilesystem - # python is a requirement for the chart-testing action below (supports yamllint among other tests) - uses: actions/setup-python@v5 with: - python-version: '3.11' + python-version: "3.11" - name: Set up chart-testing - uses: helm/chart-testing-action@v2.6.0 + # https://github.com/helm/chart-testing-action/releases/tag/v2.7.0 + uses: helm/chart-testing-action@0d28d3144d3a25ea2cc349d6e59901c4ff469b3b - name: Run chart-testing (list-changed) id: list-changed @@ -87,7 +72,8 @@ jobs: ct lint --config charts/.ci/ct-config.yaml - name: Create kind cluster - uses: helm/kind-action@v1.4.0 + # https://github.com/helm/kind-action/releases/tag/v1.12.0 + uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 if: steps.list-changed.outputs.changed == 'true' # We need cert-manager already installed in the cluster because we assume the CRDs exist diff --git a/.github/workflows/arc-validate-runners.yaml b/.github/workflows/arc-validate-runners.yaml index 9d559c37..5b7da04f 100644 --- a/.github/workflows/arc-validate-runners.yaml +++ b/.github/workflows/arc-validate-runners.yaml @@ -3,17 +3,17 @@ name: Validate ARC Runners on: pull_request: branches: - - '**' + - "**" paths: - - 'runner/**' - - 'test/startup/**' - - '!**.md' + - "runner/**" + - "test/startup/**" + - "!**.md" permissions: contents: read concurrency: - # This will make sure we only apply the concurrency limits on pull requests + # This will make sure we only apply the concurrency limits on pull requests # but not pushes to master branch by making the concurrency group name unique # for pushes group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -25,28 +25,16 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - name: shellcheck - uses: reviewdog/action-shellcheck@v1 - with: - github_token: ${{ secrets.GITHUB_TOKEN }} - path: "./runner" - pattern: | - *.sh - *.bash - update-status - # Make this consistent with `make shellsheck` - shellcheck_flags: "--shell bash --source-path runner" - exclude: "./.git/*" - check_all_files_with_shebangs: "false" - # Set this to "true" once we addressed all the shellcheck findings - fail_on_error: "false" + - name: "Run shellcheck" + run: make shellcheck + test-runner-entrypoint: name: Test entrypoint runs-on: ubuntu-latest steps: - - name: Checkout - uses: actions/checkout@v4 + - name: Checkout + uses: actions/checkout@v4 - - name: Run tests - run: | - make acceptance/runner/startup + - name: Run tests + run: | + make acceptance/runner/startup diff --git a/.github/workflows/gha-publish-chart.yaml b/.github/workflows/gha-publish-chart.yaml index 251af8e8..572f5da3 100644 --- a/.github/workflows/gha-publish-chart.yaml +++ b/.github/workflows/gha-publish-chart.yaml @@ -4,27 +4,27 @@ on: workflow_dispatch: inputs: ref: - description: 'The branch, tag or SHA to cut a release from' + description: "The branch, tag or SHA to cut a release from" required: false type: string - default: '' + default: "" release_tag_name: - description: 'The name to tag the controller image with' + description: "The name to tag the controller image with" required: true type: string - default: 'canary' + default: "canary" push_to_registries: - description: 'Push images to registries' + description: "Push images to registries" required: true type: boolean default: false publish_gha_runner_scale_set_controller_chart: - description: 'Publish new helm chart for gha-runner-scale-set-controller' + description: "Publish new helm chart for gha-runner-scale-set-controller" required: true type: boolean default: false publish_gha_runner_scale_set_chart: - description: 'Publish new helm chart for gha-runner-scale-set' + description: "Publish new helm chart for gha-runner-scale-set" required: true type: boolean default: false @@ -72,10 +72,11 @@ jobs: echo "repository_owner=$(echo ${{ github.repository_owner }} | tr '[:upper:]' '[:lower:]')" >> $GITHUB_OUTPUT - name: Set up QEMU - uses: docker/setup-qemu-action@v3 + # https://github.com/docker/setup-qemu-action/releases/tag/v3.6.0 + uses: docker/setup-qemu-action@29109295f81e9208d7d86ff1c6c12d2833863392 - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 + uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 with: # Pinning v0.9.1 for Buildx and BuildKit v0.10.6 # BuildKit v0.11 which has a bug causing intermittent @@ -84,14 +85,16 @@ jobs: driver-opts: image=moby/buildkit:v0.10.6 - name: Login to GitHub Container Registry - uses: docker/login-action@v3 + # https://github.com/docker/login-action/releases/tag/v3.4.0 + uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Build & push controller image - uses: docker/build-push-action@v5 + # https://github.com/docker/build-push-action/releases/tag/v6.15.0 + uses: docker/build-push-action@471d1dc4e07e5cdedd4c2171150001c434f0b7a4 with: file: Dockerfile platforms: linux/amd64,linux/arm64 @@ -140,7 +143,7 @@ jobs: echo "repository_owner=$(echo ${{ github.repository_owner }} | tr '[:upper:]' '[:lower:]')" >> $GITHUB_OUTPUT - name: Set up Helm - # Using https://github.com/Azure/setup-helm/releases/tag/v4.2 + # Using https://github.com/Azure/setup-helm/releases/tag/v4.2.0 uses: azure/setup-helm@fe7b79cd5ee1e45176fcad797de68ecaf3ca4814 with: version: ${{ env.HELM_VERSION }} @@ -188,7 +191,7 @@ jobs: echo "repository_owner=$(echo ${{ github.repository_owner }} | tr '[:upper:]' '[:lower:]')" >> $GITHUB_OUTPUT - name: Set up Helm - # Using https://github.com/Azure/setup-helm/releases/tag/v4.2 + # Using https://github.com/Azure/setup-helm/releases/tag/v4.2.0 uses: azure/setup-helm@fe7b79cd5ee1e45176fcad797de68ecaf3ca4814 with: version: ${{ env.HELM_VERSION }} diff --git a/.github/workflows/gha-validate-chart.yaml b/.github/workflows/gha-validate-chart.yaml index a33fd74e..4ff1e023 100644 --- a/.github/workflows/gha-validate-chart.yaml +++ b/.github/workflows/gha-validate-chart.yaml @@ -5,16 +5,16 @@ on: branches: - master paths: - - 'charts/**' - - '.github/workflows/gha-validate-chart.yaml' - - '!charts/actions-runner-controller/**' - - '!**.md' + - "charts/**" + - ".github/workflows/gha-validate-chart.yaml" + - "!charts/actions-runner-controller/**" + - "!**.md" push: paths: - - 'charts/**' - - '.github/workflows/gha-validate-chart.yaml' - - '!charts/actions-runner-controller/**' - - '!**.md' + - "charts/**" + - ".github/workflows/gha-validate-chart.yaml" + - "!charts/actions-runner-controller/**" + - "!**.md" workflow_dispatch: env: KUBE_SCORE_VERSION: 1.16.1 @@ -41,7 +41,7 @@ jobs: fetch-depth: 0 - name: Set up Helm - # Using https://github.com/Azure/setup-helm/releases/tag/v4.2 + # Using https://github.com/Azure/setup-helm/releases/tag/v4.2.0 uses: azure/setup-helm@fe7b79cd5ee1e45176fcad797de68ecaf3ca4814 with: version: ${{ env.HELM_VERSION }} @@ -49,10 +49,11 @@ jobs: # python is a requirement for the chart-testing action below (supports yamllint among other tests) - uses: actions/setup-python@v5 with: - python-version: '3.11' + python-version: "3.11" - name: Set up chart-testing - uses: helm/chart-testing-action@v2.6.0 + # https://github.com/helm/chart-testing-action/releases/tag/v2.7.0 + uses: helm/chart-testing-action@0d28d3144d3a25ea2cc349d6e59901c4ff469b3b - name: Run chart-testing (list-changed) id: list-changed @@ -68,13 +69,14 @@ jobs: ct lint --config charts/.ci/ct-config-gha.yaml - name: Set up docker buildx - uses: docker/setup-buildx-action@v3 + uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 if: steps.list-changed.outputs.changed == 'true' with: version: latest - name: Build controller image - uses: docker/build-push-action@v5 + # https://github.com/docker/build-push-action/releases/tag/v6.15.0 + uses: docker/build-push-action@471d1dc4e07e5cdedd4c2171150001c434f0b7a4 if: steps.list-changed.outputs.changed == 'true' with: file: Dockerfile @@ -89,7 +91,8 @@ jobs: cache-to: type=gha,mode=max - name: Create kind cluster - uses: helm/kind-action@v1.4.0 + # https://github.com/helm/kind-action/releases/tag/v1.12.0 + uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 if: steps.list-changed.outputs.changed == 'true' with: cluster_name: chart-testing @@ -97,11 +100,11 @@ jobs: - name: Load image into cluster if: steps.list-changed.outputs.changed == 'true' run: | - export DOCKER_IMAGE_NAME=test-arc - export VERSION=dev - export IMG_RESULT=load - make docker-buildx - kind load docker-image test-arc:dev --name chart-testing + export DOCKER_IMAGE_NAME=test-arc + export VERSION=dev + export IMG_RESULT=load + make docker-buildx + kind load docker-image test-arc:dev --name chart-testing - name: Run chart-testing (install) if: steps.list-changed.outputs.changed == 'true' diff --git a/.github/workflows/global-publish-canary.yaml b/.github/workflows/global-publish-canary.yaml index ba4796a9..9d84a10e 100644 --- a/.github/workflows/global-publish-canary.yaml +++ b/.github/workflows/global-publish-canary.yaml @@ -7,30 +7,30 @@ on: branches: - master paths-ignore: - - '**.md' - - '.github/actions/**' - - '.github/ISSUE_TEMPLATE/**' - - '.github/workflows/e2e-test-dispatch-workflow.yaml' - - '.github/workflows/gha-e2e-tests.yaml' - - '.github/workflows/arc-publish.yaml' - - '.github/workflows/arc-publish-chart.yaml' - - '.github/workflows/gha-publish-chart.yaml' - - '.github/workflows/arc-release-runners.yaml' - - '.github/workflows/global-run-codeql.yaml' - - '.github/workflows/global-run-first-interaction.yaml' - - '.github/workflows/global-run-stale.yaml' - - '.github/workflows/arc-update-runners-scheduled.yaml' - - '.github/workflows/validate-arc.yaml' - - '.github/workflows/arc-validate-chart.yaml' - - '.github/workflows/gha-validate-chart.yaml' - - '.github/workflows/arc-validate-runners.yaml' - - '.github/dependabot.yml' - - '.github/RELEASE_NOTE_TEMPLATE.md' - - 'runner/**' - - '.gitignore' - - 'PROJECT' - - 'LICENSE' - - 'Makefile' + - "**.md" + - ".github/actions/**" + - ".github/ISSUE_TEMPLATE/**" + - ".github/workflows/e2e-test-dispatch-workflow.yaml" + - ".github/workflows/gha-e2e-tests.yaml" + - ".github/workflows/arc-publish.yaml" + - ".github/workflows/arc-publish-chart.yaml" + - ".github/workflows/gha-publish-chart.yaml" + - ".github/workflows/arc-release-runners.yaml" + - ".github/workflows/global-run-codeql.yaml" + - ".github/workflows/global-run-first-interaction.yaml" + - ".github/workflows/global-run-stale.yaml" + - ".github/workflows/arc-update-runners-scheduled.yaml" + - ".github/workflows/validate-arc.yaml" + - ".github/workflows/arc-validate-chart.yaml" + - ".github/workflows/gha-validate-chart.yaml" + - ".github/workflows/arc-validate-runners.yaml" + - ".github/dependabot.yml" + - ".github/RELEASE_NOTE_TEMPLATE.md" + - "runner/**" + - ".gitignore" + - "PROJECT" + - "LICENSE" + - "Makefile" # https://docs.github.com/en/rest/overview/permissions-required-for-github-apps permissions: @@ -59,6 +59,7 @@ jobs: - name: Get Token id: get_workflow_token + # https://github.com/peter-murray/workflow-application-token-action/releases/tag/v3.0.0 uses: peter-murray/workflow-application-token-action@dc0413987a085fa17d19df9e47d4677cf81ffef3 with: application_id: ${{ secrets.ACTIONS_ACCESS_APP_ID }} @@ -93,7 +94,8 @@ jobs: uses: actions/checkout@v4 - name: Login to GitHub Container Registry - uses: docker/login-action@v3 + # https://github.com/docker/login-action/releases/tag/v3.4.0 + uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 with: registry: ghcr.io username: ${{ github.actor }} @@ -110,16 +112,19 @@ jobs: echo "repository_owner=$(echo ${{ github.repository_owner }} | tr '[:upper:]' '[:lower:]')" >> $GITHUB_OUTPUT - name: Set up QEMU - uses: docker/setup-qemu-action@v3 + # https://github.com/docker/setup-qemu-action/releases/tag/v3.6.0 + uses: docker/setup-qemu-action@29109295f81e9208d7d86ff1c6c12d2833863392 - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 + # https://github.com/docker/setup-buildx-action/releases/tag/v3.10.0 + uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 with: version: latest # Unstable builds - run at your own risk - name: Build and Push - uses: docker/build-push-action@v5 + # https://github.com/docker/build-push-action/releases/tag/v6.15.0 + uses: docker/build-push-action@471d1dc4e07e5cdedd4c2171150001c434f0b7a4 with: context: . file: ./Dockerfile diff --git a/.github/workflows/go.yaml b/.github/workflows/go.yaml index 40112c2c..703b56a5 100644 --- a/.github/workflows/go.yaml +++ b/.github/workflows/go.yaml @@ -4,16 +4,16 @@ on: branches: - master paths: - - '.github/workflows/go.yaml' - - '**.go' - - 'go.mod' - - 'go.sum' + - ".github/workflows/go.yaml" + - "**.go" + - "go.mod" + - "go.sum" pull_request: paths: - - '.github/workflows/go.yaml' - - '**.go' - - 'go.mod' - - 'go.sum' + - ".github/workflows/go.yaml" + - "**.go" + - "go.mod" + - "go.sum" permissions: contents: read @@ -32,7 +32,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-go@v5 with: - go-version-file: 'go.mod' + go-version-file: "go.mod" cache: false - name: fmt run: go fmt ./... @@ -45,13 +45,14 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-go@v5 with: - go-version-file: 'go.mod' + go-version-file: "go.mod" cache: false - name: golangci-lint - uses: golangci/golangci-lint-action@v6 + # https://github.com/golangci/golangci-lint-action/releases/tag/v7.0.0 + uses: golangci/golangci-lint-action@1481404843c368bc19ca9406f87d6e0fc97bdcfd with: only-new-issues: true - version: v1.55.2 + version: v2.1.2 generate: runs-on: ubuntu-latest @@ -59,7 +60,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-go@v5 with: - go-version-file: 'go.mod' + go-version-file: "go.mod" cache: false - name: Generate run: make generate @@ -72,7 +73,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-go@v5 with: - go-version-file: 'go.mod' + go-version-file: "go.mod" - run: make manifests - name: Check diff run: git diff --exit-code diff --git a/.golangci.yaml b/.golangci.yaml index eca46937..eec42912 100644 --- a/.golangci.yaml +++ b/.golangci.yaml @@ -1,19 +1,14 @@ +version: "2" run: - timeout: 3m -output: - formats: - - format: github-actions - path: stdout -linters-settings: - errcheck: - exclude-functions: - - (net/http.ResponseWriter).Write - - (*net/http.Server).Shutdown - - (*github.com/actions/actions-runner-controller/simulator.VisibleRunnerGroups).Add - - (*github.com/actions/actions-runner-controller/testing.Kind).Stop -issues: - exclude-rules: - - path: controllers/suite_test.go - linters: - - staticcheck - text: "SA1019" + timeout: 5m +linters: + settings: + errcheck: + exclude-functions: + - (net/http.ResponseWriter).Write + - (*net/http.Server).Shutdown + - (*github.com/actions/actions-runner-controller/simulator.VisibleRunnerGroups).Add + - (*github.com/actions/actions-runner-controller/testing.Kind).Stop + exclusions: + presets: + - std-error-handling diff --git a/Dockerfile b/Dockerfile index 3e64d273..329653d7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # Build the manager binary -FROM --platform=$BUILDPLATFORM golang:1.24.0 as builder +FROM --platform=$BUILDPLATFORM golang:1.24.3 AS builder WORKDIR /workspace @@ -30,7 +30,7 @@ ARG TARGETPLATFORM TARGETOS TARGETARCH TARGETVARIANT VERSION=dev COMMIT_SHA=dev # to avoid https://github.com/moby/buildkit/issues/2334 # We can use docker layer cache so the build is fast enogh anyway # We also use per-platform GOCACHE for the same reason. -ENV GOCACHE /build/${TARGETPLATFORM}/root/.cache/go-build +ENV GOCACHE="/build/${TARGETPLATFORM}/root/.cache/go-build" # Build RUN --mount=target=. \ diff --git a/Makefile b/Makefile index 134f2927..ce92592b 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ endif DOCKER_USER ?= $(shell echo ${DOCKER_IMAGE_NAME} | cut -d / -f1) VERSION ?= dev COMMIT_SHA = $(shell git rev-parse HEAD) -RUNNER_VERSION ?= 2.323.0 +RUNNER_VERSION ?= 2.325.0 TARGETPLATFORM ?= $(shell arch) RUNNER_NAME ?= ${DOCKER_USER}/actions-runner RUNNER_TAG ?= ${VERSION} @@ -20,7 +20,7 @@ KUBECONTEXT ?= kind-acceptance CLUSTER ?= acceptance CERT_MANAGER_VERSION ?= v1.1.1 KUBE_RBAC_PROXY_VERSION ?= v0.11.0 -SHELLCHECK_VERSION ?= 0.8.0 +SHELLCHECK_VERSION ?= 0.10.0 # Produce CRDs that work back to Kubernetes 1.11 (no version conversion) CRD_OPTIONS ?= "crd:generateEmbeddedObjectMeta=true,allowDangerousTypes=true" @@ -68,7 +68,7 @@ endif all: manager lint: - docker run --rm -v $(PWD):/app -w /app golangci/golangci-lint:v1.57.2 golangci-lint run + docker run --rm -v $(PWD):/app -w /app golangci/golangci-lint:v2.1.2 golangci-lint run GO_TEST_ARGS ?= -short @@ -204,7 +204,7 @@ generate: controller-gen # Run shellcheck on runner scripts shellcheck: shellcheck-install - $(TOOLS_PATH)/shellcheck --shell bash --source-path runner runner/*.sh hack/*.sh + $(TOOLS_PATH)/shellcheck --shell bash --source-path runner runner/*.sh runner/update-status hack/*.sh docker-buildx: export DOCKER_CLI_EXPERIMENTAL=enabled ;\ @@ -310,7 +310,7 @@ github-release: release # Otherwise we get errors like the below: # Error: failed to install CRD crds/actions.summerwind.dev_runnersets.yaml: CustomResourceDefinition.apiextensions.k8s.io "runnersets.actions.summerwind.dev" is invalid: [spec.validation.openAPIV3Schema.properties[spec].properties[template].properties[spec].properties[containers].items.properties[ports].items.properties[protocol].default: Required value: this property is in x-kubernetes-list-map-keys, so it must have a default or be a required property, spec.validation.openAPIV3Schema.properties[spec].properties[template].properties[spec].properties[initContainers].items.properties[ports].items.properties[protocol].default: Required value: this property is in x-kubernetes-list-map-keys, so it must have a default or be a required property] # -# Note that controller-gen newer than 0.6.2 is needed due to https://github.com/kubernetes-sigs/controller-tools/issues/448 +# Note that controller-gen newer than 0.7.0 is needed due to https://github.com/kubernetes-sigs/controller-tools/issues/448 # Otherwise ObjectMeta embedded in Spec results in empty on the storage. controller-gen: ifeq (, $(shell which controller-gen)) diff --git a/acceptance/pipelines/eks-integration-tests.yaml b/acceptance/pipelines/eks-integration-tests.yaml index a0ed5e65..0fb86e95 100644 --- a/acceptance/pipelines/eks-integration-tests.yaml +++ b/acceptance/pipelines/eks-integration-tests.yaml @@ -5,22 +5,23 @@ on: env: IRSA_ROLE_ARN: - ASSUME_ROLE_ARN: - AWS_REGION: + ASSUME_ROLE_ARN: + AWS_REGION: jobs: assume-role-in-runner-test: - runs-on: ['self-hosted', 'Linux'] + runs-on: ["self-hosted", "Linux"] steps: - name: Test aws-actions/configure-aws-credentials Action - uses: aws-actions/configure-aws-credentials@v1 + # https://github.com/aws-actions/configure-aws-credentials/releases/tag/v4.1.0 + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 with: aws-region: ${{ env.AWS_REGION }} role-to-assume: ${{ env.ASSUME_ROLE_ARN }} role-duration-seconds: 900 assume-role-in-container-test: - runs-on: ['self-hosted', 'Linux'] - container: + runs-on: ["self-hosted", "Linux"] + container: image: amazon/aws-cli env: AWS_WEB_IDENTITY_TOKEN_FILE: /var/run/secrets/eks.amazonaws.com/serviceaccount/token @@ -29,7 +30,8 @@ jobs: - /var/run/secrets/eks.amazonaws.com/serviceaccount/token:/var/run/secrets/eks.amazonaws.com/serviceaccount/token steps: - name: Test aws-actions/configure-aws-credentials Action in container - uses: aws-actions/configure-aws-credentials@v1 + # https://github.com/aws-actions/configure-aws-credentials/releases/tag/v4.1.0 + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 with: aws-region: ${{ env.AWS_REGION }} role-to-assume: ${{ env.ASSUME_ROLE_ARN }} diff --git a/acceptance/pipelines/runner-integration-tests.yaml b/acceptance/pipelines/runner-integration-tests.yaml index 63b42a97..e85f2ffa 100644 --- a/acceptance/pipelines/runner-integration-tests.yaml +++ b/acceptance/pipelines/runner-integration-tests.yaml @@ -8,8 +8,8 @@ env: jobs: run-step-in-container-test: - runs-on: ['self-hosted', 'Linux'] - container: + runs-on: ["self-hosted", "Linux"] + container: image: alpine steps: - name: Test we are working in the container @@ -21,7 +21,7 @@ jobs: exit 1 fi setup-python-test: - runs-on: ['self-hosted', 'Linux'] + runs-on: ["self-hosted", "Linux"] steps: - name: Print native Python environment run: | @@ -41,12 +41,12 @@ jobs: echo "Python version detected : $(python --version 2>&1)" fi setup-node-test: - runs-on: ['self-hosted', 'Linux'] + runs-on: ["self-hosted", "Linux"] steps: - uses: actions/setup-node@v2 with: - node-version: '12' - - name: Test actions/setup-node works + node-version: "12" + - name: Test actions/setup-node works run: | VERSION=$(node --version | cut -c 2- | cut -d '.' -f1) if [[ $VERSION != '12' ]]; then @@ -57,13 +57,14 @@ jobs: echo "Node version detected : $(node --version 2>&1)" fi setup-ruby-test: - runs-on: ['self-hosted', 'Linux'] + runs-on: ["self-hosted", "Linux"] steps: - - uses: ruby/setup-ruby@v1 + # https://github.com/ruby/setup-ruby/releases/tag/v1.227.0 + - uses: ruby/setup-ruby@1a615958ad9d422dd932dc1d5823942ee002799f with: ruby-version: 3.0 bundler-cache: true - - name: Test ruby/setup-ruby works + - name: Test ruby/setup-ruby works run: | VERSION=$(ruby --version | cut -d ' ' -f2 | cut -d '.' -f1-2) if [[ $VERSION != '3.0' ]]; then @@ -74,8 +75,8 @@ jobs: echo "Ruby version detected : $(ruby --version 2>&1)" fi python-shell-test: - runs-on: ['self-hosted', 'Linux'] - steps: + runs-on: ["self-hosted", "Linux"] + steps: - name: Test Python shell works run: | import os diff --git a/apis/actions.github.com/v1alpha1/ephemeralrunner_types.go b/apis/actions.github.com/v1alpha1/ephemeralrunner_types.go index e34b255e..0c13bf7c 100644 --- a/apis/actions.github.com/v1alpha1/ephemeralrunner_types.go +++ b/apis/actions.github.com/v1alpha1/ephemeralrunner_types.go @@ -119,7 +119,7 @@ type EphemeralRunnerStatus struct { RunnerJITConfig string `json:"runnerJITConfig,omitempty"` // +optional - Failures map[string]bool `json:"failures,omitempty"` + Failures map[string]metav1.Time `json:"failures,omitempty"` // +optional JobRequestId int64 `json:"jobRequestId,omitempty"` @@ -137,6 +137,20 @@ type EphemeralRunnerStatus struct { JobDisplayName string `json:"jobDisplayName,omitempty"` } +func (s *EphemeralRunnerStatus) LastFailure() metav1.Time { + var maxTime metav1.Time + if len(s.Failures) == 0 { + return maxTime + } + + for _, ts := range s.Failures { + if ts.After(maxTime.Time) { + maxTime = ts + } + } + return maxTime +} + // +kubebuilder:object:root=true // EphemeralRunnerList contains a list of EphemeralRunner diff --git a/apis/actions.github.com/v1alpha1/version.go b/apis/actions.github.com/v1alpha1/version.go new file mode 100644 index 00000000..731c6011 --- /dev/null +++ b/apis/actions.github.com/v1alpha1/version.go @@ -0,0 +1,72 @@ +package v1alpha1 + +import "strings" + +func IsVersionAllowed(resourceVersion, buildVersion string) bool { + if buildVersion == "dev" || resourceVersion == buildVersion || strings.HasPrefix(buildVersion, "canary-") { + return true + } + + rv, ok := parseSemver(resourceVersion) + if !ok { + return false + } + bv, ok := parseSemver(buildVersion) + if !ok { + return false + } + return rv.major == bv.major && rv.minor == bv.minor +} + +type semver struct { + major string + minor string +} + +func parseSemver(v string) (p semver, ok bool) { + if v == "" { + return + } + p.major, v, ok = parseInt(v) + if !ok { + return p, false + } + if v == "" { + p.minor = "0" + return p, true + } + if v[0] != '.' { + return p, false + } + p.minor, v, ok = parseInt(v[1:]) + if !ok { + return p, false + } + if v == "" { + return p, true + } + if v[0] != '.' { + return p, false + } + if _, _, ok = parseInt(v[1:]); !ok { + return p, false + } + return p, true +} + +func parseInt(v string) (t, rest string, ok bool) { + if v == "" { + return + } + if v[0] < '0' || '9' < v[0] { + return + } + i := 1 + for i < len(v) && '0' <= v[i] && v[i] <= '9' { + i++ + } + if v[0] == '0' && i != 1 { + return + } + return v[:i], v[i:], true +} diff --git a/apis/actions.github.com/v1alpha1/version_test.go b/apis/actions.github.com/v1alpha1/version_test.go new file mode 100644 index 00000000..8b4e8025 --- /dev/null +++ b/apis/actions.github.com/v1alpha1/version_test.go @@ -0,0 +1,60 @@ +package v1alpha1_test + +import ( + "testing" + + "github.com/actions/actions-runner-controller/apis/actions.github.com/v1alpha1" + "github.com/stretchr/testify/assert" +) + +func TestIsVersionAllowed(t *testing.T) { + t.Parallel() + tt := map[string]struct { + resourceVersion string + buildVersion string + want bool + }{ + "dev should always be allowed": { + resourceVersion: "0.11.0", + buildVersion: "dev", + want: true, + }, + "resourceVersion is not semver": { + resourceVersion: "dev", + buildVersion: "0.11.0", + want: false, + }, + "buildVersion is not semver": { + resourceVersion: "0.11.0", + buildVersion: "NA", + want: false, + }, + "major version mismatch": { + resourceVersion: "0.11.0", + buildVersion: "1.11.0", + want: false, + }, + "minor version mismatch": { + resourceVersion: "0.11.0", + buildVersion: "0.10.0", + want: false, + }, + "patch version mismatch": { + resourceVersion: "0.11.1", + buildVersion: "0.11.0", + want: true, + }, + "arbitrary version match": { + resourceVersion: "abc", + buildVersion: "abc", + want: true, + }, + } + + for name, tc := range tt { + t.Run(name, func(t *testing.T) { + got := v1alpha1.IsVersionAllowed(tc.resourceVersion, tc.buildVersion) + assert.Equal(t, tc.want, got) + }) + } +} diff --git a/apis/actions.github.com/v1alpha1/zz_generated.deepcopy.go b/apis/actions.github.com/v1alpha1/zz_generated.deepcopy.go index dd7553f0..b0947659 100644 --- a/apis/actions.github.com/v1alpha1/zz_generated.deepcopy.go +++ b/apis/actions.github.com/v1alpha1/zz_generated.deepcopy.go @@ -22,6 +22,7 @@ package v1alpha1 import ( "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" ) @@ -459,9 +460,9 @@ func (in *EphemeralRunnerStatus) DeepCopyInto(out *EphemeralRunnerStatus) { *out = *in if in.Failures != nil { in, out := &in.Failures, &out.Failures - *out = make(map[string]bool, len(*in)) + *out = make(map[string]metav1.Time, len(*in)) for key, val := range *in { - (*out)[key] = val + (*out)[key] = *val.DeepCopy() } } } diff --git a/apis/actions.summerwind.net/v1alpha1/runner_types.go b/apis/actions.summerwind.net/v1alpha1/runner_types.go index 2d988c24..a1d59ee0 100644 --- a/apis/actions.summerwind.net/v1alpha1/runner_types.go +++ b/apis/actions.summerwind.net/v1alpha1/runner_types.go @@ -215,10 +215,10 @@ func (rs *RunnerSpec) validateRepository() error { foundCount += 1 } if foundCount == 0 { - return errors.New("Spec needs enterprise, organization or repository") + return errors.New("spec needs enterprise, organization or repository") } if foundCount > 1 { - return errors.New("Spec cannot have many fields defined enterprise, organization and repository") + return errors.New("spec cannot have many fields defined enterprise, organization and repository") } return nil diff --git a/charts/.ci/ct-config-gha.yaml b/charts/.ci/ct-config-gha.yaml index baf8bc43..b0a15a37 100644 --- a/charts/.ci/ct-config-gha.yaml +++ b/charts/.ci/ct-config-gha.yaml @@ -1,9 +1,11 @@ # This file defines the config for "ct" (chart tester) used by the helm linting GitHub workflow +remote: origin +target-branch: master lint-conf: charts/.ci/lint-config.yaml chart-repos: - jetstack=https://charts.jetstack.io check-version-increment: false # Disable checking that the chart version has been bumped charts: -- charts/gha-runner-scale-set-controller -- charts/gha-runner-scale-set + - charts/gha-runner-scale-set-controller + - charts/gha-runner-scale-set skip-clean-up: true diff --git a/charts/.ci/ct-config.yaml b/charts/.ci/ct-config.yaml index 55ebad54..45be8be9 100644 --- a/charts/.ci/ct-config.yaml +++ b/charts/.ci/ct-config.yaml @@ -1,7 +1,9 @@ # This file defines the config for "ct" (chart tester) used by the helm linting GitHub workflow +remote: origin +target-branch: master lint-conf: charts/.ci/lint-config.yaml chart-repos: - jetstack=https://charts.jetstack.io check-version-increment: false # Disable checking that the chart version has been bumped charts: -- charts/actions-runner-controller + - charts/actions-runner-controller diff --git a/charts/.ci/scripts/local-kube-score.sh b/charts/.ci/scripts/local-kube-score.sh index 3982b388..a8592dfd 100755 --- a/charts/.ci/scripts/local-kube-score.sh +++ b/charts/.ci/scripts/local-kube-score.sh @@ -1,6 +1,5 @@ #!/bin/bash - for chart in `ls charts`; do helm template --values charts/$chart/ci/ci-values.yaml charts/$chart | kube-score score - \ @@ -12,4 +11,4 @@ helm template --values charts/$chart/ci/ci-values.yaml charts/$chart | kube-scor --enable-optional-test container-security-context-privileged \ --enable-optional-test container-security-context-readonlyrootfilesystem \ --ignore-test container-security-context -done \ No newline at end of file +done diff --git a/charts/actions-runner-controller/README.md b/charts/actions-runner-controller/README.md index e552d8be..9387464b 100644 --- a/charts/actions-runner-controller/README.md +++ b/charts/actions-runner-controller/README.md @@ -44,7 +44,7 @@ All additional docs are kept in the `docs/` folder, this README is solely for do | `image.pullPolicy` | The pull policy of the controller image | IfNotPresent | | `metrics.serviceMonitor.enable` | Deploy serviceMonitor kind for for use with prometheus-operator CRDs | false | | `metrics.serviceMonitor.interval` | Configure the interval that Prometheus should scrap the controller's metrics | 1m | -| `metrics.serviceMonitor.namespace | Namespace which Prometheus is running in | `Release.Namespace` (the default namespace of the helm chart). | +| `metrics.serviceMonitor.namespace` | Namespace which Prometheus is running in | `Release.Namespace` (the default namespace of the helm chart). | | `metrics.serviceMonitor.timeout` | Configure the timeout the timeout of Prometheus scrapping. | 30s | | `metrics.serviceAnnotations` | Set annotations for the provisioned metrics service resource | | | `metrics.port` | Set port of metrics service | 8443 | diff --git a/charts/gha-runner-scale-set-controller/crds/actions.github.com_ephemeralrunners.yaml b/charts/gha-runner-scale-set-controller/crds/actions.github.com_ephemeralrunners.yaml index e1505280..f7cf1139 100644 --- a/charts/gha-runner-scale-set-controller/crds/actions.github.com_ephemeralrunners.yaml +++ b/charts/gha-runner-scale-set-controller/crds/actions.github.com_ephemeralrunners.yaml @@ -7794,7 +7794,8 @@ spec: properties: failures: additionalProperties: - type: boolean + format: date-time + type: string type: object jobDisplayName: type: string diff --git a/charts/gha-runner-scale-set/templates/_helpers.tpl b/charts/gha-runner-scale-set/templates/_helpers.tpl index ef75a3f2..c9729235 100644 --- a/charts/gha-runner-scale-set/templates/_helpers.tpl +++ b/charts/gha-runner-scale-set/templates/_helpers.tpl @@ -106,6 +106,9 @@ env: value: "123" securityContext: privileged: true +{{- if (ge (.Capabilities.KubeVersion.Minor | int) 29) }} +restartPolicy: Always +{{- end }} volumeMounts: - name: work mountPath: /home/runner/_work diff --git a/charts/gha-runner-scale-set/templates/autoscalingrunnerset.yaml b/charts/gha-runner-scale-set/templates/autoscalingrunnerset.yaml index f2f094d1..3008b28c 100644 --- a/charts/gha-runner-scale-set/templates/autoscalingrunnerset.yaml +++ b/charts/gha-runner-scale-set/templates/autoscalingrunnerset.yaml @@ -149,6 +149,10 @@ spec: - name: init-dind-externals {{- include "gha-runner-scale-set.dind-init-container" . | nindent 8 }} {{- end }} + {{- if (ge (.Capabilities.KubeVersion.Minor | int) 29) }} + - name: dind + {{- include "gha-runner-scale-set.dind-container" . | nindent 8 }} + {{- end }} {{- with .Values.template.spec.initContainers }} {{- toYaml . | nindent 6 }} {{- end }} @@ -157,8 +161,10 @@ spec: {{- if eq $containerMode.type "dind" }} - name: runner {{- include "gha-runner-scale-set.dind-runner-container" . | nindent 8 }} + {{- if not (ge (.Capabilities.KubeVersion.Minor | int) 29) }} - name: dind {{- include "gha-runner-scale-set.dind-container" . | nindent 8 }} + {{- end }} {{- include "gha-runner-scale-set.non-runner-non-dind-containers" . | nindent 6 }} {{- else if eq $containerMode.type "kubernetes" }} - name: runner diff --git a/charts/gha-runner-scale-set/tests/template_test.go b/charts/gha-runner-scale-set/tests/template_test.go index 790f44e0..72220866 100644 --- a/charts/gha-runner-scale-set/tests/template_test.go +++ b/charts/gha-runner-scale-set/tests/template_test.go @@ -728,20 +728,20 @@ func TestTemplateRenderedAutoScalingRunnerSet_DinD_ExtraInitContainers(t *testin var ars v1alpha1.AutoscalingRunnerSet helm.UnmarshalK8SYaml(t, output, &ars) - assert.Len(t, ars.Spec.Template.Spec.InitContainers, 3, "InitContainers should be 3") - assert.Equal(t, "kube-init", ars.Spec.Template.Spec.InitContainers[1].Name, "InitContainers[1] Name should be kube-init") - assert.Equal(t, "runner-image:latest", ars.Spec.Template.Spec.InitContainers[1].Image, "InitContainers[1] Image should be runner-image:latest") - assert.Equal(t, "sudo", ars.Spec.Template.Spec.InitContainers[1].Command[0], "InitContainers[1] Command[0] should be sudo") - assert.Equal(t, "chown", ars.Spec.Template.Spec.InitContainers[1].Command[1], "InitContainers[1] Command[1] should be chown") - assert.Equal(t, "-R", ars.Spec.Template.Spec.InitContainers[1].Command[2], "InitContainers[1] Command[2] should be -R") - assert.Equal(t, "1001:123", ars.Spec.Template.Spec.InitContainers[1].Command[3], "InitContainers[1] Command[3] should be 1001:123") - assert.Equal(t, "/home/runner/_work", ars.Spec.Template.Spec.InitContainers[1].Command[4], "InitContainers[1] Command[4] should be /home/runner/_work") - assert.Equal(t, "work", ars.Spec.Template.Spec.InitContainers[1].VolumeMounts[0].Name, "InitContainers[1] VolumeMounts[0] Name should be work") - assert.Equal(t, "/home/runner/_work", ars.Spec.Template.Spec.InitContainers[1].VolumeMounts[0].MountPath, "InitContainers[1] VolumeMounts[0] MountPath should be /home/runner/_work") + assert.Len(t, ars.Spec.Template.Spec.InitContainers, 4, "InitContainers should be 4") + assert.Equal(t, "kube-init", ars.Spec.Template.Spec.InitContainers[2].Name, "InitContainers[1] Name should be kube-init") + assert.Equal(t, "runner-image:latest", ars.Spec.Template.Spec.InitContainers[2].Image, "InitContainers[1] Image should be runner-image:latest") + assert.Equal(t, "sudo", ars.Spec.Template.Spec.InitContainers[2].Command[0], "InitContainers[1] Command[0] should be sudo") + assert.Equal(t, "chown", ars.Spec.Template.Spec.InitContainers[2].Command[1], "InitContainers[1] Command[1] should be chown") + assert.Equal(t, "-R", ars.Spec.Template.Spec.InitContainers[2].Command[2], "InitContainers[1] Command[2] should be -R") + assert.Equal(t, "1001:123", ars.Spec.Template.Spec.InitContainers[2].Command[3], "InitContainers[1] Command[3] should be 1001:123") + assert.Equal(t, "/home/runner/_work", ars.Spec.Template.Spec.InitContainers[2].Command[4], "InitContainers[1] Command[4] should be /home/runner/_work") + assert.Equal(t, "work", ars.Spec.Template.Spec.InitContainers[2].VolumeMounts[0].Name, "InitContainers[1] VolumeMounts[0] Name should be work") + assert.Equal(t, "/home/runner/_work", ars.Spec.Template.Spec.InitContainers[2].VolumeMounts[0].MountPath, "InitContainers[1] VolumeMounts[0] MountPath should be /home/runner/_work") - assert.Equal(t, "ls", ars.Spec.Template.Spec.InitContainers[2].Name, "InitContainers[2] Name should be ls") - assert.Equal(t, "ubuntu:latest", ars.Spec.Template.Spec.InitContainers[2].Image, "InitContainers[2] Image should be ubuntu:latest") - assert.Equal(t, "ls", ars.Spec.Template.Spec.InitContainers[2].Command[0], "InitContainers[2] Command[0] should be ls") + assert.Equal(t, "ls", ars.Spec.Template.Spec.InitContainers[3].Name, "InitContainers[2] Name should be ls") + assert.Equal(t, "ubuntu:latest", ars.Spec.Template.Spec.InitContainers[3].Image, "InitContainers[2] Image should be ubuntu:latest") + assert.Equal(t, "ls", ars.Spec.Template.Spec.InitContainers[3].Command[0], "InitContainers[2] Command[0] should be ls") } func TestTemplateRenderedAutoScalingRunnerSet_DinD_ExtraVolumes(t *testing.T) { @@ -860,13 +860,26 @@ func TestTemplateRenderedAutoScalingRunnerSet_EnableDinD(t *testing.T) { assert.NotNil(t, ars.Spec.Template.Spec, "Template.Spec should not be nil") - assert.Len(t, ars.Spec.Template.Spec.InitContainers, 1, "Template.Spec should have 1 init container") + assert.Len(t, ars.Spec.Template.Spec.InitContainers, 2, "Template.Spec should have 2 init container") assert.Equal(t, "init-dind-externals", ars.Spec.Template.Spec.InitContainers[0].Name) assert.Equal(t, "ghcr.io/actions/actions-runner:latest", ars.Spec.Template.Spec.InitContainers[0].Image) assert.Equal(t, "cp", ars.Spec.Template.Spec.InitContainers[0].Command[0]) assert.Equal(t, "-r /home/runner/externals/. /home/runner/tmpDir/", strings.Join(ars.Spec.Template.Spec.InitContainers[0].Args, " ")) - assert.Len(t, ars.Spec.Template.Spec.Containers, 2, "Template.Spec should have 2 container") + assert.Equal(t, "dind", ars.Spec.Template.Spec.InitContainers[1].Name) + assert.Equal(t, "docker:dind", ars.Spec.Template.Spec.InitContainers[1].Image) + assert.True(t, *ars.Spec.Template.Spec.InitContainers[1].SecurityContext.Privileged) + assert.Len(t, ars.Spec.Template.Spec.InitContainers[1].VolumeMounts, 3, "The dind container should have 3 volume mounts, dind-sock, work and externals") + assert.Equal(t, "work", ars.Spec.Template.Spec.InitContainers[1].VolumeMounts[0].Name) + assert.Equal(t, "/home/runner/_work", ars.Spec.Template.Spec.InitContainers[1].VolumeMounts[0].MountPath) + + assert.Equal(t, "dind-sock", ars.Spec.Template.Spec.InitContainers[1].VolumeMounts[1].Name) + assert.Equal(t, "/var/run", ars.Spec.Template.Spec.InitContainers[1].VolumeMounts[1].MountPath) + + assert.Equal(t, "dind-externals", ars.Spec.Template.Spec.InitContainers[1].VolumeMounts[2].Name) + assert.Equal(t, "/home/runner/externals", ars.Spec.Template.Spec.InitContainers[1].VolumeMounts[2].MountPath) + + assert.Len(t, ars.Spec.Template.Spec.Containers, 1, "Template.Spec should have 1 container") assert.Equal(t, "runner", ars.Spec.Template.Spec.Containers[0].Name) assert.Equal(t, "ghcr.io/actions/actions-runner:latest", ars.Spec.Template.Spec.Containers[0].Image) assert.Len(t, ars.Spec.Template.Spec.Containers[0].Env, 2, "The runner container should have 2 env vars, DOCKER_HOST and RUNNER_WAIT_FOR_DOCKER_IN_SECONDS") @@ -883,19 +896,6 @@ func TestTemplateRenderedAutoScalingRunnerSet_EnableDinD(t *testing.T) { assert.Equal(t, "dind-sock", ars.Spec.Template.Spec.Containers[0].VolumeMounts[1].Name) assert.Equal(t, "/var/run", ars.Spec.Template.Spec.Containers[0].VolumeMounts[1].MountPath) - assert.Equal(t, "dind", ars.Spec.Template.Spec.Containers[1].Name) - assert.Equal(t, "docker:dind", ars.Spec.Template.Spec.Containers[1].Image) - assert.True(t, *ars.Spec.Template.Spec.Containers[1].SecurityContext.Privileged) - assert.Len(t, ars.Spec.Template.Spec.Containers[1].VolumeMounts, 3, "The dind container should have 3 volume mounts, dind-sock, work and externals") - assert.Equal(t, "work", ars.Spec.Template.Spec.Containers[1].VolumeMounts[0].Name) - assert.Equal(t, "/home/runner/_work", ars.Spec.Template.Spec.Containers[1].VolumeMounts[0].MountPath) - - assert.Equal(t, "dind-sock", ars.Spec.Template.Spec.Containers[1].VolumeMounts[1].Name) - assert.Equal(t, "/var/run", ars.Spec.Template.Spec.Containers[1].VolumeMounts[1].MountPath) - - assert.Equal(t, "dind-externals", ars.Spec.Template.Spec.Containers[1].VolumeMounts[2].Name) - assert.Equal(t, "/home/runner/externals", ars.Spec.Template.Spec.Containers[1].VolumeMounts[2].MountPath) - assert.Len(t, ars.Spec.Template.Spec.Volumes, 3, "Volumes should be 3") assert.Equal(t, "dind-sock", ars.Spec.Template.Spec.Volumes[0].Name, "Volume name should be dind-sock") assert.Equal(t, "dind-externals", ars.Spec.Template.Spec.Volumes[1].Name, "Volume name should be dind-externals") @@ -1178,7 +1178,7 @@ func TestTemplateRenderedWithTLS(t *testing.T) { } } require.NotNil(t, volume) - assert.Equal(t, "certs-configmap", volume.ConfigMap.LocalObjectReference.Name) + assert.Equal(t, "certs-configmap", volume.ConfigMap.Name) assert.Equal(t, "cert.pem", volume.ConfigMap.Items[0].Key) assert.Equal(t, "cert.pem", volume.ConfigMap.Items[0].Path) @@ -1238,7 +1238,7 @@ func TestTemplateRenderedWithTLS(t *testing.T) { } } require.NotNil(t, volume) - assert.Equal(t, "certs-configmap", volume.ConfigMap.LocalObjectReference.Name) + assert.Equal(t, "certs-configmap", volume.ConfigMap.Name) assert.Equal(t, "cert.pem", volume.ConfigMap.Items[0].Key) assert.Equal(t, "cert.pem", volume.ConfigMap.Items[0].Path) @@ -1298,7 +1298,7 @@ func TestTemplateRenderedWithTLS(t *testing.T) { } } require.NotNil(t, volume) - assert.Equal(t, "certs-configmap", volume.ConfigMap.LocalObjectReference.Name) + assert.Equal(t, "certs-configmap", volume.ConfigMap.Name) assert.Equal(t, "cert.pem", volume.ConfigMap.Items[0].Key) assert.Equal(t, "cert.pem", volume.ConfigMap.Items[0].Path) @@ -1826,7 +1826,7 @@ func TestTemplateRenderedAutoScalingRunnerSet_DinDMergePodSpec(t *testing.T) { var ars v1alpha1.AutoscalingRunnerSet helm.UnmarshalK8SYaml(t, output, &ars) - assert.Len(t, ars.Spec.Template.Spec.Containers, 2, "There should be 2 containers") + assert.Len(t, ars.Spec.Template.Spec.Containers, 1, "There should be 1 containers") assert.Equal(t, "runner", ars.Spec.Template.Spec.Containers[0].Name, "Container name should be runner") assert.Equal(t, "250m", ars.Spec.Template.Spec.Containers[0].Resources.Limits.Cpu().String(), "CPU Limit should be set") assert.Equal(t, "64Mi", ars.Spec.Template.Spec.Containers[0].Resources.Limits.Memory().String(), "Memory Limit should be set") diff --git a/charts/gha-runner-scale-set/values.yaml b/charts/gha-runner-scale-set/values.yaml index e3f69992..35922f84 100644 --- a/charts/gha-runner-scale-set/values.yaml +++ b/charts/gha-runner-scale-set/values.yaml @@ -17,6 +17,7 @@ githubConfigSecret: ## (Variation B) When using a GitHub App, the syntax is as follows: # githubConfigSecret: # # NOTE: IDs MUST be strings, use quotes +# # The github_app_id can be an app_id or the client_id # github_app_id: "" # github_app_installation_id: "" # github_app_private_key: | diff --git a/cmd/ghalistener/config/config.go b/cmd/ghalistener/config/config.go index b2fa0acd..905249aa 100644 --- a/cmd/ghalistener/config/config.go +++ b/cmd/ghalistener/config/config.go @@ -17,8 +17,9 @@ import ( ) type Config struct { - ConfigureUrl string `json:"configure_url"` - AppID int64 `json:"app_id"` + ConfigureUrl string `json:"configure_url"` + // AppID can be an ID of the app or the client ID + AppID string `json:"app_id"` AppInstallationID int64 `json:"app_installation_id"` AppPrivateKey string `json:"app_private_key"` Token string `json:"token"` @@ -62,26 +63,26 @@ func (c *Config) Validate() error { } if len(c.EphemeralRunnerSetNamespace) == 0 || len(c.EphemeralRunnerSetName) == 0 { - return fmt.Errorf("EphemeralRunnerSetNamespace '%s' or EphemeralRunnerSetName '%s' is missing", c.EphemeralRunnerSetNamespace, c.EphemeralRunnerSetName) + return fmt.Errorf("EphemeralRunnerSetNamespace %q or EphemeralRunnerSetName %q is missing", c.EphemeralRunnerSetNamespace, c.EphemeralRunnerSetName) } if c.RunnerScaleSetId == 0 { - return fmt.Errorf("RunnerScaleSetId '%d' is missing", c.RunnerScaleSetId) + return fmt.Errorf(`RunnerScaleSetId "%d" is missing`, c.RunnerScaleSetId) } if c.MaxRunners < c.MinRunners { - return fmt.Errorf("MinRunners '%d' cannot be greater than MaxRunners '%d'", c.MinRunners, c.MaxRunners) + return fmt.Errorf(`MinRunners "%d" cannot be greater than MaxRunners "%d"`, c.MinRunners, c.MaxRunners) } hasToken := len(c.Token) > 0 - hasPrivateKeyConfig := c.AppID > 0 && c.AppPrivateKey != "" + hasPrivateKeyConfig := len(c.AppID) > 0 && c.AppPrivateKey != "" if !hasToken && !hasPrivateKeyConfig { - return fmt.Errorf("GitHub auth credential is missing, token length: '%d', appId: '%d', installationId: '%d', private key length: '%d", len(c.Token), c.AppID, c.AppInstallationID, len(c.AppPrivateKey)) + return fmt.Errorf(`GitHub auth credential is missing, token length: "%d", appId: %q, installationId: "%d", private key length: "%d"`, len(c.Token), c.AppID, c.AppInstallationID, len(c.AppPrivateKey)) } if hasToken && hasPrivateKeyConfig { - return fmt.Errorf("only one GitHub auth method supported at a time. Have both PAT and App auth: token length: '%d', appId: '%d', installationId: '%d', private key length: '%d", len(c.Token), c.AppID, c.AppInstallationID, len(c.AppPrivateKey)) + return fmt.Errorf(`only one GitHub auth method supported at a time. Have both PAT and App auth: token length: "%d", appId: %q, installationId: "%d", private key length: "%d"`, len(c.Token), c.AppID, c.AppInstallationID, len(c.AppPrivateKey)) } return nil diff --git a/cmd/ghalistener/config/config_test.go b/cmd/ghalistener/config/config_test.go index fba4f17c..f62b4b73 100644 --- a/cmd/ghalistener/config/config_test.go +++ b/cmd/ghalistener/config/config_test.go @@ -18,7 +18,7 @@ func TestConfigValidationMinMax(t *testing.T) { Token: "token", } err := config.Validate() - assert.ErrorContains(t, err, "MinRunners '5' cannot be greater than MaxRunners '2", "Expected error about MinRunners > MaxRunners") + assert.ErrorContains(t, err, `MinRunners "5" cannot be greater than MaxRunners "2"`, "Expected error about MinRunners > MaxRunners") } func TestConfigValidationMissingToken(t *testing.T) { @@ -29,27 +29,47 @@ func TestConfigValidationMissingToken(t *testing.T) { RunnerScaleSetId: 1, } err := config.Validate() - expectedError := fmt.Sprintf("GitHub auth credential is missing, token length: '%d', appId: '%d', installationId: '%d', private key length: '%d", len(config.Token), config.AppID, config.AppInstallationID, len(config.AppPrivateKey)) + expectedError := fmt.Sprintf(`GitHub auth credential is missing, token length: "%d", appId: %q, installationId: "%d", private key length: "%d"`, len(config.Token), config.AppID, config.AppInstallationID, len(config.AppPrivateKey)) assert.ErrorContains(t, err, expectedError, "Expected error about missing auth") } func TestConfigValidationAppKey(t *testing.T) { - config := &Config{ - AppID: 1, - AppInstallationID: 10, - ConfigureUrl: "github.com/some_org/some_repo", - EphemeralRunnerSetNamespace: "namespace", - EphemeralRunnerSetName: "deployment", - RunnerScaleSetId: 1, - } - err := config.Validate() - expectedError := fmt.Sprintf("GitHub auth credential is missing, token length: '%d', appId: '%d', installationId: '%d', private key length: '%d", len(config.Token), config.AppID, config.AppInstallationID, len(config.AppPrivateKey)) - assert.ErrorContains(t, err, expectedError, "Expected error about missing auth") + t.Parallel() + + t.Run("app id integer", func(t *testing.T) { + t.Parallel() + config := &Config{ + AppID: "1", + AppInstallationID: 10, + ConfigureUrl: "github.com/some_org/some_repo", + EphemeralRunnerSetNamespace: "namespace", + EphemeralRunnerSetName: "deployment", + RunnerScaleSetId: 1, + } + err := config.Validate() + expectedError := fmt.Sprintf(`GitHub auth credential is missing, token length: "%d", appId: %q, installationId: "%d", private key length: "%d"`, len(config.Token), config.AppID, config.AppInstallationID, len(config.AppPrivateKey)) + assert.ErrorContains(t, err, expectedError, "Expected error about missing auth") + }) + + t.Run("app id as client id", func(t *testing.T) { + t.Parallel() + config := &Config{ + AppID: "Iv23f8doAlphaNumer1c", + AppInstallationID: 10, + ConfigureUrl: "github.com/some_org/some_repo", + EphemeralRunnerSetNamespace: "namespace", + EphemeralRunnerSetName: "deployment", + RunnerScaleSetId: 1, + } + err := config.Validate() + expectedError := fmt.Sprintf(`GitHub auth credential is missing, token length: "%d", appId: %q, installationId: "%d", private key length: "%d"`, len(config.Token), config.AppID, config.AppInstallationID, len(config.AppPrivateKey)) + assert.ErrorContains(t, err, expectedError, "Expected error about missing auth") + }) } func TestConfigValidationOnlyOneTypeOfCredentials(t *testing.T) { config := &Config{ - AppID: 1, + AppID: "1", AppInstallationID: 10, AppPrivateKey: "asdf", Token: "asdf", @@ -59,7 +79,7 @@ func TestConfigValidationOnlyOneTypeOfCredentials(t *testing.T) { RunnerScaleSetId: 1, } err := config.Validate() - expectedError := fmt.Sprintf("only one GitHub auth method supported at a time. Have both PAT and App auth: token length: '%d', appId: '%d', installationId: '%d', private key length: '%d", len(config.Token), config.AppID, config.AppInstallationID, len(config.AppPrivateKey)) + expectedError := fmt.Sprintf(`only one GitHub auth method supported at a time. Have both PAT and App auth: token length: "%d", appId: %q, installationId: "%d", private key length: "%d"`, len(config.Token), config.AppID, config.AppInstallationID, len(config.AppPrivateKey)) assert.ErrorContains(t, err, expectedError, "Expected error about missing auth") } diff --git a/cmd/ghalistener/metrics/metrics.go b/cmd/ghalistener/metrics/metrics.go index 9f3b8591..5dad8b97 100644 --- a/cmd/ghalistener/metrics/metrics.go +++ b/cmd/ghalistener/metrics/metrics.go @@ -287,7 +287,7 @@ func (e *exporter) ListenAndServe(ctx context.Context) error { } func (e *exporter) setGauge(name string, allLabels prometheus.Labels, val float64) { - m, ok := e.metrics.gauges[name] + m, ok := e.gauges[name] if !ok { return } @@ -299,7 +299,7 @@ func (e *exporter) setGauge(name string, allLabels prometheus.Labels, val float6 } func (e *exporter) incCounter(name string, allLabels prometheus.Labels) { - m, ok := e.metrics.counters[name] + m, ok := e.counters[name] if !ok { return } @@ -311,7 +311,7 @@ func (e *exporter) incCounter(name string, allLabels prometheus.Labels) { } func (e *exporter) observeHistogram(name string, allLabels prometheus.Labels, val float64) { - m, ok := e.metrics.histograms[name] + m, ok := e.histograms[name] if !ok { return } @@ -339,7 +339,7 @@ func (e *exporter) PublishJobStarted(msg *actions.JobStarted) { l := e.startedJobLabels(msg) e.incCounter(MetricStartedJobsTotal, l) - startupDuration := msg.JobMessageBase.RunnerAssignTime.Unix() - msg.JobMessageBase.ScaleSetAssignTime.Unix() + startupDuration := msg.RunnerAssignTime.Unix() - msg.ScaleSetAssignTime.Unix() e.observeHistogram(MetricJobStartupDurationSeconds, l, float64(startupDuration)) } @@ -347,7 +347,7 @@ func (e *exporter) PublishJobCompleted(msg *actions.JobCompleted) { l := e.completedJobLabels(msg) e.incCounter(MetricCompletedJobsTotal, l) - executionDuration := msg.JobMessageBase.FinishTime.Unix() - msg.JobMessageBase.RunnerAssignTime.Unix() + executionDuration := msg.FinishTime.Unix() - msg.RunnerAssignTime.Unix() e.observeHistogram(MetricJobExecutionDurationSeconds, l, float64(executionDuration)) } diff --git a/config/crd/bases/actions.github.com_ephemeralrunners.yaml b/config/crd/bases/actions.github.com_ephemeralrunners.yaml index e1505280..f7cf1139 100644 --- a/config/crd/bases/actions.github.com_ephemeralrunners.yaml +++ b/config/crd/bases/actions.github.com_ephemeralrunners.yaml @@ -7794,7 +7794,8 @@ spec: properties: failures: additionalProperties: - type: boolean + format: date-time + type: string type: object jobDisplayName: type: string diff --git a/controllers/actions.github.com/autoscalinglistener_controller.go b/controllers/actions.github.com/autoscalinglistener_controller.go index 8f3e6f15..15cbb905 100644 --- a/controllers/actions.github.com/autoscalinglistener_controller.go +++ b/controllers/actions.github.com/autoscalinglistener_controller.go @@ -77,7 +77,7 @@ func (r *AutoscalingListenerReconciler) Reconcile(ctx context.Context, req ctrl. return ctrl.Result{}, client.IgnoreNotFound(err) } - if !autoscalingListener.ObjectMeta.DeletionTimestamp.IsZero() { + if !autoscalingListener.DeletionTimestamp.IsZero() { if !controllerutil.ContainsFinalizer(autoscalingListener, autoscalingListenerFinalizerName) { return ctrl.Result{}, nil } @@ -139,9 +139,9 @@ func (r *AutoscalingListenerReconciler) Reconcile(ctx context.Context, req ctrl. // Create a mirror secret in the same namespace as the AutoscalingListener mirrorSecret := new(corev1.Secret) - if err := r.Get(ctx, types.NamespacedName{Namespace: autoscalingListener.Namespace, Name: scaleSetListenerSecretMirrorName(autoscalingListener)}, mirrorSecret); err != nil { + if err := r.Get(ctx, types.NamespacedName{Namespace: autoscalingListener.Namespace, Name: autoscalingListener.Name}, mirrorSecret); err != nil { if !kerrors.IsNotFound(err) { - log.Error(err, "Unable to get listener secret mirror", "namespace", autoscalingListener.Namespace, "name", scaleSetListenerSecretMirrorName(autoscalingListener)) + log.Error(err, "Unable to get listener secret mirror", "namespace", autoscalingListener.Namespace, "name", autoscalingListener.Name) return ctrl.Result{}, err } @@ -160,9 +160,9 @@ func (r *AutoscalingListenerReconciler) Reconcile(ctx context.Context, req ctrl. // Make sure the runner scale set listener service account is created for the listener pod in the controller namespace serviceAccount := new(corev1.ServiceAccount) - if err := r.Get(ctx, types.NamespacedName{Namespace: autoscalingListener.Namespace, Name: scaleSetListenerServiceAccountName(autoscalingListener)}, serviceAccount); err != nil { + if err := r.Get(ctx, types.NamespacedName{Namespace: autoscalingListener.Namespace, Name: autoscalingListener.Name}, serviceAccount); err != nil { if !kerrors.IsNotFound(err) { - log.Error(err, "Unable to get listener service accounts", "namespace", autoscalingListener.Namespace, "name", scaleSetListenerServiceAccountName(autoscalingListener)) + log.Error(err, "Unable to get listener service accounts", "namespace", autoscalingListener.Namespace, "name", autoscalingListener.Name) return ctrl.Result{}, err } @@ -175,9 +175,9 @@ func (r *AutoscalingListenerReconciler) Reconcile(ctx context.Context, req ctrl. // Make sure the runner scale set listener role is created in the AutoscalingRunnerSet namespace listenerRole := new(rbacv1.Role) - if err := r.Get(ctx, types.NamespacedName{Namespace: autoscalingListener.Spec.AutoscalingRunnerSetNamespace, Name: scaleSetListenerRoleName(autoscalingListener)}, listenerRole); err != nil { + if err := r.Get(ctx, types.NamespacedName{Namespace: autoscalingListener.Spec.AutoscalingRunnerSetNamespace, Name: autoscalingListener.Name}, listenerRole); err != nil { if !kerrors.IsNotFound(err) { - log.Error(err, "Unable to get listener role", "namespace", autoscalingListener.Spec.AutoscalingRunnerSetNamespace, "name", scaleSetListenerRoleName(autoscalingListener)) + log.Error(err, "Unable to get listener role", "namespace", autoscalingListener.Spec.AutoscalingRunnerSetNamespace, "name", autoscalingListener.Name) return ctrl.Result{}, err } @@ -197,9 +197,9 @@ func (r *AutoscalingListenerReconciler) Reconcile(ctx context.Context, req ctrl. // Make sure the runner scale set listener role binding is created listenerRoleBinding := new(rbacv1.RoleBinding) - if err := r.Get(ctx, types.NamespacedName{Namespace: autoscalingListener.Spec.AutoscalingRunnerSetNamespace, Name: scaleSetListenerRoleName(autoscalingListener)}, listenerRoleBinding); err != nil { + if err := r.Get(ctx, types.NamespacedName{Namespace: autoscalingListener.Spec.AutoscalingRunnerSetNamespace, Name: autoscalingListener.Name}, listenerRoleBinding); err != nil { if !kerrors.IsNotFound(err) { - log.Error(err, "Unable to get listener role binding", "namespace", autoscalingListener.Spec.AutoscalingRunnerSetNamespace, "name", scaleSetListenerRoleName(autoscalingListener)) + log.Error(err, "Unable to get listener role binding", "namespace", autoscalingListener.Spec.AutoscalingRunnerSetNamespace, "name", autoscalingListener.Name) return ctrl.Result{}, err } @@ -294,7 +294,7 @@ func (r *AutoscalingListenerReconciler) cleanupResources(ctx context.Context, au err = r.Get(ctx, types.NamespacedName{Name: autoscalingListener.Name, Namespace: autoscalingListener.Namespace}, listenerPod) switch { case err == nil: - if listenerPod.ObjectMeta.DeletionTimestamp.IsZero() { + if listenerPod.DeletionTimestamp.IsZero() { logger.Info("Deleting the listener pod") if err := r.Delete(ctx, listenerPod); err != nil { return false, fmt.Errorf("failed to delete listener pod: %w", err) @@ -312,7 +312,7 @@ func (r *AutoscalingListenerReconciler) cleanupResources(ctx context.Context, au err = r.Get(ctx, types.NamespacedName{Namespace: autoscalingListener.Namespace, Name: scaleSetListenerConfigName(autoscalingListener)}, &secret) switch { case err == nil: - if secret.ObjectMeta.DeletionTimestamp.IsZero() { + if secret.DeletionTimestamp.IsZero() { logger.Info("Deleting the listener config secret") if err := r.Delete(ctx, &secret); err != nil { return false, fmt.Errorf("failed to delete listener config secret: %w", err) @@ -329,7 +329,7 @@ func (r *AutoscalingListenerReconciler) cleanupResources(ctx context.Context, au err = r.Get(ctx, types.NamespacedName{Name: proxyListenerSecretName(autoscalingListener), Namespace: autoscalingListener.Namespace}, proxySecret) switch { case err == nil: - if proxySecret.ObjectMeta.DeletionTimestamp.IsZero() { + if proxySecret.DeletionTimestamp.IsZero() { logger.Info("Deleting the listener proxy secret") if err := r.Delete(ctx, proxySecret); err != nil { return false, fmt.Errorf("failed to delete listener proxy secret: %w", err) @@ -343,10 +343,10 @@ func (r *AutoscalingListenerReconciler) cleanupResources(ctx context.Context, au } listenerRoleBinding := new(rbacv1.RoleBinding) - err = r.Get(ctx, types.NamespacedName{Namespace: autoscalingListener.Spec.AutoscalingRunnerSetNamespace, Name: scaleSetListenerRoleName(autoscalingListener)}, listenerRoleBinding) + err = r.Get(ctx, types.NamespacedName{Namespace: autoscalingListener.Spec.AutoscalingRunnerSetNamespace, Name: autoscalingListener.Name}, listenerRoleBinding) switch { case err == nil: - if listenerRoleBinding.ObjectMeta.DeletionTimestamp.IsZero() { + if listenerRoleBinding.DeletionTimestamp.IsZero() { logger.Info("Deleting the listener role binding") if err := r.Delete(ctx, listenerRoleBinding); err != nil { return false, fmt.Errorf("failed to delete listener role binding: %w", err) @@ -359,10 +359,10 @@ func (r *AutoscalingListenerReconciler) cleanupResources(ctx context.Context, au logger.Info("Listener role binding is deleted") listenerRole := new(rbacv1.Role) - err = r.Get(ctx, types.NamespacedName{Namespace: autoscalingListener.Spec.AutoscalingRunnerSetNamespace, Name: scaleSetListenerRoleName(autoscalingListener)}, listenerRole) + err = r.Get(ctx, types.NamespacedName{Namespace: autoscalingListener.Spec.AutoscalingRunnerSetNamespace, Name: autoscalingListener.Name}, listenerRole) switch { case err == nil: - if listenerRole.ObjectMeta.DeletionTimestamp.IsZero() { + if listenerRole.DeletionTimestamp.IsZero() { logger.Info("Deleting the listener role") if err := r.Delete(ctx, listenerRole); err != nil { return false, fmt.Errorf("failed to delete listener role: %w", err) @@ -376,10 +376,10 @@ func (r *AutoscalingListenerReconciler) cleanupResources(ctx context.Context, au logger.Info("Cleaning up the listener service account") listenerSa := new(corev1.ServiceAccount) - err = r.Get(ctx, types.NamespacedName{Name: scaleSetListenerServiceAccountName(autoscalingListener), Namespace: autoscalingListener.Namespace}, listenerSa) + err = r.Get(ctx, types.NamespacedName{Name: autoscalingListener.Name, Namespace: autoscalingListener.Namespace}, listenerSa) switch { case err == nil: - if listenerSa.ObjectMeta.DeletionTimestamp.IsZero() { + if listenerSa.DeletionTimestamp.IsZero() { logger.Info("Deleting the listener service account") if err := r.Delete(ctx, listenerSa); err != nil { return false, fmt.Errorf("failed to delete listener service account: %w", err) @@ -395,7 +395,7 @@ func (r *AutoscalingListenerReconciler) cleanupResources(ctx context.Context, au } func (r *AutoscalingListenerReconciler) createServiceAccountForListener(ctx context.Context, autoscalingListener *v1alpha1.AutoscalingListener, logger logr.Logger) (ctrl.Result, error) { - newServiceAccount := r.ResourceBuilder.newScaleSetListenerServiceAccount(autoscalingListener) + newServiceAccount := r.newScaleSetListenerServiceAccount(autoscalingListener) if err := ctrl.SetControllerReference(autoscalingListener, newServiceAccount, r.Scheme); err != nil { return ctrl.Result{}, err @@ -480,7 +480,7 @@ func (r *AutoscalingListenerReconciler) createListenerPod(ctx context.Context, a logger.Info("Creating listener config secret") - podConfig, err := r.ResourceBuilder.newScaleSetListenerConfig(autoscalingListener, secret, metricsConfig, cert) + podConfig, err := r.newScaleSetListenerConfig(autoscalingListener, secret, metricsConfig, cert) if err != nil { logger.Error(err, "Failed to build listener config secret") return ctrl.Result{}, err @@ -499,7 +499,7 @@ func (r *AutoscalingListenerReconciler) createListenerPod(ctx context.Context, a return ctrl.Result{Requeue: true}, nil } - newPod, err := r.ResourceBuilder.newScaleSetListenerPod(autoscalingListener, &podConfig, serviceAccount, secret, metricsConfig, envs...) + newPod, err := r.newScaleSetListenerPod(autoscalingListener, &podConfig, serviceAccount, secret, metricsConfig, envs...) if err != nil { logger.Error(err, "Failed to build listener pod") return ctrl.Result{}, err @@ -559,7 +559,7 @@ func (r *AutoscalingListenerReconciler) certificate(ctx context.Context, autosca } func (r *AutoscalingListenerReconciler) createSecretsForListener(ctx context.Context, autoscalingListener *v1alpha1.AutoscalingListener, secret *corev1.Secret, logger logr.Logger) (ctrl.Result, error) { - newListenerSecret := r.ResourceBuilder.newScaleSetListenerSecretMirror(autoscalingListener, secret) + newListenerSecret := r.newScaleSetListenerSecretMirror(autoscalingListener, secret) if err := ctrl.SetControllerReference(autoscalingListener, newListenerSecret, r.Scheme); err != nil { return ctrl.Result{}, err @@ -631,7 +631,7 @@ func (r *AutoscalingListenerReconciler) updateSecretsForListener(ctx context.Con } func (r *AutoscalingListenerReconciler) createRoleForListener(ctx context.Context, autoscalingListener *v1alpha1.AutoscalingListener, logger logr.Logger) (ctrl.Result, error) { - newRole := r.ResourceBuilder.newScaleSetListenerRole(autoscalingListener) + newRole := r.newScaleSetListenerRole(autoscalingListener) logger.Info("Creating listener role", "namespace", newRole.Namespace, "name", newRole.Name, "rules", newRole.Rules) if err := r.Create(ctx, newRole); err != nil { @@ -659,7 +659,7 @@ func (r *AutoscalingListenerReconciler) updateRoleForListener(ctx context.Contex } func (r *AutoscalingListenerReconciler) createRoleBindingForListener(ctx context.Context, autoscalingListener *v1alpha1.AutoscalingListener, listenerRole *rbacv1.Role, serviceAccount *corev1.ServiceAccount, logger logr.Logger) (ctrl.Result, error) { - newRoleBinding := r.ResourceBuilder.newScaleSetListenerRoleBinding(autoscalingListener, listenerRole, serviceAccount) + newRoleBinding := r.newScaleSetListenerRoleBinding(autoscalingListener, listenerRole, serviceAccount) logger.Info("Creating listener role binding", "namespace", newRoleBinding.Namespace, diff --git a/controllers/actions.github.com/autoscalinglistener_controller_test.go b/controllers/actions.github.com/autoscalinglistener_controller_test.go index 69b7978c..5ce5ee48 100644 --- a/controllers/actions.github.com/autoscalinglistener_controller_test.go +++ b/controllers/actions.github.com/autoscalinglistener_controller_test.go @@ -134,37 +134,25 @@ var _ = Describe("Test AutoScalingListener controller", func() { autoscalingListenerTestTimeout, autoscalingListenerTestInterval).Should(BeEquivalentTo(autoscalingListenerFinalizerName), "AutoScalingListener should have a finalizer") - // Check if secret is created - mirrorSecret := new(corev1.Secret) - Eventually( - func() (string, error) { - err := k8sClient.Get(ctx, client.ObjectKey{Name: scaleSetListenerSecretMirrorName(autoscalingListener), Namespace: autoscalingListener.Namespace}, mirrorSecret) - if err != nil { - return "", err - } - return string(mirrorSecret.Data["github_token"]), nil - }, - autoscalingListenerTestTimeout, - autoscalingListenerTestInterval).Should(BeEquivalentTo(autoscalingListenerTestGitHubToken), "Mirror secret should be created") - // Check if service account is created serviceAccount := new(corev1.ServiceAccount) Eventually( func() (string, error) { - err := k8sClient.Get(ctx, client.ObjectKey{Name: scaleSetListenerServiceAccountName(autoscalingListener), Namespace: autoscalingListener.Namespace}, serviceAccount) + err := k8sClient.Get(ctx, client.ObjectKey{Name: autoscalingListener.Name, Namespace: autoscalingListener.Namespace}, serviceAccount) if err != nil { return "", err } return serviceAccount.Name, nil }, autoscalingListenerTestTimeout, - autoscalingListenerTestInterval).Should(BeEquivalentTo(scaleSetListenerServiceAccountName(autoscalingListener)), "Service account should be created") + autoscalingListenerTestInterval, + ).Should(BeEquivalentTo(autoscalingListener.Name), "Service account should be created") // Check if role is created role := new(rbacv1.Role) Eventually( func() ([]rbacv1.PolicyRule, error) { - err := k8sClient.Get(ctx, client.ObjectKey{Name: scaleSetListenerRoleName(autoscalingListener), Namespace: autoscalingListener.Spec.AutoscalingRunnerSetNamespace}, role) + err := k8sClient.Get(ctx, client.ObjectKey{Name: autoscalingListener.Name, Namespace: autoscalingListener.Spec.AutoscalingRunnerSetNamespace}, role) if err != nil { return nil, err } @@ -178,7 +166,7 @@ var _ = Describe("Test AutoScalingListener controller", func() { roleBinding := new(rbacv1.RoleBinding) Eventually( func() (string, error) { - err := k8sClient.Get(ctx, client.ObjectKey{Name: scaleSetListenerRoleName(autoscalingListener), Namespace: autoscalingListener.Spec.AutoscalingRunnerSetNamespace}, roleBinding) + err := k8sClient.Get(ctx, client.ObjectKey{Name: autoscalingListener.Name, Namespace: autoscalingListener.Spec.AutoscalingRunnerSetNamespace}, roleBinding) if err != nil { return "", err } @@ -186,7 +174,7 @@ var _ = Describe("Test AutoScalingListener controller", func() { return roleBinding.RoleRef.Name, nil }, autoscalingListenerTestTimeout, - autoscalingListenerTestInterval).Should(BeEquivalentTo(scaleSetListenerRoleName(autoscalingListener)), "Rolebinding should be created") + autoscalingListenerTestInterval).Should(BeEquivalentTo(autoscalingListener.Name), "Rolebinding should be created") // Check if pod is created pod := new(corev1.Pod) @@ -248,7 +236,7 @@ var _ = Describe("Test AutoScalingListener controller", func() { Eventually( func() bool { roleBinding := new(rbacv1.RoleBinding) - err := k8sClient.Get(ctx, client.ObjectKey{Name: scaleSetListenerRoleName(autoscalingListener), Namespace: autoscalingListener.Spec.AutoscalingRunnerSetNamespace}, roleBinding) + err := k8sClient.Get(ctx, client.ObjectKey{Name: autoscalingListener.Name, Namespace: autoscalingListener.Spec.AutoscalingRunnerSetNamespace}, roleBinding) return kerrors.IsNotFound(err) }, autoscalingListenerTestTimeout, @@ -259,7 +247,7 @@ var _ = Describe("Test AutoScalingListener controller", func() { Eventually( func() bool { role := new(rbacv1.Role) - err := k8sClient.Get(ctx, client.ObjectKey{Name: scaleSetListenerRoleName(autoscalingListener), Namespace: autoscalingListener.Spec.AutoscalingRunnerSetNamespace}, role) + err := k8sClient.Get(ctx, client.ObjectKey{Name: autoscalingListener.Name, Namespace: autoscalingListener.Spec.AutoscalingRunnerSetNamespace}, role) return kerrors.IsNotFound(err) }, autoscalingListenerTestTimeout, @@ -340,7 +328,7 @@ var _ = Describe("Test AutoScalingListener controller", func() { role := new(rbacv1.Role) Eventually( func() ([]rbacv1.PolicyRule, error) { - err := k8sClient.Get(ctx, client.ObjectKey{Name: scaleSetListenerRoleName(autoscalingListener), Namespace: autoscalingListener.Spec.AutoscalingRunnerSetNamespace}, role) + err := k8sClient.Get(ctx, client.ObjectKey{Name: autoscalingListener.Name, Namespace: autoscalingListener.Spec.AutoscalingRunnerSetNamespace}, role) if err != nil { return nil, err } @@ -397,75 +385,6 @@ var _ = Describe("Test AutoScalingListener controller", func() { autoscalingListenerTestInterval, ).ShouldNot(BeEquivalentTo(oldPodUID), "Pod should be re-created") }) - - It("It should update mirror secrets to match secret used by AutoScalingRunnerSet", func() { - // Waiting for the pod is created - pod := new(corev1.Pod) - Eventually( - func() (string, error) { - err := k8sClient.Get(ctx, client.ObjectKey{Name: autoscalingListener.Name, Namespace: autoscalingListener.Namespace}, pod) - if err != nil { - return "", err - } - - return pod.Name, nil - }, - autoscalingListenerTestTimeout, - autoscalingListenerTestInterval).Should(BeEquivalentTo(autoscalingListener.Name), "Pod should be created") - - // Update the secret - updatedSecret := configSecret.DeepCopy() - updatedSecret.Data["github_token"] = []byte(autoscalingListenerTestGitHubToken + "_updated") - err := k8sClient.Update(ctx, updatedSecret) - Expect(err).NotTo(HaveOccurred(), "failed to update test secret") - - updatedPod := pod.DeepCopy() - // Ignore status running and consult the container state - updatedPod.Status.Phase = corev1.PodRunning - updatedPod.Status.ContainerStatuses = []corev1.ContainerStatus{ - { - Name: autoscalingListenerContainerName, - State: corev1.ContainerState{ - Terminated: &corev1.ContainerStateTerminated{ - ExitCode: 1, - }, - }, - }, - } - err = k8sClient.Status().Update(ctx, updatedPod) - Expect(err).NotTo(HaveOccurred(), "failed to update test pod to failed") - - // Check if mirror secret is updated with right data - mirrorSecret := new(corev1.Secret) - Eventually( - func() (map[string][]byte, error) { - err := k8sClient.Get(ctx, client.ObjectKey{Name: scaleSetListenerSecretMirrorName(autoscalingListener), Namespace: autoscalingListener.Namespace}, mirrorSecret) - if err != nil { - return nil, err - } - - return mirrorSecret.Data, nil - }, - autoscalingListenerTestTimeout, - autoscalingListenerTestInterval).Should(BeEquivalentTo(updatedSecret.Data), "Mirror secret should be updated") - - // Check if we re-created a new pod - Eventually( - func() error { - latestPod := new(corev1.Pod) - err := k8sClient.Get(ctx, client.ObjectKey{Name: autoscalingListener.Name, Namespace: autoscalingListener.Namespace}, latestPod) - if err != nil { - return err - } - if latestPod.UID == pod.UID { - return fmt.Errorf("Pod should be recreated") - } - - return nil - }, - autoscalingListenerTestTimeout, - autoscalingListenerTestInterval).Should(Succeed(), "Pod should be recreated") - }) }) }) diff --git a/controllers/actions.github.com/autoscalingrunnerset_controller.go b/controllers/actions.github.com/autoscalingrunnerset_controller.go index f6ea15f4..21740ff6 100644 --- a/controllers/actions.github.com/autoscalingrunnerset_controller.go +++ b/controllers/actions.github.com/autoscalingrunnerset_controller.go @@ -99,7 +99,7 @@ func (r *AutoscalingRunnerSetReconciler) Reconcile(ctx context.Context, req ctrl return ctrl.Result{}, client.IgnoreNotFound(err) } - if !autoscalingRunnerSet.ObjectMeta.DeletionTimestamp.IsZero() { + if !autoscalingRunnerSet.DeletionTimestamp.IsZero() { if !controllerutil.ContainsFinalizer(autoscalingRunnerSet, autoscalingRunnerSetFinalizerName) { return ctrl.Result{}, nil } @@ -151,7 +151,7 @@ func (r *AutoscalingRunnerSetReconciler) Reconcile(ctx context.Context, req ctrl return ctrl.Result{}, nil } - if autoscalingRunnerSet.Labels[LabelKeyKubernetesVersion] != build.Version { + if !v1alpha1.IsVersionAllowed(autoscalingRunnerSet.Labels[LabelKeyKubernetesVersion], build.Version) { if err := r.Delete(ctx, autoscalingRunnerSet); err != nil { log.Error(err, "Failed to delete autoscaling runner set on version mismatch", "buildVersion", build.Version, @@ -332,7 +332,7 @@ func (r *AutoscalingRunnerSetReconciler) cleanupListener(ctx context.Context, au err = r.Get(ctx, client.ObjectKey{Namespace: r.ControllerNamespace, Name: scaleSetListenerName(autoscalingRunnerSet)}, &listener) switch { case err == nil: - if listener.ObjectMeta.DeletionTimestamp.IsZero() { + if listener.DeletionTimestamp.IsZero() { logger.Info("Deleting the listener") if err := r.Delete(ctx, &listener); err != nil { return false, fmt.Errorf("failed to delete listener: %w", err) @@ -369,7 +369,7 @@ func (r *AutoscalingRunnerSetReconciler) deleteEphemeralRunnerSets(ctx context.C for i := range oldRunnerSets { rs := &oldRunnerSets[i] // already deleted but contains finalizer so it still exists - if !rs.ObjectMeta.DeletionTimestamp.IsZero() { + if !rs.DeletionTimestamp.IsZero() { logger.Info("Skip ephemeral runner set since it is already marked for deletion", "name", rs.Name) continue } @@ -622,7 +622,7 @@ func (r *AutoscalingRunnerSetReconciler) deleteRunnerScaleSet(ctx context.Contex } func (r *AutoscalingRunnerSetReconciler) createEphemeralRunnerSet(ctx context.Context, autoscalingRunnerSet *v1alpha1.AutoscalingRunnerSet, log logr.Logger) (ctrl.Result, error) { - desiredRunnerSet, err := r.ResourceBuilder.newEphemeralRunnerSet(autoscalingRunnerSet) + desiredRunnerSet, err := r.newEphemeralRunnerSet(autoscalingRunnerSet) if err != nil { log.Error(err, "Could not create EphemeralRunnerSet") return ctrl.Result{}, err @@ -651,7 +651,7 @@ func (r *AutoscalingRunnerSetReconciler) createAutoScalingListenerForRunnerSet(c }) } - autoscalingListener, err := r.ResourceBuilder.newAutoScalingListener(autoscalingRunnerSet, ephemeralRunnerSet, r.ControllerNamespace, r.DefaultRunnerScaleSetListenerImage, imagePullSecrets) + autoscalingListener, err := r.newAutoScalingListener(autoscalingRunnerSet, ephemeralRunnerSet, r.ControllerNamespace, r.DefaultRunnerScaleSetListenerImage, imagePullSecrets) if err != nil { log.Error(err, "Could not create AutoscalingListener spec") return ctrl.Result{}, err diff --git a/controllers/actions.github.com/autoscalingrunnerset_controller_test.go b/controllers/actions.github.com/autoscalingrunnerset_controller_test.go index 5609fe41..b3002470 100644 --- a/controllers/actions.github.com/autoscalingrunnerset_controller_test.go +++ b/controllers/actions.github.com/autoscalingrunnerset_controller_test.go @@ -280,10 +280,10 @@ var _ = Describe("Test AutoScalingRunnerSet controller", Ordered, func() { // This should trigger re-creation of EphemeralRunnerSet and Listener patched := autoscalingRunnerSet.DeepCopy() patched.Spec.Template.Spec.PriorityClassName = "test-priority-class" - if patched.ObjectMeta.Annotations == nil { - patched.ObjectMeta.Annotations = make(map[string]string) + if patched.Annotations == nil { + patched.Annotations = make(map[string]string) } - patched.ObjectMeta.Annotations[annotationKeyValuesHash] = "test-hash" + patched.Annotations[annotationKeyValuesHash] = "test-hash" err = k8sClient.Patch(ctx, patched, client.MergeFrom(autoscalingRunnerSet)) Expect(err).NotTo(HaveOccurred(), "failed to patch AutoScalingRunnerSet") autoscalingRunnerSet = patched.DeepCopy() @@ -383,7 +383,7 @@ var _ = Describe("Test AutoScalingRunnerSet controller", Ordered, func() { Expect(err).NotTo(HaveOccurred(), "failed to get Listener") patched = autoscalingRunnerSet.DeepCopy() - patched.ObjectMeta.Annotations[annotationKeyValuesHash] = "hash-changes" + patched.Annotations[annotationKeyValuesHash] = "hash-changes" err = k8sClient.Patch(ctx, patched, client.MergeFrom(autoscalingRunnerSet)) Expect(err).NotTo(HaveOccurred(), "failed to patch AutoScalingRunnerSet") @@ -546,10 +546,10 @@ var _ = Describe("Test AutoScalingRunnerSet controller", Ordered, func() { // Patch the AutoScalingRunnerSet image which should trigger // the recreation of the Listener and EphemeralRunnerSet patched := autoscalingRunnerSet.DeepCopy() - if patched.ObjectMeta.Annotations == nil { - patched.ObjectMeta.Annotations = make(map[string]string) + if patched.Annotations == nil { + patched.Annotations = make(map[string]string) } - patched.ObjectMeta.Annotations[annotationKeyValuesHash] = "testgroup2" + patched.Annotations[annotationKeyValuesHash] = "testgroup2" patched.Spec.Template.Spec = corev1.PodSpec{ Containers: []corev1.Container{ { @@ -875,7 +875,7 @@ var _ = Describe("Test AutoscalingController creation failures", Ordered, func() autoscalingRunnerSetTestInterval, ).Should(BeEquivalentTo(autoscalingRunnerSetFinalizerName), "AutoScalingRunnerSet should have a finalizer") - ars.ObjectMeta.Annotations = make(map[string]string) + ars.Annotations = make(map[string]string) err = k8sClient.Update(ctx, ars) Expect(err).NotTo(HaveOccurred(), "Update autoscaling runner set without annotation should be successful") diff --git a/controllers/actions.github.com/ephemeralrunner_controller.go b/controllers/actions.github.com/ephemeralrunner_controller.go index 9f8caa48..5ac80df7 100644 --- a/controllers/actions.github.com/ephemeralrunner_controller.go +++ b/controllers/actions.github.com/ephemeralrunner_controller.go @@ -28,6 +28,7 @@ import ( "github.com/go-logr/logr" corev1 "k8s.io/api/core/v1" kerrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" @@ -50,6 +51,19 @@ type EphemeralRunnerReconciler struct { ResourceBuilder } +// precompute backoff durations for failed ephemeral runners +// the len(failedRunnerBackoff) must be equal to maxFailures + 1 +var failedRunnerBackoff = []time.Duration{ + 0, + 5 * time.Second, + 10 * time.Second, + 20 * time.Second, + 40 * time.Second, + 80 * time.Second, +} + +const maxFailures = 5 + // +kubebuilder:rbac:groups=actions.github.com,resources=ephemeralrunners,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=actions.github.com,resources=ephemeralrunners/status,verbs=get;update;patch // +kubebuilder:rbac:groups=actions.github.com,resources=ephemeralrunners/finalizers,verbs=get;list;watch;create;update;patch;delete @@ -70,7 +84,7 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ return ctrl.Result{}, client.IgnoreNotFound(err) } - if !ephemeralRunner.ObjectMeta.DeletionTimestamp.IsZero() { + if !ephemeralRunner.DeletionTimestamp.IsZero() { if !controllerutil.ContainsFinalizer(ephemeralRunner, ephemeralRunnerFinalizerName) { return ctrl.Result{}, nil } @@ -173,6 +187,29 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ } } + if len(ephemeralRunner.Status.Failures) > maxFailures { + log.Info(fmt.Sprintf("EphemeralRunner has failed more than %d times. Deleting ephemeral runner so it can be re-created", maxFailures)) + if err := r.Delete(ctx, ephemeralRunner); err != nil { + log.Error(fmt.Errorf("failed to delete ephemeral runner after %d failures: %w", maxFailures, err), "Failed to delete ephemeral runner") + return ctrl.Result{}, err + } + + return ctrl.Result{}, nil + } + + now := metav1.Now() + lastFailure := ephemeralRunner.Status.LastFailure() + backoffDuration := failedRunnerBackoff[len(ephemeralRunner.Status.Failures)] + nextReconciliation := lastFailure.Add(backoffDuration) + if !lastFailure.IsZero() && now.Before(&metav1.Time{Time: nextReconciliation}) { + log.Info("Backing off the next reconciliation due to failure", + "lastFailure", lastFailure, + "nextReconciliation", nextReconciliation, + "requeueAfter", nextReconciliation.Sub(now.Time), + ) + return ctrl.Result{RequeueAfter: now.Sub(nextReconciliation)}, nil + } + secret := new(corev1.Secret) if err := r.Get(ctx, req.NamespacedName, secret); err != nil { if !kerrors.IsNotFound(err) { @@ -196,39 +233,28 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ pod := new(corev1.Pod) if err := r.Get(ctx, req.NamespacedName, pod); err != nil { - switch { - case !kerrors.IsNotFound(err): + if !kerrors.IsNotFound(err) { log.Error(err, "Failed to fetch the pod") return ctrl.Result{}, err + } - case len(ephemeralRunner.Status.Failures) > 5: - log.Info("EphemeralRunner has failed more than 5 times. Marking it as failed") - errMessage := fmt.Sprintf("Pod has failed to start more than 5 times: %s", pod.Status.Message) - if err := r.markAsFailed(ctx, ephemeralRunner, errMessage, ReasonTooManyPodFailures, log); err != nil { + // Pod was not found. Create if the pod has never been created + log.Info("Creating new EphemeralRunner pod.") + result, err := r.createPod(ctx, ephemeralRunner, secret, log) + switch { + case err == nil: + return result, nil + case kerrors.IsInvalid(err) || kerrors.IsForbidden(err): + log.Error(err, "Failed to create a pod due to unrecoverable failure") + errMessage := fmt.Sprintf("Failed to create the pod: %v", err) + if err := r.markAsFailed(ctx, ephemeralRunner, errMessage, ReasonInvalidPodFailure, log); err != nil { log.Error(err, "Failed to set ephemeral runner to phase Failed") return ctrl.Result{}, err } return ctrl.Result{}, nil - default: - // Pod was not found. Create if the pod has never been created - log.Info("Creating new EphemeralRunner pod.") - result, err := r.createPod(ctx, ephemeralRunner, secret, log) - switch { - case err == nil: - return result, nil - case kerrors.IsInvalid(err) || kerrors.IsForbidden(err): - log.Error(err, "Failed to create a pod due to unrecoverable failure") - errMessage := fmt.Sprintf("Failed to create the pod: %v", err) - if err := r.markAsFailed(ctx, ephemeralRunner, errMessage, ReasonInvalidPodFailure, log); err != nil { - log.Error(err, "Failed to set ephemeral runner to phase Failed") - return ctrl.Result{}, err - } - return ctrl.Result{}, nil - default: - log.Error(err, "Failed to create the pod") - return ctrl.Result{}, err - } + log.Error(err, "Failed to create the pod") + return ctrl.Result{}, err } } @@ -319,7 +345,7 @@ func (r *EphemeralRunnerReconciler) cleanupResources(ctx context.Context, epheme err := r.Get(ctx, types.NamespacedName{Namespace: ephemeralRunner.Namespace, Name: ephemeralRunner.Name}, pod) switch { case err == nil: - if pod.ObjectMeta.DeletionTimestamp.IsZero() { + if pod.DeletionTimestamp.IsZero() { log.Info("Deleting the runner pod") if err := r.Delete(ctx, pod); err != nil && !kerrors.IsNotFound(err) { return fmt.Errorf("failed to delete pod: %w", err) @@ -339,7 +365,7 @@ func (r *EphemeralRunnerReconciler) cleanupResources(ctx context.Context, epheme err = r.Get(ctx, types.NamespacedName{Namespace: ephemeralRunner.Namespace, Name: ephemeralRunner.Name}, secret) switch { case err == nil: - if secret.ObjectMeta.DeletionTimestamp.IsZero() { + if secret.DeletionTimestamp.IsZero() { log.Info("Deleting the jitconfig secret") if err := r.Delete(ctx, secret); err != nil && !kerrors.IsNotFound(err) { return fmt.Errorf("failed to delete secret: %w", err) @@ -393,7 +419,7 @@ func (r *EphemeralRunnerReconciler) cleanupRunnerLinkedPods(ctx context.Context, var errs []error for i := range runnerLinkedPodList.Items { linkedPod := &runnerLinkedPodList.Items[i] - if !linkedPod.ObjectMeta.DeletionTimestamp.IsZero() { + if !linkedPod.DeletionTimestamp.IsZero() { continue } @@ -409,7 +435,7 @@ func (r *EphemeralRunnerReconciler) cleanupRunnerLinkedPods(ctx context.Context, func (r *EphemeralRunnerReconciler) cleanupRunnerLinkedSecrets(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) error { runnerLinkedLabels := client.MatchingLabels( map[string]string{ - "runner-pod": ephemeralRunner.ObjectMeta.Name, + "runner-pod": ephemeralRunner.Name, }, ) var runnerLinkedSecretList corev1.SecretList @@ -427,7 +453,7 @@ func (r *EphemeralRunnerReconciler) cleanupRunnerLinkedSecrets(ctx context.Conte var errs []error for i := range runnerLinkedSecretList.Items { s := &runnerLinkedSecretList.Items[i] - if !s.ObjectMeta.DeletionTimestamp.IsZero() { + if !s.DeletionTimestamp.IsZero() { continue } @@ -474,7 +500,7 @@ func (r *EphemeralRunnerReconciler) markAsFinished(ctx context.Context, ephemera // deletePodAsFailed is responsible for deleting the pod and updating the .Status.Failures for tracking failure count. // It should not be responsible for setting the status to Failed. func (r *EphemeralRunnerReconciler) deletePodAsFailed(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, pod *corev1.Pod, log logr.Logger) error { - if pod.ObjectMeta.DeletionTimestamp.IsZero() { + if pod.DeletionTimestamp.IsZero() { log.Info("Deleting the ephemeral runner pod", "podId", pod.UID) if err := r.Delete(ctx, pod); err != nil && !kerrors.IsNotFound(err) { return fmt.Errorf("failed to delete pod with status failed: %w", err) @@ -484,9 +510,9 @@ func (r *EphemeralRunnerReconciler) deletePodAsFailed(ctx context.Context, ephem log.Info("Updating ephemeral runner status to track the failure count") if err := patchSubResource(ctx, r.Status(), ephemeralRunner, func(obj *v1alpha1.EphemeralRunner) { if obj.Status.Failures == nil { - obj.Status.Failures = make(map[string]bool) + obj.Status.Failures = make(map[string]metav1.Time) } - obj.Status.Failures[string(pod.UID)] = true + obj.Status.Failures[string(pod.UID)] = metav1.Now() obj.Status.Ready = false obj.Status.Reason = pod.Status.Reason obj.Status.Message = pod.Status.Message @@ -640,7 +666,7 @@ func (r *EphemeralRunnerReconciler) createPod(ctx context.Context, runner *v1alp } log.Info("Creating new pod for ephemeral runner") - newPod := r.ResourceBuilder.newEphemeralRunnerPod(ctx, runner, secret, envs...) + newPod := r.newEphemeralRunnerPod(ctx, runner, secret, envs...) if err := ctrl.SetControllerReference(runner, newPod, r.Scheme); err != nil { log.Error(err, "Failed to set controller reference to a new pod") @@ -665,7 +691,7 @@ func (r *EphemeralRunnerReconciler) createPod(ctx context.Context, runner *v1alp func (r *EphemeralRunnerReconciler) createSecret(ctx context.Context, runner *v1alpha1.EphemeralRunner, log logr.Logger) (*ctrl.Result, error) { log.Info("Creating new secret for ephemeral runner") - jitSecret := r.ResourceBuilder.newEphemeralRunnerJitSecret(runner) + jitSecret := r.newEphemeralRunnerJitSecret(runner) if err := ctrl.SetControllerReference(runner, jitSecret, r.Scheme); err != nil { return &ctrl.Result{}, fmt.Errorf("failed to set controller reference: %w", err) diff --git a/controllers/actions.github.com/ephemeralrunner_controller_test.go b/controllers/actions.github.com/ephemeralrunner_controller_test.go index 1305bfca..0b842d5c 100644 --- a/controllers/actions.github.com/ephemeralrunner_controller_test.go +++ b/controllers/actions.github.com/ephemeralrunner_controller_test.go @@ -30,7 +30,7 @@ import ( const ( ephemeralRunnerTimeout = time.Second * 20 - ephemeralRunnerInterval = time.Millisecond * 250 + ephemeralRunnerInterval = time.Millisecond * 10 runnerImage = "ghcr.io/actions/actions-runner:latest" ) @@ -528,44 +528,26 @@ var _ = Describe("EphemeralRunner", func() { ).Should(BeEquivalentTo("")) }) - It("It should not re-create pod indefinitely", func() { + It("It should eventually delete ephemeral runner after consecutive failures", func() { updated := new(v1alpha1.EphemeralRunner) - pod := new(corev1.Pod) Eventually( - func() (bool, error) { - err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated) - if err != nil { - return false, err - } - - err = k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod) - if err != nil { - if kerrors.IsNotFound(err) && len(updated.Status.Failures) > 5 { - return true, nil - } - - return false, err - } - - pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{ - Name: v1alpha1.EphemeralRunnerContainerName, - State: corev1.ContainerState{ - Terminated: &corev1.ContainerStateTerminated{ - ExitCode: 1, - }, - }, - }) - err = k8sClient.Status().Update(ctx, pod) - Expect(err).To(BeNil(), "Failed to update pod status") - return false, fmt.Errorf("pod haven't failed for 5 times.") + func() error { + return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated) }, ephemeralRunnerTimeout, ephemeralRunnerInterval, - ).Should(BeEquivalentTo(true), "we should stop creating pod after 5 failures") + ).Should(Succeed(), "failed to get ephemeral runner") + + failEphemeralRunnerPod := func() *corev1.Pod { + pod := new(corev1.Pod) + Eventually( + func() error { + return k8sClient.Get(ctx, client.ObjectKey{Name: updated.Name, Namespace: updated.Namespace}, pod) + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).Should(Succeed(), "failed to get ephemeral runner pod") - // In case we still have pod created due to controller-runtime cache delay, mark the container as exited - err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod) - if err == nil { pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{ Name: v1alpha1.EphemeralRunnerContainerName, State: corev1.ContainerState{ @@ -576,25 +558,70 @@ var _ = Describe("EphemeralRunner", func() { }) err := k8sClient.Status().Update(ctx, pod) Expect(err).To(BeNil(), "Failed to update pod status") + + return pod } - // EphemeralRunner should failed with reason TooManyPodFailures - Eventually(func() (string, error) { - err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated) - if err != nil { - return "", err - } - return updated.Status.Reason, nil - }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo("TooManyPodFailures"), "Reason should be TooManyPodFailures") + for i := range 5 { + pod := failEphemeralRunnerPod() - // EphemeralRunner should not have any pod - Eventually(func() (bool, error) { - err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod) - if err == nil { - return false, nil - } - return kerrors.IsNotFound(err), nil - }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo(true)) + Eventually( + func() (int, error) { + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated) + if err != nil { + return 0, err + } + return len(updated.Status.Failures), nil + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).Should(BeEquivalentTo(i + 1)) + + Eventually( + func() error { + nextPod := new(corev1.Pod) + err := k8sClient.Get(ctx, client.ObjectKey{Name: pod.Name, Namespace: pod.Namespace}, nextPod) + if err != nil { + return err + } + if nextPod.UID != pod.UID { + return nil + } + return fmt.Errorf("pod not recreated") + }, + ).WithTimeout(20*time.Second).WithPolling(10*time.Millisecond).Should(Succeed(), "pod should be recreated") + + Eventually( + func() (bool, error) { + pod := new(corev1.Pod) + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod) + if err != nil { + return false, err + } + for _, cs := range pod.Status.ContainerStatuses { + if cs.Name == v1alpha1.EphemeralRunnerContainerName { + return cs.State.Terminated == nil, nil + } + } + + return true, nil + }, + ).WithTimeout(20*time.Second).WithPolling(10*time.Millisecond).Should(BeEquivalentTo(true), "pod should be terminated") + } + + failEphemeralRunnerPod() + + Eventually( + func() (bool, error) { + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated) + if kerrors.IsNotFound(err) { + return true, nil + } + return false, err + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).Should(BeTrue(), "Ephemeral runner should eventually be deleted") }) It("It should re-create pod on eviction", func() { diff --git a/controllers/actions.github.com/ephemeralrunnerset_controller.go b/controllers/actions.github.com/ephemeralrunnerset_controller.go index 472a646a..773e1286 100644 --- a/controllers/actions.github.com/ephemeralrunnerset_controller.go +++ b/controllers/actions.github.com/ephemeralrunnerset_controller.go @@ -83,7 +83,7 @@ func (r *EphemeralRunnerSetReconciler) Reconcile(ctx context.Context, req ctrl.R } // Requested deletion does not need reconciled. - if !ephemeralRunnerSet.ObjectMeta.DeletionTimestamp.IsZero() { + if !ephemeralRunnerSet.DeletionTimestamp.IsZero() { if !controllerutil.ContainsFinalizer(ephemeralRunnerSet, ephemeralRunnerSetFinalizerName) { return ctrl.Result{}, nil } @@ -360,7 +360,7 @@ func (r *EphemeralRunnerSetReconciler) createEphemeralRunners(ctx context.Contex // Track multiple errors at once and return the bundle. errs := make([]error, 0) for i := 0; i < count; i++ { - ephemeralRunner := r.ResourceBuilder.newEphemeralRunner(runnerSet) + ephemeralRunner := r.newEphemeralRunner(runnerSet) if runnerSet.Spec.EphemeralRunnerSpec.Proxy != nil { ephemeralRunner.Spec.ProxySecretRef = proxyEphemeralRunnerSetSecretName(runnerSet) } @@ -641,7 +641,7 @@ func newEphemeralRunnerState(ephemeralRunnerList *v1alpha1.EphemeralRunnerList) if err == nil && patchID > ephemeralRunnerState.latestPatchID { ephemeralRunnerState.latestPatchID = patchID } - if !r.ObjectMeta.DeletionTimestamp.IsZero() { + if !r.DeletionTimestamp.IsZero() { ephemeralRunnerState.deleting = append(ephemeralRunnerState.deleting, r) continue } diff --git a/controllers/actions.github.com/ephemeralrunnerset_controller_test.go b/controllers/actions.github.com/ephemeralrunnerset_controller_test.go index 0ea2027d..665279e8 100644 --- a/controllers/actions.github.com/ephemeralrunnerset_controller_test.go +++ b/controllers/actions.github.com/ephemeralrunnerset_controller_test.go @@ -10,6 +10,7 @@ import ( "os" "path/filepath" "strings" + "testing" "time" corev1 "k8s.io/api/core/v1" @@ -21,6 +22,7 @@ import ( "github.com/go-logr/logr" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + "github.com/stretchr/testify/require" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "github.com/actions/actions-runner-controller/apis/actions.github.com/v1alpha1" @@ -35,6 +37,10 @@ const ( ephemeralRunnerSetTestGitHubToken = "gh_token" ) +func TestPrecomputedConstants(t *testing.T) { + require.Equal(t, len(failedRunnerBackoff), maxFailures+1) +} + var _ = Describe("Test EphemeralRunnerSet controller", func() { var ctx context.Context var mgr ctrl.Manager diff --git a/controllers/actions.github.com/resourcebuilder.go b/controllers/actions.github.com/resourcebuilder.go index 2b7c9030..7fe8febf 100644 --- a/controllers/actions.github.com/resourcebuilder.go +++ b/controllers/actions.github.com/resourcebuilder.go @@ -5,6 +5,7 @@ import ( "context" "encoding/json" "fmt" + "maps" "math" "net" "strconv" @@ -169,15 +170,6 @@ func (b *ResourceBuilder) newScaleSetListenerConfig(autoscalingListener *v1alpha metricsEndpoint = metricsConfig.endpoint } - var appID int64 - if id, ok := secret.Data["github_app_id"]; ok { - var err error - appID, err = strconv.ParseInt(string(id), 10, 64) - if err != nil { - return nil, fmt.Errorf("failed to convert github_app_id to int: %v", err) - } - } - var appInstallationID int64 if id, ok := secret.Data["github_app_installation_id"]; ok { var err error @@ -189,7 +181,7 @@ func (b *ResourceBuilder) newScaleSetListenerConfig(autoscalingListener *v1alpha config := listenerconfig.Config{ ConfigureUrl: autoscalingListener.Spec.GitHubConfigUrl, - AppID: appID, + AppID: string(secret.Data["github_app_id"]), AppInstallationID: appInstallationID, AppPrivateKey: string(secret.Data["github_app_private_key"]), Token: string(secret.Data["github_token"]), @@ -207,6 +199,10 @@ func (b *ResourceBuilder) newScaleSetListenerConfig(autoscalingListener *v1alpha Metrics: autoscalingListener.Spec.Metrics, } + if err := config.Validate(); err != nil { + return nil, fmt.Errorf("invalid listener config: %w", err) + } + var buf bytes.Buffer if err := json.NewEncoder(&buf).Encode(config); err != nil { return nil, fmt.Errorf("failed to encode config: %w", err) @@ -278,9 +274,7 @@ func (b *ResourceBuilder) newScaleSetListenerPod(autoscalingListener *v1alpha1.A } labels := make(map[string]string, len(autoscalingListener.Labels)) - for key, val := range autoscalingListener.Labels { - labels[key] = val - } + maps.Copy(labels, autoscalingListener.Labels) newRunnerScaleSetListenerPod := &corev1.Pod{ TypeMeta: metav1.TypeMeta{ @@ -429,7 +423,7 @@ func mergeListenerContainer(base, from *corev1.Container) { func (b *ResourceBuilder) newScaleSetListenerServiceAccount(autoscalingListener *v1alpha1.AutoscalingListener) *corev1.ServiceAccount { return &corev1.ServiceAccount{ ObjectMeta: metav1.ObjectMeta{ - Name: scaleSetListenerServiceAccountName(autoscalingListener), + Name: autoscalingListener.Name, Namespace: autoscalingListener.Namespace, Labels: b.mergeLabels(autoscalingListener.Labels, map[string]string{ LabelKeyGitHubScaleSetNamespace: autoscalingListener.Spec.AutoscalingRunnerSetNamespace, @@ -444,7 +438,7 @@ func (b *ResourceBuilder) newScaleSetListenerRole(autoscalingListener *v1alpha1. rulesHash := hash.ComputeTemplateHash(&rules) newRole := &rbacv1.Role{ ObjectMeta: metav1.ObjectMeta{ - Name: scaleSetListenerRoleName(autoscalingListener), + Name: autoscalingListener.Name, Namespace: autoscalingListener.Spec.AutoscalingRunnerSetNamespace, Labels: b.mergeLabels(autoscalingListener.Labels, map[string]string{ LabelKeyGitHubScaleSetNamespace: autoscalingListener.Spec.AutoscalingRunnerSetNamespace, @@ -478,7 +472,7 @@ func (b *ResourceBuilder) newScaleSetListenerRoleBinding(autoscalingListener *v1 newRoleBinding := &rbacv1.RoleBinding{ ObjectMeta: metav1.ObjectMeta{ - Name: scaleSetListenerRoleName(autoscalingListener), + Name: autoscalingListener.Name, Namespace: autoscalingListener.Spec.AutoscalingRunnerSetNamespace, Labels: b.mergeLabels(autoscalingListener.Labels, map[string]string{ LabelKeyGitHubScaleSetNamespace: autoscalingListener.Spec.AutoscalingRunnerSetNamespace, @@ -501,7 +495,7 @@ func (b *ResourceBuilder) newScaleSetListenerSecretMirror(autoscalingListener *v newListenerSecret := &corev1.Secret{ ObjectMeta: metav1.ObjectMeta{ - Name: scaleSetListenerSecretMirrorName(autoscalingListener), + Name: autoscalingListener.Name, Namespace: autoscalingListener.Namespace, Labels: b.mergeLabels(autoscalingListener.Labels, map[string]string{ LabelKeyGitHubScaleSetNamespace: autoscalingListener.Spec.AutoscalingRunnerSetNamespace, @@ -543,8 +537,8 @@ func (b *ResourceBuilder) newEphemeralRunnerSet(autoscalingRunnerSet *v1alpha1.A newEphemeralRunnerSet := &v1alpha1.EphemeralRunnerSet{ TypeMeta: metav1.TypeMeta{}, ObjectMeta: metav1.ObjectMeta{ - GenerateName: autoscalingRunnerSet.ObjectMeta.Name + "-", - Namespace: autoscalingRunnerSet.ObjectMeta.Namespace, + GenerateName: autoscalingRunnerSet.Name + "-", + Namespace: autoscalingRunnerSet.Namespace, Labels: labels, Annotations: newAnnotations, OwnerReferences: []metav1.OwnerReference{ @@ -617,18 +611,18 @@ func (b *ResourceBuilder) newEphemeralRunnerPod(ctx context.Context, runner *v1a labels := map[string]string{} annotations := map[string]string{} - for k, v := range runner.ObjectMeta.Labels { + for k, v := range runner.Labels { labels[k] = v } - for k, v := range runner.Spec.PodTemplateSpec.Labels { + for k, v := range runner.Spec.Labels { labels[k] = v } labels["actions-ephemeral-runner"] = string(corev1.ConditionTrue) - for k, v := range runner.ObjectMeta.Annotations { + for k, v := range runner.Annotations { annotations[k] = v } - for k, v := range runner.Spec.PodTemplateSpec.Annotations { + for k, v := range runner.Spec.Annotations { annotations[k] = v } @@ -640,8 +634,8 @@ func (b *ResourceBuilder) newEphemeralRunnerPod(ctx context.Context, runner *v1a ) objectMeta := metav1.ObjectMeta{ - Name: runner.ObjectMeta.Name, - Namespace: runner.ObjectMeta.Namespace, + Name: runner.Name, + Namespace: runner.Namespace, Labels: labels, Annotations: annotations, OwnerReferences: []metav1.OwnerReference{ @@ -657,10 +651,10 @@ func (b *ResourceBuilder) newEphemeralRunnerPod(ctx context.Context, runner *v1a } newPod.ObjectMeta = objectMeta - newPod.Spec = runner.Spec.PodTemplateSpec.Spec - newPod.Spec.Containers = make([]corev1.Container, 0, len(runner.Spec.PodTemplateSpec.Spec.Containers)) + newPod.Spec = runner.Spec.Spec + newPod.Spec.Containers = make([]corev1.Container, 0, len(runner.Spec.Spec.Containers)) - for _, c := range runner.Spec.PodTemplateSpec.Spec.Containers { + for _, c := range runner.Spec.Spec.Containers { if c.Name == v1alpha1.EphemeralRunnerContainerName { c.Env = append( c.Env, @@ -713,30 +707,6 @@ func scaleSetListenerName(autoscalingRunnerSet *v1alpha1.AutoscalingRunnerSet) s return fmt.Sprintf("%v-%v-listener", autoscalingRunnerSet.Name, namespaceHash) } -func scaleSetListenerServiceAccountName(autoscalingListener *v1alpha1.AutoscalingListener) string { - namespaceHash := hash.FNVHashString(autoscalingListener.Spec.AutoscalingRunnerSetNamespace) - if len(namespaceHash) > 8 { - namespaceHash = namespaceHash[:8] - } - return fmt.Sprintf("%v-%v-listener", autoscalingListener.Spec.AutoscalingRunnerSetName, namespaceHash) -} - -func scaleSetListenerRoleName(autoscalingListener *v1alpha1.AutoscalingListener) string { - namespaceHash := hash.FNVHashString(autoscalingListener.Spec.AutoscalingRunnerSetNamespace) - if len(namespaceHash) > 8 { - namespaceHash = namespaceHash[:8] - } - return fmt.Sprintf("%v-%v-listener", autoscalingListener.Spec.AutoscalingRunnerSetName, namespaceHash) -} - -func scaleSetListenerSecretMirrorName(autoscalingListener *v1alpha1.AutoscalingListener) string { - namespaceHash := hash.FNVHashString(autoscalingListener.Spec.AutoscalingRunnerSetNamespace) - if len(namespaceHash) > 8 { - namespaceHash = namespaceHash[:8] - } - return fmt.Sprintf("%v-%v-listener", autoscalingListener.Spec.AutoscalingRunnerSetName, namespaceHash) -} - func proxyListenerSecretName(autoscalingListener *v1alpha1.AutoscalingListener) string { namespaceHash := hash.FNVHashString(autoscalingListener.Spec.AutoscalingRunnerSetNamespace) if len(namespaceHash) > 8 { diff --git a/controllers/actions.github.com/suite_test.go b/controllers/actions.github.com/suite_test.go index 80fb4196..46b97eb7 100644 --- a/controllers/actions.github.com/suite_test.go +++ b/controllers/actions.github.com/suite_test.go @@ -20,6 +20,7 @@ import ( "os" "path/filepath" "testing" + "time" "github.com/onsi/ginkgo/config" @@ -79,6 +80,15 @@ var _ = BeforeSuite(func() { k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme}) Expect(err).ToNot(HaveOccurred()) Expect(k8sClient).ToNot(BeNil()) + + failedRunnerBackoff = []time.Duration{ + 20 * time.Millisecond, + 20 * time.Millisecond, + 20 * time.Millisecond, + 20 * time.Millisecond, + 20 * time.Millisecond, + 20 * time.Millisecond, + } }) var _ = AfterSuite(func() { diff --git a/controllers/actions.summerwind.net/autoscaling.go b/controllers/actions.summerwind.net/autoscaling.go index ea21f953..677041c7 100644 --- a/controllers/actions.summerwind.net/autoscaling.go +++ b/controllers/actions.summerwind.net/autoscaling.go @@ -345,7 +345,7 @@ func (r *HorizontalRunnerAutoscalerReconciler) suggestReplicasByPercentageRunner } var runnerPodList corev1.PodList - if err := r.Client.List(ctx, &runnerPodList, client.InNamespace(hra.Namespace), client.MatchingLabels(map[string]string{ + if err := r.List(ctx, &runnerPodList, client.InNamespace(hra.Namespace), client.MatchingLabels(map[string]string{ kindLabel: hra.Spec.ScaleTargetRef.Name, })); err != nil { return nil, err diff --git a/controllers/actions.summerwind.net/autoscaling_test.go b/controllers/actions.summerwind.net/autoscaling_test.go index ee42f9a4..4fde432d 100644 --- a/controllers/actions.summerwind.net/autoscaling_test.go +++ b/controllers/actions.summerwind.net/autoscaling_test.go @@ -29,7 +29,7 @@ func newGithubClient(server *httptest.Server) *github.Client { if err != nil { panic(err) } - client.Client.BaseURL = baseURL + client.BaseURL = baseURL return client } diff --git a/controllers/actions.summerwind.net/horizontal_runner_autoscaler_batch_scale.go b/controllers/actions.summerwind.net/horizontal_runner_autoscaler_batch_scale.go index 8f537005..d74bf05c 100644 --- a/controllers/actions.summerwind.net/horizontal_runner_autoscaler_batch_scale.go +++ b/controllers/actions.summerwind.net/horizontal_runner_autoscaler_batch_scale.go @@ -82,8 +82,8 @@ func (s *batchScaler) Add(st *ScaleTarget) { break batch case st := <-s.queue: nsName := types.NamespacedName{ - Namespace: st.HorizontalRunnerAutoscaler.Namespace, - Name: st.HorizontalRunnerAutoscaler.Name, + Namespace: st.Namespace, + Name: st.Name, } b, ok := batches[nsName] if !ok { @@ -208,7 +208,7 @@ func (s *batchScaler) planBatchScale(ctx context.Context, batch batchScaleOperat // // In other words, updating HRA.spec.scaleTriggers[].duration does not result in delaying capacity reservations expiration any longer // than the "intended" duration, which is the duration of the trigger when the reservation was created. - duration := copy.Spec.CapacityReservations[i].ExpirationTime.Time.Sub(copy.Spec.CapacityReservations[i].EffectiveTime.Time) + duration := copy.Spec.CapacityReservations[i].ExpirationTime.Sub(copy.Spec.CapacityReservations[i].EffectiveTime.Time) copy.Spec.CapacityReservations[i].EffectiveTime = metav1.Time{Time: now} copy.Spec.CapacityReservations[i].ExpirationTime = metav1.Time{Time: now.Add(duration)} } diff --git a/controllers/actions.summerwind.net/horizontal_runner_autoscaler_webhook.go b/controllers/actions.summerwind.net/horizontal_runner_autoscaler_webhook.go index 85c4bc48..0f37d0d3 100644 --- a/controllers/actions.summerwind.net/horizontal_runner_autoscaler_webhook.go +++ b/controllers/actions.summerwind.net/horizontal_runner_autoscaler_webhook.go @@ -503,13 +503,13 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) getManagedRunnerGroup switch kind { case "RunnerSet": var rs v1alpha1.RunnerSet - if err := autoscaler.Client.Get(context.Background(), types.NamespacedName{Namespace: hra.Namespace, Name: hra.Spec.ScaleTargetRef.Name}, &rs); err != nil { + if err := autoscaler.Get(context.Background(), types.NamespacedName{Namespace: hra.Namespace, Name: hra.Spec.ScaleTargetRef.Name}, &rs); err != nil { return groups, err } o, e, g = rs.Spec.Organization, rs.Spec.Enterprise, rs.Spec.Group case "RunnerDeployment", "": var rd v1alpha1.RunnerDeployment - if err := autoscaler.Client.Get(context.Background(), types.NamespacedName{Namespace: hra.Namespace, Name: hra.Spec.ScaleTargetRef.Name}, &rd); err != nil { + if err := autoscaler.Get(context.Background(), types.NamespacedName{Namespace: hra.Namespace, Name: hra.Spec.ScaleTargetRef.Name}, &rd); err != nil { return groups, err } o, e, g = rd.Spec.Template.Spec.Organization, rd.Spec.Template.Spec.Enterprise, rd.Spec.Template.Spec.Group @@ -562,7 +562,7 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) getJobScaleTarget(ctx HRA: for _, hra := range hras { - if !hra.ObjectMeta.DeletionTimestamp.IsZero() { + if !hra.DeletionTimestamp.IsZero() { continue } @@ -603,7 +603,7 @@ HRA: case "RunnerSet": var rs v1alpha1.RunnerSet - if err := autoscaler.Client.Get(context.Background(), types.NamespacedName{Namespace: hra.Namespace, Name: hra.Spec.ScaleTargetRef.Name}, &rs); err != nil { + if err := autoscaler.Get(context.Background(), types.NamespacedName{Namespace: hra.Namespace, Name: hra.Spec.ScaleTargetRef.Name}, &rs); err != nil { return nil, err } @@ -634,7 +634,7 @@ HRA: case "RunnerDeployment", "": var rd v1alpha1.RunnerDeployment - if err := autoscaler.Client.Get(context.Background(), types.NamespacedName{Namespace: hra.Namespace, Name: hra.Spec.ScaleTargetRef.Name}, &rd); err != nil { + if err := autoscaler.Get(context.Background(), types.NamespacedName{Namespace: hra.Namespace, Name: hra.Spec.ScaleTargetRef.Name}, &rd); err != nil { return nil, err } @@ -676,7 +676,7 @@ func getValidCapacityReservations(autoscaler *v1alpha1.HorizontalRunnerAutoscale now := time.Now() for _, reservation := range autoscaler.Spec.CapacityReservations { - if reservation.ExpirationTime.Time.After(now) { + if reservation.ExpirationTime.After(now) { capacityReservations = append(capacityReservations, reservation) } } @@ -713,7 +713,7 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) indexer(rawObj client switch hra.Spec.ScaleTargetRef.Kind { case "", "RunnerDeployment": var rd v1alpha1.RunnerDeployment - if err := autoscaler.Client.Get(context.Background(), types.NamespacedName{Namespace: hra.Namespace, Name: hra.Spec.ScaleTargetRef.Name}, &rd); err != nil { + if err := autoscaler.Get(context.Background(), types.NamespacedName{Namespace: hra.Namespace, Name: hra.Spec.ScaleTargetRef.Name}, &rd); err != nil { autoscaler.Log.V(1).Info(fmt.Sprintf("RunnerDeployment not found with scale target ref name %s for hra %s", hra.Spec.ScaleTargetRef.Name, hra.Name)) return nil } @@ -740,7 +740,7 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) indexer(rawObj client return keys case "RunnerSet": var rs v1alpha1.RunnerSet - if err := autoscaler.Client.Get(context.Background(), types.NamespacedName{Namespace: hra.Namespace, Name: hra.Spec.ScaleTargetRef.Name}, &rs); err != nil { + if err := autoscaler.Get(context.Background(), types.NamespacedName{Namespace: hra.Namespace, Name: hra.Spec.ScaleTargetRef.Name}, &rs); err != nil { autoscaler.Log.V(1).Info(fmt.Sprintf("RunnerSet not found with scale target ref name %s for hra %s", hra.Spec.ScaleTargetRef.Name, hra.Name)) return nil } diff --git a/controllers/actions.summerwind.net/horizontalrunnerautoscaler_controller.go b/controllers/actions.summerwind.net/horizontalrunnerautoscaler_controller.go index 0aa5a7b6..fffa0347 100644 --- a/controllers/actions.summerwind.net/horizontalrunnerautoscaler_controller.go +++ b/controllers/actions.summerwind.net/horizontalrunnerautoscaler_controller.go @@ -71,7 +71,7 @@ func (r *HorizontalRunnerAutoscalerReconciler) Reconcile(ctx context.Context, re return ctrl.Result{}, client.IgnoreNotFound(err) } - if !hra.ObjectMeta.DeletionTimestamp.IsZero() { + if !hra.DeletionTimestamp.IsZero() { r.GitHubClient.DeinitForHRA(&hra) return ctrl.Result{}, nil @@ -91,7 +91,7 @@ func (r *HorizontalRunnerAutoscalerReconciler) Reconcile(ctx context.Context, re return ctrl.Result{}, client.IgnoreNotFound(err) } - if !rd.ObjectMeta.DeletionTimestamp.IsZero() { + if !rd.DeletionTimestamp.IsZero() { return ctrl.Result{}, nil } @@ -120,14 +120,14 @@ func (r *HorizontalRunnerAutoscalerReconciler) Reconcile(ctx context.Context, re copy.Spec.EffectiveTime = &metav1.Time{Time: *effectiveTime} } - if err := r.Client.Patch(ctx, copy, client.MergeFrom(&rd)); err != nil { + if err := r.Patch(ctx, copy, client.MergeFrom(&rd)); err != nil { return fmt.Errorf("patching runnerdeployment to have %d replicas: %w", newDesiredReplicas, err) } } else if ephemeral && effectiveTime != nil { copy := rd.DeepCopy() copy.Spec.EffectiveTime = &metav1.Time{Time: *effectiveTime} - if err := r.Client.Patch(ctx, copy, client.MergeFrom(&rd)); err != nil { + if err := r.Patch(ctx, copy, client.MergeFrom(&rd)); err != nil { return fmt.Errorf("patching runnerdeployment to have %d replicas: %w", newDesiredReplicas, err) } } @@ -142,7 +142,7 @@ func (r *HorizontalRunnerAutoscalerReconciler) Reconcile(ctx context.Context, re return ctrl.Result{}, client.IgnoreNotFound(err) } - if !rs.ObjectMeta.DeletionTimestamp.IsZero() { + if !rs.DeletionTimestamp.IsZero() { return ctrl.Result{}, nil } @@ -160,7 +160,7 @@ func (r *HorizontalRunnerAutoscalerReconciler) Reconcile(ctx context.Context, re org: rs.Spec.Organization, repo: rs.Spec.Repository, replicas: replicas, - labels: rs.Spec.RunnerConfig.Labels, + labels: rs.Spec.Labels, getRunnerMap: func() (map[string]struct{}, error) { // return the list of runners in namespace. Horizontal Runner Autoscaler should only be responsible for scaling resources in its own ns. var runnerPodList corev1.PodList @@ -224,14 +224,14 @@ func (r *HorizontalRunnerAutoscalerReconciler) Reconcile(ctx context.Context, re copy.Spec.EffectiveTime = &metav1.Time{Time: *effectiveTime} } - if err := r.Client.Patch(ctx, copy, client.MergeFrom(&rs)); err != nil { + if err := r.Patch(ctx, copy, client.MergeFrom(&rs)); err != nil { return fmt.Errorf("patching runnerset to have %d replicas: %w", newDesiredReplicas, err) } } else if ephemeral && effectiveTime != nil { copy := rs.DeepCopy() copy.Spec.EffectiveTime = &metav1.Time{Time: *effectiveTime} - if err := r.Client.Patch(ctx, copy, client.MergeFrom(&rs)); err != nil { + if err := r.Patch(ctx, copy, client.MergeFrom(&rs)); err != nil { return fmt.Errorf("patching runnerset to have %d replicas: %w", newDesiredReplicas, err) } } @@ -253,7 +253,7 @@ func (r *HorizontalRunnerAutoscalerReconciler) scaleTargetFromRD(ctx context.Con org: rd.Spec.Template.Spec.Organization, repo: rd.Spec.Template.Spec.Repository, replicas: rd.Spec.Replicas, - labels: rd.Spec.Template.Spec.RunnerConfig.Labels, + labels: rd.Spec.Template.Spec.Labels, getRunnerMap: func() (map[string]struct{}, error) { // return the list of runners in namespace. Horizontal Runner Autoscaler should only be responsible for scaling resources in its own ns. var runnerList v1alpha1.RunnerList @@ -484,7 +484,7 @@ func (r *HorizontalRunnerAutoscalerReconciler) computeReplicasWithCache(ghc *arc var reserved int for _, reservation := range hra.Spec.CapacityReservations { - if reservation.ExpirationTime.Time.After(now) { + if reservation.ExpirationTime.After(now) { reserved += reservation.Replicas } } diff --git a/controllers/actions.summerwind.net/runner_controller.go b/controllers/actions.summerwind.net/runner_controller.go index 476e5c54..714fb7c0 100644 --- a/controllers/actions.summerwind.net/runner_controller.go +++ b/controllers/actions.summerwind.net/runner_controller.go @@ -20,12 +20,13 @@ import ( "context" "errors" "fmt" - "k8s.io/apimachinery/pkg/api/resource" "reflect" "strconv" "strings" "time" + "k8s.io/apimachinery/pkg/api/resource" + "github.com/actions/actions-runner-controller/build" "github.com/actions/actions-runner-controller/hash" "github.com/go-logr/logr" @@ -107,12 +108,12 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr return ctrl.Result{}, client.IgnoreNotFound(err) } - if runner.ObjectMeta.DeletionTimestamp.IsZero() { - finalizers, added := addFinalizer(runner.ObjectMeta.Finalizers, finalizerName) + if runner.DeletionTimestamp.IsZero() { + finalizers, added := addFinalizer(runner.Finalizers, finalizerName) if added { newRunner := runner.DeepCopy() - newRunner.ObjectMeta.Finalizers = finalizers + newRunner.Finalizers = finalizers if err := r.Update(ctx, newRunner); err != nil { log.Error(err, "Failed to update runner") @@ -271,11 +272,11 @@ func ephemeralRunnerContainerStatus(pod *corev1.Pod) *corev1.ContainerStatus { } func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx context.Context, log logr.Logger, pod *corev1.Pod) (reconcile.Result, error) { - finalizers, removed := removeFinalizer(runner.ObjectMeta.Finalizers, finalizerName) + finalizers, removed := removeFinalizer(runner.Finalizers, finalizerName) if removed { newRunner := runner.DeepCopy() - newRunner.ObjectMeta.Finalizers = finalizers + newRunner.Finalizers = finalizers if err := r.Patch(ctx, newRunner, client.MergeFrom(&runner)); err != nil { log.Error(err, "Unable to remove finalizer") @@ -305,8 +306,8 @@ func (r *RunnerReconciler) processRunnerCreation(ctx context.Context, runner v1a if needsServiceAccount { serviceAccount := &corev1.ServiceAccount{ ObjectMeta: metav1.ObjectMeta{ - Name: runner.ObjectMeta.Name, - Namespace: runner.ObjectMeta.Namespace, + Name: runner.Name, + Namespace: runner.Namespace, }, } if res := r.createObject(ctx, serviceAccount, serviceAccount.ObjectMeta, &runner, log); res != nil { @@ -321,7 +322,7 @@ func (r *RunnerReconciler) processRunnerCreation(ctx context.Context, runner v1a APIGroups: []string{"actions.summerwind.dev"}, Resources: []string{"runners/status"}, Verbs: []string{"get", "update", "patch"}, - ResourceNames: []string{runner.ObjectMeta.Name}, + ResourceNames: []string{runner.Name}, }, }...) } @@ -359,8 +360,8 @@ func (r *RunnerReconciler) processRunnerCreation(ctx context.Context, runner v1a role := &rbacv1.Role{ ObjectMeta: metav1.ObjectMeta{ - Name: runner.ObjectMeta.Name, - Namespace: runner.ObjectMeta.Namespace, + Name: runner.Name, + Namespace: runner.Namespace, }, Rules: rules, } @@ -370,19 +371,19 @@ func (r *RunnerReconciler) processRunnerCreation(ctx context.Context, runner v1a roleBinding := &rbacv1.RoleBinding{ ObjectMeta: metav1.ObjectMeta{ - Name: runner.ObjectMeta.Name, - Namespace: runner.ObjectMeta.Namespace, + Name: runner.Name, + Namespace: runner.Namespace, }, RoleRef: rbacv1.RoleRef{ APIGroup: "rbac.authorization.k8s.io", Kind: "Role", - Name: runner.ObjectMeta.Name, + Name: runner.Name, }, Subjects: []rbacv1.Subject{ { Kind: "ServiceAccount", - Name: runner.ObjectMeta.Name, - Namespace: runner.ObjectMeta.Namespace, + Name: runner.Name, + Namespace: runner.Namespace, }, }, } @@ -482,7 +483,7 @@ func (r *RunnerReconciler) newPod(runner v1alpha1.Runner) (corev1.Pod, error) { labels := map[string]string{} - for k, v := range runner.ObjectMeta.Labels { + for k, v := range runner.Labels { labels[k] = v } @@ -511,8 +512,8 @@ func (r *RunnerReconciler) newPod(runner v1alpha1.Runner) (corev1.Pod, error) { // // See https://github.com/actions/actions-runner-controller/issues/143 for more context. labels[LabelKeyPodTemplateHash] = hash.FNVHashStringObjects( - filterLabels(runner.ObjectMeta.Labels, LabelKeyRunnerTemplateHash), - runner.ObjectMeta.Annotations, + filterLabels(runner.Labels, LabelKeyRunnerTemplateHash), + runner.Annotations, runner.Spec, ghc.GithubBaseURL, // Token change should trigger replacement. @@ -523,10 +524,10 @@ func (r *RunnerReconciler) newPod(runner v1alpha1.Runner) (corev1.Pod, error) { ) objectMeta := metav1.ObjectMeta{ - Name: runner.ObjectMeta.Name, - Namespace: runner.ObjectMeta.Namespace, + Name: runner.Name, + Namespace: runner.Namespace, Labels: labels, - Annotations: runner.ObjectMeta.Annotations, + Annotations: runner.Annotations, } template.ObjectMeta = objectMeta @@ -649,7 +650,7 @@ func (r *RunnerReconciler) newPod(runner v1alpha1.Runner) (corev1.Pod, error) { if runnerSpec.ServiceAccountName != "" { pod.Spec.ServiceAccountName = runnerSpec.ServiceAccountName } else if r.RunnerPodDefaults.UseRunnerStatusUpdateHook || runner.Spec.ContainerMode == "kubernetes" { - pod.Spec.ServiceAccountName = runner.ObjectMeta.Name + pod.Spec.ServiceAccountName = runner.Name } if runnerSpec.AutomountServiceAccountToken != nil { @@ -704,7 +705,7 @@ func (r *RunnerReconciler) newPod(runner v1alpha1.Runner) (corev1.Pod, error) { pod.Spec.RuntimeClassName = runnerSpec.RuntimeClassName } - pod.ObjectMeta.Name = runner.ObjectMeta.Name + pod.Name = runner.Name // Inject the registration token and the runner name updated := mutatePod(&pod, runner.Status.Registration.Token) @@ -720,7 +721,7 @@ func mutatePod(pod *corev1.Pod, token string) *corev1.Pod { updated := pod.DeepCopy() if getRunnerEnv(pod, EnvVarRunnerName) == "" { - setRunnerEnv(updated, EnvVarRunnerName, pod.ObjectMeta.Name) + setRunnerEnv(updated, EnvVarRunnerName, pod.Name) } if getRunnerEnv(pod, EnvVarRunnerToken) == "" { @@ -770,11 +771,11 @@ func runnerHookEnvs(pod *corev1.Pod) ([]corev1.EnvVar, error) { func newRunnerPodWithContainerMode(containerMode string, template corev1.Pod, runnerSpec v1alpha1.RunnerConfig, githubBaseURL string, d RunnerPodDefaults) (corev1.Pod, error) { var ( - privileged bool = true - dockerdInRunner bool = runnerSpec.DockerdWithinRunnerContainer != nil && *runnerSpec.DockerdWithinRunnerContainer - dockerEnabled bool = runnerSpec.DockerEnabled == nil || *runnerSpec.DockerEnabled - ephemeral bool = runnerSpec.Ephemeral == nil || *runnerSpec.Ephemeral - dockerdInRunnerPrivileged bool = dockerdInRunner + privileged = true + dockerdInRunner = runnerSpec.DockerdWithinRunnerContainer != nil && *runnerSpec.DockerdWithinRunnerContainer + dockerEnabled = runnerSpec.DockerEnabled == nil || *runnerSpec.DockerEnabled + ephemeral = runnerSpec.Ephemeral == nil || *runnerSpec.Ephemeral + dockerdInRunnerPrivileged = dockerdInRunner defaultRunnerImage = d.RunnerImage defaultRunnerImagePullSecrets = d.RunnerImagePullSecrets @@ -797,10 +798,10 @@ func newRunnerPodWithContainerMode(containerMode string, template corev1.Pod, ru template = *template.DeepCopy() // This label selector is used by default when rd.Spec.Selector is empty. - template.ObjectMeta.Labels = CloneAndAddLabel(template.ObjectMeta.Labels, LabelKeyRunner, "") - template.ObjectMeta.Labels = CloneAndAddLabel(template.ObjectMeta.Labels, LabelKeyPodMutation, LabelValuePodMutation) + template.Labels = CloneAndAddLabel(template.Labels, LabelKeyRunner, "") + template.Labels = CloneAndAddLabel(template.Labels, LabelKeyPodMutation, LabelValuePodMutation) if runnerSpec.GitHubAPICredentialsFrom != nil { - template.ObjectMeta.Annotations = CloneAndAddLabel(template.ObjectMeta.Annotations, annotationKeyGitHubAPICredsSecret, runnerSpec.GitHubAPICredentialsFrom.SecretRef.Name) + template.Annotations = CloneAndAddLabel(template.Annotations, annotationKeyGitHubAPICredsSecret, runnerSpec.GitHubAPICredentialsFrom.SecretRef.Name) } workDir := runnerSpec.WorkDir @@ -887,10 +888,11 @@ func newRunnerPodWithContainerMode(containerMode string, template corev1.Pod, ru for i := range template.Spec.Containers { c := template.Spec.Containers[i] - if c.Name == containerName { + switch c.Name { + case containerName: runnerContainerIndex = i runnerContainer = &c - } else if c.Name == "docker" { + case "docker": dockerdContainerIndex = i dockerdContainer = &c } @@ -1364,7 +1366,7 @@ func applyWorkVolumeClaimTemplateToPod(pod *corev1.Pod, workVolumeClaimTemplate } for i := range pod.Spec.Volumes { if pod.Spec.Volumes[i].Name == "work" { - return fmt.Errorf("Work volume should not be specified in container mode kubernetes. workVolumeClaimTemplate field should be used instead.") + return fmt.Errorf("work volume should not be specified in container mode kubernetes. workVolumeClaimTemplate field should be used instead") } } pod.Spec.Volumes = append(pod.Spec.Volumes, workVolumeClaimTemplate.V1Volume()) diff --git a/controllers/actions.summerwind.net/runner_pod_controller.go b/controllers/actions.summerwind.net/runner_pod_controller.go index 02aeb66a..22aa1718 100644 --- a/controllers/actions.summerwind.net/runner_pod_controller.go +++ b/controllers/actions.summerwind.net/runner_pod_controller.go @@ -79,7 +79,7 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( } if len(envvars) == 0 { - return ctrl.Result{}, errors.New("Could not determine env vars for runner Pod") + return ctrl.Result{}, errors.New("could not determine env vars for runner Pod") } var enterprise, org, repo string @@ -103,8 +103,8 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( return ctrl.Result{}, err } - if runnerPod.ObjectMeta.DeletionTimestamp.IsZero() { - finalizers, added := addFinalizer(runnerPod.ObjectMeta.Finalizers, runnerPodFinalizerName) + if runnerPod.DeletionTimestamp.IsZero() { + finalizers, added := addFinalizer(runnerPod.Finalizers, runnerPodFinalizerName) var cleanupFinalizersAdded bool if isContainerMode { @@ -113,7 +113,7 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( if added || cleanupFinalizersAdded { newRunner := runnerPod.DeepCopy() - newRunner.ObjectMeta.Finalizers = finalizers + newRunner.Finalizers = finalizers if err := r.Patch(ctx, newRunner, client.MergeFrom(&runnerPod)); err != nil { log.Error(err, "Failed to update runner") @@ -142,7 +142,7 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( } } - if finalizers, removed := removeFinalizer(runnerPod.ObjectMeta.Finalizers, runnerLinkedResourcesFinalizerName); removed { + if finalizers, removed := removeFinalizer(runnerPod.Finalizers, runnerLinkedResourcesFinalizerName); removed { if err := r.cleanupRunnerLinkedPods(ctx, &runnerPod, log); err != nil { log.Info("Runner-linked pods clean up that has failed due to an error. If this persists, please manually remove the runner-linked pods to unblock ARC", "err", err.Error()) return ctrl.Result{Requeue: true, RequeueAfter: 30 * time.Second}, nil @@ -152,7 +152,7 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( return ctrl.Result{Requeue: true, RequeueAfter: 30 * time.Second}, nil } patchedPod := runnerPod.DeepCopy() - patchedPod.ObjectMeta.Finalizers = finalizers + patchedPod.Finalizers = finalizers if err := r.Patch(ctx, patchedPod, client.MergeFrom(&runnerPod)); err != nil { log.Error(err, "Failed to update runner for finalizer linked resources removal") @@ -163,7 +163,7 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( runnerPod = *patchedPod } - finalizers, removed := removeFinalizer(runnerPod.ObjectMeta.Finalizers, runnerPodFinalizerName) + finalizers, removed := removeFinalizer(runnerPod.Finalizers, runnerPodFinalizerName) if removed { // In a standard scenario, the upstream controller, like runnerset-controller, ensures this runner to be gracefully stopped before the deletion timestamp is set. @@ -175,7 +175,7 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( } patchedPod := updatedPod.DeepCopy() - patchedPod.ObjectMeta.Finalizers = finalizers + patchedPod.Finalizers = finalizers // We commit the removal of the finalizer so that Kuberenetes notices it and delete the pod resource from the cluster. if err := r.Patch(ctx, patchedPod, client.MergeFrom(&runnerPod)); err != nil { @@ -284,7 +284,7 @@ func (r *RunnerPodReconciler) cleanupRunnerLinkedPods(ctx context.Context, pod * var runnerLinkedPodList corev1.PodList if err := r.List(ctx, &runnerLinkedPodList, client.InNamespace(pod.Namespace), client.MatchingLabels( map[string]string{ - "runner-pod": pod.ObjectMeta.Name, + "runner-pod": pod.Name, }, )); err != nil { return fmt.Errorf("failed to list runner-linked pods: %w", err) @@ -295,7 +295,7 @@ func (r *RunnerPodReconciler) cleanupRunnerLinkedPods(ctx context.Context, pod * errs []error ) for _, p := range runnerLinkedPodList.Items { - if !p.ObjectMeta.DeletionTimestamp.IsZero() { + if !p.DeletionTimestamp.IsZero() { continue } @@ -307,7 +307,7 @@ func (r *RunnerPodReconciler) cleanupRunnerLinkedPods(ctx context.Context, pod * if kerrors.IsNotFound(err) || kerrors.IsGone(err) { return } - errs = append(errs, fmt.Errorf("delete pod %q error: %v", p.ObjectMeta.Name, err)) + errs = append(errs, fmt.Errorf("delete pod %q error: %v", p.Name, err)) } }() } @@ -330,7 +330,7 @@ func (r *RunnerPodReconciler) cleanupRunnerLinkedSecrets(ctx context.Context, po var runnerLinkedSecretList corev1.SecretList if err := r.List(ctx, &runnerLinkedSecretList, client.InNamespace(pod.Namespace), client.MatchingLabels( map[string]string{ - "runner-pod": pod.ObjectMeta.Name, + "runner-pod": pod.Name, }, )); err != nil { return fmt.Errorf("failed to list runner-linked secrets: %w", err) @@ -341,7 +341,7 @@ func (r *RunnerPodReconciler) cleanupRunnerLinkedSecrets(ctx context.Context, po errs []error ) for _, s := range runnerLinkedSecretList.Items { - if !s.ObjectMeta.DeletionTimestamp.IsZero() { + if !s.DeletionTimestamp.IsZero() { continue } @@ -353,7 +353,7 @@ func (r *RunnerPodReconciler) cleanupRunnerLinkedSecrets(ctx context.Context, po if kerrors.IsNotFound(err) || kerrors.IsGone(err) { return } - errs = append(errs, fmt.Errorf("delete secret %q error: %v", s.ObjectMeta.Name, err)) + errs = append(errs, fmt.Errorf("delete secret %q error: %v", s.Name, err)) } }() } diff --git a/controllers/actions.summerwind.net/runner_pod_owner.go b/controllers/actions.summerwind.net/runner_pod_owner.go index 77cd8e3b..570a1402 100644 --- a/controllers/actions.summerwind.net/runner_pod_owner.go +++ b/controllers/actions.summerwind.net/runner_pod_owner.go @@ -90,7 +90,7 @@ var _ owner = (*ownerStatefulSet)(nil) func (s *ownerStatefulSet) pods(ctx context.Context, c client.Client) ([]corev1.Pod, error) { var podList corev1.PodList - if err := c.List(ctx, &podList, client.MatchingLabels(s.StatefulSet.Spec.Template.ObjectMeta.Labels)); err != nil { + if err := c.List(ctx, &podList, client.MatchingLabels(s.StatefulSet.Spec.Template.Labels)); err != nil { s.Log.Error(err, "Failed to list pods managed by statefulset") return nil, err } diff --git a/controllers/actions.summerwind.net/runnerdeployment_controller.go b/controllers/actions.summerwind.net/runnerdeployment_controller.go index 7753b640..e0075829 100644 --- a/controllers/actions.summerwind.net/runnerdeployment_controller.go +++ b/controllers/actions.summerwind.net/runnerdeployment_controller.go @@ -73,7 +73,7 @@ func (r *RunnerDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Req return ctrl.Result{}, client.IgnoreNotFound(err) } - if !rd.ObjectMeta.DeletionTimestamp.IsZero() { + if !rd.DeletionTimestamp.IsZero() { return ctrl.Result{}, nil } @@ -112,7 +112,7 @@ func (r *RunnerDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Req } if newestSet == nil { - if err := r.Client.Create(ctx, desiredRS); err != nil { + if err := r.Create(ctx, desiredRS); err != nil { log.Error(err, "Failed to create runnerreplicaset resource") return ctrl.Result{}, err @@ -138,7 +138,7 @@ func (r *RunnerDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Req } if newestTemplateHash != desiredTemplateHash { - if err := r.Client.Create(ctx, desiredRS); err != nil { + if err := r.Create(ctx, desiredRS); err != nil { log.Error(err, "Failed to create runnerreplicaset resource") return ctrl.Result{}, err @@ -159,7 +159,7 @@ func (r *RunnerDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Req // but we still need to update the existing replicaset with it. // Otherwise selector-based runner query will never work on replicasets created before the controller v0.17.0 // See https://github.com/actions/actions-runner-controller/pull/355#discussion_r585379259 - if err := r.Client.Update(ctx, updateSet); err != nil { + if err := r.Update(ctx, updateSet); err != nil { log.Error(err, "Failed to update runnerreplicaset resource") return ctrl.Result{}, err @@ -195,7 +195,7 @@ func (r *RunnerDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Req newestSet.Spec.Replicas = &newDesiredReplicas newestSet.Spec.EffectiveTime = rd.Spec.EffectiveTime - if err := r.Client.Update(ctx, newestSet); err != nil { + if err := r.Update(ctx, newestSet); err != nil { log.Error(err, "Failed to update runnerreplicaset resource") return ctrl.Result{}, err @@ -257,7 +257,7 @@ func (r *RunnerDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Req updated := rs.DeepCopy() zero := 0 updated.Spec.Replicas = &zero - if err := r.Client.Update(ctx, updated); err != nil { + if err := r.Update(ctx, updated); err != nil { rslog.Error(err, "Failed to scale runnerreplicaset to zero") return ctrl.Result{}, err @@ -268,7 +268,7 @@ func (r *RunnerDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Req continue } - if err := r.Client.Delete(ctx, &rs); err != nil { + if err := r.Delete(ctx, &rs); err != nil { rslog.Error(err, "Failed to delete runnerreplicaset resource") return ctrl.Result{}, err @@ -445,10 +445,10 @@ func newRunnerReplicaSet(rd *v1alpha1.RunnerDeployment, commonRunnerLabels []str templateHash := ComputeHash(&newRSTemplate) // Add template hash label to selector. - newRSTemplate.ObjectMeta.Labels = CloneAndAddLabel(newRSTemplate.ObjectMeta.Labels, LabelKeyRunnerTemplateHash, templateHash) + newRSTemplate.Labels = CloneAndAddLabel(newRSTemplate.Labels, LabelKeyRunnerTemplateHash, templateHash) // This label selector is used by default when rd.Spec.Selector is empty. - newRSTemplate.ObjectMeta.Labels = CloneAndAddLabel(newRSTemplate.ObjectMeta.Labels, LabelKeyRunnerDeploymentName, rd.Name) + newRSTemplate.Labels = CloneAndAddLabel(newRSTemplate.Labels, LabelKeyRunnerDeploymentName, rd.Name) selector := getSelector(rd) @@ -457,9 +457,9 @@ func newRunnerReplicaSet(rd *v1alpha1.RunnerDeployment, commonRunnerLabels []str rs := v1alpha1.RunnerReplicaSet{ TypeMeta: metav1.TypeMeta{}, ObjectMeta: metav1.ObjectMeta{ - GenerateName: rd.ObjectMeta.Name + "-", - Namespace: rd.ObjectMeta.Namespace, - Labels: newRSTemplate.ObjectMeta.Labels, + GenerateName: rd.Name + "-", + Namespace: rd.Namespace, + Labels: newRSTemplate.Labels, }, Spec: v1alpha1.RunnerReplicaSetSpec{ Replicas: rd.Spec.Replicas, diff --git a/controllers/actions.summerwind.net/runnerreplicaset_controller.go b/controllers/actions.summerwind.net/runnerreplicaset_controller.go index f86d80fb..945affbb 100644 --- a/controllers/actions.summerwind.net/runnerreplicaset_controller.go +++ b/controllers/actions.summerwind.net/runnerreplicaset_controller.go @@ -62,7 +62,7 @@ func (r *RunnerReplicaSetReconciler) Reconcile(ctx context.Context, req ctrl.Req return ctrl.Result{}, client.IgnoreNotFound(err) } - if !rs.ObjectMeta.DeletionTimestamp.IsZero() { + if !rs.DeletionTimestamp.IsZero() { // RunnerReplicaSet cannot be gracefuly removed. // That means any runner that is running a job can be prematurely terminated. // To gracefully remove a RunnerReplicaSet, scale it down to zero first, observe RunnerReplicaSet's status replicas, @@ -70,14 +70,14 @@ func (r *RunnerReplicaSetReconciler) Reconcile(ctx context.Context, req ctrl.Req return ctrl.Result{}, nil } - if rs.ObjectMeta.Labels == nil { - rs.ObjectMeta.Labels = map[string]string{} + if rs.Labels == nil { + rs.Labels = map[string]string{} } // Template hash is usually set by the upstream controller(RunnerDeplloyment controller) on authoring // RunerReplicaset resource, but it may be missing when the user directly created RunnerReplicaSet. // As a template hash is required by by the runner replica management, we dynamically add it here without ever persisting it. - if rs.ObjectMeta.Labels[LabelKeyRunnerTemplateHash] == "" { + if rs.Labels[LabelKeyRunnerTemplateHash] == "" { template := rs.Spec.DeepCopy() template.Replicas = nil template.EffectiveTime = nil @@ -85,8 +85,8 @@ func (r *RunnerReplicaSetReconciler) Reconcile(ctx context.Context, req ctrl.Req log.Info("Using auto-generated template hash", "value", templateHash) - rs.ObjectMeta.Labels = CloneAndAddLabel(rs.ObjectMeta.Labels, LabelKeyRunnerTemplateHash, templateHash) - rs.Spec.Template.ObjectMeta.Labels = CloneAndAddLabel(rs.Spec.Template.ObjectMeta.Labels, LabelKeyRunnerTemplateHash, templateHash) + rs.Labels = CloneAndAddLabel(rs.Labels, LabelKeyRunnerTemplateHash, templateHash) + rs.Spec.Template.Labels = CloneAndAddLabel(rs.Spec.Template.Labels, LabelKeyRunnerTemplateHash, templateHash) } selector, err := metav1.LabelSelectorAsSelector(rs.Spec.Selector) @@ -169,8 +169,8 @@ func (r *RunnerReplicaSetReconciler) newRunner(rs v1alpha1.RunnerReplicaSet) (v1 // the "runner template hash" label to the template.meta which is necessary to make this controller work correctly objectMeta := rs.Spec.Template.ObjectMeta.DeepCopy() - objectMeta.GenerateName = rs.ObjectMeta.Name + "-" - objectMeta.Namespace = rs.ObjectMeta.Namespace + objectMeta.GenerateName = rs.Name + "-" + objectMeta.Namespace = rs.Namespace if objectMeta.Annotations == nil { objectMeta.Annotations = map[string]string{} } diff --git a/controllers/actions.summerwind.net/runnerset_controller.go b/controllers/actions.summerwind.net/runnerset_controller.go index 5fd825a2..92919c0b 100644 --- a/controllers/actions.summerwind.net/runnerset_controller.go +++ b/controllers/actions.summerwind.net/runnerset_controller.go @@ -77,7 +77,7 @@ func (r *RunnerSetReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( return ctrl.Result{}, err } - if !runnerSet.ObjectMeta.DeletionTimestamp.IsZero() { + if !runnerSet.DeletionTimestamp.IsZero() { r.GitHubClient.DeinitForRunnerSet(runnerSet) return ctrl.Result{}, nil @@ -191,11 +191,11 @@ func (r *RunnerSetReconciler) newStatefulSet(ctx context.Context, runnerSet *v1a runnerSetWithOverrides.Labels = append(runnerSetWithOverrides.Labels, r.CommonRunnerLabels...) template := corev1.Pod{ - ObjectMeta: runnerSetWithOverrides.StatefulSetSpec.Template.ObjectMeta, - Spec: runnerSetWithOverrides.StatefulSetSpec.Template.Spec, + ObjectMeta: runnerSetWithOverrides.Template.ObjectMeta, + Spec: runnerSetWithOverrides.Template.Spec, } - if runnerSet.Spec.RunnerConfig.ContainerMode == "kubernetes" { + if runnerSet.Spec.ContainerMode == "kubernetes" { found := false for i := range template.Spec.Containers { if template.Spec.Containers[i].Name == containerName { @@ -208,7 +208,7 @@ func (r *RunnerSetReconciler) newStatefulSet(ctx context.Context, runnerSet *v1a }) } - workDir := runnerSet.Spec.RunnerConfig.WorkDir + workDir := runnerSet.Spec.WorkDir if workDir == "" { workDir = "/runner/_work" } @@ -219,7 +219,7 @@ func (r *RunnerSetReconciler) newStatefulSet(ctx context.Context, runnerSet *v1a template.Spec.ServiceAccountName = runnerSet.Spec.ServiceAccountName } - template.ObjectMeta.Labels = CloneAndAddLabel(template.ObjectMeta.Labels, LabelKeyRunnerSetName, runnerSet.Name) + template.Labels = CloneAndAddLabel(template.Labels, LabelKeyRunnerSetName, runnerSet.Name) ghc, err := r.GitHubClient.InitForRunnerSet(ctx, runnerSet) if err != nil { @@ -228,38 +228,38 @@ func (r *RunnerSetReconciler) newStatefulSet(ctx context.Context, runnerSet *v1a githubBaseURL := ghc.GithubBaseURL - pod, err := newRunnerPodWithContainerMode(runnerSet.Spec.RunnerConfig.ContainerMode, template, runnerSet.Spec.RunnerConfig, githubBaseURL, r.RunnerPodDefaults) + pod, err := newRunnerPodWithContainerMode(runnerSet.Spec.ContainerMode, template, runnerSet.Spec.RunnerConfig, githubBaseURL, r.RunnerPodDefaults) if err != nil { return nil, err } - runnerSetWithOverrides.StatefulSetSpec.Template.ObjectMeta = pod.ObjectMeta - runnerSetWithOverrides.StatefulSetSpec.Template.Spec = pod.Spec + runnerSetWithOverrides.Template.ObjectMeta = pod.ObjectMeta + runnerSetWithOverrides.Template.Spec = pod.Spec // NOTE: Seems like the only supported restart policy for statefulset is "Always"? // I got errosr like the below when tried to use "OnFailure": // StatefulSet.apps \"example-runnersetpg9rx\" is invalid: [spec.template.metadata.labels: Invalid value: map[string]string{\"runner-template-hash\" // :\"85d7578bd6\", \"runnerset-name\":\"example-runnerset\"}: `selector` does not match template `labels`, spec. // template.spec.restartPolicy: Unsupported value: \"OnFailure\": supported values: \"Always\"] - runnerSetWithOverrides.StatefulSetSpec.Template.Spec.RestartPolicy = corev1.RestartPolicyAlways + runnerSetWithOverrides.Template.Spec.RestartPolicy = corev1.RestartPolicyAlways templateHash := ComputeHash(pod.Spec) // Add template hash label to selector. - runnerSetWithOverrides.Template.ObjectMeta.Labels = CloneAndAddLabel(runnerSetWithOverrides.Template.ObjectMeta.Labels, LabelKeyRunnerTemplateHash, templateHash) + runnerSetWithOverrides.Template.Labels = CloneAndAddLabel(runnerSetWithOverrides.Template.Labels, LabelKeyRunnerTemplateHash, templateHash) selector := getRunnerSetSelector(runnerSet) selector = CloneSelectorAndAddLabel(selector, LabelKeyRunnerTemplateHash, templateHash) selector = CloneSelectorAndAddLabel(selector, LabelKeyRunnerSetName, runnerSet.Name) selector = CloneSelectorAndAddLabel(selector, LabelKeyPodMutation, LabelValuePodMutation) - runnerSetWithOverrides.StatefulSetSpec.Selector = selector + runnerSetWithOverrides.Selector = selector rs := appsv1.StatefulSet{ TypeMeta: metav1.TypeMeta{}, ObjectMeta: metav1.ObjectMeta{ - GenerateName: runnerSet.ObjectMeta.Name + "-", - Namespace: runnerSet.ObjectMeta.Namespace, - Labels: CloneAndAddLabel(runnerSet.ObjectMeta.Labels, LabelKeyRunnerTemplateHash, templateHash), + GenerateName: runnerSet.Name + "-", + Namespace: runnerSet.Namespace, + Labels: CloneAndAddLabel(runnerSet.Labels, LabelKeyRunnerTemplateHash, templateHash), Annotations: map[string]string{ SyncTimeAnnotationKey: time.Now().Format(time.RFC3339), }, diff --git a/controllers/actions.summerwind.net/sync_volumes.go b/controllers/actions.summerwind.net/sync_volumes.go index a8cbae0f..2b3247e2 100644 --- a/controllers/actions.summerwind.net/sync_volumes.go +++ b/controllers/actions.summerwind.net/sync_volumes.go @@ -23,7 +23,7 @@ const ( func syncVolumes(ctx context.Context, c client.Client, log logr.Logger, ns string, runnerSet *v1alpha1.RunnerSet, statefulsets []appsv1.StatefulSet) (*ctrl.Result, error) { log = log.WithValues("ns", ns) - for _, t := range runnerSet.Spec.StatefulSetSpec.VolumeClaimTemplates { + for _, t := range runnerSet.Spec.VolumeClaimTemplates { for _, sts := range statefulsets { pvcName := fmt.Sprintf("%s-%s-0", t.Name, sts.Name) diff --git a/controllers/actions.summerwind.net/testresourcereader.go b/controllers/actions.summerwind.net/testresourcereader.go index 30112473..8f0e7012 100644 --- a/controllers/actions.summerwind.net/testresourcereader.go +++ b/controllers/actions.summerwind.net/testresourcereader.go @@ -16,7 +16,7 @@ type testResourceReader struct { } func (r *testResourceReader) Get(_ context.Context, key client.ObjectKey, obj client.Object, _ ...client.GetOption) error { - nsName := types.NamespacedName{Namespace: key.Namespace, Name: key.Name} + nsName := types.NamespacedName(key) ret, ok := r.objects[nsName] if !ok { return &kerrors.StatusError{ErrStatus: metav1.Status{Reason: metav1.StatusReasonNotFound}} diff --git a/controllers/actions.summerwind.net/utils_test.go b/controllers/actions.summerwind.net/utils_test.go index 22917343..2f2234e6 100644 --- a/controllers/actions.summerwind.net/utils_test.go +++ b/controllers/actions.summerwind.net/utils_test.go @@ -64,22 +64,22 @@ func Test_workVolumeClaimTemplateVolumeV1VolumeTransformation(t *testing.T) { t.Errorf("want name %q, got %q\n", want.Name, got.Name) } - if got.VolumeSource.Ephemeral == nil { + if got.Ephemeral == nil { t.Fatal("work volume claim template should transform itself into Ephemeral volume source\n") } - if got.VolumeSource.Ephemeral.VolumeClaimTemplate == nil { + if got.Ephemeral.VolumeClaimTemplate == nil { t.Fatal("work volume claim template should have ephemeral volume claim template set\n") } - gotClassName := *got.VolumeSource.Ephemeral.VolumeClaimTemplate.Spec.StorageClassName - wantClassName := *want.VolumeSource.Ephemeral.VolumeClaimTemplate.Spec.StorageClassName + gotClassName := *got.Ephemeral.VolumeClaimTemplate.Spec.StorageClassName + wantClassName := *want.Ephemeral.VolumeClaimTemplate.Spec.StorageClassName if gotClassName != wantClassName { t.Errorf("expected storage class name %q, got %q\n", wantClassName, gotClassName) } - gotAccessModes := got.VolumeSource.Ephemeral.VolumeClaimTemplate.Spec.AccessModes - wantAccessModes := want.VolumeSource.Ephemeral.VolumeClaimTemplate.Spec.AccessModes + gotAccessModes := got.Ephemeral.VolumeClaimTemplate.Spec.AccessModes + wantAccessModes := want.Ephemeral.VolumeClaimTemplate.Spec.AccessModes if len(gotAccessModes) != len(wantAccessModes) { t.Fatalf("access modes lengths missmatch: got %v, expected %v\n", gotAccessModes, wantAccessModes) } diff --git a/docs/gha-runner-scale-set-controller/samples/grafana-dashboard/ARC-Autoscaling-Runner-Set-Monitoring.json b/docs/gha-runner-scale-set-controller/samples/grafana-dashboard/ARC-Autoscaling-Runner-Set-Monitoring.json new file mode 100644 index 00000000..43f53e01 --- /dev/null +++ b/docs/gha-runner-scale-set-controller/samples/grafana-dashboard/ARC-Autoscaling-Runner-Set-Monitoring.json @@ -0,0 +1,2177 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "panel", + "id": "bargauge", + "name": "Bar gauge", + "version": "" + }, + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "11.5.2" + }, + { + "type": "panel", + "id": "heatmap", + "name": "Heatmap", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": true, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 15, + "panels": [], + "title": "Runner Performance", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Heat map showing the typical time before a job starts and whether the number of jobs in that time bucket are increasing or decreasing.", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 7, + "x": 0, + "y": 1 + }, + "id": 7, + "options": { + "calculate": false, + "cellGap": 1, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Turbo", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisLabel": "Wait Time", + "axisPlacement": "left", + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "sum by(le) (increase(gha_job_startup_duration_seconds_bucket{actions_github_com_scale_set_name=~\"$Scaleset\", actions_github_com_scale_set_namespace=~\"$RunnerNamespace\"}[$__rate_interval]))", + "format": "heatmap", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "interval": "", + "legendFormat": "{{le}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Startup Duration", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Heat map showing the typical time to complete a job and whether the number of jobs in that time bucket are increasing or decreasing.", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 7, + "y": 1 + }, + "id": 6, + "options": { + "calculate": false, + "cellGap": 1, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Spectral", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisLabel": "Time", + "axisPlacement": "left", + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "sum by(le) (increase(gha_job_execution_duration_seconds_bucket{actions_github_com_scale_set_name=~\"$Scaleset\", actions_github_com_scale_set_namespace=~\"$RunnerNamespace\"}[$__rate_interval]))", + "format": "heatmap", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{le}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Job Execution", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The number of jobs assigned to the scale set. The threshold is triggered with the number of assigned jobs exceeds the number of desired runners. This indicates that not all jobs will have an available runner.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "fieldMinMax": false, + "mappings": [], + "min": 0, + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 0, + "y": 8 + }, + "id": 9, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": true, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum(gha_assigned_jobs{namespace=~\"$RunnerNamespace\", actions_github_com_scale_set_name=~\"$Scaleset\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(gha_desired_runners{namespace=~\"$RunnerNamespace\", actions_github_com_scale_set_name=~\"$Scaleset\"}) + 1", + "hide": false, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "DesiredRunners" + } + ], + "title": "Assigned Jobs", + "transformations": [ + { + "id": "configFromData", + "options": { + "configRefId": "DesiredRunners", + "mappings": [ + { + "fieldName": "Time", + "handlerKey": "__ignore" + }, + { + "fieldName": "sum(gha_desired_runners{namespace=~\"(arc-runners|arc-runners-dind|arc-runners-k8s)\", actions_github_com_scale_set_name=~\"(arc-runner-set|dind-runner-set|k8s-runner-set)\"}) + 1", + "handlerKey": "threshold1" + } + ] + } + } + ], + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of runners desired by the scale set", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "fieldMinMax": false, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 3, + "y": 8 + }, + "id": 4, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": true, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum(gha_desired_runners{actions_github_com_scale_set_name=~\"$Scaleset\", namespace=~\"$RunnerNamespace\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Desired Runners", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of registered runners that do not have assigned jobs.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "fieldMinMax": false, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 6, + "y": 8 + }, + "id": 2, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": true, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum(gha_idle_runners{actions_github_com_scale_set_name=~\"$Scaleset\", namespace=~\"$RunnerNamespace\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Idle Runners", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The number of workflow jobs currently executing", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 9, + "y": 8 + }, + "id": 10, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum (gha_running_jobs{actions_github_com_scale_set_name=~\"$Scaleset\", actions_github_com_scale_set_namespace=~\"$RunnerNamespace\"})", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Running Jobs", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The number of runners in a failed state. These runners are typically misconfigured and count against the scale set's maximum limit.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 12, + "y": 8 + }, + "id": 26, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(gha_controller_failed_ephemeral_runners{name=~\"$Scaleset\", namespace=~\"$RunnerNamespace\"})", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Failed Runners", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The number of active scale set listeners", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "fieldMinMax": true, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 0, + "y": 13 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(gha_controller_running_listeners{namespace=~\"$SystemNamespace\"})", + "hide": false, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Listeners", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of runner pods that are waiting to be created. When this number exceeds the number of pods Kubernetes reports as Waiting, it indicate cluster performance issues.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "fieldMinMax": false, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 3, + "y": 13 + }, + "id": 3, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": true, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum(gha_controller_pending_ephemeral_runners{name=~\"$Scaleset\", namespace=~\"$RunnerNamespace\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_status_waiting{namespace=~\"$RunnerNamespace\"}) != 0 or vector(0)", + "hide": false, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "Waiting" + } + ], + "title": "Pending Runners", + "transformations": [ + { + "id": "configFromData", + "options": { + "configRefId": "Waiting", + "mappings": [ + { + "fieldName": "Time", + "handlerKey": "__ignore" + }, + { + "fieldName": "sum(kube_pod_container_status_waiting{namespace=~\"(arc-runners|arc-runners-dind|arc-runners-k8s)\"}) != 0 or vector(0)", + "handlerKey": "threshold1" + } + ] + } + } + ], + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The number of runners registered for processing queued jobs", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 6, + "y": 13 + }, + "id": 8, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": true, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(gha_registered_runners{actions_github_com_scale_set_name=~\"$Scaleset\", namespace=~\"$RunnerNamespace\"})", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Registered Runners", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of runner pods in a running state", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "fieldMinMax": false, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 9, + "y": 13 + }, + "id": 1, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": true, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "max(gha_controller_running_ephemeral_runners{name=~\"$Scaleset\", namespace=~\"$RunnerNamespace\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Active Runners", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The number of containers that are reporting that they were terminated by an out-of-memory condition (OOMK.iller)", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "No issues detected", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "semi-dark-red", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 12, + "y": 13 + }, + "id": 23, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\",namespace=~\"$RunnerNamespace\"}) by (namespace)", + "legendFormat": "__auto", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + } + ], + "title": "Out of Memory", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The peak memory used by a container in a given scale set's namespace.", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "semi-dark-green", + "mode": "shades" + }, + "fieldMinMax": false, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 5, + "x": 0, + "y": 18 + }, + "id": 12, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": false, + "sizing": "auto" + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "max(container_memory_working_set_bytes{namespace=~\"$RunnerNamespace\"}) by (namespace)", + "format": "time_series", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Peak Container Memory", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The sum of the reads and writes occurring within the runner namespace.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 54, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 5, + "y": 18 + }, + "id": 13, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(rate(container_fs_writes_bytes_total{namespace=~\"$RunnerNamespace\"}[$__rate_interval])) > 0 or vector(0)", + "instant": false, + "legendFormat": "Write", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(rate(container_fs_reads_bytes_total{namespace=~\"$RunnerNamespace\"}[$__rate_interval])) > 0 or vector(0)", + "hide": false, + "instant": false, + "legendFormat": "Read", + "range": true, + "refId": "B" + } + ], + "title": "Container I/O", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The Kubernetes-reported pod status.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "min": 0, + "noValue": "No active pods", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "yellow", + "value": null + }, + { + "color": "green", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 11, + "y": 18 + }, + "id": 11, + "options": { + "displayMode": "lcd", + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "maxVizHeight": 300, + "minVizHeight": 10, + "minVizWidth": 0, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "sum(kube_pod_container_status_ready{namespace=~\"$RunnerNamespace\"}) != 0 or vector(0)", + "format": "time_series", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "interval": "", + "legendFormat": "Ready", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_status_waiting{namespace=~\"$RunnerNamespace\"}) != 0 or vector(0)", + "hide": false, + "instant": false, + "legendFormat": "Waiting", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_status_terminated_reason{namespace=~\"$RunnerNamespace\", reason=\"Completed\"}) != 0 or vector(0)", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Completed", + "range": true, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_status_terminated_reason{namespace=~\"$RunnerNamespace\", reason=\"Error\"}) != 0 or vector(0)", + "hide": false, + "instant": false, + "legendFormat": "Error", + "range": true, + "refId": "E" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum(gha_desired_runners)+1", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "DesiredRunners", + "useBackend": false + } + ], + "title": "Container Pod Status", + "transformations": [ + { + "id": "configFromData", + "options": { + "applyTo": { + "id": "byName", + "options": "Ready" + }, + "configRefId": "DesiredRunners", + "mappings": [ + { + "fieldName": "Time", + "handlerKey": "__ignore" + }, + { + "fieldName": "sum(gha_desired_runners) + 1", + "handlerKey": "threshold1" + }, + { + "fieldName": "sum(gha_desired_runners) -5", + "handlerKey": "threshold1" + } + ] + } + } + ], + "type": "bargauge" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 16, + "panels": [], + "title": "Controller Performance", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The average time required for a reconciliation request to be processed. This reflects the time required for the controller to process a single request to modify a Kubernetes resource.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 33, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 25 + }, + "id": 17, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "editorMode": "code", + "expr": "rate(controller_runtime_reconcile_time_seconds_sum{namespace=\"$SystemNamespace\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{controller}}", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + } + ], + "title": "Reconcile Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The average time a queued reconciliation request spends waiting to be processed.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 27, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 25 + }, + "id": 18, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "editorMode": "code", + "expr": "rate(workqueue_queue_duration_seconds_sum{namespace=\"$SystemNamespace\"}[$__rate_interval])", + "legendFormat": "{{controller}}", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + } + ], + "title": "Workqueue Queue Duration", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Errors indicate that controller has not achieved a desired state and is requesting Kubernetes to queue another request for reconciliation. Ideally, this number remains close to zero. An increasing number can indicate resource contention or delays processing API server requests.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 33, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 25 + }, + "id": 27, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "editorMode": "code", + "expr": "rate(controller_runtime_reconcile_errors_total{namespace=\"$SystemNamespace\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{controller}}", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + } + ], + "title": "Reconciliation Errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The number of reconcile requests that are waiting to be processed by the controller. A growing queue depth can indicate that the Kubernetes API Server or the controller does not have enough resources. This can lead to pods taking longer to be deleted or started. ", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "fieldMinMax": false, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 100 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 33 + }, + "id": 20, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "max" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum (workqueue_depth{namespace=\"$SystemNamespace\"}) by (name)", + "legendFormat": "__auto", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + } + ], + "title": "Queue Depth", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The number of workers that are currently being used to process reconcile requests. Increasing this number can reduce the work queue duration, but each new worker adds a small amount of time due to context switching.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 33 + }, + "id": 21, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "max" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum by (controller) (controller_runtime_active_workers)", + "legendFormat": "__auto", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + } + ], + "title": "Active Workers", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The number of calls to the API server", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 27, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 33 + }, + "id": 19, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum by (method, code) (rate(rest_client_requests_total{namespace=\"$SystemNamespace\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "__auto", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + } + ], + "title": "API Calls", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 41 + }, + "id": 25, + "panels": [], + "title": "Metrics", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The time required by Prometheus to read and process metrics. Long scrape times can delay metrics updates or lead to metrics loss. Increasing time often indicates issues with metrics cardinality or cluster resources.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 18, + "x": 0, + "y": 42 + }, + "id": 24, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "scrape_duration_seconds", + "instant": false, + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + } + ], + "title": "Scrape Duration", + "type": "timeseries" + } + ], + "refresh": "5s", + "schemaVersion": 40, + "tags": [], + "templating": { + "list": [ + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(gha_controller_running_listeners,namespace)", + "description": "The ARC system namespace", + "includeAll": true, + "label": "ARC System Namespace", + "multi": true, + "name": "SystemNamespace", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(gha_controller_running_listeners,namespace)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(gha_desired_runners,actions_github_com_scale_set_name)", + "description": "The name of the runner scale set", + "includeAll": true, + "label": "Scale Set", + "multi": true, + "name": "Scaleset", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(gha_desired_runners,actions_github_com_scale_set_name)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 2, + "regex": "", + "type": "query" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(gha_desired_runners{actions_github_com_scale_set_name=~\"$Scaleset\"},namespace)", + "description": "Namespace containing the runners", + "includeAll": true, + "label": "Runner Namespace", + "multi": true, + "name": "RunnerNamespace", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(gha_desired_runners{actions_github_com_scale_set_name=~\"$Scaleset\"},namespace)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 2, + "regex": "", + "type": "query" + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d", + "7d" + ] + }, + "timezone": "", + "title": "ARC Autoscaling Runner Set Monitoring", + "uid": "af21e938-2151-4bf2-b798-8cf9232f947a", + "version": 1, + "weekStart": "" +} diff --git a/docs/gha-runner-scale-set-controller/samples/grafana-dashboard/ARC-Autoscaling-Runner-Set-Monitoring_1692627561838.json b/docs/gha-runner-scale-set-controller/samples/grafana-dashboard/ARC-Autoscaling-Runner-Set-Monitoring_1692627561838.json deleted file mode 100644 index ed997340..00000000 --- a/docs/gha-runner-scale-set-controller/samples/grafana-dashboard/ARC-Autoscaling-Runner-Set-Monitoring_1692627561838.json +++ /dev/null @@ -1,1248 +0,0 @@ -{ - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "Prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__elements": {}, - "__requires": [ - { - "type": "panel", - "id": "gauge", - "name": "Gauge", - "version": "" - }, - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "10.0.0" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "stat", - "name": "Stat", - "version": "" - }, - { - "type": "panel", - "id": "timeseries", - "name": "Time series", - "version": "" - } - ], - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { - "type": "grafana", - "uid": "-- Grafana --" - }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "description": "Starter dashboard to monitor the behavior of the autoscaling runner set mode of actions-runner-controller", - "editable": true, - "fiscalYearStartMonth": 0, - "graphTooltip": 0, - "id": null, - "links": [], - "liveNow": false, - "panels": [ - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 4, - "panels": [], - "title": "Runtime", - "type": "row" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Number of active listener pods (in a running state)", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 2, - "x": 0, - "y": 1 - }, - "id": 14, - "options": { - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showThresholdLabels": false, - "showThresholdMarkers": true - }, - "pluginVersion": "10.0.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "expr": "sum by(namespace) (gha_controller_running_listeners)", - "instant": false, - "range": true, - "refId": "A" - } - ], - "title": "Active listeners", - "type": "gauge" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Total number of registered and running runners across namespaces", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 3, - "x": 2, - "y": 1 - }, - "id": 2, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "pluginVersion": "10.0.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "expr": "sum by(namespace) (gha_registered_runners)", - "hide": false, - "instant": false, - "legendFormat": "Registered", - "range": true, - "refId": "Registered" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "expr": "sum by(namespace) (gha_controller_running_ephemeral_runners)", - "hide": false, - "instant": false, - "interval": "", - "legendFormat": "Running", - "range": true, - "refId": "Running" - } - ], - "title": "Runners States", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Total number of failed runners across namespaces", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "#EAB839", - "value": 1 - }, - { - "color": "dark-red", - "value": 5 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 2, - "x": 5, - "y": 1 - }, - "id": 15, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "pluginVersion": "10.0.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "exemplar": false, - "expr": "sum by(namespace) (gha_controller_failed_ephemeral_runners)", - "instant": false, - "interval": "", - "legendFormat": "__auto", - "range": true, - "refId": "Failed" - } - ], - "title": "Failed (total)", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Total number of pending runners across namespaces.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "orange", - "value": 5 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 2, - "x": 7, - "y": 1 - }, - "id": 16, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "pluginVersion": "10.0.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "expr": "sum by(namespace) (gha_controller_pending_ephemeral_runners)", - "instant": false, - "range": true, - "refId": "A" - } - ], - "title": "Pending (total)", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Total number of registered runners that are not currently running a job.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 2, - "x": 9, - "y": 1 - }, - "id": 17, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "pluginVersion": "10.0.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "expr": "sum by(namespace) (gha_idle_runners)", - "instant": false, - "range": true, - "refId": "A" - } - ], - "title": "Idle (total)", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Jobs that are assigned to the runner scale set but that are not yet accepted but it", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 6, - "x": 11, - "y": 1 - }, - "id": 1, - "options": { - "legend": { - "calcs": [ - "lastNotNull" - ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "10.0.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "expr": "sum by(job) (gha_assigned_jobs)", - "instant": false, - "interval": "1m", - "legendFormat": "{{job}}", - "range": true, - "refId": "A" - } - ], - "title": "Total assigned jobs per listener", - "transparent": true, - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Total number of jobs that are assigned to the runner scale set but that are not yet accepted vs the number of accepted and running jobs", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 7, - "x": 17, - "y": 1 - }, - "id": 3, - "options": { - "legend": { - "calcs": [ - "lastNotNull" - ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "10.0.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "expr": "sum by(job) (gha_assigned_jobs)", - "instant": false, - "legendFormat": "assigned job - {{job}}", - "range": true, - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "expr": "sum by(job) (gha_running_jobs)", - "hide": false, - "instant": false, - "legendFormat": "running_jobs - {{job}}", - "range": true, - "refId": "B" - } - ], - "title": "Assigned vs running jobs", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 9 - }, - "id": 10, - "options": { - "legend": { - "calcs": [ - "lastNotNull" - ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "expr": "avg by(job) (gha_job_startup_duration_seconds_sum)", - "instant": false, - "legendFormat": "{{job}}", - "range": true, - "refId": "A" - } - ], - "title": "Average startup duration", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 9 - }, - "id": 9, - "options": { - "legend": { - "calcs": [ - "mean" - ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "expr": "avg by(job) (gha_job_execution_duration_seconds_sum)", - "instant": false, - "legendFormat": "{{job}}", - "range": true, - "refId": "A" - } - ], - "title": "Average execution duration (seconds)", - "type": "timeseries" - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 17 - }, - "id": 6, - "panels": [], - "title": "Controllers", - "type": "row" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 7, - "x": 0, - "y": 18 - }, - "id": 5, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "expr": "rate(controller_runtime_reconcile_errors_total{job=\"arc-controller-service\"}[$__rate_interval])", - "instant": false, - "legendFormat": "{{controller}}", - "range": true, - "refId": "A" - } - ], - "title": "Reconciliation errors", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 9, - "x": 7, - "y": 18 - }, - "id": 7, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "expr": "rate(controller_runtime_reconcile_time_seconds_sum{job=\"arc-controller-service\"}[$__rate_interval])", - "instant": false, - "legendFormat": "{{controller}}", - "range": true, - "refId": "A" - } - ], - "title": "Reconciliation time (seconds)", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 18 - }, - "id": 13, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "expr": "workqueue_depth{job=\"arc-controller-service\"}", - "instant": false, - "legendFormat": "{{name}}", - "range": true, - "refId": "A" - } - ], - "title": "Workqueue depth", - "type": "timeseries" - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 26 - }, - "id": 12, - "panels": [], - "title": "Prometheus", - "type": "row" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 24, - "x": 0, - "y": 27 - }, - "id": 11, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "expr": "scrape_duration_seconds", - "instant": false, - "legendFormat": "{{job}}", - "range": true, - "refId": "A" - } - ], - "title": "Scrape Duration (seconds)", - "type": "timeseries" - } - ], - "refresh": "5s", - "schemaVersion": 38, - "style": "dark", - "tags": [], - "templating": { - "list": [] - }, - "time": { - "from": "now-15m", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ] - }, - "timezone": "utc", - "title": "ARC Autoscaling Runner Set Monitoring", - "uid": "afe41561-2151-4bf2-b798-79aa6c03412c", - "version": 29, - "weekStart": "" -} \ No newline at end of file diff --git a/docs/gha-runner-scale-set-controller/samples/grafana-dashboard/README.md b/docs/gha-runner-scale-set-controller/samples/grafana-dashboard/README.md index aa869a73..c72961a7 100644 --- a/docs/gha-runner-scale-set-controller/samples/grafana-dashboard/README.md +++ b/docs/gha-runner-scale-set-controller/samples/grafana-dashboard/README.md @@ -1,6 +1,11 @@ # Visualizing Autoscaling Runner Scale Set metrics with Grafana -With metrics introduced in [gha-runner-scale-set-0.5.0](https://github.com/actions/actions-runner-controller/releases/tag/gha-runner-scale-set-0.5.0), you can now visualize the autoscaling behavior of your runner scale set with your tool of choice. This sample shows how to visualize the metrics with [Grafana](https://grafana.com/). +With the metrics support introduced in [gha-runner-scale-set-0.5.0](https://github.com/actions/actions-runner-controller/releases/tag/gha-runner-scale-set-0.5.0), you can visualize the autoscaling behavior of your runner scale set with your tool of choice. + +This sample dashboard shows how to visualize the metrics with [Grafana](https://grafana.com/). + +> [!NOTE] +> We do not intend to provide a supported ARC dashboard. This is simply a reference and a demonstration for how you could leverage the metrics emitted by the controller-manager and listeners to visualize the autoscaling behavior of your runner scale set. We offer no promises of future upgrades to this sample. ## Demo @@ -8,12 +13,43 @@ With metrics introduced in [gha-runner-scale-set-0.5.0](https://github.com/actio ## Setup -We do not intend to provide a supported ARC dashboard. This is simply a reference and a demonstration for how you could leverage the metrics emitted by the controller-manager and listeners to visualize the autoscaling behavior of your runner scale set. We offer no promises of future upgrades to this sample. - 1. Make sure to have [Grafana](https://grafana.com/docs/grafana/latest/installation/) and [Prometheus](https://prometheus.io/docs/prometheus/latest/installation/) running in your cluster. 2. Make sure that Prometheus is properly scraping the metrics endpoints of the controller-manager and listeners. 3. Import the [dashboard](ARC-Autoscaling-Runner-Set-Monitoring_1692627561838.json) into Grafana. +## Required metrics + +This sample relies on the suggestion listener metrics configuration in the scale set [values.yaml](https://github.com/actions/actions-runner-controller/blob/ea27448da51385470b1ce67150aa695cfa45fd3f/charts/gha-runner-scale-set/values.yaml#L129-L270). + +The following metrics are required to be scraped by Prometheus in order to populate the dashboard: + +| Metric | Required labels | Source | +| ------ | ----------- | -----| +| container_fs_writes_bytes_total | namespace | cAdvisor +| container_fs_reads_bytes_total | namespace | cAdvisor +| container_memory_working_set_bytes | namespace | cAdvisor +| controller_runtime_active_workers | controller | ARC Controller +| controller_runtime_reconcile_time_seconds_sum | namespace | ARC Controller +| controller_runtime_reconcile_errors_total | namespace | ARC Controller +| gha_assigned_jobs | actions_github_com_scale_set_name, namespace | ARC Controller +| gha_controller_failed_ephemeral_runners | name, namespace | ARC Controller +| gha_controller_pending_ephemeral_runners | name, namespace | ARC Controller +| gha_controller_running_ephemeral_runners | name, namespace | ARC Controller +| gha_controller_running_listeners | namespace | ARC Controller +| gha_desired_runners | actions_github_com_scale_set_name, namespace | ARC Listener +| gha_idle_runners | actions_github_com_scale_set_name, namespace | ARC Listener +| gha_job_execution_duration_seconds_bucket | actions_github_com_scale_set_name, actions_github_com_scale_set_namespace | ARC Listener +| gha_job_startup_duration_seconds_bucket | actions_github_com_scale_set_name, actions_github_com_scale_set_namespace | ARC Listener +| gha_registered_runners | actions_github_com_scale_set_name, namespace | ARC Listener +| gha_running_jobs | actions_github_com_scale_set_name, actions_github_com_scale_set_namespace | ARC Listener +| kube_pod_container_status_ready | namespace | kube-state-metrics +| kube_pod_container_status_terminated_reason | namespace, reason | kube-state-metrics +| kube_pod_container_status_waiting | namespace | kube-state-metrics +| rest_client_requests_total | code, method, namespace | ARC Controller +| scrape_duration_seconds | | prometheus +| workqueue_depth | name, namespace | ARC Controller +| workqueue_queue_duration_seconds_sum | namespace | ARC Controller + ## Details This dashboard demonstrates some of the metrics provided by ARC and the underlying Kubernetes runtime. It provides a sample visualization of the behavior of the runner scale set, the ARC controllers, and the listeners. This should not be considered a comprehensive dashboard; it is a starting point that can be used with other metrics and logs to understand the health of the cluster. Review the [GitHub documentation detailing the Actions Runner Controller metrics and how to enable them](https://docs.github.com/en/enterprise-server@3.10/actions/hosting-your-own-runners/managing-self-hosted-runners-with-actions-runner-controller/deploying-runner-scale-sets-with-actions-runner-controller#enabling-metrics). @@ -22,16 +58,25 @@ The dashboard includes the following metrics: | Label | Description | | -------------------------------- | ----------------------------------------------------| -| Active listeners | The number of listeners currently running and attempting to manage jobs for the scale set. This should match the number of scale sets deployed. | -| Runner States | Displays the number of runners in a given state. The finished and deleted states are not included in this panel. | -| Failed (total) | The total number of ephemeral runners that have failed to properly start. This may require reviewing the custom resource and logs to identify and resolve the root causes. Common causes include resource issues and failure to pull the required image. | -| Pending (total) | The total number of ephemeral runners that ARC has requested and is waiting for Kubernetes to provide in a running state. If the Kubernetes API server is responsive, this will typically match the number of runner pods that are in a pending state. This number includes requests for runner pods that have not yet been scheduled. When this number is higher than the number of runner pods in a pending state, it can indicate performance issues with the API server and resource contention. | -| Idle (total) | The total number of ephemeral runners that are available to accept jobs across all scale sets. Keeping a pool of idle runners can enable a faster start time under load, but excessive idle runners will consume resources and can prevent nodes from scaling down. | -| Total assigned jobs per listener | The number of workflow jobs acquired and assigned to the listener. The listener must provide supporting runners to complete these jobs. Once jobs are assigned, they cannot be delegated to other listeners and must be processed by the scale set or cancelled. | -| Assigned vs running jobs | Compares the number of jobs assigned against the number of runners that are currently processing jobs. When running jobs is less than assigned jobs, it can indicate that ARC is waiting on Kubernetes to provide and start additional runners. | -| Average startup duration | The average time in seconds between when jobs are assigned and when a runner accepts the job and begins processing. An increasing duration can indicate that the cluster has resource contention or a lack of available nodes for scheduling jobs | -| Average execution duration | The average time in seconds that runners are taking to complete a job. Changes in this value reflect the efficiency of workflow jobs and the pod configuration. If the value is decreasing without changes to the job, it can indicate resource contention or CPU throttling. | +| Startup Duration | Heat map of the wait time before a job starts, with the colors indicating the increase in the number of jobs in that time bucket. An increasing time can indicate that the cluster is resource constrained and may need additional nodes or resources to handle the load. | +| Execution Duration | Heat map of the execution time for a job, with the colors indicating the increase in the number of jobs in that time bucket. Time can be affected by the number of steps in the job, the allocated CPU, and whether there is resource contention on the node that is impacting performance | +| Assigned Jobs | The number of jobs that have been assigned to the listener. This is the number of jobs that the listener is responsible for providing a runner to process. | +| Desired Runners | The number of runners that the listener is requesting from the controller. This is the number of runners required to process the assigned jobs and provide idle runners. It is limited by the configured maximum runner count for the scale set. | +| Idle Runners | The total number of ephemeral runners that are available to accept jobs across all selected scale sets. Keeping a pool of idle runners can enable a faster start time under load, but excessive idle runners will consume resources and can prevent nodes from scaling down. | +| Running Jobs | The number of runners that are currently processing jobs. | +| Failed Runners | The total number of ephemeral runners that have failed to properly start. This may require reviewing the custom resource and logs to identify and resolve the root causes. Common causes include resource issues and failure to pull the required image. | +| Listeners | The number of listeners currently running and attempting to manage jobs for the scale set. This should match the number of scale sets deployed. | +| Pending Runners | The total number of ephemeral runners that ARC has requested and is waiting for Kubernetes to provide in a running state. If the Kubernetes API server is responsive, this will typically match the number of runner pods that are in a pending state. This number includes requests for runner pods that have not yet been scheduled. When this number is higher than the number of runner pods in a pending state, it can indicate performance issues. | +| Registered Runners | The total number of ephemeral runners that have been successfully registered. | +| Active Runners | The total number of runners that are active and either available or processing jobs. | +| Out of Memory | The number of containers that have been terminated by the OOMKiller. This can indicate that the requests/ limits for one or more pods on the node were configured improperly, allowing pods to request more memory than the node had available. | +| Peak Container Memory | The maximum amount of memory used by any container in a given namespace during the selected time. This can be used for tuning the memory limits for the pods and for alerts as containers get close to their limits. +| Container I/O | Shows the number of bytes read and written to the container filesystem. This can be used to identify if the container is reading or writing a large amount of data to the filesystem, which can impact performance. | +| Container Pod Status | Shows the number of containers in each status (waiting, running, terminated, ready). This can be used to identify if there are a large number of containers that are failing to start or are in a waiting state. | +| Reconcile time | The time to perform a single reconciliation task from a controller's work queue. This metric reflects the time it takes for ARC to complete each step in the processing of creating, managing, and cleaning up runners. As this increases, it can indicate resource contention, processing delays, or delays from the API server. | +| Workqueue Queue Duration | The time items spent in the work queue for a controller before being processed. This is often related to the work queue depth; as the number of items increases, it can take an increasing amount of time for an item to be processed. | | Reconciliation errors | Reconciliation is the process of a controller ensuring the desired state and actual state of the resources match. Each time an event occurs on a resource watched by the controller, the controller is required to indicate if the new state matches the desired state. Kubernetes adds a task to the work queue for the controller to perform this reconciliation. Errors indicate that controller has not achieved a desired state and is requesting Kubernetes to queue another request for reconciliation. Ideally, this number remains close to zero. An increasing number can indicate resource contention or delays processing API server requests. This reflects Kubernetes resources that ARC is waiting to be provided or in the necessary state. As a concrete example, ARC will request the creation of a secret prior to creating the pod. If the response indicates the secret is not immediately ready, ARC will requeue the reconciliation task with the error details, incrementing this count. | -| Reconciliation time | A histogram reflecting the time in seconds to perform a single reconciliation task from the controller's work queue. A histogram counts the number of requests that are processed within a given bucket of time. This metric reflects the time it takes for ARC to complete each step in the processing of creating, managing, and cleaning up runners. As this increases, it can indicate resource contention or processing delays within Kubernetes or the API server. This displays shows an average, which may hide larger or smaller times that are occurring in the processing. | -| Workqueue depth | The number of tasks that Kubernetes queued for the ARC controllers to process. This includes reconciliation requests and tasks from ARC. ARC sequentially processes a work queue of single, small task to avoid concurrency issues. Managing a runner requires multiple steps to prepare, create, update, and delete the runner, its resources, and the ARC custom resources. As each step is completed (or trigger reconciliation), new tasks are queued for processing. As the depth increases, it indicates more tasks awaiting time from the controller. Growth indicates increasing work and may indicate Kubernetes resource contention or processing latencies. Each request for a new runner will result in multiple tasks being added to the work queue to prepare and create the runner and the related ARC custom resources. | +| Workqueue depth | The number of tasks that Kubernetes has queued for the ARC controllers to process. This includes reconciliation requests and tasks initiated by the controller. Managing a runner requires multiple steps to prepare, create, update, and delete the runner, its resources, and the ARC custom resources. As each step is completed (or trigger reconciliation), new tasks are queued for processing. The controller will then use one or more workers to process these tasks in the order they were queued. As the depth increases, it indicates more tasks awaiting time from the controller. Growth indicates increasing work and may reflect Kubernetes resource contention or processing latencies. Each request for a new runner will result in multiple tasks being added to the work queue to prepare and create the runner and the related ARC custom resources. | +| Active Workers | The number of workers that are actively processing tasks in the work queue. If the queue is empty, then there may be no workers required to process the tasks. The number of workers for the ephemeral runner is configurable in the scale set values file. | +| API Calls | Shows the number of calls to the API server by status code and HTTP method. The method indicates the type of activity being performed, while the status code indicates the result of the activity. Error codes of 500 and above often indicate a Kubernetes issue. | | Scrape Duration (seconds) | The amount of time required for Prometheus to read the configured metrics from components in the cluster. An increasing number may indicate a lack of resources for Prometheus and a risk of the process exceeding the configured timeout, leading to lost metrics data. | diff --git a/docs/gha-runner-scale-set-controller/samples/grafana-dashboard/grafana-sample.png b/docs/gha-runner-scale-set-controller/samples/grafana-dashboard/grafana-sample.png index fd8f69cb..3860c4f0 100644 --- a/docs/gha-runner-scale-set-controller/samples/grafana-dashboard/grafana-sample.png +++ b/docs/gha-runner-scale-set-controller/samples/grafana-dashboard/grafana-sample.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b871862ef58b3480017edfa168d54f8269c8f5c542eb27e9da3e6fcb72294ecb -size 606907 +oid sha256:9bf448c6e9dad0e9e615f82e17883cf34b09b14f5461189167b798df40106c27 +size 351602 diff --git a/github/actions/client.go b/github/actions/client.go index 607551c0..9f6f8886 100644 --- a/github/actions/client.go +++ b/github/actions/client.go @@ -1212,7 +1212,7 @@ func createJWTForGitHubApp(appAuth *GitHubAppAuth) (string, error) { claims := &jwt.RegisteredClaims{ IssuedAt: jwt.NewNumericDate(issuedAt), ExpiresAt: jwt.NewNumericDate(expiresAt), - Issuer: strconv.FormatInt(appAuth.AppID, 10), + Issuer: appAuth.AppID, } token := jwt.NewWithClaims(jwt.SigningMethodRS256, claims) diff --git a/github/actions/client_job_acquisition_test.go b/github/actions/client_job_acquisition_test.go index 38c81e05..d155df09 100644 --- a/github/actions/client_job_acquisition_test.go +++ b/github/actions/client_job_acquisition_test.go @@ -54,7 +54,7 @@ func TestAcquireJobs(t *testing.T) { RunnerScaleSet: &actions.RunnerScaleSet{Id: 1}, MessageQueueAccessToken: "abc", } - var requestIDs []int64 = []int64{1} + var requestIDs = []int64{1} retryMax := 1 actualRetry := 0 diff --git a/github/actions/client_runner_test.go b/github/actions/client_runner_test.go index 1ad4947e..40525bde 100644 --- a/github/actions/client_runner_test.go +++ b/github/actions/client_runner_test.go @@ -67,7 +67,7 @@ func TestGetRunnerByName(t *testing.T) { t.Run("Get Runner by Name", func(t *testing.T) { var runnerID int64 = 1 - var runnerName string = "self-hosted-ubuntu" + var runnerName = "self-hosted-ubuntu" want := &actions.RunnerReference{ Id: int(runnerID), Name: runnerName, @@ -87,7 +87,7 @@ func TestGetRunnerByName(t *testing.T) { }) t.Run("Get Runner by name with not exist runner", func(t *testing.T) { - var runnerName string = "self-hosted-ubuntu" + var runnerName = "self-hosted-ubuntu" response := []byte(`{"count": 0, "value": []}`) server := newActionsServer(t, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { @@ -103,7 +103,7 @@ func TestGetRunnerByName(t *testing.T) { }) t.Run("Default retries on server error", func(t *testing.T) { - var runnerName string = "self-hosted-ubuntu" + var runnerName = "self-hosted-ubuntu" retryWaitMax := 1 * time.Millisecond retryMax := 1 @@ -181,7 +181,7 @@ func TestGetRunnerGroupByName(t *testing.T) { t.Run("Get RunnerGroup by Name", func(t *testing.T) { var runnerGroupID int64 = 1 - var runnerGroupName string = "test-runner-group" + var runnerGroupName = "test-runner-group" want := &actions.RunnerGroup{ ID: runnerGroupID, Name: runnerGroupName, @@ -201,7 +201,7 @@ func TestGetRunnerGroupByName(t *testing.T) { }) t.Run("Get RunnerGroup by name with not exist runner group", func(t *testing.T) { - var runnerGroupName string = "test-runner-group" + var runnerGroupName = "test-runner-group" response := []byte(`{"count": 0, "value": []}`) server := newActionsServer(t, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { diff --git a/github/actions/identifier_test.go b/github/actions/identifier_test.go index 5604d894..528e0521 100644 --- a/github/actions/identifier_test.go +++ b/github/actions/identifier_test.go @@ -57,7 +57,7 @@ func TestClient_Identifier(t *testing.T) { } defaultAppCreds := &actions.ActionsAuth{ AppCreds: &actions.GitHubAppAuth{ - AppID: 123, + AppID: "123", AppInstallationID: 123, AppPrivateKey: "private key", }, @@ -90,7 +90,7 @@ func TestClient_Identifier(t *testing.T) { old: defaultAppCreds, new: &actions.ActionsAuth{ AppCreds: &actions.GitHubAppAuth{ - AppID: 456, + AppID: "456", AppInstallationID: 456, AppPrivateKey: "new private key", }, diff --git a/github/actions/multi_client.go b/github/actions/multi_client.go index 01cb7abf..f8e16071 100644 --- a/github/actions/multi_client.go +++ b/github/actions/multi_client.go @@ -23,7 +23,8 @@ type multiClient struct { } type GitHubAppAuth struct { - AppID int64 + // AppID is the ID or the Client ID of the application + AppID string AppInstallationID int64 AppPrivateKey string } @@ -124,16 +125,11 @@ func (m *multiClient) GetClientFromSecret(ctx context.Context, githubConfigURL, return m.GetClientFor(ctx, githubConfigURL, auth, namespace, options...) } - parsedAppID, err := strconv.ParseInt(appID, 10, 64) - if err != nil { - return nil, err - } - parsedAppInstallationID, err := strconv.ParseInt(appInstallationID, 10, 64) if err != nil { return nil, err } - auth.AppCreds = &GitHubAppAuth{AppID: parsedAppID, AppInstallationID: parsedAppInstallationID, AppPrivateKey: appPrivateKey} + auth.AppCreds = &GitHubAppAuth{AppID: appID, AppInstallationID: parsedAppInstallationID, AppPrivateKey: appPrivateKey} return m.GetClientFor(ctx, githubConfigURL, auth, namespace, options...) } diff --git a/github/actions/multi_client_test.go b/github/actions/multi_client_test.go index 665df7ad..57589857 100644 --- a/github/actions/multi_client_test.go +++ b/github/actions/multi_client_test.go @@ -137,7 +137,7 @@ etFcaQuTHEZyRhhJ4BU= -----END PRIVATE KEY-----` auth := &GitHubAppAuth{ - AppID: 123, + AppID: "123", AppPrivateKey: key, } jwt, err := createJWTForGitHubApp(auth) diff --git a/github/fake/fake.go b/github/fake/fake.go index 411b2ece..abeb5642 100644 --- a/github/fake/fake.go +++ b/github/fake/fake.go @@ -127,7 +127,7 @@ func NewServer(opts ...Option) *httptest.Server { }, // For ListRunners - "/repos/test/valid/actions/runners": config.FixedResponses.ListRunners, + "/repos/test/valid/actions/runners": config.ListRunners, "/repos/test/invalid/actions/runners": &Handler{ Status: http.StatusNoContent, Body: "", @@ -204,10 +204,10 @@ func NewServer(opts ...Option) *httptest.Server { }, // For auto-scaling based on the number of queued(pending) workflow runs - "/repos/test/valid/actions/runs": config.FixedResponses.ListRepositoryWorkflowRuns, + "/repos/test/valid/actions/runs": config.ListRepositoryWorkflowRuns, // For auto-scaling based on the number of queued(pending) workflow jobs - "/repos/test/valid/actions/runs/": config.FixedResponses.ListWorkflowJobs, + "/repos/test/valid/actions/runs/": config.ListWorkflowJobs, } mux := http.NewServeMux() diff --git a/github/fake/options.go b/github/fake/options.go index 475c7560..33d1f2d6 100644 --- a/github/fake/options.go +++ b/github/fake/options.go @@ -12,7 +12,7 @@ type Option func(*ServerConfig) func WithListRepositoryWorkflowRunsResponse(status int, body, queued, in_progress string) Option { return func(c *ServerConfig) { - c.FixedResponses.ListRepositoryWorkflowRuns = &Handler{ + c.ListRepositoryWorkflowRuns = &Handler{ Status: status, Body: body, Statuses: map[string]string{ @@ -25,7 +25,7 @@ func WithListRepositoryWorkflowRunsResponse(status int, body, queued, in_progres func WithListWorkflowJobsResponse(status int, bodies map[int]string) Option { return func(c *ServerConfig) { - c.FixedResponses.ListWorkflowJobs = &MapHandler{ + c.ListWorkflowJobs = &MapHandler{ Status: status, Bodies: bodies, } @@ -34,7 +34,7 @@ func WithListWorkflowJobsResponse(status int, bodies map[int]string) Option { func WithListRunnersResponse(status int, body string) Option { return func(c *ServerConfig) { - c.FixedResponses.ListRunners = &ListRunnersHandler{ + c.ListRunners = &ListRunnersHandler{ Status: status, Body: body, } diff --git a/github/github.go b/github/github.go index 73c617fc..eaab748b 100644 --- a/github/github.go +++ b/github/github.go @@ -290,7 +290,7 @@ func (c *Client) ListRunnerGroupRepositoryAccesses(ctx context.Context, org stri opts := github.ListOptions{PerPage: 100} for { - list, res, err := c.Client.Actions.ListRepositoryAccessRunnerGroup(ctx, org, runnerGroupId, &opts) + list, res, err := c.Actions.ListRepositoryAccessRunnerGroup(ctx, org, runnerGroupId, &opts) if err != nil { return nil, fmt.Errorf("failed to list repository access for runner group: %w", err) } @@ -323,32 +323,32 @@ func (c *Client) cleanup() { func (c *Client) createRegistrationToken(ctx context.Context, enterprise, org, repo string) (*github.RegistrationToken, *github.Response, error) { if len(repo) > 0 { - return c.Client.Actions.CreateRegistrationToken(ctx, org, repo) + return c.Actions.CreateRegistrationToken(ctx, org, repo) } if len(org) > 0 { - return c.Client.Actions.CreateOrganizationRegistrationToken(ctx, org) + return c.Actions.CreateOrganizationRegistrationToken(ctx, org) } - return c.Client.Enterprise.CreateRegistrationToken(ctx, enterprise) + return c.Enterprise.CreateRegistrationToken(ctx, enterprise) } func (c *Client) removeRunner(ctx context.Context, enterprise, org, repo string, runnerID int64) (*github.Response, error) { if len(repo) > 0 { - return c.Client.Actions.RemoveRunner(ctx, org, repo, runnerID) + return c.Actions.RemoveRunner(ctx, org, repo, runnerID) } if len(org) > 0 { - return c.Client.Actions.RemoveOrganizationRunner(ctx, org, runnerID) + return c.Actions.RemoveOrganizationRunner(ctx, org, runnerID) } - return c.Client.Enterprise.RemoveRunner(ctx, enterprise, runnerID) + return c.Enterprise.RemoveRunner(ctx, enterprise, runnerID) } func (c *Client) listRunners(ctx context.Context, enterprise, org, repo string, opts *github.ListOptions) (*github.Runners, *github.Response, error) { if len(repo) > 0 { - return c.Client.Actions.ListRunners(ctx, org, repo, opts) + return c.Actions.ListRunners(ctx, org, repo, opts) } if len(org) > 0 { - return c.Client.Actions.ListOrganizationRunners(ctx, org, opts) + return c.Actions.ListOrganizationRunners(ctx, org, opts) } - return c.Client.Enterprise.ListRunners(ctx, enterprise, opts) + return c.Enterprise.ListRunners(ctx, enterprise, opts) } func (c *Client) ListRepositoryWorkflowRuns(ctx context.Context, user string, repoName string) ([]*github.WorkflowRun, error) { @@ -381,7 +381,7 @@ func (c *Client) listRepositoryWorkflowRuns(ctx context.Context, user string, re } for { - list, res, err := c.Client.Actions.ListRepositoryWorkflowRuns(ctx, user, repoName, &opts) + list, res, err := c.Actions.ListRepositoryWorkflowRuns(ctx, user, repoName, &opts) if err != nil { return workflowRuns, fmt.Errorf("failed to list workflow runs: %v", err) diff --git a/github/github_test.go b/github/github_test.go index a581b45e..21c8d626 100644 --- a/github/github_test.go +++ b/github/github_test.go @@ -26,7 +26,7 @@ func newTestClient() *Client { if err != nil { panic(err) } - client.Client.BaseURL = baseURL + client.BaseURL = baseURL return client } diff --git a/go.mod b/go.mod index 7ca02e5f..12f60832 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,7 @@ module github.com/actions/actions-runner-controller -go 1.24.0 +go 1.24.3 + require ( github.com/bradleyfalzon/ghinstallation/v2 v2.14.0 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc diff --git a/runner/Makefile b/runner/Makefile index f9388f89..6a0b7ac5 100644 --- a/runner/Makefile +++ b/runner/Makefile @@ -6,8 +6,8 @@ DIND_ROOTLESS_RUNNER_NAME ?= ${DOCKER_USER}/actions-runner-dind-rootless OS_IMAGE ?= ubuntu-22.04 TARGETPLATFORM ?= $(shell arch) -RUNNER_VERSION ?= 2.323.0 -RUNNER_CONTAINER_HOOKS_VERSION ?= 0.6.2 +RUNNER_VERSION ?= 2.325.0 +RUNNER_CONTAINER_HOOKS_VERSION ?= 0.7.0 DOCKER_VERSION ?= 24.0.7 # default list of platforms for which multiarch image is built diff --git a/runner/VERSION b/runner/VERSION index 9b74807c..63a843f1 100644 --- a/runner/VERSION +++ b/runner/VERSION @@ -1,2 +1,2 @@ -RUNNER_VERSION=2.323.0 -RUNNER_CONTAINER_HOOKS_VERSION=0.6.2 \ No newline at end of file +RUNNER_VERSION=2.325.0 +RUNNER_CONTAINER_HOOKS_VERSION=0.7.0 \ No newline at end of file diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index 207cc84a..7b23d7c6 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -36,8 +36,8 @@ var ( testResultCMNamePrefix = "test-result-" - RunnerVersion = "2.323.0" - RunnerContainerHooksVersion = "0.6.2" + RunnerVersion = "2.325.0" + RunnerContainerHooksVersion = "0.7.0" ) // If you're willing to run this test via VS Code "run test" or "debug test", @@ -598,9 +598,9 @@ func initTestEnv(t *testing.T, k8sMinorVer string, vars vars) *env { } e.Kind = testing.StartKind(t, k8sMinorVer, testing.Preload(images...)) - e.Env.Kubeconfig = e.Kind.Kubeconfig() + e.Kubeconfig = e.Kind.Kubeconfig() } else { - e.Env.Kubeconfig = e.remoteKubeconfig + e.Kubeconfig = e.remoteKubeconfig // Kind automatically installs https://github.com/rancher/local-path-provisioner for PVs. // But assuming the remote cluster isn't a kind Kubernetes cluster, @@ -1106,7 +1106,7 @@ func installActionsWorkflow(t *testing.T, testName, runnerLabel, testResultCMNam testing.Step{ Uses: "actions/setup-go@v3", With: &testing.With{ - GoVersion: "1.24.0", + GoVersion: "1.24.3", }, }, ) @@ -1181,7 +1181,7 @@ func installActionsWorkflow(t *testing.T, testName, runnerLabel, testResultCMNam steps = append(steps, testing.Step{ Name: "Set up Docker Buildx", - Uses: "docker/setup-buildx-action@v1", + Uses: "docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2", With: setupBuildXActionWith, }, testing.Step{ @@ -1193,7 +1193,7 @@ func installActionsWorkflow(t *testing.T, testName, runnerLabel, testResultCMNam Run: "docker run --rm test1", }, testing.Step{ - Uses: "addnab/docker-run-action@v3", + Uses: "addnab/docker-run-action@4f65fabd2431ebc8d299f8e5a018d79a769ae185", With: &testing.With{ Image: "test1", Run: "hello", @@ -1234,7 +1234,7 @@ func installActionsWorkflow(t *testing.T, testName, runnerLabel, testResultCMNam steps = append(steps, testing.Step{ - Uses: "azure/setup-kubectl@v1", + Uses: "azure/setup-kubectl@3e0aec4d80787158d308d7b364cb1b702e7feb7f", With: &testing.With{ Version: "v1.24.0", },