From 9bd4025e9c8e933eb64dc310658f1553dac053ee Mon Sep 17 00:00:00 2001 From: Nuru Date: Wed, 26 Apr 2023 21:15:23 -0700 Subject: [PATCH] Stricter filtering of check run completion events (#2520) I observed that 100% of canceled jobs in my runner pool were not causing scale down events. This PR fixes that. The problem was caused by #2119. #2119 ignores certain webhook events in order to fix #2118. However, #2119 overdoes it and filters out valid job cancellation events. This PR uses stricter filtering and add visibility for future troubleshooting.
Example cancellation event This is the redacted top portion of a valid cancellation event my runner pool received and ignored. ```json { "action": "completed", "workflow_job": { "id": 12848997134, "run_id": 4738060033, "workflow_name": "slack-notifier", "head_branch": "auto-update/slack-notifier-0.5.1", "run_url": "https://api.github.com/repos/nuru//actions/runs/4738060033", "run_attempt": 1, "node_id": "CR_kwDOB8Xtbc8AAAAC_dwjDg", "head_sha": "55bada8f3d0d3e12a510a1bf34d0c3e169b65f89", "url": "https://api.github.com/repos/nuru//actions/jobs/12848997134", "html_url": "https://github.com/nuru//actions/runs/4738060033/jobs/8411515430", "status": "completed", "conclusion": "cancelled", "created_at": "2023-04-19T00:03:12Z", "started_at": "2023-04-19T00:03:42Z", "completed_at": "2023-04-19T00:03:42Z", "name": "build (arm64)", "steps": [ ], "check_run_url": "https://api.github.com/repos/nuru//check-runs/12848997134", "labels": [ "self-hosted", "arm64" ], "runner_id": 0, "runner_name": "", "runner_group_id": 0, "runner_group_name": "" }, ```
--- .../horizontal_runner_autoscaler_webhook.go | 24 +++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/controllers/actions.summerwind.net/horizontal_runner_autoscaler_webhook.go b/controllers/actions.summerwind.net/horizontal_runner_autoscaler_webhook.go index 59f70469..a377fb9f 100644 --- a/controllers/actions.summerwind.net/horizontal_runner_autoscaler_webhook.go +++ b/controllers/actions.summerwind.net/horizontal_runner_autoscaler_webhook.go @@ -210,13 +210,23 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) Handle(w http.Respons if e.GetAction() == "queued" { target.Amount = 1 break - } else if e.GetAction() == "completed" && e.GetWorkflowJob().GetConclusion() != "skipped" && e.GetWorkflowJob().GetRunnerID() > 0 { - // A negative amount is processed in the tryScale func as a scale-down request, - // that erases the oldest CapacityReservation with the same amount. - // If the first CapacityReservation was with Replicas=1, this negative scale target erases that, - // so that the resulting desired replicas decreases by 1. - target.Amount = -1 - break + } else if e.GetAction() == "completed" && e.GetWorkflowJob().GetConclusion() != "skipped" { + // We want to filter out "completed" events sent by check runs. + // See https://github.com/actions/actions-runner-controller/issues/2118 + // and https://github.com/actions/actions-runner-controller/pull/2119 + // But canceled events have runner_id == 0 and GetRunnerID() returns 0 when RunnerID == nil, + // so we need to be more specific in filtering out the check runs. + // See example check run completion at https://gist.github.com/nathanklick/268fea6496a4d7b14cecb2999747ef84 + if e.GetWorkflowJob().GetConclusion() == "success" && e.GetWorkflowJob().RunnerID == nil { + log.V(1).Info("Ignoring workflow_job event because it does not relate to a self-hosted runner") + } else { + // A negative amount is processed in the tryScale func as a scale-down request, + // that erases the oldest CapacityReservation with the same amount. + // If the first CapacityReservation was with Replicas=1, this negative scale target erases that, + // so that the resulting desired replicas decreases by 1. + target.Amount = -1 + break + } } // If the conclusion is "skipped", we will ignore it and fallthrough to the default case. fallthrough