Stricter filtering of check run completion events (#2520)
I observed that 100% of canceled jobs in my runner pool were not causing scale down events. This PR fixes that. The problem was caused by #2119. #2119 ignores certain webhook events in order to fix #2118. However, #2119 overdoes it and filters out valid job cancellation events. This PR uses stricter filtering and add visibility for future troubleshooting. <details><summary>Example cancellation event</summary> This is the redacted top portion of a valid cancellation event my runner pool received and ignored. ```json { "action": "completed", "workflow_job": { "id": 12848997134, "run_id": 4738060033, "workflow_name": "slack-notifier", "head_branch": "auto-update/slack-notifier-0.5.1", "run_url": "https://api.github.com/repos/nuru/<redacted>/actions/runs/4738060033", "run_attempt": 1, "node_id": "CR_kwDOB8Xtbc8AAAAC_dwjDg", "head_sha": "55bada8f3d0d3e12a510a1bf34d0c3e169b65f89", "url": "https://api.github.com/repos/nuru/<redacted>/actions/jobs/12848997134", "html_url": "https://github.com/nuru/<redacted>/actions/runs/4738060033/jobs/8411515430", "status": "completed", "conclusion": "cancelled", "created_at": "2023-04-19T00:03:12Z", "started_at": "2023-04-19T00:03:42Z", "completed_at": "2023-04-19T00:03:42Z", "name": "build (arm64)", "steps": [ ], "check_run_url": "https://api.github.com/repos/nuru/<redacted>/check-runs/12848997134", "labels": [ "self-hosted", "arm64" ], "runner_id": 0, "runner_name": "", "runner_group_id": 0, "runner_group_name": "" }, ``` </details>
This commit is contained in:
parent
94c089c407
commit
9bd4025e9c
|
|
@ -210,13 +210,23 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) Handle(w http.Respons
|
|||
if e.GetAction() == "queued" {
|
||||
target.Amount = 1
|
||||
break
|
||||
} else if e.GetAction() == "completed" && e.GetWorkflowJob().GetConclusion() != "skipped" && e.GetWorkflowJob().GetRunnerID() > 0 {
|
||||
// A negative amount is processed in the tryScale func as a scale-down request,
|
||||
// that erases the oldest CapacityReservation with the same amount.
|
||||
// If the first CapacityReservation was with Replicas=1, this negative scale target erases that,
|
||||
// so that the resulting desired replicas decreases by 1.
|
||||
target.Amount = -1
|
||||
break
|
||||
} else if e.GetAction() == "completed" && e.GetWorkflowJob().GetConclusion() != "skipped" {
|
||||
// We want to filter out "completed" events sent by check runs.
|
||||
// See https://github.com/actions/actions-runner-controller/issues/2118
|
||||
// and https://github.com/actions/actions-runner-controller/pull/2119
|
||||
// But canceled events have runner_id == 0 and GetRunnerID() returns 0 when RunnerID == nil,
|
||||
// so we need to be more specific in filtering out the check runs.
|
||||
// See example check run completion at https://gist.github.com/nathanklick/268fea6496a4d7b14cecb2999747ef84
|
||||
if e.GetWorkflowJob().GetConclusion() == "success" && e.GetWorkflowJob().RunnerID == nil {
|
||||
log.V(1).Info("Ignoring workflow_job event because it does not relate to a self-hosted runner")
|
||||
} else {
|
||||
// A negative amount is processed in the tryScale func as a scale-down request,
|
||||
// that erases the oldest CapacityReservation with the same amount.
|
||||
// If the first CapacityReservation was with Replicas=1, this negative scale target erases that,
|
||||
// so that the resulting desired replicas decreases by 1.
|
||||
target.Amount = -1
|
||||
break
|
||||
}
|
||||
}
|
||||
// If the conclusion is "skipped", we will ignore it and fallthrough to the default case.
|
||||
fallthrough
|
||||
|
|
|
|||
Loading…
Reference in New Issue