Stricter filtering of check run completion events (#2520)
I observed that 100% of canceled jobs in my runner pool were not causing scale down events. This PR fixes that. The problem was caused by #2119. #2119 ignores certain webhook events in order to fix #2118. However, #2119 overdoes it and filters out valid job cancellation events. This PR uses stricter filtering and add visibility for future troubleshooting. <details><summary>Example cancellation event</summary> This is the redacted top portion of a valid cancellation event my runner pool received and ignored. ```json { "action": "completed", "workflow_job": { "id": 12848997134, "run_id": 4738060033, "workflow_name": "slack-notifier", "head_branch": "auto-update/slack-notifier-0.5.1", "run_url": "https://api.github.com/repos/nuru/<redacted>/actions/runs/4738060033", "run_attempt": 1, "node_id": "CR_kwDOB8Xtbc8AAAAC_dwjDg", "head_sha": "55bada8f3d0d3e12a510a1bf34d0c3e169b65f89", "url": "https://api.github.com/repos/nuru/<redacted>/actions/jobs/12848997134", "html_url": "https://github.com/nuru/<redacted>/actions/runs/4738060033/jobs/8411515430", "status": "completed", "conclusion": "cancelled", "created_at": "2023-04-19T00:03:12Z", "started_at": "2023-04-19T00:03:42Z", "completed_at": "2023-04-19T00:03:42Z", "name": "build (arm64)", "steps": [ ], "check_run_url": "https://api.github.com/repos/nuru/<redacted>/check-runs/12848997134", "labels": [ "self-hosted", "arm64" ], "runner_id": 0, "runner_name": "", "runner_group_id": 0, "runner_group_name": "" }, ``` </details>
This commit is contained in:
		
							parent
							
								
									94c089c407
								
							
						
					
					
						commit
						9bd4025e9c
					
				|  | @ -210,7 +210,16 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) Handle(w http.Respons | ||||||
| 			if e.GetAction() == "queued" { | 			if e.GetAction() == "queued" { | ||||||
| 				target.Amount = 1 | 				target.Amount = 1 | ||||||
| 				break | 				break | ||||||
| 			} else if e.GetAction() == "completed" && e.GetWorkflowJob().GetConclusion() != "skipped" && e.GetWorkflowJob().GetRunnerID() > 0 { | 			} else if e.GetAction() == "completed" && e.GetWorkflowJob().GetConclusion() != "skipped" { | ||||||
|  | 				// We want to filter out "completed" events sent by check runs.
 | ||||||
|  | 				// See https://github.com/actions/actions-runner-controller/issues/2118
 | ||||||
|  | 				// and https://github.com/actions/actions-runner-controller/pull/2119
 | ||||||
|  | 				// But canceled events have runner_id == 0 and GetRunnerID() returns 0 when RunnerID == nil,
 | ||||||
|  | 				// so we need to be more specific in filtering out the check runs.
 | ||||||
|  | 				// See example check run completion at https://gist.github.com/nathanklick/268fea6496a4d7b14cecb2999747ef84
 | ||||||
|  | 				if e.GetWorkflowJob().GetConclusion() == "success" && e.GetWorkflowJob().RunnerID == nil { | ||||||
|  | 					log.V(1).Info("Ignoring workflow_job event because it does not relate to a self-hosted runner") | ||||||
|  | 				} else { | ||||||
| 					// A negative amount is processed in the tryScale func as a scale-down request,
 | 					// A negative amount is processed in the tryScale func as a scale-down request,
 | ||||||
| 					// that erases the oldest CapacityReservation with the same amount.
 | 					// that erases the oldest CapacityReservation with the same amount.
 | ||||||
| 					// If the first CapacityReservation was with Replicas=1, this negative scale target erases that,
 | 					// If the first CapacityReservation was with Replicas=1, this negative scale target erases that,
 | ||||||
|  | @ -218,6 +227,7 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) Handle(w http.Respons | ||||||
| 					target.Amount = -1 | 					target.Amount = -1 | ||||||
| 					break | 					break | ||||||
| 				} | 				} | ||||||
|  | 			} | ||||||
| 			// If the conclusion is "skipped", we will ignore it and fallthrough to the default case.
 | 			// If the conclusion is "skipped", we will ignore it and fallthrough to the default case.
 | ||||||
| 			fallthrough | 			fallthrough | ||||||
| 		default: | 		default: | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue