Enhance ArgoCD health checks for ARC resources
Improves and expands health check Lua scripts for Runner, EphemeralRunner, and AutoScalingRunnerSet custom resources, adding more detailed phase handling and status messages. Adds a custom health check for core/v1 Pod objects specific to ARC runner pods, providing granular container-level diagnostics and fallback to default checks for non-runner pods.
This commit is contained in:
		
							parent
							
								
									f6b8ccfbc6
								
							
						
					
					
						commit
						c73ef99ed4
					
				|  | @ -4,23 +4,33 @@ metadata: | ||||||
|   name: argocd-cm |   name: argocd-cm | ||||||
|   namespace: argocd |   namespace: argocd | ||||||
| data: | data: | ||||||
|   # Health check for legacy Runner |   # Health check for actions.summerwind.dev/v1alpha1 Runner (Legacy API) | ||||||
|   resource.customizations.health.actions.summerwind.dev_Runner: | |   resource.customizations.health.actions.summerwind.dev_Runner: | | ||||||
|     hs = {} |     hs = {} | ||||||
|     if obj.status ~= nil then |     if obj.status then | ||||||
|       local phase = obj.status.phase |       local phase = obj.status.phase | ||||||
|       if obj.status.ready and phase == "Running" then |       if obj.status.ready and phase == "Running" then | ||||||
|         hs.status = "Healthy" |         hs.status = "Healthy" | ||||||
|         hs.message = "Runner is ready and running" |         hs.message = "Runner is ready and running" | ||||||
|       elseif phase == "Pending" or phase == "Created" then |       elseif phase == "Running" then | ||||||
|         hs.status = "Progressing" |         hs.status = "Progressing" | ||||||
|         hs.message = "Runner is starting up" |         hs.message = "Runner is running but not ready" | ||||||
|       elseif phase == "Failed" then |       elseif phase == "Pending" or phase == "Created" or phase == "Initializing" then | ||||||
|  |         hs.status = "Progressing" | ||||||
|  |         hs.message = "Runner is starting up: " .. phase | ||||||
|  |       elseif phase == "Failed" or phase == "Error" then | ||||||
|         hs.status = "Degraded" |         hs.status = "Degraded" | ||||||
|         hs.message = obj.status.message or "Runner has failed" |         hs.message = obj.status.message or "Runner has failed" | ||||||
|  |       elseif phase == "Terminating" then | ||||||
|  |         hs.status = "Progressing" | ||||||
|  |         hs.message = "Runner is terminating" | ||||||
|       else |       else | ||||||
|         hs.status = "Progressing" |         hs.status = "Progressing" | ||||||
|         hs.message = "Runner status: " .. (phase or "Unknown") |         hs.message = "Runner status: " .. (phase or "(nil)") | ||||||
|  |       end | ||||||
|  | 
 | ||||||
|  |       if obj.status.podName then | ||||||
|  |         hs.message = hs.message .. " (Pod: " .. obj.status.podName .. ")" | ||||||
|       end |       end | ||||||
|     else |     else | ||||||
|       hs.status = "Progressing" |       hs.status = "Progressing" | ||||||
|  | @ -28,25 +38,35 @@ data: | ||||||
|     end |     end | ||||||
|     return hs |     return hs | ||||||
| 
 | 
 | ||||||
|   # Health check for EphemeralRunner |   # Health check for actions.github.com/v1alpha1 EphemeralRunner (New API) | ||||||
|   resource.customizations.health.actions.github.com_EphemeralRunner: | |   resource.customizations.health.actions.github.com_EphemeralRunner: | | ||||||
|     hs = {} |     hs = {} | ||||||
|     if obj.status ~= nil then |     if obj.status then | ||||||
|       if obj.status.phase == "Running" then |       local phase = obj.status.phase | ||||||
|  |       if phase == "Running" or phase == "Succeeded" or phase == "Finished" then | ||||||
|  |         if obj.status.jobRequestId then | ||||||
|           hs.status = "Healthy" |           hs.status = "Healthy" | ||||||
|         hs.message = "EphemeralRunner is running" |           hs.message = "EphemeralRunner is running job: " .. tostring(obj.status.jobRequestId) | ||||||
|       elseif obj.status.phase == "Pending" then |         else | ||||||
|  |           hs.status = "Healthy" | ||||||
|  |           hs.message = "EphemeralRunner is running and ready for jobs" | ||||||
|  |         end | ||||||
|  |       elseif phase == "Pending" or phase == "PodCreated" then | ||||||
|         hs.status = "Progressing" |         hs.status = "Progressing" | ||||||
|         hs.message = "EphemeralRunner is pending" |         hs.message = "EphemeralRunner is starting up" | ||||||
|       elseif obj.status.phase == "Failed" then |       elseif phase == "Failed" then | ||||||
|         hs.status = "Degraded" |         hs.status = "Degraded" | ||||||
|         hs.message = obj.status.message or "EphemeralRunner has failed" |         hs.message = obj.status.message or "EphemeralRunner has failed" | ||||||
|       elseif obj.status.phase == "Finished" then |       elseif phase == "Deleting" or phase == "Terminating" then | ||||||
|         hs.status = "Healthy" |         hs.status = "Progressing" | ||||||
|         hs.message = "EphemeralRunner has finished" |         hs.message = "EphemeralRunner is cleaning up" | ||||||
|       else |       else | ||||||
|         hs.status = "Progressing" |         hs.status = "Progressing" | ||||||
|         hs.message = "EphemeralRunner status: " .. (obj.status.phase or "Unknown") |         hs.message = "EphemeralRunner status: " .. (phase or "(nil)") | ||||||
|  |       end | ||||||
|  | 
 | ||||||
|  |       if obj.status.runnerId then | ||||||
|  |         hs.message = hs.message .. " (ID: " .. tostring(obj.status.runnerId) .. ")" | ||||||
|       end |       end | ||||||
|     else |     else | ||||||
|       hs.status = "Progressing" |       hs.status = "Progressing" | ||||||
|  | @ -57,23 +77,113 @@ data: | ||||||
|   # Health check for actions.github.com/v1alpha1 AutoScalingRunnerSet |   # Health check for actions.github.com/v1alpha1 AutoScalingRunnerSet | ||||||
|   resource.customizations.health.actions.github.com_AutoScalingRunnerSet: | |   resource.customizations.health.actions.github.com_AutoScalingRunnerSet: | | ||||||
|     hs = {} |     hs = {} | ||||||
|     if obj.status ~= nil then |     if obj.status then | ||||||
|       local desired = obj.status.desiredReplicas or 0 |       local desired = obj.status.desiredReplicas or obj.status.replicas or 0 | ||||||
|       local ready = obj.status.readyReplicas or 0 |       local ready = obj.status.readyReplicas or 0 | ||||||
|       local current = obj.status.currentReplicas or 0 |       local current = obj.status.currentReplicas or 0 | ||||||
|  |       local pending = obj.status.pendingEphemeralRunners or 0 | ||||||
|  |       local terminating = obj.status.terminatingEphemeralRunners or 0 | ||||||
|  |       local running = obj.status.runningEphemeralRunners or 0 | ||||||
| 
 | 
 | ||||||
|       if desired > 0 and ready == desired then |       if desired == 0 and current == 0 then | ||||||
|         hs.status = "Healthy" |         hs.status = "Healthy" | ||||||
|         hs.message = string.format("Ready runners: %d/%d", ready, desired) |         hs.message = "AutoScaler scaled down to zero" | ||||||
|       elseif desired > 0 then |       elseif desired > 0 and ready == desired and pending == 0 and terminating == 0 then | ||||||
|  |         hs.status = "Healthy" | ||||||
|  |         hs.message = string.format("All runners ready: %d/%d (Running: %d)", ready, desired, running) | ||||||
|  |       elseif terminating > 0 then | ||||||
|         hs.status = "Progressing" |         hs.status = "Progressing" | ||||||
|         hs.message = string.format("Runners: %d/%d ready, %d current", ready, desired, current) |         hs.message = string.format("Scaling down: %d terminating, %d/%d ready", terminating, ready, desired) | ||||||
|  |       elseif pending > 0 or ready < desired then | ||||||
|  |         hs.status = "Progressing" | ||||||
|  |         hs.message = string.format("Scaling up: %d/%d ready, %d pending", ready, desired, pending) | ||||||
|       else |       else | ||||||
|         hs.status = "Progressing" |         hs.status = "Progressing" | ||||||
|         hs.message = "No desired replicas set" |         hs.message = string.format("Runners: %d current, %d desired", current, desired) | ||||||
|       end |       end | ||||||
|     else |     else | ||||||
|       hs.status = "Progressing" |       hs.status = "Progressing" | ||||||
|       hs.message = "Waiting for AutoScalingRunnerSet status" |       hs.message = "Waiting for AutoScalingRunnerSet status" | ||||||
|     end |     end | ||||||
|     return hs |     return hs | ||||||
|  | 
 | ||||||
|  |   # Health check for core/v1 Pod - specifically for Runner pods | ||||||
|  |   resource.customizations.health.core_Pod: | | ||||||
|  |     hs = {} | ||||||
|  | 
 | ||||||
|  |     -- Detect if this is an ARC runner pod by label | ||||||
|  |     local labels = (obj.metadata or {}).labels or {} | ||||||
|  |     local isRunnerPod = | ||||||
|  |         labels["actions.github.com/runner-pod"] or | ||||||
|  |         labels["runner-deployment-name"] or | ||||||
|  |         labels["actions-runner-controller/inject-registration-token"] or | ||||||
|  |         labels["app.kubernetes.io/component"] == "runner" or | ||||||
|  |         labels["app.kubernetes.io/part-of"] == "actions-runner-controller" | ||||||
|  | 
 | ||||||
|  |     if not isRunnerPod then | ||||||
|  |       return nil -- let ArgoCD fallback to default Pod checker | ||||||
|  |     end | ||||||
|  | 
 | ||||||
|  |     if obj.status then | ||||||
|  |       local phase = obj.status.phase | ||||||
|  |       local containerStatuses = obj.status.containerStatuses or {} | ||||||
|  |       local allReady, anyRunning, failedCnt = true, false, 0 | ||||||
|  |       local msgs = {} | ||||||
|  | 
 | ||||||
|  |       for _, cs in ipairs(containerStatuses) do | ||||||
|  |         if not cs.ready then allReady = false end | ||||||
|  |         if cs.state and cs.state.running then anyRunning = true end | ||||||
|  | 
 | ||||||
|  |         if cs.state and cs.state.terminated and cs.state.terminated.exitCode ~= 0 then | ||||||
|  |           failedCnt = failedCnt + 1 | ||||||
|  |           table.insert(msgs, | ||||||
|  |             string.format("Container %s exited %d (%s)", | ||||||
|  |               cs.name, | ||||||
|  |               cs.state.terminated.exitCode, | ||||||
|  |               cs.state.terminated.reason or "")) | ||||||
|  |         end | ||||||
|  | 
 | ||||||
|  |         if cs.state and cs.state.waiting then | ||||||
|  |           local reason = cs.state.waiting.reason or "" | ||||||
|  |           if reason == "CrashLoopBackOff" or reason == "ErrImagePull" or reason == "ImagePullBackOff" then | ||||||
|  |             failedCnt = failedCnt + 1 | ||||||
|  |             table.insert(msgs, | ||||||
|  |               string.format("Container %s waiting: %s", cs.name, reason)) | ||||||
|  |           end | ||||||
|  |         end | ||||||
|  |       end | ||||||
|  | 
 | ||||||
|  |       if phase == "Running" and allReady and anyRunning then | ||||||
|  |         hs.status = "Healthy" | ||||||
|  |         hs.message = "Runner pod is running and all containers are ready" | ||||||
|  |       elseif phase == "Succeeded" then | ||||||
|  |         hs.status = "Healthy" | ||||||
|  |         hs.message = "Runner pod completed successfully" | ||||||
|  |       elseif phase == "Failed" or failedCnt > 0 then | ||||||
|  |         hs.status = "Degraded" | ||||||
|  |         local concatenatedMsgs = (#msgs > 0) and table.concat(msgs, "; ") or nil | ||||||
|  |         local fallbackMsg = obj.status.message or "Pod has failed" | ||||||
|  |         hs.message = concatenatedMsgs or fallbackMsg | ||||||
|  |       elseif phase == "Pending" then | ||||||
|  |         local msg = "Pod is pending" | ||||||
|  |         for _, cond in ipairs(obj.status.conditions or {}) do | ||||||
|  |           if cond.type == "PodScheduled" and cond.status == "False" then | ||||||
|  |             msg = "Pod unschedulable: " .. (cond.reason or "") | ||||||
|  |             break | ||||||
|  |           end | ||||||
|  |         end | ||||||
|  |         hs.status = "Progressing" | ||||||
|  |         hs.message = msg | ||||||
|  |       elseif phase == "Running" and not allReady then | ||||||
|  |         hs.status = "Progressing" | ||||||
|  |         hs.message = "Pod is running but containers not ready" | ||||||
|  |       else | ||||||
|  |         hs.status = "Progressing" | ||||||
|  |         hs.message = "Pod status: " .. (phase or "(nil)") | ||||||
|  |       end | ||||||
|  |     else | ||||||
|  |       hs.status = "Progressing" | ||||||
|  |       hs.message = "Waiting for pod status" | ||||||
|  |     end | ||||||
|  | 
 | ||||||
|  |     return hs | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue