Enhance ArgoCD health checks for ARC resources
Improves and expands health check Lua scripts for Runner, EphemeralRunner, and AutoScalingRunnerSet custom resources, adding more detailed phase handling and status messages. Adds a custom health check for core/v1 Pod objects specific to ARC runner pods, providing granular container-level diagnostics and fallback to default checks for non-runner pods.
This commit is contained in:
parent
f6b8ccfbc6
commit
c73ef99ed4
|
|
@ -4,23 +4,33 @@ metadata:
|
|||
name: argocd-cm
|
||||
namespace: argocd
|
||||
data:
|
||||
# Health check for legacy Runner
|
||||
# Health check for actions.summerwind.dev/v1alpha1 Runner (Legacy API)
|
||||
resource.customizations.health.actions.summerwind.dev_Runner: |
|
||||
hs = {}
|
||||
if obj.status ~= nil then
|
||||
if obj.status then
|
||||
local phase = obj.status.phase
|
||||
if obj.status.ready and phase == "Running" then
|
||||
hs.status = "Healthy"
|
||||
hs.message = "Runner is ready and running"
|
||||
elseif phase == "Pending" or phase == "Created" then
|
||||
elseif phase == "Running" then
|
||||
hs.status = "Progressing"
|
||||
hs.message = "Runner is starting up"
|
||||
elseif phase == "Failed" then
|
||||
hs.message = "Runner is running but not ready"
|
||||
elseif phase == "Pending" or phase == "Created" or phase == "Initializing" then
|
||||
hs.status = "Progressing"
|
||||
hs.message = "Runner is starting up: " .. phase
|
||||
elseif phase == "Failed" or phase == "Error" then
|
||||
hs.status = "Degraded"
|
||||
hs.message = obj.status.message or "Runner has failed"
|
||||
elseif phase == "Terminating" then
|
||||
hs.status = "Progressing"
|
||||
hs.message = "Runner is terminating"
|
||||
else
|
||||
hs.status = "Progressing"
|
||||
hs.message = "Runner status: " .. (phase or "Unknown")
|
||||
hs.message = "Runner status: " .. (phase or "(nil)")
|
||||
end
|
||||
|
||||
if obj.status.podName then
|
||||
hs.message = hs.message .. " (Pod: " .. obj.status.podName .. ")"
|
||||
end
|
||||
else
|
||||
hs.status = "Progressing"
|
||||
|
|
@ -28,25 +38,35 @@ data:
|
|||
end
|
||||
return hs
|
||||
|
||||
# Health check for EphemeralRunner
|
||||
# Health check for actions.github.com/v1alpha1 EphemeralRunner (New API)
|
||||
resource.customizations.health.actions.github.com_EphemeralRunner: |
|
||||
hs = {}
|
||||
if obj.status ~= nil then
|
||||
if obj.status.phase == "Running" then
|
||||
if obj.status then
|
||||
local phase = obj.status.phase
|
||||
if phase == "Running" or phase == "Succeeded" or phase == "Finished" then
|
||||
if obj.status.jobRequestId then
|
||||
hs.status = "Healthy"
|
||||
hs.message = "EphemeralRunner is running"
|
||||
elseif obj.status.phase == "Pending" then
|
||||
hs.message = "EphemeralRunner is running job: " .. tostring(obj.status.jobRequestId)
|
||||
else
|
||||
hs.status = "Healthy"
|
||||
hs.message = "EphemeralRunner is running and ready for jobs"
|
||||
end
|
||||
elseif phase == "Pending" or phase == "PodCreated" then
|
||||
hs.status = "Progressing"
|
||||
hs.message = "EphemeralRunner is pending"
|
||||
elseif obj.status.phase == "Failed" then
|
||||
hs.message = "EphemeralRunner is starting up"
|
||||
elseif phase == "Failed" then
|
||||
hs.status = "Degraded"
|
||||
hs.message = obj.status.message or "EphemeralRunner has failed"
|
||||
elseif obj.status.phase == "Finished" then
|
||||
hs.status = "Healthy"
|
||||
hs.message = "EphemeralRunner has finished"
|
||||
elseif phase == "Deleting" or phase == "Terminating" then
|
||||
hs.status = "Progressing"
|
||||
hs.message = "EphemeralRunner is cleaning up"
|
||||
else
|
||||
hs.status = "Progressing"
|
||||
hs.message = "EphemeralRunner status: " .. (obj.status.phase or "Unknown")
|
||||
hs.message = "EphemeralRunner status: " .. (phase or "(nil)")
|
||||
end
|
||||
|
||||
if obj.status.runnerId then
|
||||
hs.message = hs.message .. " (ID: " .. tostring(obj.status.runnerId) .. ")"
|
||||
end
|
||||
else
|
||||
hs.status = "Progressing"
|
||||
|
|
@ -57,23 +77,113 @@ data:
|
|||
# Health check for actions.github.com/v1alpha1 AutoScalingRunnerSet
|
||||
resource.customizations.health.actions.github.com_AutoScalingRunnerSet: |
|
||||
hs = {}
|
||||
if obj.status ~= nil then
|
||||
local desired = obj.status.desiredReplicas or 0
|
||||
if obj.status then
|
||||
local desired = obj.status.desiredReplicas or obj.status.replicas or 0
|
||||
local ready = obj.status.readyReplicas or 0
|
||||
local current = obj.status.currentReplicas or 0
|
||||
local pending = obj.status.pendingEphemeralRunners or 0
|
||||
local terminating = obj.status.terminatingEphemeralRunners or 0
|
||||
local running = obj.status.runningEphemeralRunners or 0
|
||||
|
||||
if desired > 0 and ready == desired then
|
||||
if desired == 0 and current == 0 then
|
||||
hs.status = "Healthy"
|
||||
hs.message = string.format("Ready runners: %d/%d", ready, desired)
|
||||
elseif desired > 0 then
|
||||
hs.message = "AutoScaler scaled down to zero"
|
||||
elseif desired > 0 and ready == desired and pending == 0 and terminating == 0 then
|
||||
hs.status = "Healthy"
|
||||
hs.message = string.format("All runners ready: %d/%d (Running: %d)", ready, desired, running)
|
||||
elseif terminating > 0 then
|
||||
hs.status = "Progressing"
|
||||
hs.message = string.format("Runners: %d/%d ready, %d current", ready, desired, current)
|
||||
hs.message = string.format("Scaling down: %d terminating, %d/%d ready", terminating, ready, desired)
|
||||
elseif pending > 0 or ready < desired then
|
||||
hs.status = "Progressing"
|
||||
hs.message = string.format("Scaling up: %d/%d ready, %d pending", ready, desired, pending)
|
||||
else
|
||||
hs.status = "Progressing"
|
||||
hs.message = "No desired replicas set"
|
||||
hs.message = string.format("Runners: %d current, %d desired", current, desired)
|
||||
end
|
||||
else
|
||||
hs.status = "Progressing"
|
||||
hs.message = "Waiting for AutoScalingRunnerSet status"
|
||||
end
|
||||
return hs
|
||||
|
||||
# Health check for core/v1 Pod - specifically for Runner pods
|
||||
resource.customizations.health.core_Pod: |
|
||||
hs = {}
|
||||
|
||||
-- Detect if this is an ARC runner pod by label
|
||||
local labels = (obj.metadata or {}).labels or {}
|
||||
local isRunnerPod =
|
||||
labels["actions.github.com/runner-pod"] or
|
||||
labels["runner-deployment-name"] or
|
||||
labels["actions-runner-controller/inject-registration-token"] or
|
||||
labels["app.kubernetes.io/component"] == "runner" or
|
||||
labels["app.kubernetes.io/part-of"] == "actions-runner-controller"
|
||||
|
||||
if not isRunnerPod then
|
||||
return nil -- let ArgoCD fallback to default Pod checker
|
||||
end
|
||||
|
||||
if obj.status then
|
||||
local phase = obj.status.phase
|
||||
local containerStatuses = obj.status.containerStatuses or {}
|
||||
local allReady, anyRunning, failedCnt = true, false, 0
|
||||
local msgs = {}
|
||||
|
||||
for _, cs in ipairs(containerStatuses) do
|
||||
if not cs.ready then allReady = false end
|
||||
if cs.state and cs.state.running then anyRunning = true end
|
||||
|
||||
if cs.state and cs.state.terminated and cs.state.terminated.exitCode ~= 0 then
|
||||
failedCnt = failedCnt + 1
|
||||
table.insert(msgs,
|
||||
string.format("Container %s exited %d (%s)",
|
||||
cs.name,
|
||||
cs.state.terminated.exitCode,
|
||||
cs.state.terminated.reason or ""))
|
||||
end
|
||||
|
||||
if cs.state and cs.state.waiting then
|
||||
local reason = cs.state.waiting.reason or ""
|
||||
if reason == "CrashLoopBackOff" or reason == "ErrImagePull" or reason == "ImagePullBackOff" then
|
||||
failedCnt = failedCnt + 1
|
||||
table.insert(msgs,
|
||||
string.format("Container %s waiting: %s", cs.name, reason))
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
if phase == "Running" and allReady and anyRunning then
|
||||
hs.status = "Healthy"
|
||||
hs.message = "Runner pod is running and all containers are ready"
|
||||
elseif phase == "Succeeded" then
|
||||
hs.status = "Healthy"
|
||||
hs.message = "Runner pod completed successfully"
|
||||
elseif phase == "Failed" or failedCnt > 0 then
|
||||
hs.status = "Degraded"
|
||||
local concatenatedMsgs = (#msgs > 0) and table.concat(msgs, "; ") or nil
|
||||
local fallbackMsg = obj.status.message or "Pod has failed"
|
||||
hs.message = concatenatedMsgs or fallbackMsg
|
||||
elseif phase == "Pending" then
|
||||
local msg = "Pod is pending"
|
||||
for _, cond in ipairs(obj.status.conditions or {}) do
|
||||
if cond.type == "PodScheduled" and cond.status == "False" then
|
||||
msg = "Pod unschedulable: " .. (cond.reason or "")
|
||||
break
|
||||
end
|
||||
end
|
||||
hs.status = "Progressing"
|
||||
hs.message = msg
|
||||
elseif phase == "Running" and not allReady then
|
||||
hs.status = "Progressing"
|
||||
hs.message = "Pod is running but containers not ready"
|
||||
else
|
||||
hs.status = "Progressing"
|
||||
hs.message = "Pod status: " .. (phase or "(nil)")
|
||||
end
|
||||
else
|
||||
hs.status = "Progressing"
|
||||
hs.message = "Waiting for pod status"
|
||||
end
|
||||
|
||||
return hs
|
||||
|
|
|
|||
Loading…
Reference in New Issue