{ "__inputs": [ { "name": "DS_PROMETHEUS", "label": "Prometheus", "description": "", "type": "datasource", "pluginId": "prometheus", "pluginName": "Prometheus" } ], "__elements": {}, "__requires": [ { "type": "panel", "id": "bargauge", "name": "Bar gauge", "version": "" }, { "type": "panel", "id": "gauge", "name": "Gauge", "version": "" }, { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "11.5.2" }, { "type": "panel", "id": "heatmap", "name": "Heatmap", "version": "" }, { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" }, { "type": "panel", "id": "stat", "name": "Stat", "version": "" }, { "type": "panel", "id": "timeseries", "name": "Time series", "version": "" } ], "annotations": { "list": [ { "builtIn": 1, "datasource": { "type": "grafana", "uid": "-- Grafana --" }, "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, "id": null, "links": [], "liveNow": true, "panels": [ { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "id": 15, "panels": [], "title": "Runner Performance", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Heat map showing the typical time before a job starts and whether the number of jobs in that time bucket are increasing or decreasing.", "fieldConfig": { "defaults": { "custom": { "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "scaleDistribution": { "type": "linear" } } }, "overrides": [] }, "gridPos": { "h": 7, "w": 7, "x": 0, "y": 1 }, "id": 7, "options": { "calculate": false, "cellGap": 1, "color": { "exponent": 0.5, "fill": "dark-orange", "mode": "scheme", "reverse": false, "scale": "exponential", "scheme": "Turbo", "steps": 64 }, "exemplars": { "color": "rgba(255,0,255,0.7)" }, "filterValues": { "le": 1e-9 }, "legend": { "show": true }, "rowsFrame": { "layout": "auto" }, "tooltip": { "mode": "single", "showColorScale": false, "yHistogram": false }, "yAxis": { "axisLabel": "Wait Time", "axisPlacement": "left", "reverse": false, "unit": "s" } }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "code", "exemplar": false, "expr": "sum by(le) (increase(gha_job_startup_duration_seconds_bucket{actions_github_com_scale_set_name=~\"$Scaleset\", actions_github_com_scale_set_namespace=~\"$RunnerNamespace\"}[$__rate_interval]))", "format": "heatmap", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "interval": "", "legendFormat": "{{le}}", "range": true, "refId": "A", "useBackend": false } ], "title": "Startup Duration", "type": "heatmap" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Heat map showing the typical time to complete a job and whether the number of jobs in that time bucket are increasing or decreasing.", "fieldConfig": { "defaults": { "custom": { "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "scaleDistribution": { "type": "linear" } } }, "overrides": [] }, "gridPos": { "h": 7, "w": 8, "x": 7, "y": 1 }, "id": 6, "options": { "calculate": false, "cellGap": 1, "color": { "exponent": 0.5, "fill": "dark-orange", "mode": "scheme", "reverse": false, "scale": "exponential", "scheme": "Spectral", "steps": 64 }, "exemplars": { "color": "rgba(255,0,255,0.7)" }, "filterValues": { "le": 1e-9 }, "legend": { "show": true }, "rowsFrame": { "layout": "auto" }, "tooltip": { "mode": "single", "showColorScale": false, "yHistogram": false }, "yAxis": { "axisLabel": "Time", "axisPlacement": "left", "reverse": false, "unit": "s" } }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "code", "exemplar": false, "expr": "sum by(le) (increase(gha_job_execution_duration_seconds_bucket{actions_github_com_scale_set_name=~\"$Scaleset\", actions_github_com_scale_set_namespace=~\"$RunnerNamespace\"}[$__rate_interval]))", "format": "heatmap", "fullMetaSearch": false, "includeNullMetadata": false, "instant": false, "legendFormat": "{{le}}", "range": true, "refId": "A", "useBackend": false } ], "title": "Job Execution", "type": "heatmap" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "The number of jobs assigned to the scale set. The threshold is triggered with the number of assigned jobs exceeds the number of desired runners. This indicates that not all jobs will have an available runner.", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "fieldMinMax": false, "mappings": [], "min": 0, "noValue": "0", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] } }, "overrides": [] }, "gridPos": { "h": 5, "w": 3, "x": 0, "y": 8 }, "id": 9, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "last" ], "fields": "", "values": false }, "showThresholdLabels": true, "showThresholdMarkers": true, "sizing": "auto" }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "code", "expr": "sum(gha_assigned_jobs{namespace=~\"$RunnerNamespace\", actions_github_com_scale_set_name=~\"$Scaleset\"})", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "refId": "A", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(gha_desired_runners{namespace=~\"$RunnerNamespace\", actions_github_com_scale_set_name=~\"$Scaleset\"}) + 1", "hide": false, "instant": false, "legendFormat": "__auto", "range": true, "refId": "DesiredRunners" } ], "title": "Assigned Jobs", "transformations": [ { "id": "configFromData", "options": { "configRefId": "DesiredRunners", "mappings": [ { "fieldName": "Time", "handlerKey": "__ignore" }, { "fieldName": "sum(gha_desired_runners{namespace=~\"(arc-runners|arc-runners-dind|arc-runners-k8s)\", actions_github_com_scale_set_name=~\"(arc-runner-set|dind-runner-set|k8s-runner-set)\"}) + 1", "handlerKey": "threshold1" } ] } } ], "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Number of runners desired by the scale set", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "decimals": 0, "fieldMinMax": false, "mappings": [], "noValue": "0", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] } }, "overrides": [] }, "gridPos": { "h": 5, "w": 3, "x": 3, "y": 8 }, "id": 4, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "last" ], "fields": "", "values": false }, "showThresholdLabels": true, "showThresholdMarkers": true, "sizing": "auto" }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "code", "expr": "sum(gha_desired_runners{actions_github_com_scale_set_name=~\"$Scaleset\", namespace=~\"$RunnerNamespace\"})", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "refId": "A", "useBackend": false } ], "title": "Desired Runners", "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Number of registered runners that do not have assigned jobs.", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "decimals": 0, "fieldMinMax": false, "mappings": [], "noValue": "0", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] } }, "overrides": [] }, "gridPos": { "h": 5, "w": 3, "x": 6, "y": 8 }, "id": 2, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showThresholdLabels": true, "showThresholdMarkers": true, "sizing": "auto" }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "code", "expr": "sum(gha_idle_runners{actions_github_com_scale_set_name=~\"$Scaleset\", namespace=~\"$RunnerNamespace\"})", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "refId": "A", "useBackend": false } ], "title": "Idle Runners", "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "The number of workflow jobs currently executing", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "noValue": "0", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] } }, "overrides": [] }, "gridPos": { "h": 5, "w": 3, "x": 9, "y": 8 }, "id": 10, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "last" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true, "sizing": "auto" }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum (gha_running_jobs{actions_github_com_scale_set_name=~\"$Scaleset\", actions_github_com_scale_set_namespace=~\"$RunnerNamespace\"})", "instant": false, "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "Running Jobs", "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "The number of runners in a failed state. These runners are typically misconfigured and count against the scale set's maximum limit.", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "noValue": "0", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] } }, "overrides": [] }, "gridPos": { "h": 5, "w": 3, "x": 12, "y": 8 }, "id": 26, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "last" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true, "sizing": "auto" }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(gha_controller_failed_ephemeral_runners{name=~\"$Scaleset\", namespace=~\"$RunnerNamespace\"})", "instant": false, "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "Failed Runners", "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "The number of active scale set listeners", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "fieldMinMax": true, "mappings": [], "noValue": "0", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] } }, "overrides": [] }, "gridPos": { "h": 5, "w": 3, "x": 0, "y": 13 }, "id": 5, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "textMode": "auto", "wideLayout": true }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(gha_controller_running_listeners{namespace=~\"$SystemNamespace\"})", "hide": false, "instant": false, "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "Listeners", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Number of runner pods that are waiting to be created. When this number exceeds the number of pods Kubernetes reports as Waiting, it indicate cluster performance issues.", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "decimals": 0, "fieldMinMax": false, "mappings": [], "noValue": "0", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] } }, "overrides": [] }, "gridPos": { "h": 5, "w": 3, "x": 3, "y": 13 }, "id": 3, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "last" ], "fields": "", "values": false }, "showThresholdLabels": true, "showThresholdMarkers": true, "sizing": "auto" }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "code", "expr": "sum(gha_controller_pending_ephemeral_runners{name=~\"$Scaleset\", namespace=~\"$RunnerNamespace\"})", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "refId": "A", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(kube_pod_container_status_waiting{namespace=~\"$RunnerNamespace\"}) != 0 or vector(0)", "hide": false, "instant": false, "legendFormat": "__auto", "range": true, "refId": "Waiting" } ], "title": "Pending Runners", "transformations": [ { "id": "configFromData", "options": { "configRefId": "Waiting", "mappings": [ { "fieldName": "Time", "handlerKey": "__ignore" }, { "fieldName": "sum(kube_pod_container_status_waiting{namespace=~\"(arc-runners|arc-runners-dind|arc-runners-k8s)\"}) != 0 or vector(0)", "handlerKey": "threshold1" } ] } } ], "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "The number of runners registered for processing queued jobs", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "noValue": "0", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] } }, "overrides": [] }, "gridPos": { "h": 5, "w": 3, "x": 6, "y": 13 }, "id": 8, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "last" ], "fields": "", "values": false }, "showThresholdLabels": true, "showThresholdMarkers": true, "sizing": "auto" }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(gha_registered_runners{actions_github_com_scale_set_name=~\"$Scaleset\", namespace=~\"$RunnerNamespace\"})", "instant": false, "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "Registered Runners", "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Number of runner pods in a running state", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "decimals": 0, "fieldMinMax": false, "mappings": [], "noValue": "0", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] } }, "overrides": [] }, "gridPos": { "h": 5, "w": 3, "x": 9, "y": 13 }, "id": 1, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showThresholdLabels": true, "showThresholdMarkers": true, "sizing": "auto" }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "code", "expr": "max(gha_controller_running_ephemeral_runners{name=~\"$Scaleset\", namespace=~\"$RunnerNamespace\"})", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "refId": "A", "useBackend": false } ], "title": "Active Runners", "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "The number of containers that are reporting that they were terminated by an out-of-memory condition (OOMK.iller)", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "noValue": "No issues detected", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "semi-dark-red", "value": 1 } ] } }, "overrides": [] }, "gridPos": { "h": 5, "w": 3, "x": 12, "y": 13 }, "id": 23, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "textMode": "auto", "wideLayout": true }, "pluginVersion": "11.5.2", "targets": [ { "editorMode": "code", "expr": "sum(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\",namespace=~\"$RunnerNamespace\"}) by (namespace)", "legendFormat": "__auto", "range": true, "refId": "A", "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" } } ], "title": "Out of Memory", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "The peak memory used by a container in a given scale set's namespace.", "fieldConfig": { "defaults": { "color": { "fixedColor": "semi-dark-green", "mode": "shades" }, "fieldMinMax": false, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "decbytes" }, "overrides": [] }, "gridPos": { "h": 6, "w": 5, "x": 0, "y": 18 }, "id": 12, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": false, "sizing": "auto" }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "max(container_memory_working_set_bytes{namespace=~\"$RunnerNamespace\"}) by (namespace)", "format": "time_series", "instant": false, "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "Peak Container Memory", "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "The sum of the reads and writes occurring within the runner namespace.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "Bytes", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 54, "gradientMode": "opacity", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "smooth", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "log": 2, "type": "log" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "decbytes" }, "overrides": [] }, "gridPos": { "h": 6, "w": 6, "x": 5, "y": 18 }, "id": 13, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "hideZeros": false, "mode": "single", "sort": "none" } }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(rate(container_fs_writes_bytes_total{namespace=~\"$RunnerNamespace\"}[$__rate_interval])) > 0 or vector(0)", "instant": false, "legendFormat": "Write", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(rate(container_fs_reads_bytes_total{namespace=~\"$RunnerNamespace\"}[$__rate_interval])) > 0 or vector(0)", "hide": false, "instant": false, "legendFormat": "Read", "range": true, "refId": "B" } ], "title": "Container I/O", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "The Kubernetes-reported pod status.", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "min": 0, "noValue": "No active pods", "thresholds": { "mode": "absolute", "steps": [ { "color": "yellow", "value": null }, { "color": "green", "value": 80 } ] } }, "overrides": [] }, "gridPos": { "h": 6, "w": 4, "x": 11, "y": 18 }, "id": 11, "options": { "displayMode": "lcd", "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": false }, "maxVizHeight": 300, "minVizHeight": 10, "minVizWidth": 0, "namePlacement": "auto", "orientation": "horizontal", "reduceOptions": { "calcs": [ "last" ], "fields": "", "values": false }, "showUnfilled": true, "sizing": "auto", "valueMode": "color" }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "code", "exemplar": false, "expr": "sum(kube_pod_container_status_ready{namespace=~\"$RunnerNamespace\"}) != 0 or vector(0)", "format": "time_series", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "interval": "", "legendFormat": "Ready", "range": true, "refId": "B", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(kube_pod_container_status_waiting{namespace=~\"$RunnerNamespace\"}) != 0 or vector(0)", "hide": false, "instant": false, "legendFormat": "Waiting", "range": true, "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(kube_pod_container_status_terminated_reason{namespace=~\"$RunnerNamespace\", reason=\"Completed\"}) != 0 or vector(0)", "hide": false, "instant": false, "interval": "", "legendFormat": "Completed", "range": true, "refId": "D" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(kube_pod_container_status_terminated_reason{namespace=~\"$RunnerNamespace\", reason=\"Error\"}) != 0 or vector(0)", "hide": false, "instant": false, "legendFormat": "Error", "range": true, "refId": "E" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "code", "expr": "sum(gha_desired_runners)+1", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "refId": "DesiredRunners", "useBackend": false } ], "title": "Container Pod Status", "transformations": [ { "id": "configFromData", "options": { "applyTo": { "id": "byName", "options": "Ready" }, "configRefId": "DesiredRunners", "mappings": [ { "fieldName": "Time", "handlerKey": "__ignore" }, { "fieldName": "sum(gha_desired_runners) + 1", "handlerKey": "threshold1" }, { "fieldName": "sum(gha_desired_runners) -5", "handlerKey": "threshold1" } ] } } ], "type": "bargauge" }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 24 }, "id": 16, "panels": [], "title": "Controller Performance", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "The average time required for a reconciliation request to be processed. This reflects the time required for the controller to process a single request to modify a Kubernetes resource.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 33, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "smooth", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "s" }, "overrides": [] }, "gridPos": { "h": 8, "w": 6, "x": 0, "y": 25 }, "id": 17, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "hideZeros": false, "mode": "single", "sort": "none" } }, "pluginVersion": "11.5.2", "targets": [ { "editorMode": "code", "expr": "rate(controller_runtime_reconcile_time_seconds_sum{namespace=\"$SystemNamespace\"}[$__rate_interval])", "interval": "", "legendFormat": "{{controller}}", "range": true, "refId": "A", "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" } } ], "title": "Reconcile Time", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "The average time a queued reconciliation request spends waiting to be processed.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 27, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "s" }, "overrides": [] }, "gridPos": { "h": 8, "w": 6, "x": 6, "y": 25 }, "id": 18, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "hideZeros": false, "mode": "single", "sort": "none" } }, "pluginVersion": "11.5.2", "targets": [ { "editorMode": "code", "expr": "rate(workqueue_queue_duration_seconds_sum{namespace=\"$SystemNamespace\"}[$__rate_interval])", "legendFormat": "{{controller}}", "range": true, "refId": "A", "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" } } ], "title": "Workqueue Queue Duration", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Errors indicate that controller has not achieved a desired state and is requesting Kubernetes to queue another request for reconciliation. Ideally, this number remains close to zero. An increasing number can indicate resource contention or delays processing API server requests.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 33, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "smooth", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] } }, "overrides": [] }, "gridPos": { "h": 8, "w": 6, "x": 12, "y": 25 }, "id": 27, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "hideZeros": false, "mode": "single", "sort": "none" } }, "pluginVersion": "11.5.2", "targets": [ { "editorMode": "code", "expr": "rate(controller_runtime_reconcile_errors_total{namespace=\"$SystemNamespace\"}[$__rate_interval])", "interval": "", "legendFormat": "{{controller}}", "range": true, "refId": "A", "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" } } ], "title": "Reconciliation Errors", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "The number of reconcile requests that are waiting to be processed by the controller. A growing queue depth can indicate that the Kubernetes API Server or the controller does not have enough resources. This can lead to pods taking longer to be deleted or started. ", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "fieldMinMax": false, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 100 } ] } }, "overrides": [] }, "gridPos": { "h": 8, "w": 6, "x": 0, "y": 33 }, "id": 20, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "max" ], "fields": "", "values": false }, "showPercentChange": false, "textMode": "auto", "wideLayout": true }, "pluginVersion": "11.5.2", "targets": [ { "editorMode": "code", "expr": "sum (workqueue_depth{namespace=\"$SystemNamespace\"}) by (name)", "legendFormat": "__auto", "range": true, "refId": "A", "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" } } ], "title": "Queue Depth", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "The number of workers that are currently being used to process reconcile requests. Increasing this number can reduce the work queue duration, but each new worker adds a small amount of time due to context switching.", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] } }, "overrides": [] }, "gridPos": { "h": 8, "w": 6, "x": 6, "y": 33 }, "id": 21, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "max" ], "fields": "", "values": false }, "showPercentChange": false, "textMode": "auto", "wideLayout": true }, "pluginVersion": "11.5.2", "targets": [ { "editorMode": "code", "expr": "sum by (controller) (controller_runtime_active_workers{namespace=\"$SystemNamespace\"})", "legendFormat": "__auto", "range": true, "refId": "A", "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" } } ], "title": "Active Workers", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "The number of calls to the API server", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 27, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "smooth", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] } }, "overrides": [] }, "gridPos": { "h": 8, "w": 6, "x": 12, "y": 33 }, "id": 19, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "hideZeros": false, "mode": "single", "sort": "none" } }, "pluginVersion": "11.5.2", "targets": [ { "editorMode": "code", "expr": "sum by (method, code) (rate(rest_client_requests_total{namespace=\"$SystemNamespace\"}[$__rate_interval]))", "format": "time_series", "legendFormat": "__auto", "range": true, "refId": "A", "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" } } ], "title": "API Calls", "type": "timeseries" }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 41 }, "id": 25, "panels": [], "title": "Metrics", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "The time required by Prometheus to read and process metrics. Long scrape times can delay metrics updates or lead to metrics loss. Increasing time often indicates issues with metrics cardinality or cluster resources.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineStyle": { "fill": "solid" }, "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "s" }, "overrides": [] }, "gridPos": { "h": 10, "w": 18, "x": 0, "y": 42 }, "id": 24, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "hideZeros": false, "mode": "single", "sort": "none" } }, "pluginVersion": "11.5.2", "targets": [ { "editorMode": "code", "exemplar": false, "expr": "scrape_duration_seconds", "instant": false, "legendFormat": "{{job}}", "range": true, "refId": "A", "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" } } ], "title": "Scrape Duration", "type": "timeseries" } ], "refresh": "5s", "schemaVersion": 40, "tags": [], "templating": { "list": [ { "current": {}, "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "definition": "label_values(gha_controller_running_listeners,namespace)", "description": "The ARC system namespace", "includeAll": true, "label": "ARC System Namespace", "multi": true, "name": "SystemNamespace", "options": [], "query": { "qryType": 1, "query": "label_values(gha_controller_running_listeners,namespace)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, "regex": "", "type": "query" }, { "current": {}, "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "definition": "label_values(gha_desired_runners,actions_github_com_scale_set_name)", "description": "The name of the runner scale set", "includeAll": true, "label": "Scale Set", "multi": true, "name": "Scaleset", "options": [], "query": { "qryType": 1, "query": "label_values(gha_desired_runners,actions_github_com_scale_set_name)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 2, "regex": "", "type": "query" }, { "current": {}, "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "definition": "label_values(gha_desired_runners{actions_github_com_scale_set_name=~\"$Scaleset\"},namespace)", "description": "Namespace containing the runners", "includeAll": true, "label": "Runner Namespace", "multi": true, "name": "RunnerNamespace", "options": [], "query": { "qryType": 1, "query": "label_values(gha_desired_runners{actions_github_com_scale_set_name=~\"$Scaleset\"},namespace)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 2, "regex": "", "type": "query" } ] }, "time": { "from": "now-15m", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d", "7d" ] }, "timezone": "", "title": "ARC Autoscaling Runner Set Monitoring", "uid": "af21e938-2151-4bf2-b798-8cf9232f947a", "version": 1, "weekStart": "" }