From 46ee5cf9a26b615e22c5ff5eb240375b8b58911e Mon Sep 17 00:00:00 2001 From: Ken Muse Date: Wed, 23 Apr 2025 05:36:05 -0400 Subject: [PATCH] Revised dashboard (#4022) --- ...ARC-Autoscaling-Runner-Set-Monitoring.json | 2177 +++++++++++++++++ ...g-Runner-Set-Monitoring_1692627561838.json | 1248 ---------- .../samples/grafana-dashboard/README.md | 73 +- .../grafana-dashboard/grafana-sample.png | 4 +- 4 files changed, 2238 insertions(+), 1264 deletions(-) create mode 100644 docs/gha-runner-scale-set-controller/samples/grafana-dashboard/ARC-Autoscaling-Runner-Set-Monitoring.json delete mode 100644 docs/gha-runner-scale-set-controller/samples/grafana-dashboard/ARC-Autoscaling-Runner-Set-Monitoring_1692627561838.json diff --git a/docs/gha-runner-scale-set-controller/samples/grafana-dashboard/ARC-Autoscaling-Runner-Set-Monitoring.json b/docs/gha-runner-scale-set-controller/samples/grafana-dashboard/ARC-Autoscaling-Runner-Set-Monitoring.json new file mode 100644 index 00000000..43f53e01 --- /dev/null +++ b/docs/gha-runner-scale-set-controller/samples/grafana-dashboard/ARC-Autoscaling-Runner-Set-Monitoring.json @@ -0,0 +1,2177 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "panel", + "id": "bargauge", + "name": "Bar gauge", + "version": "" + }, + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "11.5.2" + }, + { + "type": "panel", + "id": "heatmap", + "name": "Heatmap", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": true, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 15, + "panels": [], + "title": "Runner Performance", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Heat map showing the typical time before a job starts and whether the number of jobs in that time bucket are increasing or decreasing.", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 7, + "x": 0, + "y": 1 + }, + "id": 7, + "options": { + "calculate": false, + "cellGap": 1, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Turbo", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisLabel": "Wait Time", + "axisPlacement": "left", + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "sum by(le) (increase(gha_job_startup_duration_seconds_bucket{actions_github_com_scale_set_name=~\"$Scaleset\", actions_github_com_scale_set_namespace=~\"$RunnerNamespace\"}[$__rate_interval]))", + "format": "heatmap", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "interval": "", + "legendFormat": "{{le}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Startup Duration", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Heat map showing the typical time to complete a job and whether the number of jobs in that time bucket are increasing or decreasing.", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 7, + "y": 1 + }, + "id": 6, + "options": { + "calculate": false, + "cellGap": 1, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Spectral", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisLabel": "Time", + "axisPlacement": "left", + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "sum by(le) (increase(gha_job_execution_duration_seconds_bucket{actions_github_com_scale_set_name=~\"$Scaleset\", actions_github_com_scale_set_namespace=~\"$RunnerNamespace\"}[$__rate_interval]))", + "format": "heatmap", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{le}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Job Execution", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The number of jobs assigned to the scale set. The threshold is triggered with the number of assigned jobs exceeds the number of desired runners. This indicates that not all jobs will have an available runner.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "fieldMinMax": false, + "mappings": [], + "min": 0, + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 0, + "y": 8 + }, + "id": 9, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": true, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum(gha_assigned_jobs{namespace=~\"$RunnerNamespace\", actions_github_com_scale_set_name=~\"$Scaleset\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(gha_desired_runners{namespace=~\"$RunnerNamespace\", actions_github_com_scale_set_name=~\"$Scaleset\"}) + 1", + "hide": false, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "DesiredRunners" + } + ], + "title": "Assigned Jobs", + "transformations": [ + { + "id": "configFromData", + "options": { + "configRefId": "DesiredRunners", + "mappings": [ + { + "fieldName": "Time", + "handlerKey": "__ignore" + }, + { + "fieldName": "sum(gha_desired_runners{namespace=~\"(arc-runners|arc-runners-dind|arc-runners-k8s)\", actions_github_com_scale_set_name=~\"(arc-runner-set|dind-runner-set|k8s-runner-set)\"}) + 1", + "handlerKey": "threshold1" + } + ] + } + } + ], + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of runners desired by the scale set", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "fieldMinMax": false, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 3, + "y": 8 + }, + "id": 4, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": true, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum(gha_desired_runners{actions_github_com_scale_set_name=~\"$Scaleset\", namespace=~\"$RunnerNamespace\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Desired Runners", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of registered runners that do not have assigned jobs.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "fieldMinMax": false, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 6, + "y": 8 + }, + "id": 2, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": true, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum(gha_idle_runners{actions_github_com_scale_set_name=~\"$Scaleset\", namespace=~\"$RunnerNamespace\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Idle Runners", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The number of workflow jobs currently executing", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 9, + "y": 8 + }, + "id": 10, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum (gha_running_jobs{actions_github_com_scale_set_name=~\"$Scaleset\", actions_github_com_scale_set_namespace=~\"$RunnerNamespace\"})", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Running Jobs", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The number of runners in a failed state. These runners are typically misconfigured and count against the scale set's maximum limit.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 12, + "y": 8 + }, + "id": 26, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(gha_controller_failed_ephemeral_runners{name=~\"$Scaleset\", namespace=~\"$RunnerNamespace\"})", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Failed Runners", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The number of active scale set listeners", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "fieldMinMax": true, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 0, + "y": 13 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(gha_controller_running_listeners{namespace=~\"$SystemNamespace\"})", + "hide": false, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Listeners", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of runner pods that are waiting to be created. When this number exceeds the number of pods Kubernetes reports as Waiting, it indicate cluster performance issues.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "fieldMinMax": false, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 3, + "y": 13 + }, + "id": 3, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": true, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum(gha_controller_pending_ephemeral_runners{name=~\"$Scaleset\", namespace=~\"$RunnerNamespace\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_status_waiting{namespace=~\"$RunnerNamespace\"}) != 0 or vector(0)", + "hide": false, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "Waiting" + } + ], + "title": "Pending Runners", + "transformations": [ + { + "id": "configFromData", + "options": { + "configRefId": "Waiting", + "mappings": [ + { + "fieldName": "Time", + "handlerKey": "__ignore" + }, + { + "fieldName": "sum(kube_pod_container_status_waiting{namespace=~\"(arc-runners|arc-runners-dind|arc-runners-k8s)\"}) != 0 or vector(0)", + "handlerKey": "threshold1" + } + ] + } + } + ], + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The number of runners registered for processing queued jobs", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 6, + "y": 13 + }, + "id": 8, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": true, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(gha_registered_runners{actions_github_com_scale_set_name=~\"$Scaleset\", namespace=~\"$RunnerNamespace\"})", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Registered Runners", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of runner pods in a running state", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "fieldMinMax": false, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 9, + "y": 13 + }, + "id": 1, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": true, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "max(gha_controller_running_ephemeral_runners{name=~\"$Scaleset\", namespace=~\"$RunnerNamespace\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Active Runners", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The number of containers that are reporting that they were terminated by an out-of-memory condition (OOMK.iller)", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "No issues detected", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "semi-dark-red", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 12, + "y": 13 + }, + "id": 23, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\",namespace=~\"$RunnerNamespace\"}) by (namespace)", + "legendFormat": "__auto", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + } + ], + "title": "Out of Memory", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The peak memory used by a container in a given scale set's namespace.", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "semi-dark-green", + "mode": "shades" + }, + "fieldMinMax": false, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 5, + "x": 0, + "y": 18 + }, + "id": 12, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": false, + "sizing": "auto" + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "max(container_memory_working_set_bytes{namespace=~\"$RunnerNamespace\"}) by (namespace)", + "format": "time_series", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Peak Container Memory", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The sum of the reads and writes occurring within the runner namespace.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 54, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 5, + "y": 18 + }, + "id": 13, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(rate(container_fs_writes_bytes_total{namespace=~\"$RunnerNamespace\"}[$__rate_interval])) > 0 or vector(0)", + "instant": false, + "legendFormat": "Write", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(rate(container_fs_reads_bytes_total{namespace=~\"$RunnerNamespace\"}[$__rate_interval])) > 0 or vector(0)", + "hide": false, + "instant": false, + "legendFormat": "Read", + "range": true, + "refId": "B" + } + ], + "title": "Container I/O", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The Kubernetes-reported pod status.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "min": 0, + "noValue": "No active pods", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "yellow", + "value": null + }, + { + "color": "green", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 11, + "y": 18 + }, + "id": 11, + "options": { + "displayMode": "lcd", + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "maxVizHeight": 300, + "minVizHeight": 10, + "minVizWidth": 0, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "sum(kube_pod_container_status_ready{namespace=~\"$RunnerNamespace\"}) != 0 or vector(0)", + "format": "time_series", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "interval": "", + "legendFormat": "Ready", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_status_waiting{namespace=~\"$RunnerNamespace\"}) != 0 or vector(0)", + "hide": false, + "instant": false, + "legendFormat": "Waiting", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_status_terminated_reason{namespace=~\"$RunnerNamespace\", reason=\"Completed\"}) != 0 or vector(0)", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Completed", + "range": true, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_status_terminated_reason{namespace=~\"$RunnerNamespace\", reason=\"Error\"}) != 0 or vector(0)", + "hide": false, + "instant": false, + "legendFormat": "Error", + "range": true, + "refId": "E" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum(gha_desired_runners)+1", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "DesiredRunners", + "useBackend": false + } + ], + "title": "Container Pod Status", + "transformations": [ + { + "id": "configFromData", + "options": { + "applyTo": { + "id": "byName", + "options": "Ready" + }, + "configRefId": "DesiredRunners", + "mappings": [ + { + "fieldName": "Time", + "handlerKey": "__ignore" + }, + { + "fieldName": "sum(gha_desired_runners) + 1", + "handlerKey": "threshold1" + }, + { + "fieldName": "sum(gha_desired_runners) -5", + "handlerKey": "threshold1" + } + ] + } + } + ], + "type": "bargauge" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 16, + "panels": [], + "title": "Controller Performance", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The average time required for a reconciliation request to be processed. This reflects the time required for the controller to process a single request to modify a Kubernetes resource.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 33, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 25 + }, + "id": 17, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "editorMode": "code", + "expr": "rate(controller_runtime_reconcile_time_seconds_sum{namespace=\"$SystemNamespace\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{controller}}", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + } + ], + "title": "Reconcile Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The average time a queued reconciliation request spends waiting to be processed.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 27, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 25 + }, + "id": 18, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "editorMode": "code", + "expr": "rate(workqueue_queue_duration_seconds_sum{namespace=\"$SystemNamespace\"}[$__rate_interval])", + "legendFormat": "{{controller}}", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + } + ], + "title": "Workqueue Queue Duration", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Errors indicate that controller has not achieved a desired state and is requesting Kubernetes to queue another request for reconciliation. Ideally, this number remains close to zero. An increasing number can indicate resource contention or delays processing API server requests.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 33, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 25 + }, + "id": 27, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "editorMode": "code", + "expr": "rate(controller_runtime_reconcile_errors_total{namespace=\"$SystemNamespace\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{controller}}", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + } + ], + "title": "Reconciliation Errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The number of reconcile requests that are waiting to be processed by the controller. A growing queue depth can indicate that the Kubernetes API Server or the controller does not have enough resources. This can lead to pods taking longer to be deleted or started. ", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "fieldMinMax": false, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 100 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 33 + }, + "id": 20, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "max" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum (workqueue_depth{namespace=\"$SystemNamespace\"}) by (name)", + "legendFormat": "__auto", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + } + ], + "title": "Queue Depth", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The number of workers that are currently being used to process reconcile requests. Increasing this number can reduce the work queue duration, but each new worker adds a small amount of time due to context switching.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 33 + }, + "id": 21, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "max" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum by (controller) (controller_runtime_active_workers)", + "legendFormat": "__auto", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + } + ], + "title": "Active Workers", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The number of calls to the API server", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 27, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 33 + }, + "id": 19, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum by (method, code) (rate(rest_client_requests_total{namespace=\"$SystemNamespace\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "__auto", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + } + ], + "title": "API Calls", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 41 + }, + "id": 25, + "panels": [], + "title": "Metrics", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The time required by Prometheus to read and process metrics. Long scrape times can delay metrics updates or lead to metrics loss. Increasing time often indicates issues with metrics cardinality or cluster resources.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 18, + "x": 0, + "y": 42 + }, + "id": 24, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "scrape_duration_seconds", + "instant": false, + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + } + ], + "title": "Scrape Duration", + "type": "timeseries" + } + ], + "refresh": "5s", + "schemaVersion": 40, + "tags": [], + "templating": { + "list": [ + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(gha_controller_running_listeners,namespace)", + "description": "The ARC system namespace", + "includeAll": true, + "label": "ARC System Namespace", + "multi": true, + "name": "SystemNamespace", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(gha_controller_running_listeners,namespace)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(gha_desired_runners,actions_github_com_scale_set_name)", + "description": "The name of the runner scale set", + "includeAll": true, + "label": "Scale Set", + "multi": true, + "name": "Scaleset", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(gha_desired_runners,actions_github_com_scale_set_name)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 2, + "regex": "", + "type": "query" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(gha_desired_runners{actions_github_com_scale_set_name=~\"$Scaleset\"},namespace)", + "description": "Namespace containing the runners", + "includeAll": true, + "label": "Runner Namespace", + "multi": true, + "name": "RunnerNamespace", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(gha_desired_runners{actions_github_com_scale_set_name=~\"$Scaleset\"},namespace)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 2, + "regex": "", + "type": "query" + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d", + "7d" + ] + }, + "timezone": "", + "title": "ARC Autoscaling Runner Set Monitoring", + "uid": "af21e938-2151-4bf2-b798-8cf9232f947a", + "version": 1, + "weekStart": "" +} diff --git a/docs/gha-runner-scale-set-controller/samples/grafana-dashboard/ARC-Autoscaling-Runner-Set-Monitoring_1692627561838.json b/docs/gha-runner-scale-set-controller/samples/grafana-dashboard/ARC-Autoscaling-Runner-Set-Monitoring_1692627561838.json deleted file mode 100644 index ed997340..00000000 --- a/docs/gha-runner-scale-set-controller/samples/grafana-dashboard/ARC-Autoscaling-Runner-Set-Monitoring_1692627561838.json +++ /dev/null @@ -1,1248 +0,0 @@ -{ - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "Prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__elements": {}, - "__requires": [ - { - "type": "panel", - "id": "gauge", - "name": "Gauge", - "version": "" - }, - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "10.0.0" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "stat", - "name": "Stat", - "version": "" - }, - { - "type": "panel", - "id": "timeseries", - "name": "Time series", - "version": "" - } - ], - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { - "type": "grafana", - "uid": "-- Grafana --" - }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "description": "Starter dashboard to monitor the behavior of the autoscaling runner set mode of actions-runner-controller", - "editable": true, - "fiscalYearStartMonth": 0, - "graphTooltip": 0, - "id": null, - "links": [], - "liveNow": false, - "panels": [ - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 4, - "panels": [], - "title": "Runtime", - "type": "row" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Number of active listener pods (in a running state)", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 2, - "x": 0, - "y": 1 - }, - "id": 14, - "options": { - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showThresholdLabels": false, - "showThresholdMarkers": true - }, - "pluginVersion": "10.0.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "expr": "sum by(namespace) (gha_controller_running_listeners)", - "instant": false, - "range": true, - "refId": "A" - } - ], - "title": "Active listeners", - "type": "gauge" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Total number of registered and running runners across namespaces", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 3, - "x": 2, - "y": 1 - }, - "id": 2, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "pluginVersion": "10.0.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "expr": "sum by(namespace) (gha_registered_runners)", - "hide": false, - "instant": false, - "legendFormat": "Registered", - "range": true, - "refId": "Registered" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "expr": "sum by(namespace) (gha_controller_running_ephemeral_runners)", - "hide": false, - "instant": false, - "interval": "", - "legendFormat": "Running", - "range": true, - "refId": "Running" - } - ], - "title": "Runners States", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Total number of failed runners across namespaces", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "#EAB839", - "value": 1 - }, - { - "color": "dark-red", - "value": 5 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 2, - "x": 5, - "y": 1 - }, - "id": 15, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "pluginVersion": "10.0.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "exemplar": false, - "expr": "sum by(namespace) (gha_controller_failed_ephemeral_runners)", - "instant": false, - "interval": "", - "legendFormat": "__auto", - "range": true, - "refId": "Failed" - } - ], - "title": "Failed (total)", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Total number of pending runners across namespaces.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "orange", - "value": 5 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 2, - "x": 7, - "y": 1 - }, - "id": 16, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "pluginVersion": "10.0.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "expr": "sum by(namespace) (gha_controller_pending_ephemeral_runners)", - "instant": false, - "range": true, - "refId": "A" - } - ], - "title": "Pending (total)", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Total number of registered runners that are not currently running a job.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 2, - "x": 9, - "y": 1 - }, - "id": 17, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "pluginVersion": "10.0.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "expr": "sum by(namespace) (gha_idle_runners)", - "instant": false, - "range": true, - "refId": "A" - } - ], - "title": "Idle (total)", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Jobs that are assigned to the runner scale set but that are not yet accepted but it", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 6, - "x": 11, - "y": 1 - }, - "id": 1, - "options": { - "legend": { - "calcs": [ - "lastNotNull" - ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "10.0.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "expr": "sum by(job) (gha_assigned_jobs)", - "instant": false, - "interval": "1m", - "legendFormat": "{{job}}", - "range": true, - "refId": "A" - } - ], - "title": "Total assigned jobs per listener", - "transparent": true, - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Total number of jobs that are assigned to the runner scale set but that are not yet accepted vs the number of accepted and running jobs", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 7, - "x": 17, - "y": 1 - }, - "id": 3, - "options": { - "legend": { - "calcs": [ - "lastNotNull" - ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "10.0.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "expr": "sum by(job) (gha_assigned_jobs)", - "instant": false, - "legendFormat": "assigned job - {{job}}", - "range": true, - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "expr": "sum by(job) (gha_running_jobs)", - "hide": false, - "instant": false, - "legendFormat": "running_jobs - {{job}}", - "range": true, - "refId": "B" - } - ], - "title": "Assigned vs running jobs", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 9 - }, - "id": 10, - "options": { - "legend": { - "calcs": [ - "lastNotNull" - ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "expr": "avg by(job) (gha_job_startup_duration_seconds_sum)", - "instant": false, - "legendFormat": "{{job}}", - "range": true, - "refId": "A" - } - ], - "title": "Average startup duration", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 9 - }, - "id": 9, - "options": { - "legend": { - "calcs": [ - "mean" - ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "expr": "avg by(job) (gha_job_execution_duration_seconds_sum)", - "instant": false, - "legendFormat": "{{job}}", - "range": true, - "refId": "A" - } - ], - "title": "Average execution duration (seconds)", - "type": "timeseries" - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 17 - }, - "id": 6, - "panels": [], - "title": "Controllers", - "type": "row" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 7, - "x": 0, - "y": 18 - }, - "id": 5, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "expr": "rate(controller_runtime_reconcile_errors_total{job=\"arc-controller-service\"}[$__rate_interval])", - "instant": false, - "legendFormat": "{{controller}}", - "range": true, - "refId": "A" - } - ], - "title": "Reconciliation errors", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 9, - "x": 7, - "y": 18 - }, - "id": 7, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "expr": "rate(controller_runtime_reconcile_time_seconds_sum{job=\"arc-controller-service\"}[$__rate_interval])", - "instant": false, - "legendFormat": "{{controller}}", - "range": true, - "refId": "A" - } - ], - "title": "Reconciliation time (seconds)", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 18 - }, - "id": 13, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "expr": "workqueue_depth{job=\"arc-controller-service\"}", - "instant": false, - "legendFormat": "{{name}}", - "range": true, - "refId": "A" - } - ], - "title": "Workqueue depth", - "type": "timeseries" - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 26 - }, - "id": 12, - "panels": [], - "title": "Prometheus", - "type": "row" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 24, - "x": 0, - "y": 27 - }, - "id": 11, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "expr": "scrape_duration_seconds", - "instant": false, - "legendFormat": "{{job}}", - "range": true, - "refId": "A" - } - ], - "title": "Scrape Duration (seconds)", - "type": "timeseries" - } - ], - "refresh": "5s", - "schemaVersion": 38, - "style": "dark", - "tags": [], - "templating": { - "list": [] - }, - "time": { - "from": "now-15m", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ] - }, - "timezone": "utc", - "title": "ARC Autoscaling Runner Set Monitoring", - "uid": "afe41561-2151-4bf2-b798-79aa6c03412c", - "version": 29, - "weekStart": "" -} \ No newline at end of file diff --git a/docs/gha-runner-scale-set-controller/samples/grafana-dashboard/README.md b/docs/gha-runner-scale-set-controller/samples/grafana-dashboard/README.md index aa869a73..c72961a7 100644 --- a/docs/gha-runner-scale-set-controller/samples/grafana-dashboard/README.md +++ b/docs/gha-runner-scale-set-controller/samples/grafana-dashboard/README.md @@ -1,6 +1,11 @@ # Visualizing Autoscaling Runner Scale Set metrics with Grafana -With metrics introduced in [gha-runner-scale-set-0.5.0](https://github.com/actions/actions-runner-controller/releases/tag/gha-runner-scale-set-0.5.0), you can now visualize the autoscaling behavior of your runner scale set with your tool of choice. This sample shows how to visualize the metrics with [Grafana](https://grafana.com/). +With the metrics support introduced in [gha-runner-scale-set-0.5.0](https://github.com/actions/actions-runner-controller/releases/tag/gha-runner-scale-set-0.5.0), you can visualize the autoscaling behavior of your runner scale set with your tool of choice. + +This sample dashboard shows how to visualize the metrics with [Grafana](https://grafana.com/). + +> [!NOTE] +> We do not intend to provide a supported ARC dashboard. This is simply a reference and a demonstration for how you could leverage the metrics emitted by the controller-manager and listeners to visualize the autoscaling behavior of your runner scale set. We offer no promises of future upgrades to this sample. ## Demo @@ -8,12 +13,43 @@ With metrics introduced in [gha-runner-scale-set-0.5.0](https://github.com/actio ## Setup -We do not intend to provide a supported ARC dashboard. This is simply a reference and a demonstration for how you could leverage the metrics emitted by the controller-manager and listeners to visualize the autoscaling behavior of your runner scale set. We offer no promises of future upgrades to this sample. - 1. Make sure to have [Grafana](https://grafana.com/docs/grafana/latest/installation/) and [Prometheus](https://prometheus.io/docs/prometheus/latest/installation/) running in your cluster. 2. Make sure that Prometheus is properly scraping the metrics endpoints of the controller-manager and listeners. 3. Import the [dashboard](ARC-Autoscaling-Runner-Set-Monitoring_1692627561838.json) into Grafana. +## Required metrics + +This sample relies on the suggestion listener metrics configuration in the scale set [values.yaml](https://github.com/actions/actions-runner-controller/blob/ea27448da51385470b1ce67150aa695cfa45fd3f/charts/gha-runner-scale-set/values.yaml#L129-L270). + +The following metrics are required to be scraped by Prometheus in order to populate the dashboard: + +| Metric | Required labels | Source | +| ------ | ----------- | -----| +| container_fs_writes_bytes_total | namespace | cAdvisor +| container_fs_reads_bytes_total | namespace | cAdvisor +| container_memory_working_set_bytes | namespace | cAdvisor +| controller_runtime_active_workers | controller | ARC Controller +| controller_runtime_reconcile_time_seconds_sum | namespace | ARC Controller +| controller_runtime_reconcile_errors_total | namespace | ARC Controller +| gha_assigned_jobs | actions_github_com_scale_set_name, namespace | ARC Controller +| gha_controller_failed_ephemeral_runners | name, namespace | ARC Controller +| gha_controller_pending_ephemeral_runners | name, namespace | ARC Controller +| gha_controller_running_ephemeral_runners | name, namespace | ARC Controller +| gha_controller_running_listeners | namespace | ARC Controller +| gha_desired_runners | actions_github_com_scale_set_name, namespace | ARC Listener +| gha_idle_runners | actions_github_com_scale_set_name, namespace | ARC Listener +| gha_job_execution_duration_seconds_bucket | actions_github_com_scale_set_name, actions_github_com_scale_set_namespace | ARC Listener +| gha_job_startup_duration_seconds_bucket | actions_github_com_scale_set_name, actions_github_com_scale_set_namespace | ARC Listener +| gha_registered_runners | actions_github_com_scale_set_name, namespace | ARC Listener +| gha_running_jobs | actions_github_com_scale_set_name, actions_github_com_scale_set_namespace | ARC Listener +| kube_pod_container_status_ready | namespace | kube-state-metrics +| kube_pod_container_status_terminated_reason | namespace, reason | kube-state-metrics +| kube_pod_container_status_waiting | namespace | kube-state-metrics +| rest_client_requests_total | code, method, namespace | ARC Controller +| scrape_duration_seconds | | prometheus +| workqueue_depth | name, namespace | ARC Controller +| workqueue_queue_duration_seconds_sum | namespace | ARC Controller + ## Details This dashboard demonstrates some of the metrics provided by ARC and the underlying Kubernetes runtime. It provides a sample visualization of the behavior of the runner scale set, the ARC controllers, and the listeners. This should not be considered a comprehensive dashboard; it is a starting point that can be used with other metrics and logs to understand the health of the cluster. Review the [GitHub documentation detailing the Actions Runner Controller metrics and how to enable them](https://docs.github.com/en/enterprise-server@3.10/actions/hosting-your-own-runners/managing-self-hosted-runners-with-actions-runner-controller/deploying-runner-scale-sets-with-actions-runner-controller#enabling-metrics). @@ -22,16 +58,25 @@ The dashboard includes the following metrics: | Label | Description | | -------------------------------- | ----------------------------------------------------| -| Active listeners | The number of listeners currently running and attempting to manage jobs for the scale set. This should match the number of scale sets deployed. | -| Runner States | Displays the number of runners in a given state. The finished and deleted states are not included in this panel. | -| Failed (total) | The total number of ephemeral runners that have failed to properly start. This may require reviewing the custom resource and logs to identify and resolve the root causes. Common causes include resource issues and failure to pull the required image. | -| Pending (total) | The total number of ephemeral runners that ARC has requested and is waiting for Kubernetes to provide in a running state. If the Kubernetes API server is responsive, this will typically match the number of runner pods that are in a pending state. This number includes requests for runner pods that have not yet been scheduled. When this number is higher than the number of runner pods in a pending state, it can indicate performance issues with the API server and resource contention. | -| Idle (total) | The total number of ephemeral runners that are available to accept jobs across all scale sets. Keeping a pool of idle runners can enable a faster start time under load, but excessive idle runners will consume resources and can prevent nodes from scaling down. | -| Total assigned jobs per listener | The number of workflow jobs acquired and assigned to the listener. The listener must provide supporting runners to complete these jobs. Once jobs are assigned, they cannot be delegated to other listeners and must be processed by the scale set or cancelled. | -| Assigned vs running jobs | Compares the number of jobs assigned against the number of runners that are currently processing jobs. When running jobs is less than assigned jobs, it can indicate that ARC is waiting on Kubernetes to provide and start additional runners. | -| Average startup duration | The average time in seconds between when jobs are assigned and when a runner accepts the job and begins processing. An increasing duration can indicate that the cluster has resource contention or a lack of available nodes for scheduling jobs | -| Average execution duration | The average time in seconds that runners are taking to complete a job. Changes in this value reflect the efficiency of workflow jobs and the pod configuration. If the value is decreasing without changes to the job, it can indicate resource contention or CPU throttling. | +| Startup Duration | Heat map of the wait time before a job starts, with the colors indicating the increase in the number of jobs in that time bucket. An increasing time can indicate that the cluster is resource constrained and may need additional nodes or resources to handle the load. | +| Execution Duration | Heat map of the execution time for a job, with the colors indicating the increase in the number of jobs in that time bucket. Time can be affected by the number of steps in the job, the allocated CPU, and whether there is resource contention on the node that is impacting performance | +| Assigned Jobs | The number of jobs that have been assigned to the listener. This is the number of jobs that the listener is responsible for providing a runner to process. | +| Desired Runners | The number of runners that the listener is requesting from the controller. This is the number of runners required to process the assigned jobs and provide idle runners. It is limited by the configured maximum runner count for the scale set. | +| Idle Runners | The total number of ephemeral runners that are available to accept jobs across all selected scale sets. Keeping a pool of idle runners can enable a faster start time under load, but excessive idle runners will consume resources and can prevent nodes from scaling down. | +| Running Jobs | The number of runners that are currently processing jobs. | +| Failed Runners | The total number of ephemeral runners that have failed to properly start. This may require reviewing the custom resource and logs to identify and resolve the root causes. Common causes include resource issues and failure to pull the required image. | +| Listeners | The number of listeners currently running and attempting to manage jobs for the scale set. This should match the number of scale sets deployed. | +| Pending Runners | The total number of ephemeral runners that ARC has requested and is waiting for Kubernetes to provide in a running state. If the Kubernetes API server is responsive, this will typically match the number of runner pods that are in a pending state. This number includes requests for runner pods that have not yet been scheduled. When this number is higher than the number of runner pods in a pending state, it can indicate performance issues. | +| Registered Runners | The total number of ephemeral runners that have been successfully registered. | +| Active Runners | The total number of runners that are active and either available or processing jobs. | +| Out of Memory | The number of containers that have been terminated by the OOMKiller. This can indicate that the requests/ limits for one or more pods on the node were configured improperly, allowing pods to request more memory than the node had available. | +| Peak Container Memory | The maximum amount of memory used by any container in a given namespace during the selected time. This can be used for tuning the memory limits for the pods and for alerts as containers get close to their limits. +| Container I/O | Shows the number of bytes read and written to the container filesystem. This can be used to identify if the container is reading or writing a large amount of data to the filesystem, which can impact performance. | +| Container Pod Status | Shows the number of containers in each status (waiting, running, terminated, ready). This can be used to identify if there are a large number of containers that are failing to start or are in a waiting state. | +| Reconcile time | The time to perform a single reconciliation task from a controller's work queue. This metric reflects the time it takes for ARC to complete each step in the processing of creating, managing, and cleaning up runners. As this increases, it can indicate resource contention, processing delays, or delays from the API server. | +| Workqueue Queue Duration | The time items spent in the work queue for a controller before being processed. This is often related to the work queue depth; as the number of items increases, it can take an increasing amount of time for an item to be processed. | | Reconciliation errors | Reconciliation is the process of a controller ensuring the desired state and actual state of the resources match. Each time an event occurs on a resource watched by the controller, the controller is required to indicate if the new state matches the desired state. Kubernetes adds a task to the work queue for the controller to perform this reconciliation. Errors indicate that controller has not achieved a desired state and is requesting Kubernetes to queue another request for reconciliation. Ideally, this number remains close to zero. An increasing number can indicate resource contention or delays processing API server requests. This reflects Kubernetes resources that ARC is waiting to be provided or in the necessary state. As a concrete example, ARC will request the creation of a secret prior to creating the pod. If the response indicates the secret is not immediately ready, ARC will requeue the reconciliation task with the error details, incrementing this count. | -| Reconciliation time | A histogram reflecting the time in seconds to perform a single reconciliation task from the controller's work queue. A histogram counts the number of requests that are processed within a given bucket of time. This metric reflects the time it takes for ARC to complete each step in the processing of creating, managing, and cleaning up runners. As this increases, it can indicate resource contention or processing delays within Kubernetes or the API server. This displays shows an average, which may hide larger or smaller times that are occurring in the processing. | -| Workqueue depth | The number of tasks that Kubernetes queued for the ARC controllers to process. This includes reconciliation requests and tasks from ARC. ARC sequentially processes a work queue of single, small task to avoid concurrency issues. Managing a runner requires multiple steps to prepare, create, update, and delete the runner, its resources, and the ARC custom resources. As each step is completed (or trigger reconciliation), new tasks are queued for processing. As the depth increases, it indicates more tasks awaiting time from the controller. Growth indicates increasing work and may indicate Kubernetes resource contention or processing latencies. Each request for a new runner will result in multiple tasks being added to the work queue to prepare and create the runner and the related ARC custom resources. | +| Workqueue depth | The number of tasks that Kubernetes has queued for the ARC controllers to process. This includes reconciliation requests and tasks initiated by the controller. Managing a runner requires multiple steps to prepare, create, update, and delete the runner, its resources, and the ARC custom resources. As each step is completed (or trigger reconciliation), new tasks are queued for processing. The controller will then use one or more workers to process these tasks in the order they were queued. As the depth increases, it indicates more tasks awaiting time from the controller. Growth indicates increasing work and may reflect Kubernetes resource contention or processing latencies. Each request for a new runner will result in multiple tasks being added to the work queue to prepare and create the runner and the related ARC custom resources. | +| Active Workers | The number of workers that are actively processing tasks in the work queue. If the queue is empty, then there may be no workers required to process the tasks. The number of workers for the ephemeral runner is configurable in the scale set values file. | +| API Calls | Shows the number of calls to the API server by status code and HTTP method. The method indicates the type of activity being performed, while the status code indicates the result of the activity. Error codes of 500 and above often indicate a Kubernetes issue. | | Scrape Duration (seconds) | The amount of time required for Prometheus to read the configured metrics from components in the cluster. An increasing number may indicate a lack of resources for Prometheus and a risk of the process exceeding the configured timeout, leading to lost metrics data. | diff --git a/docs/gha-runner-scale-set-controller/samples/grafana-dashboard/grafana-sample.png b/docs/gha-runner-scale-set-controller/samples/grafana-dashboard/grafana-sample.png index fd8f69cb..3860c4f0 100644 --- a/docs/gha-runner-scale-set-controller/samples/grafana-dashboard/grafana-sample.png +++ b/docs/gha-runner-scale-set-controller/samples/grafana-dashboard/grafana-sample.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b871862ef58b3480017edfa168d54f8269c8f5c542eb27e9da3e6fcb72294ecb -size 606907 +oid sha256:9bf448c6e9dad0e9e615f82e17883cf34b09b14f5461189167b798df40106c27 +size 351602