HUD failure fallback: over-provision placeholders

- Add configurable HUDFailureMultiplier (default 3x) to scale placeholder count when HUD API is unreachable - New env var CAPACITY_AWARE_HUD_FAILURE_MULTIPLIER with clamp ≥1 in both ConfigFromEnv and Validate - Fallback formula: ProactiveCapacity * multiplier (replaces the previous zero-queued-jobs fallback that reduced capacity) - Add tests for multiplier clamping, MaxRunners cap interaction, and HUD-disabled path - Bump chart versions to jeanschmidt.10 When HUD is down we lose visibility into queue depth, so the old fallback of assuming 0 queued jobs was backwards — it shrank capacity exactly when we had the least information. The multiplier-based fallback leans toward over-provisioning instead; existing safety bounds (MaxRunners headroom, MaxBurstCapacity) still cap the blast radius. Signed-off-by: Jean Schmidt <contato@jschmidt.me>
2026-05-15 14:23:36 -07:00 · 2026-05-15 14:23:36 -07:00 · d5d94fba48
parent 30c1a102b4
commit d5d94fba48
6 changed files with 167 additions and 20 deletions
--- a/charts/gha-runner-scale-set-controller/Chart.yaml
+++ b/charts/gha-runner-scale-set-controller/Chart.yaml
@ -15,13 +15,13 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.14.1-jeanschmidt.9
+version: 0.14.1-jeanschmidt.10

 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 # follow Semantic Versioning. They should reflect the version the application is using.
 # It is recommended to use it with quotes.
-appVersion: "0.14.1-jeanschmidt.9"
+appVersion: "0.14.1-jeanschmidt.10"

 home: https://github.com/actions/actions-runner-controller

--- a/charts/gha-runner-scale-set/Chart.yaml
+++ b/charts/gha-runner-scale-set/Chart.yaml
@ -15,13 +15,13 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.14.1-jeanschmidt.9
+version: 0.14.1-jeanschmidt.10

 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 # follow Semantic Versioning. They should reflect the version the application is using.
 # It is recommended to use it with quotes.
-appVersion: "0.14.1-jeanschmidt.9"
+appVersion: "0.14.1-jeanschmidt.10"

 home: https://github.com/actions/actions-runner-controller

--- a/cmd/ghalistener/capacity/config.go
+++ b/cmd/ghalistener/capacity/config.go
@ -16,6 +16,11 @@ const (
 	// proactiveCapacityWarnThreshold triggers a warning log but does
 	// not clamp — operators may legitimately need >100 in surge cases.
 	proactiveCapacityWarnThreshold = 100
+	// defaultHUDFailureMultiplier is applied to ProactiveCapacity when the
+	// HUD API is unreachable. A value >1 keeps placeholder capacity above
+	// the proactive baseline during a HUD outage; outer caps (MaxRunners
+	// headroom, MaxBurstCapacity) bound the absolute blast radius.
+	defaultHUDFailureMultiplier = 3
 )

 // Config holds all configuration for the capacity monitor.
@ -62,8 +67,9 @@ type Config struct {
 	ScaleSetName string

 	// HUD API
-	HUDAPIURL   string
-	HUDAPIToken string
+	HUDAPIURL            string
+	HUDAPIToken          string
+	HUDFailureMultiplier int
 }

 // ConfigFromEnv reads capacity monitor configuration from environment
@ -87,8 +93,9 @@ func ConfigFromEnv() Config {
 		NodeFleet:           envString("CAPACITY_AWARE_NODE_FLEET", ""),
 		RunnerNodeFleet:     envString("CAPACITY_AWARE_RUNNER_NODE_FLEET", ""),
 		RunnerClass:         envString("CAPACITY_AWARE_RUNNER_CLASS", ""),
-		HUDAPIURL:           envString("CAPACITY_AWARE_HUD_API_URL", defaultHUDAPIURL),
-		HUDAPIToken:         envString("CAPACITY_AWARE_HUD_API_TOKEN", ""),
+		HUDAPIURL:            envString("CAPACITY_AWARE_HUD_API_URL", defaultHUDAPIURL),
+		HUDAPIToken:          envString("CAPACITY_AWARE_HUD_API_TOKEN", ""),
+		HUDFailureMultiplier: envInt("CAPACITY_AWARE_HUD_FAILURE_MULTIPLIER", defaultHUDFailureMultiplier),
 	}

 	if c.ProactiveCapacity < 0 {
@ -105,6 +112,12 @@ func ConfigFromEnv() Config {
 			"value", c.ProactiveCapacity, "warnThreshold", proactiveCapacityWarnThreshold)
 	}

+	if c.HUDFailureMultiplier < 1 {
+		slog.Warn("CAPACITY_AWARE_HUD_FAILURE_MULTIPLIER must be >= 1, clamping",
+			"original", c.HUDFailureMultiplier, "clampedTo", 1)
+		c.HUDFailureMultiplier = 1
+	}
+
 	return c
 }

@ -123,6 +136,11 @@ func (c *Config) Validate() error {
 		slog.Warn("MaxBurstCapacity is negative, clamping to 0", "original", c.MaxBurstCapacity)
 		c.MaxBurstCapacity = 0
 	}
+	if c.HUDFailureMultiplier < 1 {
+		slog.Warn("HUDFailureMultiplier must be >= 1, clamping",
+			"original", c.HUDFailureMultiplier, "clampedTo", 1)
+		c.HUDFailureMultiplier = 1
+	}

 	if c.Enabled && c.RunnerNodeFleet == "" {
 		// Hard requirement: the runner-pool fleet drives placeholder-runner
--- a/cmd/ghalistener/capacity/config_test.go
+++ b/cmd/ghalistener/capacity/config_test.go
@ -49,6 +49,7 @@ func TestConfigFromEnv_Defaults(t *testing.T) {
 		"CAPACITY_AWARE_RUNNER_NODE_FLEET",
 		"CAPACITY_AWARE_RUNNER_CLASS",
 		"CAPACITY_AWARE_HUD_API_TOKEN",
+		"CAPACITY_AWARE_HUD_FAILURE_MULTIPLIER",
 	}
 	unsetEnvs(t, keys)

@ -68,6 +69,7 @@ func TestConfigFromEnv_Defaults(t *testing.T) {
 	assert.Equal(t, "", cfg.RunnerNodeFleet, "RunnerNodeFleet default")
 	assert.Equal(t, "", cfg.RunnerClass, "RunnerClass default")
 	assert.Equal(t, "", cfg.HUDAPIToken, "HUDAPIToken default")
+	assert.Equal(t, defaultHUDFailureMultiplier, cfg.HUDFailureMultiplier, "HUDFailureMultiplier default")
 	// Fields set by main.go should be zero values.
 	assert.Equal(t, 0, cfg.MaxRunners, "MaxRunners zero")
 	assert.Equal(t, 0, cfg.ScaleSetID, "ScaleSetID zero")
@ -90,8 +92,9 @@ func TestConfigFromEnv_AllSet(t *testing.T) {
 		"CAPACITY_AWARE_RUNNER_MEMORY":        "1Gi",
 		"CAPACITY_AWARE_NODE_FLEET":           "gpu-fleet",
 		"CAPACITY_AWARE_RUNNER_NODE_FLEET":    "c7i-runner",
-		"CAPACITY_AWARE_RUNNER_CLASS":         "gpu-large",
-		"CAPACITY_AWARE_HUD_API_TOKEN":        "secret-token",
+		"CAPACITY_AWARE_RUNNER_CLASS":           "gpu-large",
+		"CAPACITY_AWARE_HUD_API_TOKEN":          "secret-token",
+		"CAPACITY_AWARE_HUD_FAILURE_MULTIPLIER": "5",
 	})

 	cfg := ConfigFromEnv()
@ -110,6 +113,7 @@ func TestConfigFromEnv_AllSet(t *testing.T) {
 	assert.Equal(t, "c7i-runner", cfg.RunnerNodeFleet)
 	assert.Equal(t, "gpu-large", cfg.RunnerClass)
 	assert.Equal(t, "secret-token", cfg.HUDAPIToken)
+	assert.Equal(t, 5, cfg.HUDFailureMultiplier)
 }

 func TestConfigFromEnv_InvalidValues_FallbackToDefaults(t *testing.T) {
@ -157,6 +161,31 @@ func TestConfigFromEnv_ProactiveCapacity_NegativeClampedToZero(t *testing.T) {
 		"negative ProactiveCapacity must clamp to 0")
 }

+// HUDFailureMultiplier must be >= 1 — a value below 1 would never produce
+// over-provisioning on HUD failure, defeating the purpose of the fallback.
+func TestConfigFromEnv_HUDFailureMultiplier_BelowOneClampedToOne(t *testing.T) {
+	t.Run("negative", func(t *testing.T) {
+		setEnvs(t, map[string]string{
+			"CAPACITY_AWARE_HUD_FAILURE_MULTIPLIER": "-5",
+		})
+
+		cfg := ConfigFromEnv()
+
+		assert.Equal(t, 1, cfg.HUDFailureMultiplier,
+			"negative HUDFailureMultiplier must clamp to 1")
+	})
+	t.Run("zero", func(t *testing.T) {
+		setEnvs(t, map[string]string{
+			"CAPACITY_AWARE_HUD_FAILURE_MULTIPLIER": "0",
+		})
+
+		cfg := ConfigFromEnv()
+
+		assert.Equal(t, 1, cfg.HUDFailureMultiplier,
+			"zero HUDFailureMultiplier must clamp to 1")
+	})
+}
+
 // Values above the hard cap (1000) must be clamped — protects against
 // runaway placeholder creation from a misconfiguration.
 func TestConfigFromEnv_ProactiveCapacity_AboveHardCapClamped(t *testing.T) {
@ -195,6 +224,23 @@ func TestConfigFromEnv_ProactiveCapacity_AboveWarnAllowed(t *testing.T) {
 		"values between warn threshold and hard cap are allowed")
 }

+// Validate() must enforce HUDFailureMultiplier >= 1 for callers that
+// construct Config programmatically (bypassing ConfigFromEnv's clamp).
+func TestConfig_Validate_HUDFailureMultiplierClampedBelowOne(t *testing.T) {
+	t.Run("zero", func(t *testing.T) {
+		cfg := Config{HUDFailureMultiplier: 0}
+		require.NoError(t, cfg.Validate())
+		assert.Equal(t, 1, cfg.HUDFailureMultiplier,
+			"Validate must clamp HUDFailureMultiplier=0 to 1")
+	})
+	t.Run("negative", func(t *testing.T) {
+		cfg := Config{HUDFailureMultiplier: -3}
+		require.NoError(t, cfg.Validate())
+		assert.Equal(t, 1, cfg.HUDFailureMultiplier,
+			"Validate must clamp negative HUDFailureMultiplier to 1")
+	})
+}
+
 // Validate() clamps negative MaxRunners (set by main.go after env parse).
 func TestConfig_Validate_MaxRunnersNegativeClamped(t *testing.T) {
 	cfg := Config{MaxRunners: -3}
--- a/cmd/ghalistener/capacity/monitor.go
+++ b/cmd/ghalistener/capacity/monitor.go
@ -339,15 +339,17 @@ func (m *Monitor) reconcileProvisioning(ctx context.Context) {
 		m.recorder.ObserveReconcileDuration(reconcilePhaseProvisioner, time.Since(start))
 	}()

-	// 1. Query HUD API with retry (graceful fallback to 0).
+	// 1. Query HUD API with retry (graceful fallback handled below).
 	queuedJobs := 0
+	hudFailed := false
 	if m.hudClient != nil && m.config.HUDAPIToken != "" {
 		var err error
 		queuedJobs, err = m.queryHUDWithRetry(ctx)
 		if err != nil {
-			m.logger.Warn("HUD API failed after retries, using 0 queued jobs", "error", err)
+			m.logger.Warn("HUD API failed after retries, falling back to ProactiveCapacity * HUDFailureMultiplier", "error", err)
 			m.recorder.IncReconcileSkips(skipReasonHUDAPIFailed)
 			queuedJobs = 0
+			hudFailed = true
 		}
 	}
 	// Set even on the failure path — queuedJobs is 0 in that case, which
@ -432,7 +434,13 @@ func (m *Monitor) reconcileProvisioning(ctx context.Context) {
 	}

 	// 5. Calculate desired placeholder count.
+	// On HUD failure, over-provision: less information about queue depth
+	// means we must lean toward more capacity to keep latency bounded.
+	// Headroom and burst caps below still bound the absolute blast radius.
 	desiredPairs := m.config.ProactiveCapacity + queuedJobs
+	if hudFailed {
+		desiredPairs = m.config.ProactiveCapacity * m.config.HUDFailureMultiplier
+	}

 	// Clamp by headroom against the hard runner cap. Real runner pods (running +
 	// pending) consume the cap, so the placeholder pool can only fill what's left.
@ -458,6 +466,7 @@ func (m *Monitor) reconcileProvisioning(ctx context.Context) {

 	m.logger.Info("provisioning reconciled",
 		"queuedJobs", queuedJobs,
+		"hudFailed", hudFailed,
 		"desiredPairs", desiredPairs,
 		"currentPairs", currentPairs,
 		"runningRunnerPods", runningRunnerPods,
--- a/cmd/ghalistener/capacity/monitor_test.go
+++ b/cmd/ghalistener/capacity/monitor_test.go
@ -266,7 +266,7 @@ func TestReconcile_SetMaxRunners_CapAtMaxRunners(t *testing.T) {
 	assert.Equal(t, int32(5), maxVal.Load())
 }

-func TestReconcile_HUDAPIFailure_FallsBackToProactiveOnly(t *testing.T) {
+func TestReconcile_HUDAPIFailure_FallsBackToProactiveTimesMultiplier(t *testing.T) {
 	// HUD server returns 500.
 	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		w.WriteHeader(http.StatusInternalServerError)
@ -274,11 +274,12 @@ func TestReconcile_HUDAPIFailure_FallsBackToProactiveOnly(t *testing.T) {
 	defer srv.Close()

 	cfg := Config{
-		ProactiveCapacity:  3,
-		MaxRunners:         20,
-		ScaleSetLabels:     []string{"linux.2xlarge"},
-		HUDAPIToken:        "test",
-		PlaceholderTimeout: 5 * time.Minute,
+		ProactiveCapacity:    3,
+		HUDFailureMultiplier: 3,
+		MaxRunners:           20,
+		ScaleSetLabels:       []string{"linux.2xlarge"},
+		HUDAPIToken:          "test",
+		PlaceholderTimeout:   5 * time.Minute,
 	}
 	m, cs, maxVal := newTestMonitor(t, cfg, nil)

@ -287,12 +288,85 @@ func TestReconcile_HUDAPIFailure_FallsBackToProactiveOnly(t *testing.T) {
 	m.reconcileProvisioning(context.Background())
 	m.reconcileReporting(context.Background())

-	// Falls back to proactiveCapacity only: 3 pairs = 6 pods.
-	assert.Equal(t, 6, countPods(t, cs, "test-ns"))
+	// HUD failure -> over-provision: ProactiveCapacity(3) * HUDFailureMultiplier(3)
+	// = 9 pairs = 18 pods. Less info about queue depth means lean toward more
+	// capacity; outer caps still bound the absolute blast radius.
+	assert.Equal(t, 18, countPods(t, cs, "test-ns"))
 	// No running pairs, so capacity = 0 (still capped at MaxRunners=20).
 	assert.Equal(t, int32(0), maxVal.Load())
 }

+// With multiplier=1, the HUD-failure fallback equals ProactiveCapacity alone.
+func TestReconcile_HUDAPIFailure_MultiplierOne(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusInternalServerError)
+	}))
+	defer srv.Close()
+
+	cfg := Config{
+		ProactiveCapacity:    3,
+		HUDFailureMultiplier: 1,
+		MaxRunners:           20,
+		ScaleSetLabels:       []string{"linux.2xlarge"},
+		HUDAPIToken:          "test",
+		PlaceholderTimeout:   5 * time.Minute,
+	}
+	m, cs, _ := newTestMonitor(t, cfg, nil)
+	m.hudClient = NewHUDClient(srv.URL, "test")
+
+	m.reconcileProvisioning(context.Background())
+	m.reconcileReporting(context.Background())
+
+	// 3 * 1 = 3 pairs = 6 pods.
+	assert.Equal(t, 6, countPods(t, cs, "test-ns"))
+}
+
+// MaxRunners must clamp the multiplier-amplified fallback so a misconfigured
+// multiplier cannot exceed the hard runner cap.
+func TestReconcile_HUDAPIFailure_MultiplierClampedByMaxRunners(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusInternalServerError)
+	}))
+	defer srv.Close()
+
+	cfg := Config{
+		ProactiveCapacity:    5,
+		HUDFailureMultiplier: 4,
+		MaxRunners:           10,
+		ScaleSetLabels:       []string{"linux.2xlarge"},
+		HUDAPIToken:          "test",
+		PlaceholderTimeout:   5 * time.Minute,
+	}
+	m, cs, _ := newTestMonitor(t, cfg, nil)
+	m.hudClient = NewHUDClient(srv.URL, "test")
+
+	m.reconcileProvisioning(context.Background())
+	m.reconcileReporting(context.Background())
+
+	// 5 * 4 = 20 desired, clamped to MaxRunners=10 -> 10 pairs = 20 pods.
+	assert.Equal(t, 20, countPods(t, cs, "test-ns"))
+}
+
+// When HUD is disabled by config (no token), the multiplier path must not
+// trigger — only the proactive baseline applies.
+func TestReconcile_HUDDisabled_MultiplierDoesNotApply(t *testing.T) {
+	cfg := Config{
+		ProactiveCapacity:    3,
+		HUDFailureMultiplier: 99,
+		MaxRunners:           100,
+		HUDAPIToken:          "",
+		PlaceholderTimeout:   5 * time.Minute,
+	}
+	m, cs, _ := newTestMonitor(t, cfg, nil)
+
+	m.reconcileProvisioning(context.Background())
+	m.reconcileReporting(context.Background())
+
+	// HUD disabled -> hudFailed stays false -> desired = ProactiveCapacity(3) +
+	// queuedJobs(0) = 3 pairs = 6 pods. Multiplier of 99 must not apply.
+	assert.Equal(t, 6, countPods(t, cs, "test-ns"))
+}
+
 func TestReconcile_IdempotentWhenAtDesired(t *testing.T) {
 	cfg := Config{
 		ProactiveCapacity:  2,