HUD failure fallback: over-provision placeholders

- Add configurable HUDFailureMultiplier (default 3x) to scale
  placeholder count when HUD API is unreachable
- New env var CAPACITY_AWARE_HUD_FAILURE_MULTIPLIER with clamp ≥1
  in both ConfigFromEnv and Validate
- Fallback formula: ProactiveCapacity * multiplier (replaces the
  previous zero-queued-jobs fallback that reduced capacity)
- Add tests for multiplier clamping, MaxRunners cap interaction,
  and HUD-disabled path
- Bump chart versions to jeanschmidt.10

When HUD is down we lose visibility into queue depth, so the old
fallback of assuming 0 queued jobs was backwards — it shrank capacity
exactly when we had the least information. The multiplier-based
fallback leans toward over-provisioning instead; existing safety
bounds (MaxRunners headroom, MaxBurstCapacity) still cap the blast
radius.

Signed-off-by: Jean Schmidt <contato@jschmidt.me>
This commit is contained in:
Jean Schmidt 2026-05-15 14:23:36 -07:00
parent 30c1a102b4
commit d5d94fba48
6 changed files with 167 additions and 20 deletions

View File

@ -15,13 +15,13 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.14.1-jeanschmidt.9
version: 0.14.1-jeanschmidt.10
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "0.14.1-jeanschmidt.9"
appVersion: "0.14.1-jeanschmidt.10"
home: https://github.com/actions/actions-runner-controller

View File

@ -15,13 +15,13 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.14.1-jeanschmidt.9
version: 0.14.1-jeanschmidt.10
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "0.14.1-jeanschmidt.9"
appVersion: "0.14.1-jeanschmidt.10"
home: https://github.com/actions/actions-runner-controller

View File

@ -16,6 +16,11 @@ const (
// proactiveCapacityWarnThreshold triggers a warning log but does
// not clamp — operators may legitimately need >100 in surge cases.
proactiveCapacityWarnThreshold = 100
// defaultHUDFailureMultiplier is applied to ProactiveCapacity when the
// HUD API is unreachable. A value >1 keeps placeholder capacity above
// the proactive baseline during a HUD outage; outer caps (MaxRunners
// headroom, MaxBurstCapacity) bound the absolute blast radius.
defaultHUDFailureMultiplier = 3
)
// Config holds all configuration for the capacity monitor.
@ -62,8 +67,9 @@ type Config struct {
ScaleSetName string
// HUD API
HUDAPIURL string
HUDAPIToken string
HUDAPIURL string
HUDAPIToken string
HUDFailureMultiplier int
}
// ConfigFromEnv reads capacity monitor configuration from environment
@ -87,8 +93,9 @@ func ConfigFromEnv() Config {
NodeFleet: envString("CAPACITY_AWARE_NODE_FLEET", ""),
RunnerNodeFleet: envString("CAPACITY_AWARE_RUNNER_NODE_FLEET", ""),
RunnerClass: envString("CAPACITY_AWARE_RUNNER_CLASS", ""),
HUDAPIURL: envString("CAPACITY_AWARE_HUD_API_URL", defaultHUDAPIURL),
HUDAPIToken: envString("CAPACITY_AWARE_HUD_API_TOKEN", ""),
HUDAPIURL: envString("CAPACITY_AWARE_HUD_API_URL", defaultHUDAPIURL),
HUDAPIToken: envString("CAPACITY_AWARE_HUD_API_TOKEN", ""),
HUDFailureMultiplier: envInt("CAPACITY_AWARE_HUD_FAILURE_MULTIPLIER", defaultHUDFailureMultiplier),
}
if c.ProactiveCapacity < 0 {
@ -105,6 +112,12 @@ func ConfigFromEnv() Config {
"value", c.ProactiveCapacity, "warnThreshold", proactiveCapacityWarnThreshold)
}
if c.HUDFailureMultiplier < 1 {
slog.Warn("CAPACITY_AWARE_HUD_FAILURE_MULTIPLIER must be >= 1, clamping",
"original", c.HUDFailureMultiplier, "clampedTo", 1)
c.HUDFailureMultiplier = 1
}
return c
}
@ -123,6 +136,11 @@ func (c *Config) Validate() error {
slog.Warn("MaxBurstCapacity is negative, clamping to 0", "original", c.MaxBurstCapacity)
c.MaxBurstCapacity = 0
}
if c.HUDFailureMultiplier < 1 {
slog.Warn("HUDFailureMultiplier must be >= 1, clamping",
"original", c.HUDFailureMultiplier, "clampedTo", 1)
c.HUDFailureMultiplier = 1
}
if c.Enabled && c.RunnerNodeFleet == "" {
// Hard requirement: the runner-pool fleet drives placeholder-runner

View File

@ -49,6 +49,7 @@ func TestConfigFromEnv_Defaults(t *testing.T) {
"CAPACITY_AWARE_RUNNER_NODE_FLEET",
"CAPACITY_AWARE_RUNNER_CLASS",
"CAPACITY_AWARE_HUD_API_TOKEN",
"CAPACITY_AWARE_HUD_FAILURE_MULTIPLIER",
}
unsetEnvs(t, keys)
@ -68,6 +69,7 @@ func TestConfigFromEnv_Defaults(t *testing.T) {
assert.Equal(t, "", cfg.RunnerNodeFleet, "RunnerNodeFleet default")
assert.Equal(t, "", cfg.RunnerClass, "RunnerClass default")
assert.Equal(t, "", cfg.HUDAPIToken, "HUDAPIToken default")
assert.Equal(t, defaultHUDFailureMultiplier, cfg.HUDFailureMultiplier, "HUDFailureMultiplier default")
// Fields set by main.go should be zero values.
assert.Equal(t, 0, cfg.MaxRunners, "MaxRunners zero")
assert.Equal(t, 0, cfg.ScaleSetID, "ScaleSetID zero")
@ -90,8 +92,9 @@ func TestConfigFromEnv_AllSet(t *testing.T) {
"CAPACITY_AWARE_RUNNER_MEMORY": "1Gi",
"CAPACITY_AWARE_NODE_FLEET": "gpu-fleet",
"CAPACITY_AWARE_RUNNER_NODE_FLEET": "c7i-runner",
"CAPACITY_AWARE_RUNNER_CLASS": "gpu-large",
"CAPACITY_AWARE_HUD_API_TOKEN": "secret-token",
"CAPACITY_AWARE_RUNNER_CLASS": "gpu-large",
"CAPACITY_AWARE_HUD_API_TOKEN": "secret-token",
"CAPACITY_AWARE_HUD_FAILURE_MULTIPLIER": "5",
})
cfg := ConfigFromEnv()
@ -110,6 +113,7 @@ func TestConfigFromEnv_AllSet(t *testing.T) {
assert.Equal(t, "c7i-runner", cfg.RunnerNodeFleet)
assert.Equal(t, "gpu-large", cfg.RunnerClass)
assert.Equal(t, "secret-token", cfg.HUDAPIToken)
assert.Equal(t, 5, cfg.HUDFailureMultiplier)
}
func TestConfigFromEnv_InvalidValues_FallbackToDefaults(t *testing.T) {
@ -157,6 +161,31 @@ func TestConfigFromEnv_ProactiveCapacity_NegativeClampedToZero(t *testing.T) {
"negative ProactiveCapacity must clamp to 0")
}
// HUDFailureMultiplier must be >= 1 — a value below 1 would never produce
// over-provisioning on HUD failure, defeating the purpose of the fallback.
func TestConfigFromEnv_HUDFailureMultiplier_BelowOneClampedToOne(t *testing.T) {
t.Run("negative", func(t *testing.T) {
setEnvs(t, map[string]string{
"CAPACITY_AWARE_HUD_FAILURE_MULTIPLIER": "-5",
})
cfg := ConfigFromEnv()
assert.Equal(t, 1, cfg.HUDFailureMultiplier,
"negative HUDFailureMultiplier must clamp to 1")
})
t.Run("zero", func(t *testing.T) {
setEnvs(t, map[string]string{
"CAPACITY_AWARE_HUD_FAILURE_MULTIPLIER": "0",
})
cfg := ConfigFromEnv()
assert.Equal(t, 1, cfg.HUDFailureMultiplier,
"zero HUDFailureMultiplier must clamp to 1")
})
}
// Values above the hard cap (1000) must be clamped — protects against
// runaway placeholder creation from a misconfiguration.
func TestConfigFromEnv_ProactiveCapacity_AboveHardCapClamped(t *testing.T) {
@ -195,6 +224,23 @@ func TestConfigFromEnv_ProactiveCapacity_AboveWarnAllowed(t *testing.T) {
"values between warn threshold and hard cap are allowed")
}
// Validate() must enforce HUDFailureMultiplier >= 1 for callers that
// construct Config programmatically (bypassing ConfigFromEnv's clamp).
func TestConfig_Validate_HUDFailureMultiplierClampedBelowOne(t *testing.T) {
t.Run("zero", func(t *testing.T) {
cfg := Config{HUDFailureMultiplier: 0}
require.NoError(t, cfg.Validate())
assert.Equal(t, 1, cfg.HUDFailureMultiplier,
"Validate must clamp HUDFailureMultiplier=0 to 1")
})
t.Run("negative", func(t *testing.T) {
cfg := Config{HUDFailureMultiplier: -3}
require.NoError(t, cfg.Validate())
assert.Equal(t, 1, cfg.HUDFailureMultiplier,
"Validate must clamp negative HUDFailureMultiplier to 1")
})
}
// Validate() clamps negative MaxRunners (set by main.go after env parse).
func TestConfig_Validate_MaxRunnersNegativeClamped(t *testing.T) {
cfg := Config{MaxRunners: -3}

View File

@ -339,15 +339,17 @@ func (m *Monitor) reconcileProvisioning(ctx context.Context) {
m.recorder.ObserveReconcileDuration(reconcilePhaseProvisioner, time.Since(start))
}()
// 1. Query HUD API with retry (graceful fallback to 0).
// 1. Query HUD API with retry (graceful fallback handled below).
queuedJobs := 0
hudFailed := false
if m.hudClient != nil && m.config.HUDAPIToken != "" {
var err error
queuedJobs, err = m.queryHUDWithRetry(ctx)
if err != nil {
m.logger.Warn("HUD API failed after retries, using 0 queued jobs", "error", err)
m.logger.Warn("HUD API failed after retries, falling back to ProactiveCapacity * HUDFailureMultiplier", "error", err)
m.recorder.IncReconcileSkips(skipReasonHUDAPIFailed)
queuedJobs = 0
hudFailed = true
}
}
// Set even on the failure path — queuedJobs is 0 in that case, which
@ -432,7 +434,13 @@ func (m *Monitor) reconcileProvisioning(ctx context.Context) {
}
// 5. Calculate desired placeholder count.
// On HUD failure, over-provision: less information about queue depth
// means we must lean toward more capacity to keep latency bounded.
// Headroom and burst caps below still bound the absolute blast radius.
desiredPairs := m.config.ProactiveCapacity + queuedJobs
if hudFailed {
desiredPairs = m.config.ProactiveCapacity * m.config.HUDFailureMultiplier
}
// Clamp by headroom against the hard runner cap. Real runner pods (running +
// pending) consume the cap, so the placeholder pool can only fill what's left.
@ -458,6 +466,7 @@ func (m *Monitor) reconcileProvisioning(ctx context.Context) {
m.logger.Info("provisioning reconciled",
"queuedJobs", queuedJobs,
"hudFailed", hudFailed,
"desiredPairs", desiredPairs,
"currentPairs", currentPairs,
"runningRunnerPods", runningRunnerPods,

View File

@ -266,7 +266,7 @@ func TestReconcile_SetMaxRunners_CapAtMaxRunners(t *testing.T) {
assert.Equal(t, int32(5), maxVal.Load())
}
func TestReconcile_HUDAPIFailure_FallsBackToProactiveOnly(t *testing.T) {
func TestReconcile_HUDAPIFailure_FallsBackToProactiveTimesMultiplier(t *testing.T) {
// HUD server returns 500.
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusInternalServerError)
@ -274,11 +274,12 @@ func TestReconcile_HUDAPIFailure_FallsBackToProactiveOnly(t *testing.T) {
defer srv.Close()
cfg := Config{
ProactiveCapacity: 3,
MaxRunners: 20,
ScaleSetLabels: []string{"linux.2xlarge"},
HUDAPIToken: "test",
PlaceholderTimeout: 5 * time.Minute,
ProactiveCapacity: 3,
HUDFailureMultiplier: 3,
MaxRunners: 20,
ScaleSetLabels: []string{"linux.2xlarge"},
HUDAPIToken: "test",
PlaceholderTimeout: 5 * time.Minute,
}
m, cs, maxVal := newTestMonitor(t, cfg, nil)
@ -287,12 +288,85 @@ func TestReconcile_HUDAPIFailure_FallsBackToProactiveOnly(t *testing.T) {
m.reconcileProvisioning(context.Background())
m.reconcileReporting(context.Background())
// Falls back to proactiveCapacity only: 3 pairs = 6 pods.
assert.Equal(t, 6, countPods(t, cs, "test-ns"))
// HUD failure -> over-provision: ProactiveCapacity(3) * HUDFailureMultiplier(3)
// = 9 pairs = 18 pods. Less info about queue depth means lean toward more
// capacity; outer caps still bound the absolute blast radius.
assert.Equal(t, 18, countPods(t, cs, "test-ns"))
// No running pairs, so capacity = 0 (still capped at MaxRunners=20).
assert.Equal(t, int32(0), maxVal.Load())
}
// With multiplier=1, the HUD-failure fallback equals ProactiveCapacity alone.
func TestReconcile_HUDAPIFailure_MultiplierOne(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusInternalServerError)
}))
defer srv.Close()
cfg := Config{
ProactiveCapacity: 3,
HUDFailureMultiplier: 1,
MaxRunners: 20,
ScaleSetLabels: []string{"linux.2xlarge"},
HUDAPIToken: "test",
PlaceholderTimeout: 5 * time.Minute,
}
m, cs, _ := newTestMonitor(t, cfg, nil)
m.hudClient = NewHUDClient(srv.URL, "test")
m.reconcileProvisioning(context.Background())
m.reconcileReporting(context.Background())
// 3 * 1 = 3 pairs = 6 pods.
assert.Equal(t, 6, countPods(t, cs, "test-ns"))
}
// MaxRunners must clamp the multiplier-amplified fallback so a misconfigured
// multiplier cannot exceed the hard runner cap.
func TestReconcile_HUDAPIFailure_MultiplierClampedByMaxRunners(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusInternalServerError)
}))
defer srv.Close()
cfg := Config{
ProactiveCapacity: 5,
HUDFailureMultiplier: 4,
MaxRunners: 10,
ScaleSetLabels: []string{"linux.2xlarge"},
HUDAPIToken: "test",
PlaceholderTimeout: 5 * time.Minute,
}
m, cs, _ := newTestMonitor(t, cfg, nil)
m.hudClient = NewHUDClient(srv.URL, "test")
m.reconcileProvisioning(context.Background())
m.reconcileReporting(context.Background())
// 5 * 4 = 20 desired, clamped to MaxRunners=10 -> 10 pairs = 20 pods.
assert.Equal(t, 20, countPods(t, cs, "test-ns"))
}
// When HUD is disabled by config (no token), the multiplier path must not
// trigger — only the proactive baseline applies.
func TestReconcile_HUDDisabled_MultiplierDoesNotApply(t *testing.T) {
cfg := Config{
ProactiveCapacity: 3,
HUDFailureMultiplier: 99,
MaxRunners: 100,
HUDAPIToken: "",
PlaceholderTimeout: 5 * time.Minute,
}
m, cs, _ := newTestMonitor(t, cfg, nil)
m.reconcileProvisioning(context.Background())
m.reconcileReporting(context.Background())
// HUD disabled -> hudFailed stays false -> desired = ProactiveCapacity(3) +
// queuedJobs(0) = 3 pairs = 6 pods. Multiplier of 99 must not apply.
assert.Equal(t, 6, countPods(t, cs, "test-ns"))
}
func TestReconcile_IdempotentWhenAtDesired(t *testing.T) {
cfg := Config{
ProactiveCapacity: 2,