HUD failure fallback: over-provision placeholders
- Add configurable HUDFailureMultiplier (default 3x) to scale placeholder count when HUD API is unreachable - New env var CAPACITY_AWARE_HUD_FAILURE_MULTIPLIER with clamp ≥1 in both ConfigFromEnv and Validate - Fallback formula: ProactiveCapacity * multiplier (replaces the previous zero-queued-jobs fallback that reduced capacity) - Add tests for multiplier clamping, MaxRunners cap interaction, and HUD-disabled path - Bump chart versions to jeanschmidt.10 When HUD is down we lose visibility into queue depth, so the old fallback of assuming 0 queued jobs was backwards — it shrank capacity exactly when we had the least information. The multiplier-based fallback leans toward over-provisioning instead; existing safety bounds (MaxRunners headroom, MaxBurstCapacity) still cap the blast radius. Signed-off-by: Jean Schmidt <contato@jschmidt.me>
This commit is contained in:
parent
30c1a102b4
commit
d5d94fba48
|
|
@ -15,13 +15,13 @@ type: application
|
|||
# This is the chart version. This version number should be incremented each time you make changes
|
||||
# to the chart and its templates, including the app version.
|
||||
# Versions are expected to follow Semantic Versioning (https://semver.org/)
|
||||
version: 0.14.1-jeanschmidt.9
|
||||
version: 0.14.1-jeanschmidt.10
|
||||
|
||||
# This is the version number of the application being deployed. This version number should be
|
||||
# incremented each time you make changes to the application. Versions are not expected to
|
||||
# follow Semantic Versioning. They should reflect the version the application is using.
|
||||
# It is recommended to use it with quotes.
|
||||
appVersion: "0.14.1-jeanschmidt.9"
|
||||
appVersion: "0.14.1-jeanschmidt.10"
|
||||
|
||||
home: https://github.com/actions/actions-runner-controller
|
||||
|
||||
|
|
|
|||
|
|
@ -15,13 +15,13 @@ type: application
|
|||
# This is the chart version. This version number should be incremented each time you make changes
|
||||
# to the chart and its templates, including the app version.
|
||||
# Versions are expected to follow Semantic Versioning (https://semver.org/)
|
||||
version: 0.14.1-jeanschmidt.9
|
||||
version: 0.14.1-jeanschmidt.10
|
||||
|
||||
# This is the version number of the application being deployed. This version number should be
|
||||
# incremented each time you make changes to the application. Versions are not expected to
|
||||
# follow Semantic Versioning. They should reflect the version the application is using.
|
||||
# It is recommended to use it with quotes.
|
||||
appVersion: "0.14.1-jeanschmidt.9"
|
||||
appVersion: "0.14.1-jeanschmidt.10"
|
||||
|
||||
home: https://github.com/actions/actions-runner-controller
|
||||
|
||||
|
|
|
|||
|
|
@ -16,6 +16,11 @@ const (
|
|||
// proactiveCapacityWarnThreshold triggers a warning log but does
|
||||
// not clamp — operators may legitimately need >100 in surge cases.
|
||||
proactiveCapacityWarnThreshold = 100
|
||||
// defaultHUDFailureMultiplier is applied to ProactiveCapacity when the
|
||||
// HUD API is unreachable. A value >1 keeps placeholder capacity above
|
||||
// the proactive baseline during a HUD outage; outer caps (MaxRunners
|
||||
// headroom, MaxBurstCapacity) bound the absolute blast radius.
|
||||
defaultHUDFailureMultiplier = 3
|
||||
)
|
||||
|
||||
// Config holds all configuration for the capacity monitor.
|
||||
|
|
@ -62,8 +67,9 @@ type Config struct {
|
|||
ScaleSetName string
|
||||
|
||||
// HUD API
|
||||
HUDAPIURL string
|
||||
HUDAPIToken string
|
||||
HUDAPIURL string
|
||||
HUDAPIToken string
|
||||
HUDFailureMultiplier int
|
||||
}
|
||||
|
||||
// ConfigFromEnv reads capacity monitor configuration from environment
|
||||
|
|
@ -87,8 +93,9 @@ func ConfigFromEnv() Config {
|
|||
NodeFleet: envString("CAPACITY_AWARE_NODE_FLEET", ""),
|
||||
RunnerNodeFleet: envString("CAPACITY_AWARE_RUNNER_NODE_FLEET", ""),
|
||||
RunnerClass: envString("CAPACITY_AWARE_RUNNER_CLASS", ""),
|
||||
HUDAPIURL: envString("CAPACITY_AWARE_HUD_API_URL", defaultHUDAPIURL),
|
||||
HUDAPIToken: envString("CAPACITY_AWARE_HUD_API_TOKEN", ""),
|
||||
HUDAPIURL: envString("CAPACITY_AWARE_HUD_API_URL", defaultHUDAPIURL),
|
||||
HUDAPIToken: envString("CAPACITY_AWARE_HUD_API_TOKEN", ""),
|
||||
HUDFailureMultiplier: envInt("CAPACITY_AWARE_HUD_FAILURE_MULTIPLIER", defaultHUDFailureMultiplier),
|
||||
}
|
||||
|
||||
if c.ProactiveCapacity < 0 {
|
||||
|
|
@ -105,6 +112,12 @@ func ConfigFromEnv() Config {
|
|||
"value", c.ProactiveCapacity, "warnThreshold", proactiveCapacityWarnThreshold)
|
||||
}
|
||||
|
||||
if c.HUDFailureMultiplier < 1 {
|
||||
slog.Warn("CAPACITY_AWARE_HUD_FAILURE_MULTIPLIER must be >= 1, clamping",
|
||||
"original", c.HUDFailureMultiplier, "clampedTo", 1)
|
||||
c.HUDFailureMultiplier = 1
|
||||
}
|
||||
|
||||
return c
|
||||
}
|
||||
|
||||
|
|
@ -123,6 +136,11 @@ func (c *Config) Validate() error {
|
|||
slog.Warn("MaxBurstCapacity is negative, clamping to 0", "original", c.MaxBurstCapacity)
|
||||
c.MaxBurstCapacity = 0
|
||||
}
|
||||
if c.HUDFailureMultiplier < 1 {
|
||||
slog.Warn("HUDFailureMultiplier must be >= 1, clamping",
|
||||
"original", c.HUDFailureMultiplier, "clampedTo", 1)
|
||||
c.HUDFailureMultiplier = 1
|
||||
}
|
||||
|
||||
if c.Enabled && c.RunnerNodeFleet == "" {
|
||||
// Hard requirement: the runner-pool fleet drives placeholder-runner
|
||||
|
|
|
|||
|
|
@ -49,6 +49,7 @@ func TestConfigFromEnv_Defaults(t *testing.T) {
|
|||
"CAPACITY_AWARE_RUNNER_NODE_FLEET",
|
||||
"CAPACITY_AWARE_RUNNER_CLASS",
|
||||
"CAPACITY_AWARE_HUD_API_TOKEN",
|
||||
"CAPACITY_AWARE_HUD_FAILURE_MULTIPLIER",
|
||||
}
|
||||
unsetEnvs(t, keys)
|
||||
|
||||
|
|
@ -68,6 +69,7 @@ func TestConfigFromEnv_Defaults(t *testing.T) {
|
|||
assert.Equal(t, "", cfg.RunnerNodeFleet, "RunnerNodeFleet default")
|
||||
assert.Equal(t, "", cfg.RunnerClass, "RunnerClass default")
|
||||
assert.Equal(t, "", cfg.HUDAPIToken, "HUDAPIToken default")
|
||||
assert.Equal(t, defaultHUDFailureMultiplier, cfg.HUDFailureMultiplier, "HUDFailureMultiplier default")
|
||||
// Fields set by main.go should be zero values.
|
||||
assert.Equal(t, 0, cfg.MaxRunners, "MaxRunners zero")
|
||||
assert.Equal(t, 0, cfg.ScaleSetID, "ScaleSetID zero")
|
||||
|
|
@ -90,8 +92,9 @@ func TestConfigFromEnv_AllSet(t *testing.T) {
|
|||
"CAPACITY_AWARE_RUNNER_MEMORY": "1Gi",
|
||||
"CAPACITY_AWARE_NODE_FLEET": "gpu-fleet",
|
||||
"CAPACITY_AWARE_RUNNER_NODE_FLEET": "c7i-runner",
|
||||
"CAPACITY_AWARE_RUNNER_CLASS": "gpu-large",
|
||||
"CAPACITY_AWARE_HUD_API_TOKEN": "secret-token",
|
||||
"CAPACITY_AWARE_RUNNER_CLASS": "gpu-large",
|
||||
"CAPACITY_AWARE_HUD_API_TOKEN": "secret-token",
|
||||
"CAPACITY_AWARE_HUD_FAILURE_MULTIPLIER": "5",
|
||||
})
|
||||
|
||||
cfg := ConfigFromEnv()
|
||||
|
|
@ -110,6 +113,7 @@ func TestConfigFromEnv_AllSet(t *testing.T) {
|
|||
assert.Equal(t, "c7i-runner", cfg.RunnerNodeFleet)
|
||||
assert.Equal(t, "gpu-large", cfg.RunnerClass)
|
||||
assert.Equal(t, "secret-token", cfg.HUDAPIToken)
|
||||
assert.Equal(t, 5, cfg.HUDFailureMultiplier)
|
||||
}
|
||||
|
||||
func TestConfigFromEnv_InvalidValues_FallbackToDefaults(t *testing.T) {
|
||||
|
|
@ -157,6 +161,31 @@ func TestConfigFromEnv_ProactiveCapacity_NegativeClampedToZero(t *testing.T) {
|
|||
"negative ProactiveCapacity must clamp to 0")
|
||||
}
|
||||
|
||||
// HUDFailureMultiplier must be >= 1 — a value below 1 would never produce
|
||||
// over-provisioning on HUD failure, defeating the purpose of the fallback.
|
||||
func TestConfigFromEnv_HUDFailureMultiplier_BelowOneClampedToOne(t *testing.T) {
|
||||
t.Run("negative", func(t *testing.T) {
|
||||
setEnvs(t, map[string]string{
|
||||
"CAPACITY_AWARE_HUD_FAILURE_MULTIPLIER": "-5",
|
||||
})
|
||||
|
||||
cfg := ConfigFromEnv()
|
||||
|
||||
assert.Equal(t, 1, cfg.HUDFailureMultiplier,
|
||||
"negative HUDFailureMultiplier must clamp to 1")
|
||||
})
|
||||
t.Run("zero", func(t *testing.T) {
|
||||
setEnvs(t, map[string]string{
|
||||
"CAPACITY_AWARE_HUD_FAILURE_MULTIPLIER": "0",
|
||||
})
|
||||
|
||||
cfg := ConfigFromEnv()
|
||||
|
||||
assert.Equal(t, 1, cfg.HUDFailureMultiplier,
|
||||
"zero HUDFailureMultiplier must clamp to 1")
|
||||
})
|
||||
}
|
||||
|
||||
// Values above the hard cap (1000) must be clamped — protects against
|
||||
// runaway placeholder creation from a misconfiguration.
|
||||
func TestConfigFromEnv_ProactiveCapacity_AboveHardCapClamped(t *testing.T) {
|
||||
|
|
@ -195,6 +224,23 @@ func TestConfigFromEnv_ProactiveCapacity_AboveWarnAllowed(t *testing.T) {
|
|||
"values between warn threshold and hard cap are allowed")
|
||||
}
|
||||
|
||||
// Validate() must enforce HUDFailureMultiplier >= 1 for callers that
|
||||
// construct Config programmatically (bypassing ConfigFromEnv's clamp).
|
||||
func TestConfig_Validate_HUDFailureMultiplierClampedBelowOne(t *testing.T) {
|
||||
t.Run("zero", func(t *testing.T) {
|
||||
cfg := Config{HUDFailureMultiplier: 0}
|
||||
require.NoError(t, cfg.Validate())
|
||||
assert.Equal(t, 1, cfg.HUDFailureMultiplier,
|
||||
"Validate must clamp HUDFailureMultiplier=0 to 1")
|
||||
})
|
||||
t.Run("negative", func(t *testing.T) {
|
||||
cfg := Config{HUDFailureMultiplier: -3}
|
||||
require.NoError(t, cfg.Validate())
|
||||
assert.Equal(t, 1, cfg.HUDFailureMultiplier,
|
||||
"Validate must clamp negative HUDFailureMultiplier to 1")
|
||||
})
|
||||
}
|
||||
|
||||
// Validate() clamps negative MaxRunners (set by main.go after env parse).
|
||||
func TestConfig_Validate_MaxRunnersNegativeClamped(t *testing.T) {
|
||||
cfg := Config{MaxRunners: -3}
|
||||
|
|
|
|||
|
|
@ -339,15 +339,17 @@ func (m *Monitor) reconcileProvisioning(ctx context.Context) {
|
|||
m.recorder.ObserveReconcileDuration(reconcilePhaseProvisioner, time.Since(start))
|
||||
}()
|
||||
|
||||
// 1. Query HUD API with retry (graceful fallback to 0).
|
||||
// 1. Query HUD API with retry (graceful fallback handled below).
|
||||
queuedJobs := 0
|
||||
hudFailed := false
|
||||
if m.hudClient != nil && m.config.HUDAPIToken != "" {
|
||||
var err error
|
||||
queuedJobs, err = m.queryHUDWithRetry(ctx)
|
||||
if err != nil {
|
||||
m.logger.Warn("HUD API failed after retries, using 0 queued jobs", "error", err)
|
||||
m.logger.Warn("HUD API failed after retries, falling back to ProactiveCapacity * HUDFailureMultiplier", "error", err)
|
||||
m.recorder.IncReconcileSkips(skipReasonHUDAPIFailed)
|
||||
queuedJobs = 0
|
||||
hudFailed = true
|
||||
}
|
||||
}
|
||||
// Set even on the failure path — queuedJobs is 0 in that case, which
|
||||
|
|
@ -432,7 +434,13 @@ func (m *Monitor) reconcileProvisioning(ctx context.Context) {
|
|||
}
|
||||
|
||||
// 5. Calculate desired placeholder count.
|
||||
// On HUD failure, over-provision: less information about queue depth
|
||||
// means we must lean toward more capacity to keep latency bounded.
|
||||
// Headroom and burst caps below still bound the absolute blast radius.
|
||||
desiredPairs := m.config.ProactiveCapacity + queuedJobs
|
||||
if hudFailed {
|
||||
desiredPairs = m.config.ProactiveCapacity * m.config.HUDFailureMultiplier
|
||||
}
|
||||
|
||||
// Clamp by headroom against the hard runner cap. Real runner pods (running +
|
||||
// pending) consume the cap, so the placeholder pool can only fill what's left.
|
||||
|
|
@ -458,6 +466,7 @@ func (m *Monitor) reconcileProvisioning(ctx context.Context) {
|
|||
|
||||
m.logger.Info("provisioning reconciled",
|
||||
"queuedJobs", queuedJobs,
|
||||
"hudFailed", hudFailed,
|
||||
"desiredPairs", desiredPairs,
|
||||
"currentPairs", currentPairs,
|
||||
"runningRunnerPods", runningRunnerPods,
|
||||
|
|
|
|||
|
|
@ -266,7 +266,7 @@ func TestReconcile_SetMaxRunners_CapAtMaxRunners(t *testing.T) {
|
|||
assert.Equal(t, int32(5), maxVal.Load())
|
||||
}
|
||||
|
||||
func TestReconcile_HUDAPIFailure_FallsBackToProactiveOnly(t *testing.T) {
|
||||
func TestReconcile_HUDAPIFailure_FallsBackToProactiveTimesMultiplier(t *testing.T) {
|
||||
// HUD server returns 500.
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
|
|
@ -274,11 +274,12 @@ func TestReconcile_HUDAPIFailure_FallsBackToProactiveOnly(t *testing.T) {
|
|||
defer srv.Close()
|
||||
|
||||
cfg := Config{
|
||||
ProactiveCapacity: 3,
|
||||
MaxRunners: 20,
|
||||
ScaleSetLabels: []string{"linux.2xlarge"},
|
||||
HUDAPIToken: "test",
|
||||
PlaceholderTimeout: 5 * time.Minute,
|
||||
ProactiveCapacity: 3,
|
||||
HUDFailureMultiplier: 3,
|
||||
MaxRunners: 20,
|
||||
ScaleSetLabels: []string{"linux.2xlarge"},
|
||||
HUDAPIToken: "test",
|
||||
PlaceholderTimeout: 5 * time.Minute,
|
||||
}
|
||||
m, cs, maxVal := newTestMonitor(t, cfg, nil)
|
||||
|
||||
|
|
@ -287,12 +288,85 @@ func TestReconcile_HUDAPIFailure_FallsBackToProactiveOnly(t *testing.T) {
|
|||
m.reconcileProvisioning(context.Background())
|
||||
m.reconcileReporting(context.Background())
|
||||
|
||||
// Falls back to proactiveCapacity only: 3 pairs = 6 pods.
|
||||
assert.Equal(t, 6, countPods(t, cs, "test-ns"))
|
||||
// HUD failure -> over-provision: ProactiveCapacity(3) * HUDFailureMultiplier(3)
|
||||
// = 9 pairs = 18 pods. Less info about queue depth means lean toward more
|
||||
// capacity; outer caps still bound the absolute blast radius.
|
||||
assert.Equal(t, 18, countPods(t, cs, "test-ns"))
|
||||
// No running pairs, so capacity = 0 (still capped at MaxRunners=20).
|
||||
assert.Equal(t, int32(0), maxVal.Load())
|
||||
}
|
||||
|
||||
// With multiplier=1, the HUD-failure fallback equals ProactiveCapacity alone.
|
||||
func TestReconcile_HUDAPIFailure_MultiplierOne(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
cfg := Config{
|
||||
ProactiveCapacity: 3,
|
||||
HUDFailureMultiplier: 1,
|
||||
MaxRunners: 20,
|
||||
ScaleSetLabels: []string{"linux.2xlarge"},
|
||||
HUDAPIToken: "test",
|
||||
PlaceholderTimeout: 5 * time.Minute,
|
||||
}
|
||||
m, cs, _ := newTestMonitor(t, cfg, nil)
|
||||
m.hudClient = NewHUDClient(srv.URL, "test")
|
||||
|
||||
m.reconcileProvisioning(context.Background())
|
||||
m.reconcileReporting(context.Background())
|
||||
|
||||
// 3 * 1 = 3 pairs = 6 pods.
|
||||
assert.Equal(t, 6, countPods(t, cs, "test-ns"))
|
||||
}
|
||||
|
||||
// MaxRunners must clamp the multiplier-amplified fallback so a misconfigured
|
||||
// multiplier cannot exceed the hard runner cap.
|
||||
func TestReconcile_HUDAPIFailure_MultiplierClampedByMaxRunners(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
cfg := Config{
|
||||
ProactiveCapacity: 5,
|
||||
HUDFailureMultiplier: 4,
|
||||
MaxRunners: 10,
|
||||
ScaleSetLabels: []string{"linux.2xlarge"},
|
||||
HUDAPIToken: "test",
|
||||
PlaceholderTimeout: 5 * time.Minute,
|
||||
}
|
||||
m, cs, _ := newTestMonitor(t, cfg, nil)
|
||||
m.hudClient = NewHUDClient(srv.URL, "test")
|
||||
|
||||
m.reconcileProvisioning(context.Background())
|
||||
m.reconcileReporting(context.Background())
|
||||
|
||||
// 5 * 4 = 20 desired, clamped to MaxRunners=10 -> 10 pairs = 20 pods.
|
||||
assert.Equal(t, 20, countPods(t, cs, "test-ns"))
|
||||
}
|
||||
|
||||
// When HUD is disabled by config (no token), the multiplier path must not
|
||||
// trigger — only the proactive baseline applies.
|
||||
func TestReconcile_HUDDisabled_MultiplierDoesNotApply(t *testing.T) {
|
||||
cfg := Config{
|
||||
ProactiveCapacity: 3,
|
||||
HUDFailureMultiplier: 99,
|
||||
MaxRunners: 100,
|
||||
HUDAPIToken: "",
|
||||
PlaceholderTimeout: 5 * time.Minute,
|
||||
}
|
||||
m, cs, _ := newTestMonitor(t, cfg, nil)
|
||||
|
||||
m.reconcileProvisioning(context.Background())
|
||||
m.reconcileReporting(context.Background())
|
||||
|
||||
// HUD disabled -> hudFailed stays false -> desired = ProactiveCapacity(3) +
|
||||
// queuedJobs(0) = 3 pairs = 6 pods. Multiplier of 99 must not apply.
|
||||
assert.Equal(t, 6, countPods(t, cs, "test-ns"))
|
||||
}
|
||||
|
||||
func TestReconcile_IdempotentWhenAtDesired(t *testing.T) {
|
||||
cfg := Config{
|
||||
ProactiveCapacity: 2,
|
||||
|
|
|
|||
Loading…
Reference in New Issue