Detect and replace broken placeholder pairs

- Add CleanupBroken to PlaceholderManager to find slots
  where one of two pods was evicted/deleted
- Integrate broken-pair cleanup into reconcileProvisioning
  between ListPairs and adjustPairs so replacement happens
  in the same cycle
- Add "broken" delete reason with Prometheus metrics
- Add unit tests for both successful and failed cleanup
- Bump Helm chart versions to jeanschmidt.8

Without this fix, a broken pair (one pod missing) would
count as healthy in currentPairs, causing the provisioner
to believe capacity was at desired level. Pre-warmed
capacity would be permanently reduced until the next full
listener restart.

Signed-off-by: Jean Schmidt <contato@jschmidt.me>
This commit is contained in:
Jean Schmidt 2026-05-01 13:26:59 -07:00
parent a3c294ab14
commit d219a11c89
7 changed files with 356 additions and 5 deletions

View File

@ -15,13 +15,13 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.14.1-jeanschmidt.7
version: 0.14.1-jeanschmidt.8
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "0.14.1-jeanschmidt.7"
appVersion: "0.14.1-jeanschmidt.8"
home: https://github.com/actions/actions-runner-controller

View File

@ -15,13 +15,13 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.14.1-jeanschmidt.7
version: 0.14.1-jeanschmidt.8
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "0.14.1-jeanschmidt.7"
appVersion: "0.14.1-jeanschmidt.8"
home: https://github.com/actions/actions-runner-controller

View File

@ -34,6 +34,7 @@ const (
deleteReasonOrphan = "orphan"
deleteReasonTimeout = "timeout"
deleteReasonExcess = "excess"
deleteReasonBroken = "broken"
skipReasonProvisionerListPairs = "provisioner_list_pairs"
skipReasonProvisionerListRunners = "provisioner_list_runners"
@ -380,6 +381,27 @@ func (m *Monitor) reconcileProvisioning(ctx context.Context) {
m.recorder.IncReconcileSkips(skipReasonProvisionerListPairs)
return
}
// 3b. Cleanup broken pairs (slots with only one of two pods). Without
// this, currentPairs would count broken slots as healthy and the
// provisioner would not re-create capacity to replace them. The
// surviving orphan pod is deleted; the next adjustPairs step will
// create a fresh full pair to fill the freed slot in this same cycle.
brokenSuccess, brokenFailed, brokenSlots := m.placeholders.CleanupBroken(ctx, pairs)
for _, slotID := range brokenSlots {
delete(pairs, slotID)
}
for i := 0; i < brokenSuccess; i++ {
m.recorder.IncPairDeletes(deleteReasonBroken, resultSuccess)
}
for i := 0; i < brokenFailed; i++ {
m.recorder.IncPairDeletes(deleteReasonBroken, resultError)
}
if brokenSuccess > 0 || brokenFailed > 0 {
m.logger.Info("cleaned up broken placeholder pairs",
"success", brokenSuccess, "failed", brokenFailed)
}
currentPairs := len(pairs)
m.recorder.SetPairs(currentPairs)
m.emitPlaceholderPodPhases(pairs)

View File

@ -459,6 +459,155 @@ func TestRetryWithBackoff_RespectsContextCancellation(t *testing.T) {
assert.ErrorIs(t, err, context.Canceled)
}
// TestProvisioner_BrokenPair_TriggersRecreation simulates a broken slot
// (only the runner pod present, workflow pod has been deleted/evicted) and
// asserts that the provisioner deletes the orphan pod, excludes the slot
// from currentPairs in the SAME cycle, and creates a fresh full pair to
// fill the freed slot — restoring the pre-warmed capacity.
//
// Without CleanupBroken, the slot would count as healthy and the provisioner
// would think it was already at desired, leaving capacity permanently low.
func TestProvisioner_BrokenPair_TriggersRecreation(t *testing.T) {
cfg := Config{
ProactiveCapacity: 5,
MaxRunners: 20,
PlaceholderTimeout: 5 * time.Minute,
}
rec := newFakeCapacityRecorder()
m, cs, _ := newTestMonitorWithRecorder(t, cfg, rec)
ctx := context.Background()
// Initial reconcile creates 5 healthy pairs (10 pods).
m.reconcileProvisioning(ctx)
assert.Equal(t, 10, countPods(t, cs, "test-ns"),
"5 healthy pairs after first reconcile")
// Break one slot by deleting just its workflow pod (simulating
// eviction/crash of the workflow placeholder).
pairs, err := m.placeholders.ListPairs(ctx)
require.NoError(t, err)
require.Len(t, pairs, 5)
var brokenSlotID string
for slotID := range pairs {
brokenSlotID = slotID
break
}
wfPods, err := cs.CoreV1().Pods("test-ns").List(ctx, metav1.ListOptions{
LabelSelector: labelPlaceholderID + "=" + brokenSlotID + "," +
labelPlaceholderRole + "=" + rolePlaceholderWorkflow,
})
require.NoError(t, err)
require.Len(t, wfPods.Items, 1)
require.NoError(t, cs.CoreV1().Pods("test-ns").Delete(
ctx, wfPods.Items[0].Name, metav1.DeleteOptions{},
))
// 4 pairs healthy + 1 surviving runner pod = 9 pods.
assert.Equal(t, 9, countPods(t, cs, "test-ns"),
"after break: 4 healthy pairs + 1 orphan runner pod")
// Run provisioning again. The broken slot should be detected,
// its surviving runner pod deleted, and a fresh full pair created
// in the SAME cycle (because CleanupBroken updates pairs and
// currentPairs in-place before adjustPairs runs).
m.reconcileProvisioning(ctx)
// End state: 5 healthy pairs (= 10 pods).
assert.Equal(t, 10, countPods(t, cs, "test-ns"),
"5 healthy pairs restored in the same cycle")
// The original broken slot must no longer exist (it was deleted).
finalPairs, err := m.placeholders.ListPairs(ctx)
require.NoError(t, err)
assert.Len(t, finalPairs, 5, "exactly 5 pairs in steady state")
assert.NotContains(t, finalPairs, brokenSlotID,
"broken slot must be gone (replaced by a fresh slot)")
for slotID, pair := range finalPairs {
assert.NotNilf(t, pair.RunnerPod, "slot %s must have runner pod", slotID)
assert.NotNilf(t, pair.WorkflowPod, "slot %s must have workflow pod", slotID)
}
// Metrics: one broken-success delete recorded, plus one new pair create.
rec.mu.Lock()
defer rec.mu.Unlock()
assert.Equal(t, 1, rec.incPairDeletesCalls[deleteReasonBroken+":"+resultSuccess],
"one broken delete recorded as success")
assert.Equal(t, 0, rec.incPairDeletesCalls[deleteReasonBroken+":"+resultError],
"no broken delete failures")
// Pair creates: 5 (initial) + 1 (replacement). Cycles run twice.
assert.Equal(t, 6, rec.incPairCreatesCalls[resultSuccess],
"5 initial creates + 1 replacement create")
}
// TestProvisioner_BrokenPair_DeleteFailure simulates a DeletePair
// failure during broken-pair cleanup. The cycle must not crash, the
// slot must still be excluded from currentPairs (so the next cycle
// re-detects it), and a broken-error metric must be recorded.
func TestProvisioner_BrokenPair_DeleteFailure(t *testing.T) {
cfg := Config{
ProactiveCapacity: 3,
MaxRunners: 20,
PlaceholderTimeout: 5 * time.Minute,
}
rec := newFakeCapacityRecorder()
m, cs, _ := newTestMonitorWithRecorder(t, cfg, rec)
ctx := context.Background()
// Initial reconcile creates 3 healthy pairs.
m.reconcileProvisioning(ctx)
require.Equal(t, 6, countPods(t, cs, "test-ns"))
// Break one slot.
pairs, err := m.placeholders.ListPairs(ctx)
require.NoError(t, err)
var brokenSlotID string
for slotID := range pairs {
brokenSlotID = slotID
break
}
wfPods, err := cs.CoreV1().Pods("test-ns").List(ctx, metav1.ListOptions{
LabelSelector: labelPlaceholderID + "=" + brokenSlotID + "," +
labelPlaceholderRole + "=" + rolePlaceholderWorkflow,
})
require.NoError(t, err)
require.NoError(t, cs.CoreV1().Pods("test-ns").Delete(
ctx, wfPods.Items[0].Name, metav1.DeleteOptions{},
))
// Inject a delete-collection failure that targets only the broken
// slot (so the second-cycle replacement-create + the orphan-runner
// cleanup the next cycle does aren't blocked by the same reactor).
cs.PrependReactor("delete-collection", "pods",
func(action k8stesting.Action) (bool, runtime.Object, error) {
dca := action.(k8stesting.DeleteCollectionActionImpl)
sel := dca.GetListOptions().LabelSelector
if strings.Contains(sel, labelPlaceholderID+"="+brokenSlotID) {
return true, nil, fmt.Errorf("simulated delete failure")
}
return false, nil, nil
},
)
// Run the provisioner — must not crash even though delete fails.
assert.NotPanics(t, func() { m.reconcileProvisioning(ctx) })
// The broken slot must still have been excluded from currentPairs,
// so the provisioner created a replacement pair: total existing pairs
// = 4 (3 originals minus 1 still-broken + 1 new replacement).
pairs, err = m.placeholders.ListPairs(ctx)
require.NoError(t, err)
assert.Len(t, pairs, 4,
"4 pairs total: 2 untouched healthy + 1 still-broken (delete failed) + 1 new replacement")
// Metrics: one broken-error delete recorded.
rec.mu.Lock()
defer rec.mu.Unlock()
assert.Equal(t, 0, rec.incPairDeletesCalls[deleteReasonBroken+":"+resultSuccess],
"delete failed -> no broken success")
assert.Equal(t, 1, rec.incPairDeletesCalls[deleteReasonBroken+":"+resultError],
"one broken delete failure recorded")
}
// ---- capacity recorder wiring tests ----
//
// These tests assert that the monitor calls the right CapacityRecorder

View File

@ -164,6 +164,50 @@ func (pm *PlaceholderManager) CleanupTimedOut(ctx context.Context) (int, int, er
return deleted, failed, nil
}
// CleanupBroken deletes pairs where only one of the two placeholder pods
// exists (the other was evicted, deleted, or never landed). The surviving
// orphan pod is deleted via DeletePair so the next reconcile cycle can
// create a fresh full pair via the normal adjustPairs path.
//
// A broken pair cannot serve work — the missing pod means the slot reserves
// a node but no workload can land. Without this cleanup, currentPairs would
// count the broken slot as healthy and the provisioner would not re-create
// capacity to replace it.
//
// Operates on the supplied pairs map (already fetched by the caller via
// ListPairs) — no fresh List call. Returns (successful deletes, failed
// deletes, deleted slot IDs). The caller should remove the returned slot
// IDs from its local pairs map so the broken slots are excluded from
// currentPairs in the same reconcile cycle (even on delete failure —
// counting a broken slot as healthy here would mask the broken state;
// the next cycle will re-detect any leftover state and retry).
func (pm *PlaceholderManager) CleanupBroken(
ctx context.Context,
pairs map[string]*PlaceholderPair,
) (int, int, []string) {
var deletedSlots []string
deleted, failed := 0, 0
for slotID, pair := range pairs {
if pair.RunnerPod != nil && pair.WorkflowPod != nil {
continue
}
if err := pm.DeletePair(ctx, slotID); err != nil {
pm.logger.Warn("failed to delete broken pair",
"slotID", slotID,
"hasRunner", pair.RunnerPod != nil,
"hasWorkflow", pair.WorkflowPod != nil,
"error", err,
)
failed++
deletedSlots = append(deletedSlots, slotID)
continue
}
deleted++
deletedSlots = append(deletedSlots, slotID)
}
return deleted, failed, deletedSlots
}
// CleanupOrphans deletes placeholder pods and anchor ConfigMaps from
// previous listener instances of the SAME scale set (different
// listener-pod label value but same scale-set-name label).

View File

@ -230,6 +230,142 @@ func TestCleanupTimedOut(t *testing.T) {
assert.Contains(t, pairs, "slot-new")
}
// TestCleanupBroken_DeletesSlotWithOnlyRunner verifies that a slot
// missing its workflow pod is identified as broken, deleted, and
// returned in the deleted slot list.
func TestCleanupBroken_DeletesSlotWithOnlyRunner(t *testing.T) {
pm, cs := newTestPM(t, Config{})
ctx := context.Background()
// Create a healthy pair, then delete just the workflow pod to leave
// a broken slot with only the runner pod remaining.
require.NoError(t, pm.CreatePair(ctx, "broken-slot"))
pods, err := cs.CoreV1().Pods("test-ns").List(ctx, metav1.ListOptions{
LabelSelector: labelPlaceholderID + "=broken-slot," +
labelPlaceholderRole + "=" + rolePlaceholderWorkflow,
})
require.NoError(t, err)
require.Len(t, pods.Items, 1, "workflow pod must exist before delete")
require.NoError(t, cs.CoreV1().Pods("test-ns").Delete(
ctx, pods.Items[0].Name, metav1.DeleteOptions{},
))
pairs, err := pm.ListPairs(ctx)
require.NoError(t, err)
require.Len(t, pairs, 1)
require.NotNil(t, pairs["broken-slot"].RunnerPod)
require.Nil(t, pairs["broken-slot"].WorkflowPod)
deleted, failed, deletedSlots := pm.CleanupBroken(ctx, pairs)
assert.Equal(t, 1, deleted, "one broken slot deleted")
assert.Equal(t, 0, failed, "no failures")
assert.Equal(t, []string{"broken-slot"}, deletedSlots,
"deleted slot ID returned")
// All pods for that slot must be gone.
pods, err = cs.CoreV1().Pods("test-ns").List(ctx, metav1.ListOptions{
LabelSelector: labelPlaceholderID + "=broken-slot",
})
require.NoError(t, err)
assert.Empty(t, pods.Items, "no surviving pods for broken slot")
}
// TestCleanupBroken_DeletesSlotWithOnlyWorkflow is the symmetric case:
// a slot missing its runner pod must also be deleted.
func TestCleanupBroken_DeletesSlotWithOnlyWorkflow(t *testing.T) {
pm, cs := newTestPM(t, Config{})
ctx := context.Background()
require.NoError(t, pm.CreatePair(ctx, "broken-slot"))
pods, err := cs.CoreV1().Pods("test-ns").List(ctx, metav1.ListOptions{
LabelSelector: labelPlaceholderID + "=broken-slot," +
labelPlaceholderRole + "=" + rolePlaceholderRunner,
})
require.NoError(t, err)
require.Len(t, pods.Items, 1, "runner pod must exist before delete")
require.NoError(t, cs.CoreV1().Pods("test-ns").Delete(
ctx, pods.Items[0].Name, metav1.DeleteOptions{},
))
pairs, err := pm.ListPairs(ctx)
require.NoError(t, err)
require.Len(t, pairs, 1)
require.Nil(t, pairs["broken-slot"].RunnerPod)
require.NotNil(t, pairs["broken-slot"].WorkflowPod)
deleted, failed, deletedSlots := pm.CleanupBroken(ctx, pairs)
assert.Equal(t, 1, deleted, "one broken slot deleted")
assert.Equal(t, 0, failed, "no failures")
assert.Equal(t, []string{"broken-slot"}, deletedSlots)
pods, err = cs.CoreV1().Pods("test-ns").List(ctx, metav1.ListOptions{
LabelSelector: labelPlaceholderID + "=broken-slot",
})
require.NoError(t, err)
assert.Empty(t, pods.Items, "no surviving pods for broken slot")
}
// TestCleanupBroken_LeavesHealthyPairs verifies that fully populated
// pairs (both runner + workflow pods present) are NOT deleted.
func TestCleanupBroken_LeavesHealthyPairs(t *testing.T) {
pm, cs := newTestPM(t, Config{})
ctx := context.Background()
require.NoError(t, pm.CreatePair(ctx, "slot-a"))
require.NoError(t, pm.CreatePair(ctx, "slot-b"))
pairs, err := pm.ListPairs(ctx)
require.NoError(t, err)
require.Len(t, pairs, 2)
deleted, failed, deletedSlots := pm.CleanupBroken(ctx, pairs)
assert.Equal(t, 0, deleted, "no pairs deleted")
assert.Equal(t, 0, failed, "no failures")
assert.Empty(t, deletedSlots, "no slot IDs returned")
// All 4 pods (2 pairs × 2 pods) must still exist.
pods, err := cs.CoreV1().Pods("test-ns").List(ctx, metav1.ListOptions{})
require.NoError(t, err)
assert.Len(t, pods.Items, 4, "all healthy pair pods preserved")
}
// TestCleanupBroken_DeleteFailureStillExcludesSlot verifies that a
// DeletePair error still causes the slot ID to be returned (so the
// caller excludes it from currentPairs) and is recorded as failed.
func TestCleanupBroken_DeleteFailureStillExcludesSlot(t *testing.T) {
pm, cs := newTestPM(t, Config{})
ctx := context.Background()
// Build a broken pair (only runner present).
require.NoError(t, pm.CreatePair(ctx, "broken-slot"))
wfPods, err := cs.CoreV1().Pods("test-ns").List(ctx, metav1.ListOptions{
LabelSelector: labelPlaceholderID + "=broken-slot," +
labelPlaceholderRole + "=" + rolePlaceholderWorkflow,
})
require.NoError(t, err)
require.NoError(t, cs.CoreV1().Pods("test-ns").Delete(
ctx, wfPods.Items[0].Name, metav1.DeleteOptions{},
))
pairs, err := pm.ListPairs(ctx)
require.NoError(t, err)
require.Len(t, pairs, 1)
// Inject a delete-collection failure for ALL subsequent calls.
cs.PrependReactor("delete-collection", "pods",
func(action k8stesting.Action) (bool, runtime.Object, error) {
return true, nil, errors.New("simulated delete failure")
},
)
deleted, failed, deletedSlots := pm.CleanupBroken(ctx, pairs)
assert.Equal(t, 0, deleted, "delete failed -> 0 successful deletes")
assert.Equal(t, 1, failed, "one failed delete")
assert.Equal(t, []string{"broken-slot"}, deletedSlots,
"slot must still be returned so caller excludes it from currentPairs "+
"(counting it as healthy would mask the broken state)")
}
func TestCleanupOrphans(t *testing.T) {
cfg := Config{Namespace: "test-ns", ScaleSetName: "sset"}
cs := newFakeClientset()

View File

@ -428,7 +428,7 @@ func TestExporterCapacityRecorder_Counters(t *testing.T) {
assert.Equal(t, 2.0, counterValue(t, e, MetricCapacityPairCreatesTotal, labelKeyResult, "success"))
assert.Equal(t, 1.0, counterValue(t, e, MetricCapacityPairCreatesTotal, labelKeyResult, "error"))
for _, reason := range []string{"timeout", "orphan", "excess"} {
for _, reason := range []string{"timeout", "orphan", "excess", "broken"} {
e.IncPairDeletes(reason, "success")
e.IncPairDeletes(reason, "error")
e.IncPairDeletes(reason, "success")