Detect and replace broken placeholder pairs

- Add CleanupBroken to PlaceholderManager to find slots where one of two pods was evicted/deleted - Integrate broken-pair cleanup into reconcileProvisioning between ListPairs and adjustPairs so replacement happens in the same cycle - Add "broken" delete reason with Prometheus metrics - Add unit tests for both successful and failed cleanup - Bump Helm chart versions to jeanschmidt.8 Without this fix, a broken pair (one pod missing) would count as healthy in currentPairs, causing the provisioner to believe capacity was at desired level. Pre-warmed capacity would be permanently reduced until the next full listener restart. Signed-off-by: Jean Schmidt <contato@jschmidt.me>
2026-05-01 13:26:59 -07:00 · 2026-05-01 13:26:59 -07:00 · d219a11c89
parent a3c294ab14
commit d219a11c89
7 changed files with 356 additions and 5 deletions
--- a/charts/gha-runner-scale-set-controller/Chart.yaml
+++ b/charts/gha-runner-scale-set-controller/Chart.yaml
@ -15,13 +15,13 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.14.1-jeanschmidt.7
+version: 0.14.1-jeanschmidt.8

 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 # follow Semantic Versioning. They should reflect the version the application is using.
 # It is recommended to use it with quotes.
-appVersion: "0.14.1-jeanschmidt.7"
+appVersion: "0.14.1-jeanschmidt.8"

 home: https://github.com/actions/actions-runner-controller

--- a/charts/gha-runner-scale-set/Chart.yaml
+++ b/charts/gha-runner-scale-set/Chart.yaml
@ -15,13 +15,13 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.14.1-jeanschmidt.7
+version: 0.14.1-jeanschmidt.8

 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 # follow Semantic Versioning. They should reflect the version the application is using.
 # It is recommended to use it with quotes.
-appVersion: "0.14.1-jeanschmidt.7"
+appVersion: "0.14.1-jeanschmidt.8"

 home: https://github.com/actions/actions-runner-controller

--- a/cmd/ghalistener/capacity/monitor.go
+++ b/cmd/ghalistener/capacity/monitor.go
@ -34,6 +34,7 @@ const (
 	deleteReasonOrphan  = "orphan"
 	deleteReasonTimeout = "timeout"
 	deleteReasonExcess  = "excess"
+	deleteReasonBroken  = "broken"

 	skipReasonProvisionerListPairs   = "provisioner_list_pairs"
 	skipReasonProvisionerListRunners = "provisioner_list_runners"
@ -380,6 +381,27 @@ func (m *Monitor) reconcileProvisioning(ctx context.Context) {
 		m.recorder.IncReconcileSkips(skipReasonProvisionerListPairs)
 		return
 	}
+
+	// 3b. Cleanup broken pairs (slots with only one of two pods). Without
+	// this, currentPairs would count broken slots as healthy and the
+	// provisioner would not re-create capacity to replace them. The
+	// surviving orphan pod is deleted; the next adjustPairs step will
+	// create a fresh full pair to fill the freed slot in this same cycle.
+	brokenSuccess, brokenFailed, brokenSlots := m.placeholders.CleanupBroken(ctx, pairs)
+	for _, slotID := range brokenSlots {
+		delete(pairs, slotID)
+	}
+	for i := 0; i < brokenSuccess; i++ {
+		m.recorder.IncPairDeletes(deleteReasonBroken, resultSuccess)
+	}
+	for i := 0; i < brokenFailed; i++ {
+		m.recorder.IncPairDeletes(deleteReasonBroken, resultError)
+	}
+	if brokenSuccess > 0 || brokenFailed > 0 {
+		m.logger.Info("cleaned up broken placeholder pairs",
+			"success", brokenSuccess, "failed", brokenFailed)
+	}
+
 	currentPairs := len(pairs)
 	m.recorder.SetPairs(currentPairs)
 	m.emitPlaceholderPodPhases(pairs)
--- a/cmd/ghalistener/capacity/monitor_test.go
+++ b/cmd/ghalistener/capacity/monitor_test.go
@ -459,6 +459,155 @@ func TestRetryWithBackoff_RespectsContextCancellation(t *testing.T) {
 	assert.ErrorIs(t, err, context.Canceled)
 }

+// TestProvisioner_BrokenPair_TriggersRecreation simulates a broken slot
+// (only the runner pod present, workflow pod has been deleted/evicted) and
+// asserts that the provisioner deletes the orphan pod, excludes the slot
+// from currentPairs in the SAME cycle, and creates a fresh full pair to
+// fill the freed slot — restoring the pre-warmed capacity.
+//
+// Without CleanupBroken, the slot would count as healthy and the provisioner
+// would think it was already at desired, leaving capacity permanently low.
+func TestProvisioner_BrokenPair_TriggersRecreation(t *testing.T) {
+	cfg := Config{
+		ProactiveCapacity:  5,
+		MaxRunners:         20,
+		PlaceholderTimeout: 5 * time.Minute,
+	}
+	rec := newFakeCapacityRecorder()
+	m, cs, _ := newTestMonitorWithRecorder(t, cfg, rec)
+	ctx := context.Background()
+
+	// Initial reconcile creates 5 healthy pairs (10 pods).
+	m.reconcileProvisioning(ctx)
+	assert.Equal(t, 10, countPods(t, cs, "test-ns"),
+		"5 healthy pairs after first reconcile")
+
+	// Break one slot by deleting just its workflow pod (simulating
+	// eviction/crash of the workflow placeholder).
+	pairs, err := m.placeholders.ListPairs(ctx)
+	require.NoError(t, err)
+	require.Len(t, pairs, 5)
+	var brokenSlotID string
+	for slotID := range pairs {
+		brokenSlotID = slotID
+		break
+	}
+	wfPods, err := cs.CoreV1().Pods("test-ns").List(ctx, metav1.ListOptions{
+		LabelSelector: labelPlaceholderID + "=" + brokenSlotID + "," +
+			labelPlaceholderRole + "=" + rolePlaceholderWorkflow,
+	})
+	require.NoError(t, err)
+	require.Len(t, wfPods.Items, 1)
+	require.NoError(t, cs.CoreV1().Pods("test-ns").Delete(
+		ctx, wfPods.Items[0].Name, metav1.DeleteOptions{},
+	))
+
+	// 4 pairs healthy + 1 surviving runner pod = 9 pods.
+	assert.Equal(t, 9, countPods(t, cs, "test-ns"),
+		"after break: 4 healthy pairs + 1 orphan runner pod")
+
+	// Run provisioning again. The broken slot should be detected,
+	// its surviving runner pod deleted, and a fresh full pair created
+	// in the SAME cycle (because CleanupBroken updates pairs and
+	// currentPairs in-place before adjustPairs runs).
+	m.reconcileProvisioning(ctx)
+
+	// End state: 5 healthy pairs (= 10 pods).
+	assert.Equal(t, 10, countPods(t, cs, "test-ns"),
+		"5 healthy pairs restored in the same cycle")
+
+	// The original broken slot must no longer exist (it was deleted).
+	finalPairs, err := m.placeholders.ListPairs(ctx)
+	require.NoError(t, err)
+	assert.Len(t, finalPairs, 5, "exactly 5 pairs in steady state")
+	assert.NotContains(t, finalPairs, brokenSlotID,
+		"broken slot must be gone (replaced by a fresh slot)")
+	for slotID, pair := range finalPairs {
+		assert.NotNilf(t, pair.RunnerPod, "slot %s must have runner pod", slotID)
+		assert.NotNilf(t, pair.WorkflowPod, "slot %s must have workflow pod", slotID)
+	}
+
+	// Metrics: one broken-success delete recorded, plus one new pair create.
+	rec.mu.Lock()
+	defer rec.mu.Unlock()
+	assert.Equal(t, 1, rec.incPairDeletesCalls[deleteReasonBroken+":"+resultSuccess],
+		"one broken delete recorded as success")
+	assert.Equal(t, 0, rec.incPairDeletesCalls[deleteReasonBroken+":"+resultError],
+		"no broken delete failures")
+	// Pair creates: 5 (initial) + 1 (replacement). Cycles run twice.
+	assert.Equal(t, 6, rec.incPairCreatesCalls[resultSuccess],
+		"5 initial creates + 1 replacement create")
+}
+
+// TestProvisioner_BrokenPair_DeleteFailure simulates a DeletePair
+// failure during broken-pair cleanup. The cycle must not crash, the
+// slot must still be excluded from currentPairs (so the next cycle
+// re-detects it), and a broken-error metric must be recorded.
+func TestProvisioner_BrokenPair_DeleteFailure(t *testing.T) {
+	cfg := Config{
+		ProactiveCapacity:  3,
+		MaxRunners:         20,
+		PlaceholderTimeout: 5 * time.Minute,
+	}
+	rec := newFakeCapacityRecorder()
+	m, cs, _ := newTestMonitorWithRecorder(t, cfg, rec)
+	ctx := context.Background()
+
+	// Initial reconcile creates 3 healthy pairs.
+	m.reconcileProvisioning(ctx)
+	require.Equal(t, 6, countPods(t, cs, "test-ns"))
+
+	// Break one slot.
+	pairs, err := m.placeholders.ListPairs(ctx)
+	require.NoError(t, err)
+	var brokenSlotID string
+	for slotID := range pairs {
+		brokenSlotID = slotID
+		break
+	}
+	wfPods, err := cs.CoreV1().Pods("test-ns").List(ctx, metav1.ListOptions{
+		LabelSelector: labelPlaceholderID + "=" + brokenSlotID + "," +
+			labelPlaceholderRole + "=" + rolePlaceholderWorkflow,
+	})
+	require.NoError(t, err)
+	require.NoError(t, cs.CoreV1().Pods("test-ns").Delete(
+		ctx, wfPods.Items[0].Name, metav1.DeleteOptions{},
+	))
+
+	// Inject a delete-collection failure that targets only the broken
+	// slot (so the second-cycle replacement-create + the orphan-runner
+	// cleanup the next cycle does aren't blocked by the same reactor).
+	cs.PrependReactor("delete-collection", "pods",
+		func(action k8stesting.Action) (bool, runtime.Object, error) {
+			dca := action.(k8stesting.DeleteCollectionActionImpl)
+			sel := dca.GetListOptions().LabelSelector
+			if strings.Contains(sel, labelPlaceholderID+"="+brokenSlotID) {
+				return true, nil, fmt.Errorf("simulated delete failure")
+			}
+			return false, nil, nil
+		},
+	)
+
+	// Run the provisioner — must not crash even though delete fails.
+	assert.NotPanics(t, func() { m.reconcileProvisioning(ctx) })
+
+	// The broken slot must still have been excluded from currentPairs,
+	// so the provisioner created a replacement pair: total existing pairs
+	// = 4 (3 originals minus 1 still-broken + 1 new replacement).
+	pairs, err = m.placeholders.ListPairs(ctx)
+	require.NoError(t, err)
+	assert.Len(t, pairs, 4,
+		"4 pairs total: 2 untouched healthy + 1 still-broken (delete failed) + 1 new replacement")
+
+	// Metrics: one broken-error delete recorded.
+	rec.mu.Lock()
+	defer rec.mu.Unlock()
+	assert.Equal(t, 0, rec.incPairDeletesCalls[deleteReasonBroken+":"+resultSuccess],
+		"delete failed -> no broken success")
+	assert.Equal(t, 1, rec.incPairDeletesCalls[deleteReasonBroken+":"+resultError],
+		"one broken delete failure recorded")
+}
+
 // ---- capacity recorder wiring tests ----
 //
 // These tests assert that the monitor calls the right CapacityRecorder
--- a/cmd/ghalistener/capacity/placeholder.go
+++ b/cmd/ghalistener/capacity/placeholder.go
@ -164,6 +164,50 @@ func (pm *PlaceholderManager) CleanupTimedOut(ctx context.Context) (int, int, er
 	return deleted, failed, nil
 }

+// CleanupBroken deletes pairs where only one of the two placeholder pods
+// exists (the other was evicted, deleted, or never landed). The surviving
+// orphan pod is deleted via DeletePair so the next reconcile cycle can
+// create a fresh full pair via the normal adjustPairs path.
+//
+// A broken pair cannot serve work — the missing pod means the slot reserves
+// a node but no workload can land. Without this cleanup, currentPairs would
+// count the broken slot as healthy and the provisioner would not re-create
+// capacity to replace it.
+//
+// Operates on the supplied pairs map (already fetched by the caller via
+// ListPairs) — no fresh List call. Returns (successful deletes, failed
+// deletes, deleted slot IDs). The caller should remove the returned slot
+// IDs from its local pairs map so the broken slots are excluded from
+// currentPairs in the same reconcile cycle (even on delete failure —
+// counting a broken slot as healthy here would mask the broken state;
+// the next cycle will re-detect any leftover state and retry).
+func (pm *PlaceholderManager) CleanupBroken(
+	ctx context.Context,
+	pairs map[string]*PlaceholderPair,
+) (int, int, []string) {
+	var deletedSlots []string
+	deleted, failed := 0, 0
+	for slotID, pair := range pairs {
+		if pair.RunnerPod != nil && pair.WorkflowPod != nil {
+			continue
+		}
+		if err := pm.DeletePair(ctx, slotID); err != nil {
+			pm.logger.Warn("failed to delete broken pair",
+				"slotID", slotID,
+				"hasRunner", pair.RunnerPod != nil,
+				"hasWorkflow", pair.WorkflowPod != nil,
+				"error", err,
+			)
+			failed++
+			deletedSlots = append(deletedSlots, slotID)
+			continue
+		}
+		deleted++
+		deletedSlots = append(deletedSlots, slotID)
+	}
+	return deleted, failed, deletedSlots
+}
+
 // CleanupOrphans deletes placeholder pods and anchor ConfigMaps from
 // previous listener instances of the SAME scale set (different
 // listener-pod label value but same scale-set-name label).
--- a/cmd/ghalistener/capacity/placeholder_test.go
+++ b/cmd/ghalistener/capacity/placeholder_test.go
@ -230,6 +230,142 @@ func TestCleanupTimedOut(t *testing.T) {
 	assert.Contains(t, pairs, "slot-new")
 }

+// TestCleanupBroken_DeletesSlotWithOnlyRunner verifies that a slot
+// missing its workflow pod is identified as broken, deleted, and
+// returned in the deleted slot list.
+func TestCleanupBroken_DeletesSlotWithOnlyRunner(t *testing.T) {
+	pm, cs := newTestPM(t, Config{})
+	ctx := context.Background()
+
+	// Create a healthy pair, then delete just the workflow pod to leave
+	// a broken slot with only the runner pod remaining.
+	require.NoError(t, pm.CreatePair(ctx, "broken-slot"))
+	pods, err := cs.CoreV1().Pods("test-ns").List(ctx, metav1.ListOptions{
+		LabelSelector: labelPlaceholderID + "=broken-slot," +
+			labelPlaceholderRole + "=" + rolePlaceholderWorkflow,
+	})
+	require.NoError(t, err)
+	require.Len(t, pods.Items, 1, "workflow pod must exist before delete")
+	require.NoError(t, cs.CoreV1().Pods("test-ns").Delete(
+		ctx, pods.Items[0].Name, metav1.DeleteOptions{},
+	))
+
+	pairs, err := pm.ListPairs(ctx)
+	require.NoError(t, err)
+	require.Len(t, pairs, 1)
+	require.NotNil(t, pairs["broken-slot"].RunnerPod)
+	require.Nil(t, pairs["broken-slot"].WorkflowPod)
+
+	deleted, failed, deletedSlots := pm.CleanupBroken(ctx, pairs)
+	assert.Equal(t, 1, deleted, "one broken slot deleted")
+	assert.Equal(t, 0, failed, "no failures")
+	assert.Equal(t, []string{"broken-slot"}, deletedSlots,
+		"deleted slot ID returned")
+
+	// All pods for that slot must be gone.
+	pods, err = cs.CoreV1().Pods("test-ns").List(ctx, metav1.ListOptions{
+		LabelSelector: labelPlaceholderID + "=broken-slot",
+	})
+	require.NoError(t, err)
+	assert.Empty(t, pods.Items, "no surviving pods for broken slot")
+}
+
+// TestCleanupBroken_DeletesSlotWithOnlyWorkflow is the symmetric case:
+// a slot missing its runner pod must also be deleted.
+func TestCleanupBroken_DeletesSlotWithOnlyWorkflow(t *testing.T) {
+	pm, cs := newTestPM(t, Config{})
+	ctx := context.Background()
+
+	require.NoError(t, pm.CreatePair(ctx, "broken-slot"))
+	pods, err := cs.CoreV1().Pods("test-ns").List(ctx, metav1.ListOptions{
+		LabelSelector: labelPlaceholderID + "=broken-slot," +
+			labelPlaceholderRole + "=" + rolePlaceholderRunner,
+	})
+	require.NoError(t, err)
+	require.Len(t, pods.Items, 1, "runner pod must exist before delete")
+	require.NoError(t, cs.CoreV1().Pods("test-ns").Delete(
+		ctx, pods.Items[0].Name, metav1.DeleteOptions{},
+	))
+
+	pairs, err := pm.ListPairs(ctx)
+	require.NoError(t, err)
+	require.Len(t, pairs, 1)
+	require.Nil(t, pairs["broken-slot"].RunnerPod)
+	require.NotNil(t, pairs["broken-slot"].WorkflowPod)
+
+	deleted, failed, deletedSlots := pm.CleanupBroken(ctx, pairs)
+	assert.Equal(t, 1, deleted, "one broken slot deleted")
+	assert.Equal(t, 0, failed, "no failures")
+	assert.Equal(t, []string{"broken-slot"}, deletedSlots)
+
+	pods, err = cs.CoreV1().Pods("test-ns").List(ctx, metav1.ListOptions{
+		LabelSelector: labelPlaceholderID + "=broken-slot",
+	})
+	require.NoError(t, err)
+	assert.Empty(t, pods.Items, "no surviving pods for broken slot")
+}
+
+// TestCleanupBroken_LeavesHealthyPairs verifies that fully populated
+// pairs (both runner + workflow pods present) are NOT deleted.
+func TestCleanupBroken_LeavesHealthyPairs(t *testing.T) {
+	pm, cs := newTestPM(t, Config{})
+	ctx := context.Background()
+
+	require.NoError(t, pm.CreatePair(ctx, "slot-a"))
+	require.NoError(t, pm.CreatePair(ctx, "slot-b"))
+
+	pairs, err := pm.ListPairs(ctx)
+	require.NoError(t, err)
+	require.Len(t, pairs, 2)
+
+	deleted, failed, deletedSlots := pm.CleanupBroken(ctx, pairs)
+	assert.Equal(t, 0, deleted, "no pairs deleted")
+	assert.Equal(t, 0, failed, "no failures")
+	assert.Empty(t, deletedSlots, "no slot IDs returned")
+
+	// All 4 pods (2 pairs × 2 pods) must still exist.
+	pods, err := cs.CoreV1().Pods("test-ns").List(ctx, metav1.ListOptions{})
+	require.NoError(t, err)
+	assert.Len(t, pods.Items, 4, "all healthy pair pods preserved")
+}
+
+// TestCleanupBroken_DeleteFailureStillExcludesSlot verifies that a
+// DeletePair error still causes the slot ID to be returned (so the
+// caller excludes it from currentPairs) and is recorded as failed.
+func TestCleanupBroken_DeleteFailureStillExcludesSlot(t *testing.T) {
+	pm, cs := newTestPM(t, Config{})
+	ctx := context.Background()
+
+	// Build a broken pair (only runner present).
+	require.NoError(t, pm.CreatePair(ctx, "broken-slot"))
+	wfPods, err := cs.CoreV1().Pods("test-ns").List(ctx, metav1.ListOptions{
+		LabelSelector: labelPlaceholderID + "=broken-slot," +
+			labelPlaceholderRole + "=" + rolePlaceholderWorkflow,
+	})
+	require.NoError(t, err)
+	require.NoError(t, cs.CoreV1().Pods("test-ns").Delete(
+		ctx, wfPods.Items[0].Name, metav1.DeleteOptions{},
+	))
+
+	pairs, err := pm.ListPairs(ctx)
+	require.NoError(t, err)
+	require.Len(t, pairs, 1)
+
+	// Inject a delete-collection failure for ALL subsequent calls.
+	cs.PrependReactor("delete-collection", "pods",
+		func(action k8stesting.Action) (bool, runtime.Object, error) {
+			return true, nil, errors.New("simulated delete failure")
+		},
+	)
+
+	deleted, failed, deletedSlots := pm.CleanupBroken(ctx, pairs)
+	assert.Equal(t, 0, deleted, "delete failed -> 0 successful deletes")
+	assert.Equal(t, 1, failed, "one failed delete")
+	assert.Equal(t, []string{"broken-slot"}, deletedSlots,
+		"slot must still be returned so caller excludes it from currentPairs "+
+			"(counting it as healthy would mask the broken state)")
+}
+
 func TestCleanupOrphans(t *testing.T) {
 	cfg := Config{Namespace: "test-ns", ScaleSetName: "sset"}
 	cs := newFakeClientset()
--- a/cmd/ghalistener/metrics/metrics_test.go
+++ b/cmd/ghalistener/metrics/metrics_test.go
@ -428,7 +428,7 @@ func TestExporterCapacityRecorder_Counters(t *testing.T) {
 	assert.Equal(t, 2.0, counterValue(t, e, MetricCapacityPairCreatesTotal, labelKeyResult, "success"))
 	assert.Equal(t, 1.0, counterValue(t, e, MetricCapacityPairCreatesTotal, labelKeyResult, "error"))

-	for _, reason := range []string{"timeout", "orphan", "excess"} {
+	for _, reason := range []string{"timeout", "orphan", "excess", "broken"} {
 		e.IncPairDeletes(reason, "success")
 		e.IncPairDeletes(reason, "error")
 		e.IncPairDeletes(reason, "success")