diff --git a/charts/gha-runner-scale-set-controller/Chart.yaml b/charts/gha-runner-scale-set-controller/Chart.yaml index 4cc918b3..cf59e01a 100644 --- a/charts/gha-runner-scale-set-controller/Chart.yaml +++ b/charts/gha-runner-scale-set-controller/Chart.yaml @@ -15,13 +15,13 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.14.1-jeanschmidt.7 +version: 0.14.1-jeanschmidt.8 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: "0.14.1-jeanschmidt.7" +appVersion: "0.14.1-jeanschmidt.8" home: https://github.com/actions/actions-runner-controller diff --git a/charts/gha-runner-scale-set/Chart.yaml b/charts/gha-runner-scale-set/Chart.yaml index 522a7317..3060a4ac 100644 --- a/charts/gha-runner-scale-set/Chart.yaml +++ b/charts/gha-runner-scale-set/Chart.yaml @@ -15,13 +15,13 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.14.1-jeanschmidt.7 +version: 0.14.1-jeanschmidt.8 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: "0.14.1-jeanschmidt.7" +appVersion: "0.14.1-jeanschmidt.8" home: https://github.com/actions/actions-runner-controller diff --git a/cmd/ghalistener/capacity/monitor.go b/cmd/ghalistener/capacity/monitor.go index 5e11baa0..084abbf6 100644 --- a/cmd/ghalistener/capacity/monitor.go +++ b/cmd/ghalistener/capacity/monitor.go @@ -34,6 +34,7 @@ const ( deleteReasonOrphan = "orphan" deleteReasonTimeout = "timeout" deleteReasonExcess = "excess" + deleteReasonBroken = "broken" skipReasonProvisionerListPairs = "provisioner_list_pairs" skipReasonProvisionerListRunners = "provisioner_list_runners" @@ -380,6 +381,27 @@ func (m *Monitor) reconcileProvisioning(ctx context.Context) { m.recorder.IncReconcileSkips(skipReasonProvisionerListPairs) return } + + // 3b. Cleanup broken pairs (slots with only one of two pods). Without + // this, currentPairs would count broken slots as healthy and the + // provisioner would not re-create capacity to replace them. The + // surviving orphan pod is deleted; the next adjustPairs step will + // create a fresh full pair to fill the freed slot in this same cycle. + brokenSuccess, brokenFailed, brokenSlots := m.placeholders.CleanupBroken(ctx, pairs) + for _, slotID := range brokenSlots { + delete(pairs, slotID) + } + for i := 0; i < brokenSuccess; i++ { + m.recorder.IncPairDeletes(deleteReasonBroken, resultSuccess) + } + for i := 0; i < brokenFailed; i++ { + m.recorder.IncPairDeletes(deleteReasonBroken, resultError) + } + if brokenSuccess > 0 || brokenFailed > 0 { + m.logger.Info("cleaned up broken placeholder pairs", + "success", brokenSuccess, "failed", brokenFailed) + } + currentPairs := len(pairs) m.recorder.SetPairs(currentPairs) m.emitPlaceholderPodPhases(pairs) diff --git a/cmd/ghalistener/capacity/monitor_test.go b/cmd/ghalistener/capacity/monitor_test.go index e758e303..28b7293e 100644 --- a/cmd/ghalistener/capacity/monitor_test.go +++ b/cmd/ghalistener/capacity/monitor_test.go @@ -459,6 +459,155 @@ func TestRetryWithBackoff_RespectsContextCancellation(t *testing.T) { assert.ErrorIs(t, err, context.Canceled) } +// TestProvisioner_BrokenPair_TriggersRecreation simulates a broken slot +// (only the runner pod present, workflow pod has been deleted/evicted) and +// asserts that the provisioner deletes the orphan pod, excludes the slot +// from currentPairs in the SAME cycle, and creates a fresh full pair to +// fill the freed slot — restoring the pre-warmed capacity. +// +// Without CleanupBroken, the slot would count as healthy and the provisioner +// would think it was already at desired, leaving capacity permanently low. +func TestProvisioner_BrokenPair_TriggersRecreation(t *testing.T) { + cfg := Config{ + ProactiveCapacity: 5, + MaxRunners: 20, + PlaceholderTimeout: 5 * time.Minute, + } + rec := newFakeCapacityRecorder() + m, cs, _ := newTestMonitorWithRecorder(t, cfg, rec) + ctx := context.Background() + + // Initial reconcile creates 5 healthy pairs (10 pods). + m.reconcileProvisioning(ctx) + assert.Equal(t, 10, countPods(t, cs, "test-ns"), + "5 healthy pairs after first reconcile") + + // Break one slot by deleting just its workflow pod (simulating + // eviction/crash of the workflow placeholder). + pairs, err := m.placeholders.ListPairs(ctx) + require.NoError(t, err) + require.Len(t, pairs, 5) + var brokenSlotID string + for slotID := range pairs { + brokenSlotID = slotID + break + } + wfPods, err := cs.CoreV1().Pods("test-ns").List(ctx, metav1.ListOptions{ + LabelSelector: labelPlaceholderID + "=" + brokenSlotID + "," + + labelPlaceholderRole + "=" + rolePlaceholderWorkflow, + }) + require.NoError(t, err) + require.Len(t, wfPods.Items, 1) + require.NoError(t, cs.CoreV1().Pods("test-ns").Delete( + ctx, wfPods.Items[0].Name, metav1.DeleteOptions{}, + )) + + // 4 pairs healthy + 1 surviving runner pod = 9 pods. + assert.Equal(t, 9, countPods(t, cs, "test-ns"), + "after break: 4 healthy pairs + 1 orphan runner pod") + + // Run provisioning again. The broken slot should be detected, + // its surviving runner pod deleted, and a fresh full pair created + // in the SAME cycle (because CleanupBroken updates pairs and + // currentPairs in-place before adjustPairs runs). + m.reconcileProvisioning(ctx) + + // End state: 5 healthy pairs (= 10 pods). + assert.Equal(t, 10, countPods(t, cs, "test-ns"), + "5 healthy pairs restored in the same cycle") + + // The original broken slot must no longer exist (it was deleted). + finalPairs, err := m.placeholders.ListPairs(ctx) + require.NoError(t, err) + assert.Len(t, finalPairs, 5, "exactly 5 pairs in steady state") + assert.NotContains(t, finalPairs, brokenSlotID, + "broken slot must be gone (replaced by a fresh slot)") + for slotID, pair := range finalPairs { + assert.NotNilf(t, pair.RunnerPod, "slot %s must have runner pod", slotID) + assert.NotNilf(t, pair.WorkflowPod, "slot %s must have workflow pod", slotID) + } + + // Metrics: one broken-success delete recorded, plus one new pair create. + rec.mu.Lock() + defer rec.mu.Unlock() + assert.Equal(t, 1, rec.incPairDeletesCalls[deleteReasonBroken+":"+resultSuccess], + "one broken delete recorded as success") + assert.Equal(t, 0, rec.incPairDeletesCalls[deleteReasonBroken+":"+resultError], + "no broken delete failures") + // Pair creates: 5 (initial) + 1 (replacement). Cycles run twice. + assert.Equal(t, 6, rec.incPairCreatesCalls[resultSuccess], + "5 initial creates + 1 replacement create") +} + +// TestProvisioner_BrokenPair_DeleteFailure simulates a DeletePair +// failure during broken-pair cleanup. The cycle must not crash, the +// slot must still be excluded from currentPairs (so the next cycle +// re-detects it), and a broken-error metric must be recorded. +func TestProvisioner_BrokenPair_DeleteFailure(t *testing.T) { + cfg := Config{ + ProactiveCapacity: 3, + MaxRunners: 20, + PlaceholderTimeout: 5 * time.Minute, + } + rec := newFakeCapacityRecorder() + m, cs, _ := newTestMonitorWithRecorder(t, cfg, rec) + ctx := context.Background() + + // Initial reconcile creates 3 healthy pairs. + m.reconcileProvisioning(ctx) + require.Equal(t, 6, countPods(t, cs, "test-ns")) + + // Break one slot. + pairs, err := m.placeholders.ListPairs(ctx) + require.NoError(t, err) + var brokenSlotID string + for slotID := range pairs { + brokenSlotID = slotID + break + } + wfPods, err := cs.CoreV1().Pods("test-ns").List(ctx, metav1.ListOptions{ + LabelSelector: labelPlaceholderID + "=" + brokenSlotID + "," + + labelPlaceholderRole + "=" + rolePlaceholderWorkflow, + }) + require.NoError(t, err) + require.NoError(t, cs.CoreV1().Pods("test-ns").Delete( + ctx, wfPods.Items[0].Name, metav1.DeleteOptions{}, + )) + + // Inject a delete-collection failure that targets only the broken + // slot (so the second-cycle replacement-create + the orphan-runner + // cleanup the next cycle does aren't blocked by the same reactor). + cs.PrependReactor("delete-collection", "pods", + func(action k8stesting.Action) (bool, runtime.Object, error) { + dca := action.(k8stesting.DeleteCollectionActionImpl) + sel := dca.GetListOptions().LabelSelector + if strings.Contains(sel, labelPlaceholderID+"="+brokenSlotID) { + return true, nil, fmt.Errorf("simulated delete failure") + } + return false, nil, nil + }, + ) + + // Run the provisioner — must not crash even though delete fails. + assert.NotPanics(t, func() { m.reconcileProvisioning(ctx) }) + + // The broken slot must still have been excluded from currentPairs, + // so the provisioner created a replacement pair: total existing pairs + // = 4 (3 originals minus 1 still-broken + 1 new replacement). + pairs, err = m.placeholders.ListPairs(ctx) + require.NoError(t, err) + assert.Len(t, pairs, 4, + "4 pairs total: 2 untouched healthy + 1 still-broken (delete failed) + 1 new replacement") + + // Metrics: one broken-error delete recorded. + rec.mu.Lock() + defer rec.mu.Unlock() + assert.Equal(t, 0, rec.incPairDeletesCalls[deleteReasonBroken+":"+resultSuccess], + "delete failed -> no broken success") + assert.Equal(t, 1, rec.incPairDeletesCalls[deleteReasonBroken+":"+resultError], + "one broken delete failure recorded") +} + // ---- capacity recorder wiring tests ---- // // These tests assert that the monitor calls the right CapacityRecorder diff --git a/cmd/ghalistener/capacity/placeholder.go b/cmd/ghalistener/capacity/placeholder.go index 64744c92..363ed059 100644 --- a/cmd/ghalistener/capacity/placeholder.go +++ b/cmd/ghalistener/capacity/placeholder.go @@ -164,6 +164,50 @@ func (pm *PlaceholderManager) CleanupTimedOut(ctx context.Context) (int, int, er return deleted, failed, nil } +// CleanupBroken deletes pairs where only one of the two placeholder pods +// exists (the other was evicted, deleted, or never landed). The surviving +// orphan pod is deleted via DeletePair so the next reconcile cycle can +// create a fresh full pair via the normal adjustPairs path. +// +// A broken pair cannot serve work — the missing pod means the slot reserves +// a node but no workload can land. Without this cleanup, currentPairs would +// count the broken slot as healthy and the provisioner would not re-create +// capacity to replace it. +// +// Operates on the supplied pairs map (already fetched by the caller via +// ListPairs) — no fresh List call. Returns (successful deletes, failed +// deletes, deleted slot IDs). The caller should remove the returned slot +// IDs from its local pairs map so the broken slots are excluded from +// currentPairs in the same reconcile cycle (even on delete failure — +// counting a broken slot as healthy here would mask the broken state; +// the next cycle will re-detect any leftover state and retry). +func (pm *PlaceholderManager) CleanupBroken( + ctx context.Context, + pairs map[string]*PlaceholderPair, +) (int, int, []string) { + var deletedSlots []string + deleted, failed := 0, 0 + for slotID, pair := range pairs { + if pair.RunnerPod != nil && pair.WorkflowPod != nil { + continue + } + if err := pm.DeletePair(ctx, slotID); err != nil { + pm.logger.Warn("failed to delete broken pair", + "slotID", slotID, + "hasRunner", pair.RunnerPod != nil, + "hasWorkflow", pair.WorkflowPod != nil, + "error", err, + ) + failed++ + deletedSlots = append(deletedSlots, slotID) + continue + } + deleted++ + deletedSlots = append(deletedSlots, slotID) + } + return deleted, failed, deletedSlots +} + // CleanupOrphans deletes placeholder pods and anchor ConfigMaps from // previous listener instances of the SAME scale set (different // listener-pod label value but same scale-set-name label). diff --git a/cmd/ghalistener/capacity/placeholder_test.go b/cmd/ghalistener/capacity/placeholder_test.go index 0289fcc2..162e2632 100644 --- a/cmd/ghalistener/capacity/placeholder_test.go +++ b/cmd/ghalistener/capacity/placeholder_test.go @@ -230,6 +230,142 @@ func TestCleanupTimedOut(t *testing.T) { assert.Contains(t, pairs, "slot-new") } +// TestCleanupBroken_DeletesSlotWithOnlyRunner verifies that a slot +// missing its workflow pod is identified as broken, deleted, and +// returned in the deleted slot list. +func TestCleanupBroken_DeletesSlotWithOnlyRunner(t *testing.T) { + pm, cs := newTestPM(t, Config{}) + ctx := context.Background() + + // Create a healthy pair, then delete just the workflow pod to leave + // a broken slot with only the runner pod remaining. + require.NoError(t, pm.CreatePair(ctx, "broken-slot")) + pods, err := cs.CoreV1().Pods("test-ns").List(ctx, metav1.ListOptions{ + LabelSelector: labelPlaceholderID + "=broken-slot," + + labelPlaceholderRole + "=" + rolePlaceholderWorkflow, + }) + require.NoError(t, err) + require.Len(t, pods.Items, 1, "workflow pod must exist before delete") + require.NoError(t, cs.CoreV1().Pods("test-ns").Delete( + ctx, pods.Items[0].Name, metav1.DeleteOptions{}, + )) + + pairs, err := pm.ListPairs(ctx) + require.NoError(t, err) + require.Len(t, pairs, 1) + require.NotNil(t, pairs["broken-slot"].RunnerPod) + require.Nil(t, pairs["broken-slot"].WorkflowPod) + + deleted, failed, deletedSlots := pm.CleanupBroken(ctx, pairs) + assert.Equal(t, 1, deleted, "one broken slot deleted") + assert.Equal(t, 0, failed, "no failures") + assert.Equal(t, []string{"broken-slot"}, deletedSlots, + "deleted slot ID returned") + + // All pods for that slot must be gone. + pods, err = cs.CoreV1().Pods("test-ns").List(ctx, metav1.ListOptions{ + LabelSelector: labelPlaceholderID + "=broken-slot", + }) + require.NoError(t, err) + assert.Empty(t, pods.Items, "no surviving pods for broken slot") +} + +// TestCleanupBroken_DeletesSlotWithOnlyWorkflow is the symmetric case: +// a slot missing its runner pod must also be deleted. +func TestCleanupBroken_DeletesSlotWithOnlyWorkflow(t *testing.T) { + pm, cs := newTestPM(t, Config{}) + ctx := context.Background() + + require.NoError(t, pm.CreatePair(ctx, "broken-slot")) + pods, err := cs.CoreV1().Pods("test-ns").List(ctx, metav1.ListOptions{ + LabelSelector: labelPlaceholderID + "=broken-slot," + + labelPlaceholderRole + "=" + rolePlaceholderRunner, + }) + require.NoError(t, err) + require.Len(t, pods.Items, 1, "runner pod must exist before delete") + require.NoError(t, cs.CoreV1().Pods("test-ns").Delete( + ctx, pods.Items[0].Name, metav1.DeleteOptions{}, + )) + + pairs, err := pm.ListPairs(ctx) + require.NoError(t, err) + require.Len(t, pairs, 1) + require.Nil(t, pairs["broken-slot"].RunnerPod) + require.NotNil(t, pairs["broken-slot"].WorkflowPod) + + deleted, failed, deletedSlots := pm.CleanupBroken(ctx, pairs) + assert.Equal(t, 1, deleted, "one broken slot deleted") + assert.Equal(t, 0, failed, "no failures") + assert.Equal(t, []string{"broken-slot"}, deletedSlots) + + pods, err = cs.CoreV1().Pods("test-ns").List(ctx, metav1.ListOptions{ + LabelSelector: labelPlaceholderID + "=broken-slot", + }) + require.NoError(t, err) + assert.Empty(t, pods.Items, "no surviving pods for broken slot") +} + +// TestCleanupBroken_LeavesHealthyPairs verifies that fully populated +// pairs (both runner + workflow pods present) are NOT deleted. +func TestCleanupBroken_LeavesHealthyPairs(t *testing.T) { + pm, cs := newTestPM(t, Config{}) + ctx := context.Background() + + require.NoError(t, pm.CreatePair(ctx, "slot-a")) + require.NoError(t, pm.CreatePair(ctx, "slot-b")) + + pairs, err := pm.ListPairs(ctx) + require.NoError(t, err) + require.Len(t, pairs, 2) + + deleted, failed, deletedSlots := pm.CleanupBroken(ctx, pairs) + assert.Equal(t, 0, deleted, "no pairs deleted") + assert.Equal(t, 0, failed, "no failures") + assert.Empty(t, deletedSlots, "no slot IDs returned") + + // All 4 pods (2 pairs × 2 pods) must still exist. + pods, err := cs.CoreV1().Pods("test-ns").List(ctx, metav1.ListOptions{}) + require.NoError(t, err) + assert.Len(t, pods.Items, 4, "all healthy pair pods preserved") +} + +// TestCleanupBroken_DeleteFailureStillExcludesSlot verifies that a +// DeletePair error still causes the slot ID to be returned (so the +// caller excludes it from currentPairs) and is recorded as failed. +func TestCleanupBroken_DeleteFailureStillExcludesSlot(t *testing.T) { + pm, cs := newTestPM(t, Config{}) + ctx := context.Background() + + // Build a broken pair (only runner present). + require.NoError(t, pm.CreatePair(ctx, "broken-slot")) + wfPods, err := cs.CoreV1().Pods("test-ns").List(ctx, metav1.ListOptions{ + LabelSelector: labelPlaceholderID + "=broken-slot," + + labelPlaceholderRole + "=" + rolePlaceholderWorkflow, + }) + require.NoError(t, err) + require.NoError(t, cs.CoreV1().Pods("test-ns").Delete( + ctx, wfPods.Items[0].Name, metav1.DeleteOptions{}, + )) + + pairs, err := pm.ListPairs(ctx) + require.NoError(t, err) + require.Len(t, pairs, 1) + + // Inject a delete-collection failure for ALL subsequent calls. + cs.PrependReactor("delete-collection", "pods", + func(action k8stesting.Action) (bool, runtime.Object, error) { + return true, nil, errors.New("simulated delete failure") + }, + ) + + deleted, failed, deletedSlots := pm.CleanupBroken(ctx, pairs) + assert.Equal(t, 0, deleted, "delete failed -> 0 successful deletes") + assert.Equal(t, 1, failed, "one failed delete") + assert.Equal(t, []string{"broken-slot"}, deletedSlots, + "slot must still be returned so caller excludes it from currentPairs "+ + "(counting it as healthy would mask the broken state)") +} + func TestCleanupOrphans(t *testing.T) { cfg := Config{Namespace: "test-ns", ScaleSetName: "sset"} cs := newFakeClientset() diff --git a/cmd/ghalistener/metrics/metrics_test.go b/cmd/ghalistener/metrics/metrics_test.go index d6ee81a0..a6fd3852 100644 --- a/cmd/ghalistener/metrics/metrics_test.go +++ b/cmd/ghalistener/metrics/metrics_test.go @@ -428,7 +428,7 @@ func TestExporterCapacityRecorder_Counters(t *testing.T) { assert.Equal(t, 2.0, counterValue(t, e, MetricCapacityPairCreatesTotal, labelKeyResult, "success")) assert.Equal(t, 1.0, counterValue(t, e, MetricCapacityPairCreatesTotal, labelKeyResult, "error")) - for _, reason := range []string{"timeout", "orphan", "excess"} { + for _, reason := range []string{"timeout", "orphan", "excess", "broken"} { e.IncPairDeletes(reason, "success") e.IncPairDeletes(reason, "error") e.IncPairDeletes(reason, "success")