From 4714643523841df23f9ec3580e2a25bd8189c655 Mon Sep 17 00:00:00 2001
From: Jean Schmidt <contato@jschmidt.me>
Date: Tue, 5 May 2026 11:42:02 -0700
Subject: [PATCH] Require runner-class in workflow affinity
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Promote runner-class from preferred (weight 100) to required node
  affinity term, matching the actual workflow pod's scheduling
- Use DoesNotExist operator when RunnerClass is unset
- AND-combine runner-class and GPU label in same matchExpressions block
- Add table-driven tests for all runner-class + GPU combinations
- Bump chart versions to jeanschmidt.9

A preferred runner-class term let placeholders land on non-matching
nodes where the real workflow pod (which uses a required term) could
never follow — wasting the reservation. Making it required ensures
placeholders only occupy nodes the actual pod can schedule onto.

Signed-off-by: Jean Schmidt <contato@jschmidt.me>
---
 .../Chart.yaml                                |   4 +-
 charts/gha-runner-scale-set/Chart.yaml        |   4 +-
 cmd/ghalistener/capacity/placeholder.go       |  83 +++++----
 cmd/ghalistener/capacity/pod_spec_test.go     | 164 +++++++++++++++---
 4 files changed, 185 insertions(+), 70 deletions(-)

diff --git a/charts/gha-runner-scale-set-controller/Chart.yaml b/charts/gha-runner-scale-set-controller/Chart.yaml
index cf59e01a..60479700 100644
--- a/charts/gha-runner-scale-set-controller/Chart.yaml
+++ b/charts/gha-runner-scale-set-controller/Chart.yaml
@@ -15,13 +15,13 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.14.1-jeanschmidt.8
+version: 0.14.1-jeanschmidt.9
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 # follow Semantic Versioning. They should reflect the version the application is using.
 # It is recommended to use it with quotes.
-appVersion: "0.14.1-jeanschmidt.8"
+appVersion: "0.14.1-jeanschmidt.9"
 
 home: https://github.com/actions/actions-runner-controller
 
diff --git a/charts/gha-runner-scale-set/Chart.yaml b/charts/gha-runner-scale-set/Chart.yaml
index 3060a4ac..2a9003a2 100644
--- a/charts/gha-runner-scale-set/Chart.yaml
+++ b/charts/gha-runner-scale-set/Chart.yaml
@@ -15,13 +15,13 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.14.1-jeanschmidt.8
+version: 0.14.1-jeanschmidt.9
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 # follow Semantic Versioning. They should reflect the version the application is using.
 # It is recommended to use it with quotes.
-appVersion: "0.14.1-jeanschmidt.8"
+appVersion: "0.14.1-jeanschmidt.9"
 
 home: https://github.com/actions/actions-runner-controller
 
diff --git a/cmd/ghalistener/capacity/placeholder.go b/cmd/ghalistener/capacity/placeholder.go
index 363ed059..c87f9f4c 100644
--- a/cmd/ghalistener/capacity/placeholder.go
+++ b/cmd/ghalistener/capacity/placeholder.go
@@ -470,27 +470,38 @@ func (pm *PlaceholderManager) buildWorkflowPlaceholder(slotID string) *corev1.Po
 	return pod
 }
 
-// buildWorkflowAffinity builds the soft node affinity for workflow placeholders,
+// buildWorkflowAffinity builds the node affinity for workflow placeholders,
 // mirroring the job-pod template:
+//   - required runner-class match (In when RunnerClass set, DoesNotExist when unset),
+//     so the placeholder lands where the actual workflow pod can follow it
+//   - required GPU node label (when WorkflowGPU > 0), AND-combined with runner-class
 //   - weight 50 preference for node-fleet + workload-type
-//   - optional weight 100 preference for runner-class (when set)
-//   - optional GPU node selector requirement (when WorkflowGPU > 0)
 func (pm *PlaceholderManager) buildWorkflowAffinity() *corev1.Affinity {
-	preferredTerms := []corev1.PreferredSchedulingTerm{}
-
-	// Optional runner-class preference (template uses higher weight for class match).
+	// Runner-class must be REQUIRED (not preferred): the actual workflow pod
+	// uses a required term here, so a placeholder landing on a non-matching
+	// node cannot be followed by the real pod and the reservation is wasted.
+	var runnerClassReq corev1.NodeSelectorRequirement
 	if pm.config.RunnerClass != "" {
-		preferredTerms = append(preferredTerms, corev1.PreferredSchedulingTerm{
-			Weight: 100,
-			Preference: corev1.NodeSelectorTerm{
-				MatchExpressions: []corev1.NodeSelectorRequirement{
-					{
-						Key:      "osdc.io/runner-class",
-						Operator: corev1.NodeSelectorOpIn,
-						Values:   []string{pm.config.RunnerClass},
-					},
-				},
-			},
+		runnerClassReq = corev1.NodeSelectorRequirement{
+			Key:      "osdc.io/runner-class",
+			Operator: corev1.NodeSelectorOpIn,
+			Values:   []string{pm.config.RunnerClass},
+		}
+	} else {
+		runnerClassReq = corev1.NodeSelectorRequirement{
+			Key:      "osdc.io/runner-class",
+			Operator: corev1.NodeSelectorOpDoesNotExist,
+		}
+	}
+
+	// Required terms: runner-class always; GPU label when WorkflowGPU > 0.
+	// Both go in the SAME matchExpressions block so they AND together.
+	requiredExprs := []corev1.NodeSelectorRequirement{runnerClassReq}
+	if pm.config.WorkflowGPU > 0 {
+		requiredExprs = append(requiredExprs, corev1.NodeSelectorRequirement{
+			Key:      "nvidia.com/gpu",
+			Operator: corev1.NodeSelectorOpIn,
+			Values:   []string{"true"},
 		})
 	}
 
@@ -511,37 +522,25 @@ func (pm *PlaceholderManager) buildWorkflowAffinity() *corev1.Affinity {
 			},
 		}, fleetMatchExpressions...)
 	}
-	preferredTerms = append(preferredTerms, corev1.PreferredSchedulingTerm{
-		Weight: 50,
-		Preference: corev1.NodeSelectorTerm{
-			MatchExpressions: fleetMatchExpressions,
+	preferredTerms := []corev1.PreferredSchedulingTerm{
+		{
+			Weight: 50,
+			Preference: corev1.NodeSelectorTerm{
+				MatchExpressions: fleetMatchExpressions,
+			},
 		},
-	})
+	}
 
-	affinity := &corev1.Affinity{
+	return &corev1.Affinity{
 		NodeAffinity: &corev1.NodeAffinity{
+			RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{
+				NodeSelectorTerms: []corev1.NodeSelectorTerm{
+					{MatchExpressions: requiredExprs},
+				},
+			},
 			PreferredDuringSchedulingIgnoredDuringExecution: preferredTerms,
 		},
 	}
-
-	// Optional hard GPU node selector requirement.
-	if pm.config.WorkflowGPU > 0 {
-		affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution = &corev1.NodeSelector{
-			NodeSelectorTerms: []corev1.NodeSelectorTerm{
-				{
-					MatchExpressions: []corev1.NodeSelectorRequirement{
-						{
-							Key:      "nvidia.com/gpu",
-							Operator: corev1.NodeSelectorOpIn,
-							Values:   []string{"true"},
-						},
-					},
-				},
-			},
-		}
-	}
-
-	return affinity
 }
 
 // sleepArg returns the sleep duration for placeholder containers as a
diff --git a/cmd/ghalistener/capacity/pod_spec_test.go b/cmd/ghalistener/capacity/pod_spec_test.go
index f677fa7f..94a6b65f 100644
--- a/cmd/ghalistener/capacity/pod_spec_test.go
+++ b/cmd/ghalistener/capacity/pod_spec_test.go
@@ -51,9 +51,10 @@ func TestRunnerPlaceholder_NoGPU_NoFleet_NoRunnerClass(t *testing.T) {
 
 // Workflow placeholder mirrors the job-pod.yaml ConfigMap template:
 // NO hard nodeSelector — uses preferredDuringScheduling node affinity
-// with weight 50 for node-fleet + workload-type. Tolerations:
-// instance-type Exists, optional node-fleet, optional GPU. NO
-// git-cache-not-ready (workflow waits for cache, doesn't tolerate the taint).
+// with weight 50 for node-fleet + workload-type, plus a REQUIRED runner-class
+// term (DoesNotExist when RunnerClass unset). Tolerations: instance-type Exists,
+// optional node-fleet, optional GPU. NO git-cache-not-ready (workflow waits
+// for cache, doesn't tolerate the taint).
 func TestWorkflowPlaceholder_NoGPU_NoFleet_NoRunnerClass(t *testing.T) {
 	pm, _ := newTestPM(t, Config{})
 	ctx := context.Background()
@@ -67,8 +68,18 @@ func TestWorkflowPlaceholder_NoGPU_NoFleet_NoRunnerClass(t *testing.T) {
 	require.NotNil(t, wf.Spec.Affinity)
 	require.NotNil(t, wf.Spec.Affinity.NodeAffinity)
 	preferred := wf.Spec.Affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution
-	require.NotEmpty(t, preferred)
-	assert.Equal(t, int32(50), preferred[len(preferred)-1].Weight)
+	require.Len(t, preferred, 1, "only fleet preference; runner-class lives in required terms")
+	assert.Equal(t, int32(50), preferred[0].Weight)
+
+	// Required: runner-class DoesNotExist (matches workflow pod's required term
+	// for non-class fleets).
+	required := wf.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution
+	require.NotNil(t, required)
+	require.Len(t, required.NodeSelectorTerms, 1)
+	exprs := required.NodeSelectorTerms[0].MatchExpressions
+	require.Len(t, exprs, 1)
+	assert.Equal(t, "osdc.io/runner-class", exprs[0].Key)
+	assert.Equal(t, corev1.NodeSelectorOpDoesNotExist, exprs[0].Operator)
 
 	// Without NodeFleet/GPU: only instance-type toleration. No git-cache.
 	assert.Len(t, wf.Spec.Tolerations, 1)
@@ -152,13 +163,31 @@ func TestWorkflowPlaceholder_WithGPU_WithFleet_WithRunnerClass(t *testing.T) {
 	// Workflow has no hard nodeSelector regardless of config.
 	assert.Nil(t, wf.Spec.NodeSelector)
 
-	// Affinity: required GPU node selector + preferred runner-class + preferred fleet.
+	// Affinity: required {runner-class In + GPU In} (AND) + preferred fleet.
 	require.NotNil(t, wf.Spec.Affinity)
 	require.NotNil(t, wf.Spec.Affinity.NodeAffinity)
 	preferred := wf.Spec.Affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution
-	assert.Len(t, preferred, 2, "runner-class preference + fleet preference")
-	require.NotNil(t, wf.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution,
-		"GPU workflow has required GPU node selector")
+	require.Len(t, preferred, 1, "only fleet preference; runner-class lives in required terms")
+	assert.Equal(t, int32(50), preferred[0].Weight)
+
+	required := wf.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution
+	require.NotNil(t, required, "GPU + runner-class required terms")
+	require.Len(t, required.NodeSelectorTerms, 1,
+		"both required exprs go in the SAME matchExpressions block (AND semantics)")
+	exprs := required.NodeSelectorTerms[0].MatchExpressions
+	require.Len(t, exprs, 2, "runner-class In + nvidia.com/gpu In, both in same block")
+	exprByKey := map[string]corev1.NodeSelectorRequirement{}
+	for _, e := range exprs {
+		exprByKey[e.Key] = e
+	}
+	rc, ok := exprByKey["osdc.io/runner-class"]
+	require.True(t, ok, "runner-class required expr must be present")
+	assert.Equal(t, corev1.NodeSelectorOpIn, rc.Operator)
+	assert.Equal(t, []string{"gpu-large"}, rc.Values)
+	gpu, ok := exprByKey["nvidia.com/gpu"]
+	require.True(t, ok, "GPU required expr must be present")
+	assert.Equal(t, corev1.NodeSelectorOpIn, gpu.Operator)
+	assert.Equal(t, []string{"true"}, gpu.Values)
 
 	// Tolerations: instance-type + node-fleet + GPU = 3.
 	assert.Len(t, wf.Spec.Tolerations, 3)
@@ -501,10 +530,104 @@ func TestRunnerPlaceholder_NeverIncludesRunnerClass(t *testing.T) {
 	}
 }
 
-// The workflow placeholder MUST still consume RunnerClass via its preferred
-// node affinity (weight 100). Only the runner placeholder drops runner-class —
-// the workflow pod's scheduling pattern is unchanged.
-func TestWorkflowPlaceholder_StillUsesRunnerClassInPreferredAffinity(t *testing.T) {
+// The workflow placeholder MUST express runner-class as a REQUIRED node
+// affinity term — not preferred. The actual workflow pod uses a required
+// term for runner-class (In when set, DoesNotExist when unset), so a
+// placeholder landing on a non-matching node cannot be followed by the
+// real pod and the reservation is wasted.
+//
+// When WorkflowGPU > 0, the GPU node-label requirement must be AND-combined
+// with the runner-class requirement in the SAME matchExpressions block.
+func TestWorkflowPlaceholder_RequiresRunnerClass(t *testing.T) {
+	cases := []struct {
+		name        string
+		runnerClass string
+		gpu         int
+		// expected runner-class expr
+		wantRCOp     corev1.NodeSelectorOperator
+		wantRCValues []string
+		// expected exprs count in the single matchExpressions block
+		wantExprCount int
+	}{
+		{
+			name:          "release without GPU",
+			runnerClass:   "release",
+			gpu:           0,
+			wantRCOp:      corev1.NodeSelectorOpIn,
+			wantRCValues:  []string{"release"},
+			wantExprCount: 1,
+		},
+		{
+			name:          "release with GPU",
+			runnerClass:   "release",
+			gpu:           1,
+			wantRCOp:      corev1.NodeSelectorOpIn,
+			wantRCValues:  []string{"release"},
+			wantExprCount: 2,
+		},
+		{
+			name:          "no runner-class without GPU",
+			runnerClass:   "",
+			gpu:           0,
+			wantRCOp:      corev1.NodeSelectorOpDoesNotExist,
+			wantExprCount: 1,
+		},
+		{
+			name:          "no runner-class with GPU",
+			runnerClass:   "",
+			gpu:           1,
+			wantRCOp:      corev1.NodeSelectorOpDoesNotExist,
+			wantExprCount: 2,
+		},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			cfg := Config{
+				NodeFleet:   "g4dn",
+				RunnerClass: tc.runnerClass,
+				WorkflowGPU: tc.gpu,
+			}
+			pm, _ := newTestPM(t, cfg)
+			ctx := context.Background()
+
+			require.NoError(t, pm.CreatePair(ctx, "wf-req-slot"))
+			pairs, _ := pm.ListPairs(ctx)
+			wf := pairs["wf-req-slot"].WorkflowPod
+			require.NotNil(t, wf)
+			require.NotNil(t, wf.Spec.Affinity)
+			require.NotNil(t, wf.Spec.Affinity.NodeAffinity)
+
+			required := wf.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution
+			require.NotNil(t, required, "workflow placeholder must have required affinity for runner-class")
+			require.Len(t, required.NodeSelectorTerms, 1,
+				"runner-class and GPU must AND together in a single matchExpressions block")
+			exprs := required.NodeSelectorTerms[0].MatchExpressions
+			require.Len(t, exprs, tc.wantExprCount)
+
+			exprByKey := map[string]corev1.NodeSelectorRequirement{}
+			for _, e := range exprs {
+				exprByKey[e.Key] = e
+			}
+			rc, ok := exprByKey["osdc.io/runner-class"]
+			require.True(t, ok, "runner-class required expr must be present")
+			assert.Equal(t, tc.wantRCOp, rc.Operator)
+			assert.Equal(t, tc.wantRCValues, rc.Values)
+
+			if tc.gpu > 0 {
+				gpu, ok := exprByKey["nvidia.com/gpu"]
+				require.True(t, ok, "GPU required expr must be present when WorkflowGPU > 0")
+				assert.Equal(t, corev1.NodeSelectorOpIn, gpu.Operator)
+				assert.Equal(t, []string{"true"}, gpu.Values)
+			}
+		})
+	}
+}
+
+// Regression guard: runner-class must live ONLY in required terms — never
+// in preferred. The earlier (buggy) implementation used a PreferredSchedulingTerm
+// (weight 100) which let the placeholder land on a non-class node where the
+// actual workflow pod (which uses a required term) cannot follow.
+func TestWorkflowPlaceholder_RunnerClassMustBeRequiredNotPreferred(t *testing.T) {
 	cfg := Config{
 		NodeFleet:   "g4dn",
 		RunnerClass: "release",
@@ -512,26 +635,19 @@ func TestWorkflowPlaceholder_StillUsesRunnerClassInPreferredAffinity(t *testing.
 	pm, _ := newTestPM(t, cfg)
 	ctx := context.Background()
 
-	require.NoError(t, pm.CreatePair(ctx, "wf-rc-slot"))
+	require.NoError(t, pm.CreatePair(ctx, "wf-guard-slot"))
 	pairs, _ := pm.ListPairs(ctx)
-	wf := pairs["wf-rc-slot"].WorkflowPod
+	wf := pairs["wf-guard-slot"].WorkflowPod
 	require.NotNil(t, wf)
 	require.NotNil(t, wf.Spec.Affinity)
 	require.NotNil(t, wf.Spec.Affinity.NodeAffinity)
 
-	var foundRC bool
 	for _, term := range wf.Spec.Affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution {
 		for _, expr := range term.Preference.MatchExpressions {
-			if expr.Key == "osdc.io/runner-class" {
-				foundRC = true
-				assert.Equal(t, int32(100), term.Weight,
-					"workflow runner-class preferred affinity weight must be 100")
-				assert.Contains(t, expr.Values, "release")
-			}
+			assert.NotEqual(t, "osdc.io/runner-class", expr.Key,
+				"runner-class must NEVER appear in a PreferredSchedulingTerm — required only")
 		}
 	}
-	assert.True(t, foundRC,
-		"workflow placeholder must still include runner-class in preferred affinity")
 }
 
 func tolerationKeySet(tolerations []corev1.Toleration) map[string]struct{} {