Merge pull request #4 from jeanschmidt/jeanschmidt/release_enforcement
Require runner-class in workflow affinity
This commit is contained in:
commit
30c1a102b4
|
|
@ -15,13 +15,13 @@ type: application
|
|||
# This is the chart version. This version number should be incremented each time you make changes
|
||||
# to the chart and its templates, including the app version.
|
||||
# Versions are expected to follow Semantic Versioning (https://semver.org/)
|
||||
version: 0.14.1-jeanschmidt.8
|
||||
version: 0.14.1-jeanschmidt.9
|
||||
|
||||
# This is the version number of the application being deployed. This version number should be
|
||||
# incremented each time you make changes to the application. Versions are not expected to
|
||||
# follow Semantic Versioning. They should reflect the version the application is using.
|
||||
# It is recommended to use it with quotes.
|
||||
appVersion: "0.14.1-jeanschmidt.8"
|
||||
appVersion: "0.14.1-jeanschmidt.9"
|
||||
|
||||
home: https://github.com/actions/actions-runner-controller
|
||||
|
||||
|
|
|
|||
|
|
@ -15,13 +15,13 @@ type: application
|
|||
# This is the chart version. This version number should be incremented each time you make changes
|
||||
# to the chart and its templates, including the app version.
|
||||
# Versions are expected to follow Semantic Versioning (https://semver.org/)
|
||||
version: 0.14.1-jeanschmidt.8
|
||||
version: 0.14.1-jeanschmidt.9
|
||||
|
||||
# This is the version number of the application being deployed. This version number should be
|
||||
# incremented each time you make changes to the application. Versions are not expected to
|
||||
# follow Semantic Versioning. They should reflect the version the application is using.
|
||||
# It is recommended to use it with quotes.
|
||||
appVersion: "0.14.1-jeanschmidt.8"
|
||||
appVersion: "0.14.1-jeanschmidt.9"
|
||||
|
||||
home: https://github.com/actions/actions-runner-controller
|
||||
|
||||
|
|
|
|||
|
|
@ -470,27 +470,38 @@ func (pm *PlaceholderManager) buildWorkflowPlaceholder(slotID string) *corev1.Po
|
|||
return pod
|
||||
}
|
||||
|
||||
// buildWorkflowAffinity builds the soft node affinity for workflow placeholders,
|
||||
// buildWorkflowAffinity builds the node affinity for workflow placeholders,
|
||||
// mirroring the job-pod template:
|
||||
// - required runner-class match (In when RunnerClass set, DoesNotExist when unset),
|
||||
// so the placeholder lands where the actual workflow pod can follow it
|
||||
// - required GPU node label (when WorkflowGPU > 0), AND-combined with runner-class
|
||||
// - weight 50 preference for node-fleet + workload-type
|
||||
// - optional weight 100 preference for runner-class (when set)
|
||||
// - optional GPU node selector requirement (when WorkflowGPU > 0)
|
||||
func (pm *PlaceholderManager) buildWorkflowAffinity() *corev1.Affinity {
|
||||
preferredTerms := []corev1.PreferredSchedulingTerm{}
|
||||
|
||||
// Optional runner-class preference (template uses higher weight for class match).
|
||||
// Runner-class must be REQUIRED (not preferred): the actual workflow pod
|
||||
// uses a required term here, so a placeholder landing on a non-matching
|
||||
// node cannot be followed by the real pod and the reservation is wasted.
|
||||
var runnerClassReq corev1.NodeSelectorRequirement
|
||||
if pm.config.RunnerClass != "" {
|
||||
preferredTerms = append(preferredTerms, corev1.PreferredSchedulingTerm{
|
||||
Weight: 100,
|
||||
Preference: corev1.NodeSelectorTerm{
|
||||
MatchExpressions: []corev1.NodeSelectorRequirement{
|
||||
{
|
||||
Key: "osdc.io/runner-class",
|
||||
Operator: corev1.NodeSelectorOpIn,
|
||||
Values: []string{pm.config.RunnerClass},
|
||||
},
|
||||
},
|
||||
},
|
||||
runnerClassReq = corev1.NodeSelectorRequirement{
|
||||
Key: "osdc.io/runner-class",
|
||||
Operator: corev1.NodeSelectorOpIn,
|
||||
Values: []string{pm.config.RunnerClass},
|
||||
}
|
||||
} else {
|
||||
runnerClassReq = corev1.NodeSelectorRequirement{
|
||||
Key: "osdc.io/runner-class",
|
||||
Operator: corev1.NodeSelectorOpDoesNotExist,
|
||||
}
|
||||
}
|
||||
|
||||
// Required terms: runner-class always; GPU label when WorkflowGPU > 0.
|
||||
// Both go in the SAME matchExpressions block so they AND together.
|
||||
requiredExprs := []corev1.NodeSelectorRequirement{runnerClassReq}
|
||||
if pm.config.WorkflowGPU > 0 {
|
||||
requiredExprs = append(requiredExprs, corev1.NodeSelectorRequirement{
|
||||
Key: "nvidia.com/gpu",
|
||||
Operator: corev1.NodeSelectorOpIn,
|
||||
Values: []string{"true"},
|
||||
})
|
||||
}
|
||||
|
||||
|
|
@ -511,37 +522,25 @@ func (pm *PlaceholderManager) buildWorkflowAffinity() *corev1.Affinity {
|
|||
},
|
||||
}, fleetMatchExpressions...)
|
||||
}
|
||||
preferredTerms = append(preferredTerms, corev1.PreferredSchedulingTerm{
|
||||
Weight: 50,
|
||||
Preference: corev1.NodeSelectorTerm{
|
||||
MatchExpressions: fleetMatchExpressions,
|
||||
preferredTerms := []corev1.PreferredSchedulingTerm{
|
||||
{
|
||||
Weight: 50,
|
||||
Preference: corev1.NodeSelectorTerm{
|
||||
MatchExpressions: fleetMatchExpressions,
|
||||
},
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
affinity := &corev1.Affinity{
|
||||
return &corev1.Affinity{
|
||||
NodeAffinity: &corev1.NodeAffinity{
|
||||
RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{
|
||||
NodeSelectorTerms: []corev1.NodeSelectorTerm{
|
||||
{MatchExpressions: requiredExprs},
|
||||
},
|
||||
},
|
||||
PreferredDuringSchedulingIgnoredDuringExecution: preferredTerms,
|
||||
},
|
||||
}
|
||||
|
||||
// Optional hard GPU node selector requirement.
|
||||
if pm.config.WorkflowGPU > 0 {
|
||||
affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution = &corev1.NodeSelector{
|
||||
NodeSelectorTerms: []corev1.NodeSelectorTerm{
|
||||
{
|
||||
MatchExpressions: []corev1.NodeSelectorRequirement{
|
||||
{
|
||||
Key: "nvidia.com/gpu",
|
||||
Operator: corev1.NodeSelectorOpIn,
|
||||
Values: []string{"true"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
return affinity
|
||||
}
|
||||
|
||||
// sleepArg returns the sleep duration for placeholder containers as a
|
||||
|
|
|
|||
|
|
@ -51,9 +51,10 @@ func TestRunnerPlaceholder_NoGPU_NoFleet_NoRunnerClass(t *testing.T) {
|
|||
|
||||
// Workflow placeholder mirrors the job-pod.yaml ConfigMap template:
|
||||
// NO hard nodeSelector — uses preferredDuringScheduling node affinity
|
||||
// with weight 50 for node-fleet + workload-type. Tolerations:
|
||||
// instance-type Exists, optional node-fleet, optional GPU. NO
|
||||
// git-cache-not-ready (workflow waits for cache, doesn't tolerate the taint).
|
||||
// with weight 50 for node-fleet + workload-type, plus a REQUIRED runner-class
|
||||
// term (DoesNotExist when RunnerClass unset). Tolerations: instance-type Exists,
|
||||
// optional node-fleet, optional GPU. NO git-cache-not-ready (workflow waits
|
||||
// for cache, doesn't tolerate the taint).
|
||||
func TestWorkflowPlaceholder_NoGPU_NoFleet_NoRunnerClass(t *testing.T) {
|
||||
pm, _ := newTestPM(t, Config{})
|
||||
ctx := context.Background()
|
||||
|
|
@ -67,8 +68,18 @@ func TestWorkflowPlaceholder_NoGPU_NoFleet_NoRunnerClass(t *testing.T) {
|
|||
require.NotNil(t, wf.Spec.Affinity)
|
||||
require.NotNil(t, wf.Spec.Affinity.NodeAffinity)
|
||||
preferred := wf.Spec.Affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution
|
||||
require.NotEmpty(t, preferred)
|
||||
assert.Equal(t, int32(50), preferred[len(preferred)-1].Weight)
|
||||
require.Len(t, preferred, 1, "only fleet preference; runner-class lives in required terms")
|
||||
assert.Equal(t, int32(50), preferred[0].Weight)
|
||||
|
||||
// Required: runner-class DoesNotExist (matches workflow pod's required term
|
||||
// for non-class fleets).
|
||||
required := wf.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution
|
||||
require.NotNil(t, required)
|
||||
require.Len(t, required.NodeSelectorTerms, 1)
|
||||
exprs := required.NodeSelectorTerms[0].MatchExpressions
|
||||
require.Len(t, exprs, 1)
|
||||
assert.Equal(t, "osdc.io/runner-class", exprs[0].Key)
|
||||
assert.Equal(t, corev1.NodeSelectorOpDoesNotExist, exprs[0].Operator)
|
||||
|
||||
// Without NodeFleet/GPU: only instance-type toleration. No git-cache.
|
||||
assert.Len(t, wf.Spec.Tolerations, 1)
|
||||
|
|
@ -152,13 +163,31 @@ func TestWorkflowPlaceholder_WithGPU_WithFleet_WithRunnerClass(t *testing.T) {
|
|||
// Workflow has no hard nodeSelector regardless of config.
|
||||
assert.Nil(t, wf.Spec.NodeSelector)
|
||||
|
||||
// Affinity: required GPU node selector + preferred runner-class + preferred fleet.
|
||||
// Affinity: required {runner-class In + GPU In} (AND) + preferred fleet.
|
||||
require.NotNil(t, wf.Spec.Affinity)
|
||||
require.NotNil(t, wf.Spec.Affinity.NodeAffinity)
|
||||
preferred := wf.Spec.Affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution
|
||||
assert.Len(t, preferred, 2, "runner-class preference + fleet preference")
|
||||
require.NotNil(t, wf.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution,
|
||||
"GPU workflow has required GPU node selector")
|
||||
require.Len(t, preferred, 1, "only fleet preference; runner-class lives in required terms")
|
||||
assert.Equal(t, int32(50), preferred[0].Weight)
|
||||
|
||||
required := wf.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution
|
||||
require.NotNil(t, required, "GPU + runner-class required terms")
|
||||
require.Len(t, required.NodeSelectorTerms, 1,
|
||||
"both required exprs go in the SAME matchExpressions block (AND semantics)")
|
||||
exprs := required.NodeSelectorTerms[0].MatchExpressions
|
||||
require.Len(t, exprs, 2, "runner-class In + nvidia.com/gpu In, both in same block")
|
||||
exprByKey := map[string]corev1.NodeSelectorRequirement{}
|
||||
for _, e := range exprs {
|
||||
exprByKey[e.Key] = e
|
||||
}
|
||||
rc, ok := exprByKey["osdc.io/runner-class"]
|
||||
require.True(t, ok, "runner-class required expr must be present")
|
||||
assert.Equal(t, corev1.NodeSelectorOpIn, rc.Operator)
|
||||
assert.Equal(t, []string{"gpu-large"}, rc.Values)
|
||||
gpu, ok := exprByKey["nvidia.com/gpu"]
|
||||
require.True(t, ok, "GPU required expr must be present")
|
||||
assert.Equal(t, corev1.NodeSelectorOpIn, gpu.Operator)
|
||||
assert.Equal(t, []string{"true"}, gpu.Values)
|
||||
|
||||
// Tolerations: instance-type + node-fleet + GPU = 3.
|
||||
assert.Len(t, wf.Spec.Tolerations, 3)
|
||||
|
|
@ -501,10 +530,104 @@ func TestRunnerPlaceholder_NeverIncludesRunnerClass(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
// The workflow placeholder MUST still consume RunnerClass via its preferred
|
||||
// node affinity (weight 100). Only the runner placeholder drops runner-class —
|
||||
// the workflow pod's scheduling pattern is unchanged.
|
||||
func TestWorkflowPlaceholder_StillUsesRunnerClassInPreferredAffinity(t *testing.T) {
|
||||
// The workflow placeholder MUST express runner-class as a REQUIRED node
|
||||
// affinity term — not preferred. The actual workflow pod uses a required
|
||||
// term for runner-class (In when set, DoesNotExist when unset), so a
|
||||
// placeholder landing on a non-matching node cannot be followed by the
|
||||
// real pod and the reservation is wasted.
|
||||
//
|
||||
// When WorkflowGPU > 0, the GPU node-label requirement must be AND-combined
|
||||
// with the runner-class requirement in the SAME matchExpressions block.
|
||||
func TestWorkflowPlaceholder_RequiresRunnerClass(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
runnerClass string
|
||||
gpu int
|
||||
// expected runner-class expr
|
||||
wantRCOp corev1.NodeSelectorOperator
|
||||
wantRCValues []string
|
||||
// expected exprs count in the single matchExpressions block
|
||||
wantExprCount int
|
||||
}{
|
||||
{
|
||||
name: "release without GPU",
|
||||
runnerClass: "release",
|
||||
gpu: 0,
|
||||
wantRCOp: corev1.NodeSelectorOpIn,
|
||||
wantRCValues: []string{"release"},
|
||||
wantExprCount: 1,
|
||||
},
|
||||
{
|
||||
name: "release with GPU",
|
||||
runnerClass: "release",
|
||||
gpu: 1,
|
||||
wantRCOp: corev1.NodeSelectorOpIn,
|
||||
wantRCValues: []string{"release"},
|
||||
wantExprCount: 2,
|
||||
},
|
||||
{
|
||||
name: "no runner-class without GPU",
|
||||
runnerClass: "",
|
||||
gpu: 0,
|
||||
wantRCOp: corev1.NodeSelectorOpDoesNotExist,
|
||||
wantExprCount: 1,
|
||||
},
|
||||
{
|
||||
name: "no runner-class with GPU",
|
||||
runnerClass: "",
|
||||
gpu: 1,
|
||||
wantRCOp: corev1.NodeSelectorOpDoesNotExist,
|
||||
wantExprCount: 2,
|
||||
},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
cfg := Config{
|
||||
NodeFleet: "g4dn",
|
||||
RunnerClass: tc.runnerClass,
|
||||
WorkflowGPU: tc.gpu,
|
||||
}
|
||||
pm, _ := newTestPM(t, cfg)
|
||||
ctx := context.Background()
|
||||
|
||||
require.NoError(t, pm.CreatePair(ctx, "wf-req-slot"))
|
||||
pairs, _ := pm.ListPairs(ctx)
|
||||
wf := pairs["wf-req-slot"].WorkflowPod
|
||||
require.NotNil(t, wf)
|
||||
require.NotNil(t, wf.Spec.Affinity)
|
||||
require.NotNil(t, wf.Spec.Affinity.NodeAffinity)
|
||||
|
||||
required := wf.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution
|
||||
require.NotNil(t, required, "workflow placeholder must have required affinity for runner-class")
|
||||
require.Len(t, required.NodeSelectorTerms, 1,
|
||||
"runner-class and GPU must AND together in a single matchExpressions block")
|
||||
exprs := required.NodeSelectorTerms[0].MatchExpressions
|
||||
require.Len(t, exprs, tc.wantExprCount)
|
||||
|
||||
exprByKey := map[string]corev1.NodeSelectorRequirement{}
|
||||
for _, e := range exprs {
|
||||
exprByKey[e.Key] = e
|
||||
}
|
||||
rc, ok := exprByKey["osdc.io/runner-class"]
|
||||
require.True(t, ok, "runner-class required expr must be present")
|
||||
assert.Equal(t, tc.wantRCOp, rc.Operator)
|
||||
assert.Equal(t, tc.wantRCValues, rc.Values)
|
||||
|
||||
if tc.gpu > 0 {
|
||||
gpu, ok := exprByKey["nvidia.com/gpu"]
|
||||
require.True(t, ok, "GPU required expr must be present when WorkflowGPU > 0")
|
||||
assert.Equal(t, corev1.NodeSelectorOpIn, gpu.Operator)
|
||||
assert.Equal(t, []string{"true"}, gpu.Values)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Regression guard: runner-class must live ONLY in required terms — never
|
||||
// in preferred. The earlier (buggy) implementation used a PreferredSchedulingTerm
|
||||
// (weight 100) which let the placeholder land on a non-class node where the
|
||||
// actual workflow pod (which uses a required term) cannot follow.
|
||||
func TestWorkflowPlaceholder_RunnerClassMustBeRequiredNotPreferred(t *testing.T) {
|
||||
cfg := Config{
|
||||
NodeFleet: "g4dn",
|
||||
RunnerClass: "release",
|
||||
|
|
@ -512,26 +635,19 @@ func TestWorkflowPlaceholder_StillUsesRunnerClassInPreferredAffinity(t *testing.
|
|||
pm, _ := newTestPM(t, cfg)
|
||||
ctx := context.Background()
|
||||
|
||||
require.NoError(t, pm.CreatePair(ctx, "wf-rc-slot"))
|
||||
require.NoError(t, pm.CreatePair(ctx, "wf-guard-slot"))
|
||||
pairs, _ := pm.ListPairs(ctx)
|
||||
wf := pairs["wf-rc-slot"].WorkflowPod
|
||||
wf := pairs["wf-guard-slot"].WorkflowPod
|
||||
require.NotNil(t, wf)
|
||||
require.NotNil(t, wf.Spec.Affinity)
|
||||
require.NotNil(t, wf.Spec.Affinity.NodeAffinity)
|
||||
|
||||
var foundRC bool
|
||||
for _, term := range wf.Spec.Affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution {
|
||||
for _, expr := range term.Preference.MatchExpressions {
|
||||
if expr.Key == "osdc.io/runner-class" {
|
||||
foundRC = true
|
||||
assert.Equal(t, int32(100), term.Weight,
|
||||
"workflow runner-class preferred affinity weight must be 100")
|
||||
assert.Contains(t, expr.Values, "release")
|
||||
}
|
||||
assert.NotEqual(t, "osdc.io/runner-class", expr.Key,
|
||||
"runner-class must NEVER appear in a PreferredSchedulingTerm — required only")
|
||||
}
|
||||
}
|
||||
assert.True(t, foundRC,
|
||||
"workflow placeholder must still include runner-class in preferred affinity")
|
||||
}
|
||||
|
||||
func tolerationKeySet(tolerations []corev1.Toleration) map[string]struct{} {
|
||||
|
|
|
|||
Loading…
Reference in New Issue