From 8161136cbd7451e055cbe4ca5cf9729ff1fe141e Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Wed, 29 Jun 2022 20:49:21 +0900 Subject: [PATCH] Fix PercentageRunnersBusy scaling delay (#1579) * Use a dedicated pod label to say it is a runner pod Follow-up for #1546 * Fix PercentageRunnersBusy scaling delay Ref #1374 --- controllers/autoscaling.go | 35 +++++++++++++++++++++++++++- controllers/constants.go | 4 ++++ controllers/new_runner_pod_test.go | 14 +++++------ controllers/runner_controller.go | 10 ++++---- controllers/runner_graceful_stop.go | 30 +++++++++++++++++++++++- controllers/runner_pod_controller.go | 3 +-- controllers/runnerset_controller.go | 4 +++- 7 files changed, 83 insertions(+), 17 deletions(-) diff --git a/controllers/autoscaling.go b/controllers/autoscaling.go index 1ae071b6..a2b76913 100644 --- a/controllers/autoscaling.go +++ b/controllers/autoscaling.go @@ -10,6 +10,8 @@ import ( "github.com/actions-runner-controller/actions-runner-controller/api/v1alpha1" "github.com/google/go-github/v45/github" + corev1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" ) const ( @@ -314,22 +316,52 @@ func (r *HorizontalRunnerAutoscalerReconciler) suggestReplicasByPercentageRunner numRunners int numRunnersRegistered int numRunnersBusy int + numTerminatingBusy int ) numRunners = len(runnerMap) + busyTerminatingRunnerPods := map[string]struct{}{} + + kindLabel := LabelKeyRunnerDeploymentName + if hra.Spec.ScaleTargetRef.Kind == "RunnerSet" { + kindLabel = LabelKeyRunnerSetName + } + + var runnerPodList corev1.PodList + if err := r.Client.List(ctx, &runnerPodList, client.InNamespace(hra.Namespace), client.MatchingLabels(map[string]string{ + kindLabel: hra.Spec.ScaleTargetRef.Name, + })); err != nil { + return nil, err + } + + for _, p := range runnerPodList.Items { + if p.Annotations[AnnotationKeyUnregistrationFailureMessage] != "" { + busyTerminatingRunnerPods[p.Name] = struct{}{} + } + } + for _, runner := range runners { if _, ok := runnerMap[*runner.Name]; ok { numRunnersRegistered++ if runner.GetBusy() { numRunnersBusy++ + } else if _, ok := busyTerminatingRunnerPods[*runner.Name]; ok { + numTerminatingBusy++ } + + delete(busyTerminatingRunnerPods, *runner.Name) } } + // Remaining busyTerminatingRunnerPods are runners that were not on the ListRunners API response yet + for range busyTerminatingRunnerPods { + numTerminatingBusy++ + } + var desiredReplicas int - fractionBusy := float64(numRunnersBusy) / float64(desiredReplicasBefore) + fractionBusy := float64(numRunnersBusy+numTerminatingBusy) / float64(desiredReplicasBefore) if fractionBusy >= scaleUpThreshold { if scaleUpAdjustment > 0 { desiredReplicas = desiredReplicasBefore + scaleUpAdjustment @@ -358,6 +390,7 @@ func (r *HorizontalRunnerAutoscalerReconciler) suggestReplicasByPercentageRunner "num_runners", numRunners, "num_runners_registered", numRunnersRegistered, "num_runners_busy", numRunnersBusy, + "num_terminating_busy", numTerminatingBusy, "namespace", hra.Namespace, "kind", st.kind, "name", st.st, diff --git a/controllers/constants.go b/controllers/constants.go index bb36ba7a..076d796e 100644 --- a/controllers/constants.go +++ b/controllers/constants.go @@ -4,6 +4,7 @@ import "time" const ( LabelKeyRunnerSetName = "runnerset-name" + LabelKeyRunner = "actions-runner" ) const ( @@ -16,6 +17,9 @@ const ( AnnotationKeyLastRegistrationCheckTime = "actions-runner-controller/last-registration-check-time" + // AnnotationKeyUnregistrationFailureMessage is the annotation that is added onto the pod once it failed to be unregistered from GitHub due to e.g. 422 error + AnnotationKeyUnregistrationFailureMessage = annotationKeyPrefix + "unregistration-failure-message" + // AnnotationKeyUnregistrationCompleteTimestamp is the annotation that is added onto the pod once the previously started unregistration process has been completed. AnnotationKeyUnregistrationCompleteTimestamp = annotationKeyPrefix + "unregistration-complete-timestamp" diff --git a/controllers/new_runner_pod_test.go b/controllers/new_runner_pod_test.go index 09e80b3f..b65b4767 100644 --- a/controllers/new_runner_pod_test.go +++ b/controllers/new_runner_pod_test.go @@ -56,7 +56,7 @@ func TestNewRunnerPod(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{ "actions-runner-controller/inject-registration-token": "true", - "runnerset-name": "runner", + "actions-runner": "", }, }, Spec: corev1.PodSpec{ @@ -198,7 +198,7 @@ func TestNewRunnerPod(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{ "actions-runner-controller/inject-registration-token": "true", - "runnerset-name": "runner", + "actions-runner": "", }, }, Spec: corev1.PodSpec{ @@ -276,7 +276,7 @@ func TestNewRunnerPod(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{ "actions-runner-controller/inject-registration-token": "true", - "runnerset-name": "runner", + "actions-runner": "", }, }, Spec: corev1.PodSpec{ @@ -515,7 +515,7 @@ func TestNewRunnerPod(t *testing.T) { for i := range testcases { tc := testcases[i] t.Run(tc.description, func(t *testing.T) { - got, err := newRunnerPod("runner", tc.template, tc.config, defaultRunnerImage, defaultRunnerImagePullSecrets, defaultDockerImage, defaultDockerRegistryMirror, githubBaseURL) + got, err := newRunnerPod(tc.template, tc.config, defaultRunnerImage, defaultRunnerImagePullSecrets, defaultDockerImage, defaultDockerRegistryMirror, githubBaseURL) require.NoError(t, err) require.Equal(t, tc.want, got) }) @@ -546,7 +546,7 @@ func TestNewRunnerPodFromRunnerController(t *testing.T) { Labels: map[string]string{ "actions-runner-controller/inject-registration-token": "true", "pod-template-hash": "8857b86c7", - "runnerset-name": "runner", + "actions-runner": "", }, OwnerReferences: []metav1.OwnerReference{ { @@ -703,7 +703,7 @@ func TestNewRunnerPodFromRunnerController(t *testing.T) { Labels: map[string]string{ "actions-runner-controller/inject-registration-token": "true", "pod-template-hash": "8857b86c7", - "runnerset-name": "runner", + "actions-runner": "", }, OwnerReferences: []metav1.OwnerReference{ { @@ -800,7 +800,7 @@ func TestNewRunnerPodFromRunnerController(t *testing.T) { Labels: map[string]string{ "actions-runner-controller/inject-registration-token": "true", "pod-template-hash": "8857b86c7", - "runnerset-name": "runner", + "actions-runner": "", }, OwnerReferences: []metav1.OwnerReference{ { diff --git a/controllers/runner_controller.go b/controllers/runner_controller.go index a696ea06..69f0fdfa 100644 --- a/controllers/runner_controller.go +++ b/controllers/runner_controller.go @@ -426,7 +426,7 @@ func (r *RunnerReconciler) newPod(runner v1alpha1.Runner) (corev1.Pod, error) { } } - pod, err := newRunnerPodWithContainerMode(runner.Spec.ContainerMode, runner.Name, template, runner.Spec.RunnerConfig, r.RunnerImage, r.RunnerImagePullSecrets, r.DockerImage, r.DockerRegistryMirror, r.GitHubClient.GithubBaseURL) + pod, err := newRunnerPodWithContainerMode(runner.Spec.ContainerMode, template, runner.Spec.RunnerConfig, r.RunnerImage, r.RunnerImagePullSecrets, r.DockerImage, r.DockerRegistryMirror, r.GitHubClient.GithubBaseURL) if err != nil { return pod, err } @@ -589,7 +589,7 @@ func runnerHookEnvs(pod *corev1.Pod) ([]corev1.EnvVar, error) { }, nil } -func newRunnerPodWithContainerMode(containerMode string, runnerName string, template corev1.Pod, runnerSpec v1alpha1.RunnerConfig, defaultRunnerImage string, defaultRunnerImagePullSecrets []string, defaultDockerImage, defaultDockerRegistryMirror string, githubBaseURL string) (corev1.Pod, error) { +func newRunnerPodWithContainerMode(containerMode string, template corev1.Pod, runnerSpec v1alpha1.RunnerConfig, defaultRunnerImage string, defaultRunnerImagePullSecrets []string, defaultDockerImage, defaultDockerRegistryMirror string, githubBaseURL string) (corev1.Pod, error) { var ( privileged bool = true dockerdInRunner bool = runnerSpec.DockerdWithinRunnerContainer != nil && *runnerSpec.DockerdWithinRunnerContainer @@ -607,7 +607,7 @@ func newRunnerPodWithContainerMode(containerMode string, runnerName string, temp template = *template.DeepCopy() // This label selector is used by default when rd.Spec.Selector is empty. - template.ObjectMeta.Labels = CloneAndAddLabel(template.ObjectMeta.Labels, LabelKeyRunnerSetName, runnerName) + template.ObjectMeta.Labels = CloneAndAddLabel(template.ObjectMeta.Labels, LabelKeyRunner, "") template.ObjectMeta.Labels = CloneAndAddLabel(template.ObjectMeta.Labels, LabelKeyPodMutation, LabelValuePodMutation) workDir := runnerSpec.WorkDir @@ -962,8 +962,8 @@ func newRunnerPodWithContainerMode(containerMode string, runnerName string, temp return *pod, nil } -func newRunnerPod(runnerName string, template corev1.Pod, runnerSpec v1alpha1.RunnerConfig, defaultRunnerImage string, defaultRunnerImagePullSecrets []string, defaultDockerImage, defaultDockerRegistryMirror string, githubBaseURL string) (corev1.Pod, error) { - return newRunnerPodWithContainerMode("", runnerName, template, runnerSpec, defaultRunnerImage, defaultRunnerImagePullSecrets, defaultDockerImage, defaultDockerRegistryMirror, githubBaseURL) +func newRunnerPod(template corev1.Pod, runnerSpec v1alpha1.RunnerConfig, defaultRunnerImage string, defaultRunnerImagePullSecrets []string, defaultDockerImage, defaultDockerRegistryMirror string, githubBaseURL string) (corev1.Pod, error) { + return newRunnerPodWithContainerMode("", template, runnerSpec, defaultRunnerImage, defaultRunnerImagePullSecrets, defaultDockerImage, defaultDockerRegistryMirror, githubBaseURL) } func (r *RunnerReconciler) SetupWithManager(mgr ctrl.Manager) error { diff --git a/controllers/runner_graceful_stop.go b/controllers/runner_graceful_stop.go index 0e379333..1a0378d1 100644 --- a/controllers/runner_graceful_stop.go +++ b/controllers/runner_graceful_stop.go @@ -67,6 +67,25 @@ func annotatePodOnce(ctx context.Context, c client.Client, log logr.Logger, pod return updated, nil } +func labelPod(ctx context.Context, c client.Client, log logr.Logger, pod *corev1.Pod, k, v string) (*corev1.Pod, error) { + if pod == nil { + return nil, nil + } + + updated := pod.DeepCopy() + if updated.Labels == nil { + updated.Labels = map[string]string{} + } + updated.Labels[k] = v + if err := c.Patch(ctx, updated, client.MergeFrom(pod)); err != nil { + log.Error(err, fmt.Sprintf("Failed to patch pod to have %s annotation", k)) + return nil, err + } + + log.V(2).Info("Labeled pod", "key", k, "value", v) + + return updated, nil +} // If the first return value is nil, it's safe to delete the runner pod. func ensureRunnerUnregistration(ctx context.Context, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*ctrl.Result, error) { @@ -151,7 +170,10 @@ func ensureRunnerUnregistration(ctx context.Context, retryDelay time.Duration, l log.V(1).Info("Failed to unregister runner before deleting the pod.", "error", err) - var runnerBusy bool + var ( + runnerBusy bool + runnerUnregistrationFailureMessage string + ) errRes := &gogithub.ErrorResponse{} if errors.As(err, &errRes) { @@ -173,6 +195,7 @@ func ensureRunnerUnregistration(ctx context.Context, retryDelay time.Duration, l } runnerBusy = errRes.Response.StatusCode == 422 + runnerUnregistrationFailureMessage = errRes.Message if runnerBusy && code != nil { log.V(2).Info("Runner container has already stopped but the unregistration attempt failed. "+ @@ -187,6 +210,11 @@ func ensureRunnerUnregistration(ctx context.Context, retryDelay time.Duration, l } if runnerBusy { + _, err := labelPod(ctx, c, log, pod, AnnotationKeyUnregistrationFailureMessage, runnerUnregistrationFailureMessage) + if err != nil { + return &ctrl.Result{}, err + } + // We want to prevent spamming the deletion attemps but returning ctrl.Result with RequeueAfter doesn't // work as the reconcilation can happen earlier due to pod status update. // For ephemeral runners, we can expect it to stop and unregister itself on completion. diff --git a/controllers/runner_pod_controller.go b/controllers/runner_pod_controller.go index 3bfdcbaf..d058f1d3 100644 --- a/controllers/runner_pod_controller.go +++ b/controllers/runner_pod_controller.go @@ -62,8 +62,7 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( return ctrl.Result{}, client.IgnoreNotFound(err) } - _, isRunnerPod := runnerPod.Labels[LabelKeyRunnerSetName] - if !isRunnerPod { + if _, isRunnerPod := runnerPod.Labels[LabelKeyRunner]; !isRunnerPod { return ctrl.Result{}, nil } diff --git a/controllers/runnerset_controller.go b/controllers/runnerset_controller.go index ed9f0e8a..85050a05 100644 --- a/controllers/runnerset_controller.go +++ b/controllers/runnerset_controller.go @@ -219,7 +219,9 @@ func (r *RunnerSetReconciler) newStatefulSet(runnerSet *v1alpha1.RunnerSet) (*ap template.Spec.ServiceAccountName = runnerSet.Spec.ServiceAccountName } - pod, err := newRunnerPodWithContainerMode(runnerSet.Spec.RunnerConfig.ContainerMode, runnerSet.Name, template, runnerSet.Spec.RunnerConfig, r.RunnerImage, r.RunnerImagePullSecrets, r.DockerImage, r.DockerRegistryMirror, r.GitHubBaseURL) + template.ObjectMeta.Labels = CloneAndAddLabel(template.ObjectMeta.Labels, LabelKeyRunnerSetName, runnerSet.Name) + + pod, err := newRunnerPodWithContainerMode(runnerSet.Spec.RunnerConfig.ContainerMode, template, runnerSet.Spec.RunnerConfig, r.RunnerImage, r.RunnerImagePullSecrets, r.DockerImage, r.DockerRegistryMirror, r.GitHubBaseURL) if err != nil { return nil, err }