Fix PercentageRunnersBusy scaling delay (#1579)

* Use a dedicated pod label to say it is a runner pod

Follow-up for #1546

* Fix PercentageRunnersBusy scaling delay

Ref #1374
This commit is contained in:
Yusuke Kuoka 2022-06-29 20:49:21 +09:00 committed by GitHub
parent a9ac5a1cbf
commit 8161136cbd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 83 additions and 17 deletions

View File

@ -10,6 +10,8 @@ import (
"github.com/actions-runner-controller/actions-runner-controller/api/v1alpha1"
"github.com/google/go-github/v45/github"
corev1 "k8s.io/api/core/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
)
const (
@ -314,22 +316,52 @@ func (r *HorizontalRunnerAutoscalerReconciler) suggestReplicasByPercentageRunner
numRunners int
numRunnersRegistered int
numRunnersBusy int
numTerminatingBusy int
)
numRunners = len(runnerMap)
busyTerminatingRunnerPods := map[string]struct{}{}
kindLabel := LabelKeyRunnerDeploymentName
if hra.Spec.ScaleTargetRef.Kind == "RunnerSet" {
kindLabel = LabelKeyRunnerSetName
}
var runnerPodList corev1.PodList
if err := r.Client.List(ctx, &runnerPodList, client.InNamespace(hra.Namespace), client.MatchingLabels(map[string]string{
kindLabel: hra.Spec.ScaleTargetRef.Name,
})); err != nil {
return nil, err
}
for _, p := range runnerPodList.Items {
if p.Annotations[AnnotationKeyUnregistrationFailureMessage] != "" {
busyTerminatingRunnerPods[p.Name] = struct{}{}
}
}
for _, runner := range runners {
if _, ok := runnerMap[*runner.Name]; ok {
numRunnersRegistered++
if runner.GetBusy() {
numRunnersBusy++
} else if _, ok := busyTerminatingRunnerPods[*runner.Name]; ok {
numTerminatingBusy++
}
delete(busyTerminatingRunnerPods, *runner.Name)
}
}
// Remaining busyTerminatingRunnerPods are runners that were not on the ListRunners API response yet
for range busyTerminatingRunnerPods {
numTerminatingBusy++
}
var desiredReplicas int
fractionBusy := float64(numRunnersBusy) / float64(desiredReplicasBefore)
fractionBusy := float64(numRunnersBusy+numTerminatingBusy) / float64(desiredReplicasBefore)
if fractionBusy >= scaleUpThreshold {
if scaleUpAdjustment > 0 {
desiredReplicas = desiredReplicasBefore + scaleUpAdjustment
@ -358,6 +390,7 @@ func (r *HorizontalRunnerAutoscalerReconciler) suggestReplicasByPercentageRunner
"num_runners", numRunners,
"num_runners_registered", numRunnersRegistered,
"num_runners_busy", numRunnersBusy,
"num_terminating_busy", numTerminatingBusy,
"namespace", hra.Namespace,
"kind", st.kind,
"name", st.st,

View File

@ -4,6 +4,7 @@ import "time"
const (
LabelKeyRunnerSetName = "runnerset-name"
LabelKeyRunner = "actions-runner"
)
const (
@ -16,6 +17,9 @@ const (
AnnotationKeyLastRegistrationCheckTime = "actions-runner-controller/last-registration-check-time"
// AnnotationKeyUnregistrationFailureMessage is the annotation that is added onto the pod once it failed to be unregistered from GitHub due to e.g. 422 error
AnnotationKeyUnregistrationFailureMessage = annotationKeyPrefix + "unregistration-failure-message"
// AnnotationKeyUnregistrationCompleteTimestamp is the annotation that is added onto the pod once the previously started unregistration process has been completed.
AnnotationKeyUnregistrationCompleteTimestamp = annotationKeyPrefix + "unregistration-complete-timestamp"

View File

@ -56,7 +56,7 @@ func TestNewRunnerPod(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
"actions-runner-controller/inject-registration-token": "true",
"runnerset-name": "runner",
"actions-runner": "",
},
},
Spec: corev1.PodSpec{
@ -198,7 +198,7 @@ func TestNewRunnerPod(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
"actions-runner-controller/inject-registration-token": "true",
"runnerset-name": "runner",
"actions-runner": "",
},
},
Spec: corev1.PodSpec{
@ -276,7 +276,7 @@ func TestNewRunnerPod(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
"actions-runner-controller/inject-registration-token": "true",
"runnerset-name": "runner",
"actions-runner": "",
},
},
Spec: corev1.PodSpec{
@ -515,7 +515,7 @@ func TestNewRunnerPod(t *testing.T) {
for i := range testcases {
tc := testcases[i]
t.Run(tc.description, func(t *testing.T) {
got, err := newRunnerPod("runner", tc.template, tc.config, defaultRunnerImage, defaultRunnerImagePullSecrets, defaultDockerImage, defaultDockerRegistryMirror, githubBaseURL)
got, err := newRunnerPod(tc.template, tc.config, defaultRunnerImage, defaultRunnerImagePullSecrets, defaultDockerImage, defaultDockerRegistryMirror, githubBaseURL)
require.NoError(t, err)
require.Equal(t, tc.want, got)
})
@ -546,7 +546,7 @@ func TestNewRunnerPodFromRunnerController(t *testing.T) {
Labels: map[string]string{
"actions-runner-controller/inject-registration-token": "true",
"pod-template-hash": "8857b86c7",
"runnerset-name": "runner",
"actions-runner": "",
},
OwnerReferences: []metav1.OwnerReference{
{
@ -703,7 +703,7 @@ func TestNewRunnerPodFromRunnerController(t *testing.T) {
Labels: map[string]string{
"actions-runner-controller/inject-registration-token": "true",
"pod-template-hash": "8857b86c7",
"runnerset-name": "runner",
"actions-runner": "",
},
OwnerReferences: []metav1.OwnerReference{
{
@ -800,7 +800,7 @@ func TestNewRunnerPodFromRunnerController(t *testing.T) {
Labels: map[string]string{
"actions-runner-controller/inject-registration-token": "true",
"pod-template-hash": "8857b86c7",
"runnerset-name": "runner",
"actions-runner": "",
},
OwnerReferences: []metav1.OwnerReference{
{

View File

@ -426,7 +426,7 @@ func (r *RunnerReconciler) newPod(runner v1alpha1.Runner) (corev1.Pod, error) {
}
}
pod, err := newRunnerPodWithContainerMode(runner.Spec.ContainerMode, runner.Name, template, runner.Spec.RunnerConfig, r.RunnerImage, r.RunnerImagePullSecrets, r.DockerImage, r.DockerRegistryMirror, r.GitHubClient.GithubBaseURL)
pod, err := newRunnerPodWithContainerMode(runner.Spec.ContainerMode, template, runner.Spec.RunnerConfig, r.RunnerImage, r.RunnerImagePullSecrets, r.DockerImage, r.DockerRegistryMirror, r.GitHubClient.GithubBaseURL)
if err != nil {
return pod, err
}
@ -589,7 +589,7 @@ func runnerHookEnvs(pod *corev1.Pod) ([]corev1.EnvVar, error) {
}, nil
}
func newRunnerPodWithContainerMode(containerMode string, runnerName string, template corev1.Pod, runnerSpec v1alpha1.RunnerConfig, defaultRunnerImage string, defaultRunnerImagePullSecrets []string, defaultDockerImage, defaultDockerRegistryMirror string, githubBaseURL string) (corev1.Pod, error) {
func newRunnerPodWithContainerMode(containerMode string, template corev1.Pod, runnerSpec v1alpha1.RunnerConfig, defaultRunnerImage string, defaultRunnerImagePullSecrets []string, defaultDockerImage, defaultDockerRegistryMirror string, githubBaseURL string) (corev1.Pod, error) {
var (
privileged bool = true
dockerdInRunner bool = runnerSpec.DockerdWithinRunnerContainer != nil && *runnerSpec.DockerdWithinRunnerContainer
@ -607,7 +607,7 @@ func newRunnerPodWithContainerMode(containerMode string, runnerName string, temp
template = *template.DeepCopy()
// This label selector is used by default when rd.Spec.Selector is empty.
template.ObjectMeta.Labels = CloneAndAddLabel(template.ObjectMeta.Labels, LabelKeyRunnerSetName, runnerName)
template.ObjectMeta.Labels = CloneAndAddLabel(template.ObjectMeta.Labels, LabelKeyRunner, "")
template.ObjectMeta.Labels = CloneAndAddLabel(template.ObjectMeta.Labels, LabelKeyPodMutation, LabelValuePodMutation)
workDir := runnerSpec.WorkDir
@ -962,8 +962,8 @@ func newRunnerPodWithContainerMode(containerMode string, runnerName string, temp
return *pod, nil
}
func newRunnerPod(runnerName string, template corev1.Pod, runnerSpec v1alpha1.RunnerConfig, defaultRunnerImage string, defaultRunnerImagePullSecrets []string, defaultDockerImage, defaultDockerRegistryMirror string, githubBaseURL string) (corev1.Pod, error) {
return newRunnerPodWithContainerMode("", runnerName, template, runnerSpec, defaultRunnerImage, defaultRunnerImagePullSecrets, defaultDockerImage, defaultDockerRegistryMirror, githubBaseURL)
func newRunnerPod(template corev1.Pod, runnerSpec v1alpha1.RunnerConfig, defaultRunnerImage string, defaultRunnerImagePullSecrets []string, defaultDockerImage, defaultDockerRegistryMirror string, githubBaseURL string) (corev1.Pod, error) {
return newRunnerPodWithContainerMode("", template, runnerSpec, defaultRunnerImage, defaultRunnerImagePullSecrets, defaultDockerImage, defaultDockerRegistryMirror, githubBaseURL)
}
func (r *RunnerReconciler) SetupWithManager(mgr ctrl.Manager) error {

View File

@ -67,6 +67,25 @@ func annotatePodOnce(ctx context.Context, c client.Client, log logr.Logger, pod
return updated, nil
}
func labelPod(ctx context.Context, c client.Client, log logr.Logger, pod *corev1.Pod, k, v string) (*corev1.Pod, error) {
if pod == nil {
return nil, nil
}
updated := pod.DeepCopy()
if updated.Labels == nil {
updated.Labels = map[string]string{}
}
updated.Labels[k] = v
if err := c.Patch(ctx, updated, client.MergeFrom(pod)); err != nil {
log.Error(err, fmt.Sprintf("Failed to patch pod to have %s annotation", k))
return nil, err
}
log.V(2).Info("Labeled pod", "key", k, "value", v)
return updated, nil
}
// If the first return value is nil, it's safe to delete the runner pod.
func ensureRunnerUnregistration(ctx context.Context, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*ctrl.Result, error) {
@ -151,7 +170,10 @@ func ensureRunnerUnregistration(ctx context.Context, retryDelay time.Duration, l
log.V(1).Info("Failed to unregister runner before deleting the pod.", "error", err)
var runnerBusy bool
var (
runnerBusy bool
runnerUnregistrationFailureMessage string
)
errRes := &gogithub.ErrorResponse{}
if errors.As(err, &errRes) {
@ -173,6 +195,7 @@ func ensureRunnerUnregistration(ctx context.Context, retryDelay time.Duration, l
}
runnerBusy = errRes.Response.StatusCode == 422
runnerUnregistrationFailureMessage = errRes.Message
if runnerBusy && code != nil {
log.V(2).Info("Runner container has already stopped but the unregistration attempt failed. "+
@ -187,6 +210,11 @@ func ensureRunnerUnregistration(ctx context.Context, retryDelay time.Duration, l
}
if runnerBusy {
_, err := labelPod(ctx, c, log, pod, AnnotationKeyUnregistrationFailureMessage, runnerUnregistrationFailureMessage)
if err != nil {
return &ctrl.Result{}, err
}
// We want to prevent spamming the deletion attemps but returning ctrl.Result with RequeueAfter doesn't
// work as the reconcilation can happen earlier due to pod status update.
// For ephemeral runners, we can expect it to stop and unregister itself on completion.

View File

@ -62,8 +62,7 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
return ctrl.Result{}, client.IgnoreNotFound(err)
}
_, isRunnerPod := runnerPod.Labels[LabelKeyRunnerSetName]
if !isRunnerPod {
if _, isRunnerPod := runnerPod.Labels[LabelKeyRunner]; !isRunnerPod {
return ctrl.Result{}, nil
}

View File

@ -219,7 +219,9 @@ func (r *RunnerSetReconciler) newStatefulSet(runnerSet *v1alpha1.RunnerSet) (*ap
template.Spec.ServiceAccountName = runnerSet.Spec.ServiceAccountName
}
pod, err := newRunnerPodWithContainerMode(runnerSet.Spec.RunnerConfig.ContainerMode, runnerSet.Name, template, runnerSet.Spec.RunnerConfig, r.RunnerImage, r.RunnerImagePullSecrets, r.DockerImage, r.DockerRegistryMirror, r.GitHubBaseURL)
template.ObjectMeta.Labels = CloneAndAddLabel(template.ObjectMeta.Labels, LabelKeyRunnerSetName, runnerSet.Name)
pod, err := newRunnerPodWithContainerMode(runnerSet.Spec.RunnerConfig.ContainerMode, template, runnerSet.Spec.RunnerConfig, r.RunnerImage, r.RunnerImagePullSecrets, r.DockerImage, r.DockerRegistryMirror, r.GitHubBaseURL)
if err != nil {
return nil, err
}