RunnerSet: Automatic-recovery from registration timeout and deregistration on pod termination (#652)

Ref #629
Ref #613
Ref #612
This commit is contained in:
Yusuke Kuoka 2021-06-24 20:39:37 +09:00 committed by GitHub
parent 98da4c2adb
commit acb906164b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 465 additions and 8 deletions

View File

@ -16,6 +16,8 @@ spec:
# RunnerSet doesn't support scale from/to zero yet # RunnerSet doesn't support scale from/to zero yet
minReplicas: 1 minReplicas: 1
maxReplicas: 5 maxReplicas: 5
# This should be less than 600(seconds, the default) for faster testing
scaleDownDelaySecondsAfterScaleOut: 60
metrics: metrics:
- type: PercentageRunnersBusy - type: PercentageRunnersBusy
scaleUpThreshold: '0.75' scaleUpThreshold: '0.75'

View File

@ -14,6 +14,10 @@ spec:
serviceName: example-runnerset serviceName: example-runnerset
#replicas: 1 #replicas: 1
# From my limited testing, `ephemeral: true` is more reliable.
# Seomtimes, updating already deployed runners from `ephemeral: false` to `ephemeral: true` seems to
# result in queued jobs hanging forever.
ephemeral: false ephemeral: false
repository: ${TEST_REPO} repository: ${TEST_REPO}

View File

@ -17,6 +17,10 @@ import (
"sigs.k8s.io/controller-runtime/pkg/webhook/admission" "sigs.k8s.io/controller-runtime/pkg/webhook/admission"
) )
const (
AnnotationKeyTokenExpirationDate = "actions-runner-controller/token-expires-at"
)
// +kubebuilder:webhook:path=/mutate-runner-set-pod,mutating=true,failurePolicy=ignore,groups="",resources=pods,verbs=create,versions=v1,name=mutate-runner-pod.webhook.actions.summerwind.dev,sideEffects=None,admissionReviewVersions=v1beta1 // +kubebuilder:webhook:path=/mutate-runner-set-pod,mutating=true,failurePolicy=ignore,groups="",resources=pods,verbs=create,versions=v1,name=mutate-runner-pod.webhook.actions.summerwind.dev,sideEffects=None,admissionReviewVersions=v1beta1
type PodRunnerTokenInjector struct { type PodRunnerTokenInjector struct {
@ -72,7 +76,7 @@ func (t *PodRunnerTokenInjector) Handle(ctx context.Context, req admission.Reque
updated := mutatePod(&pod, *rt.Token) updated := mutatePod(&pod, *rt.Token)
updated.Annotations["actions-runner-controller/token-expires-at"] = ts updated.Annotations[AnnotationKeyTokenExpirationDate] = ts
if pod.Spec.RestartPolicy != corev1.RestartPolicyOnFailure { if pod.Spec.RestartPolicy != corev1.RestartPolicyOnFailure {
updated.Spec.RestartPolicy = corev1.RestartPolicyOnFailure updated.Spec.RestartPolicy = corev1.RestartPolicyOnFailure

View File

@ -51,6 +51,10 @@ const (
// This is an annotation internal to actions-runner-controller and can change in backward-incompatible ways // This is an annotation internal to actions-runner-controller and can change in backward-incompatible ways
annotationKeyRegistrationOnly = "actions-runner-controller/registration-only" annotationKeyRegistrationOnly = "actions-runner-controller/registration-only"
EnvVarOrg = "RUNNER_ORG"
EnvVarRepo = "RUNNER_REPO"
EnvVarEnterprise = "RUNNER_ENTERPRISE"
) )
// RunnerReconciler reconciles a Runner object // RunnerReconciler reconciles a Runner object
@ -89,7 +93,7 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
} }
if runner.ObjectMeta.DeletionTimestamp.IsZero() { if runner.ObjectMeta.DeletionTimestamp.IsZero() {
finalizers, added := addFinalizer(runner.ObjectMeta.Finalizers) finalizers, added := addFinalizer(runner.ObjectMeta.Finalizers, finalizerName)
if added { if added {
newRunner := runner.DeepCopy() newRunner := runner.DeepCopy()
@ -103,7 +107,7 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
return ctrl.Result{}, nil return ctrl.Result{}, nil
} }
} else { } else {
finalizers, removed := removeFinalizer(runner.ObjectMeta.Finalizers) finalizers, removed := removeFinalizer(runner.ObjectMeta.Finalizers, finalizerName)
if removed { if removed {
if len(runner.Status.Registration.Token) > 0 { if len(runner.Status.Registration.Token) > 0 {
@ -741,15 +745,15 @@ func newRunnerPod(template corev1.Pod, runnerSpec v1alpha1.RunnerConfig, default
env := []corev1.EnvVar{ env := []corev1.EnvVar{
{ {
Name: "RUNNER_ORG", Name: EnvVarOrg,
Value: runnerSpec.Organization, Value: runnerSpec.Organization,
}, },
{ {
Name: "RUNNER_REPO", Name: EnvVarRepo,
Value: runnerSpec.Repository, Value: runnerSpec.Repository,
}, },
{ {
Name: "RUNNER_ENTERPRISE", Name: EnvVarEnterprise,
Value: runnerSpec.Enterprise, Value: runnerSpec.Enterprise,
}, },
{ {
@ -1029,7 +1033,7 @@ func (r *RunnerReconciler) SetupWithManager(mgr ctrl.Manager) error {
Complete(r) Complete(r)
} }
func addFinalizer(finalizers []string) ([]string, bool) { func addFinalizer(finalizers []string, finalizerName string) ([]string, bool) {
exists := false exists := false
for _, name := range finalizers { for _, name := range finalizers {
if name == finalizerName { if name == finalizerName {
@ -1044,7 +1048,7 @@ func addFinalizer(finalizers []string) ([]string, bool) {
return append(finalizers, finalizerName), true return append(finalizers, finalizerName), true
} }
func removeFinalizer(finalizers []string) ([]string, bool) { func removeFinalizer(finalizers []string, finalizerName string) ([]string, bool) {
removed := false removed := false
result := []string{} result := []string{}

View File

@ -0,0 +1,431 @@
/*
Copyright 2020 The actions-runner-controller authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package controllers
import (
"context"
"errors"
"fmt"
"time"
gogithub "github.com/google/go-github/v33/github"
"k8s.io/apimachinery/pkg/util/wait"
"github.com/go-logr/logr"
kerrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/tools/record"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
corev1 "k8s.io/api/core/v1"
"github.com/actions-runner-controller/actions-runner-controller/github"
)
// RunnerPodReconciler reconciles a Runner object
type RunnerPodReconciler struct {
client.Client
Log logr.Logger
Recorder record.EventRecorder
Scheme *runtime.Scheme
GitHubClient *github.Client
Name string
RegistrationRecheckInterval time.Duration
RegistrationRecheckJitter time.Duration
}
const (
// This names requires at leaset one slash to work.
// See https://github.com/google/knative-gcp/issues/378
runnerPodFinalizerName = "actions.summerwind.dev/runner-pod"
AnnotationKeyLastRegistrationCheckTime = "actions-runner-controller/last-registration-check-time"
)
// +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;update;patch;delete
// +kubebuilder:rbac:groups=core,resources=events,verbs=create;patch
func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
log := r.Log.WithValues("runnerpod", req.NamespacedName)
var runnerPod corev1.Pod
if err := r.Get(ctx, req.NamespacedName, &runnerPod); err != nil {
return ctrl.Result{}, client.IgnoreNotFound(err)
}
_, isRunnerPod := runnerPod.Labels[LabelKeyRunnerSetName]
if !isRunnerPod {
return ctrl.Result{}, nil
}
var enterprise, org, repo string
envvars := runnerPod.Spec.Containers[0].Env
for _, e := range envvars {
switch e.Name {
case EnvVarEnterprise:
enterprise = e.Value
case EnvVarOrg:
org = e.Value
case EnvVarRepo:
repo = e.Value
}
}
if runnerPod.ObjectMeta.DeletionTimestamp.IsZero() {
finalizers, added := addFinalizer(runnerPod.ObjectMeta.Finalizers, runnerPodFinalizerName)
if added {
newRunner := runnerPod.DeepCopy()
newRunner.ObjectMeta.Finalizers = finalizers
if err := r.Patch(ctx, newRunner, client.MergeFrom(&runnerPod)); err != nil {
log.Error(err, "Failed to update runner")
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
}
} else {
finalizers, removed := removeFinalizer(runnerPod.ObjectMeta.Finalizers, runnerPodFinalizerName)
if removed {
ok, err := r.unregisterRunner(ctx, enterprise, org, repo, runnerPod.Name)
if err != nil {
if errors.Is(err, &gogithub.RateLimitError{}) {
// We log the underlying error when we failed calling GitHub API to list or unregisters,
// or the runner is still busy.
log.Error(
err,
fmt.Sprintf(
"Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls",
retryDelayOnGitHubAPIRateLimitError,
),
)
return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
}
return ctrl.Result{}, err
}
if !ok {
log.V(1).Info("Runner no longer exists on GitHub")
}
newRunner := runnerPod.DeepCopy()
newRunner.ObjectMeta.Finalizers = finalizers
if err := r.Patch(ctx, newRunner, client.MergeFrom(&runnerPod)); err != nil {
log.Error(err, "Failed to update runner for finalizer removal")
return ctrl.Result{}, err
}
log.Info("Removed runner from GitHub", "repository", repo, "organization", org)
}
deletionTimeout := 1 * time.Minute
currentTime := time.Now()
deletionDidTimeout := currentTime.Sub(runnerPod.DeletionTimestamp.Add(deletionTimeout)) > 0
if deletionDidTimeout {
log.Info(
fmt.Sprintf("Failed to delete pod within %s. ", deletionTimeout)+
"This is typically the case when a Kubernetes node became unreachable "+
"and the kube controller started evicting nodes. Forcefully deleting the pod to not get stuck.",
"podDeletionTimestamp", runnerPod.DeletionTimestamp,
"currentTime", currentTime,
"configuredDeletionTimeout", deletionTimeout,
)
var force int64 = 0
// forcefully delete runner as we would otherwise get stuck if the node stays unreachable
if err := r.Delete(ctx, &runnerPod, &client.DeleteOptions{GracePeriodSeconds: &force}); err != nil {
// probably
if !kerrors.IsNotFound(err) {
log.Error(err, "Failed to forcefully delete pod resource ...")
return ctrl.Result{}, err
}
// forceful deletion finally succeeded
return ctrl.Result{Requeue: true}, nil
}
r.Recorder.Event(&runnerPod, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Forcefully deleted pod '%s'", runnerPod.Name))
log.Info("Forcefully deleted runner pod", "repository", repo)
// give kube manager a little time to forcefully delete the stuck pod
return ctrl.Result{RequeueAfter: 3 * time.Second}, nil
}
return ctrl.Result{}, nil
}
// If pod has ended up succeeded we need to restart it
// Happens e.g. when dind is in runner and run completes
stopped := runnerPod.Status.Phase == corev1.PodSucceeded
if !stopped {
if runnerPod.Status.Phase == corev1.PodRunning {
for _, status := range runnerPod.Status.ContainerStatuses {
if status.Name != containerName {
continue
}
if status.State.Terminated != nil && status.State.Terminated.ExitCode == 0 {
stopped = true
}
}
}
}
restart := stopped
var registrationRecheckDelay time.Duration
// all checks done below only decide whether a restart is needed
// if a restart was already decided before, there is no need for the checks
// saving API calls and scary log messages
if !restart {
registrationCheckInterval := time.Minute
if r.RegistrationRecheckInterval > 0 {
registrationCheckInterval = r.RegistrationRecheckInterval
}
lastCheckTimeStr := runnerPod.Annotations[AnnotationKeyLastRegistrationCheckTime]
var lastCheckTime *time.Time
if lastCheckTimeStr != "" {
t, err := time.Parse(time.RFC3339, lastCheckTimeStr)
if err != nil {
log.Error(err, "failed to parase last check time %q", lastCheckTimeStr)
return ctrl.Result{}, nil
}
lastCheckTime = &t
}
// We want to call ListRunners GitHub Actions API only once per runner per minute.
// This if block, in conjunction with:
// return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil
// achieves that.
if lastCheckTime != nil {
nextCheckTime := lastCheckTime.Add(registrationCheckInterval)
now := time.Now()
// Requeue scheduled by RequeueAfter can happen a bit earlier (like dozens of milliseconds)
// so to avoid excessive, in-effective retry, we heuristically ignore the remaining delay in case it is
// shorter than 1s
requeueAfter := nextCheckTime.Sub(now) - time.Second
if requeueAfter > 0 {
log.Info(
fmt.Sprintf("Skipped registration check because it's deferred until %s. Retrying in %s at latest", nextCheckTime, requeueAfter),
"lastRegistrationCheckTime", lastCheckTime,
"registrationCheckInterval", registrationCheckInterval,
)
// Without RequeueAfter, the controller may not retry on scheduled. Instead, it must wait until the
// next sync period passes, which can be too much later than nextCheckTime.
//
// We need to requeue on this reconcilation even though we have already scheduled the initial
// requeue previously with `return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil`.
// Apparently, the workqueue used by controller-runtime seems to deduplicate and resets the delay on
// other requeues- so the initial scheduled requeue may have been reset due to requeue on
// spec/status change.
return ctrl.Result{RequeueAfter: requeueAfter}, nil
}
}
notFound := false
offline := false
_, err := r.GitHubClient.IsRunnerBusy(ctx, enterprise, org, repo, runnerPod.Name)
currentTime := time.Now()
if err != nil {
var notFoundException *github.RunnerNotFound
var offlineException *github.RunnerOffline
if errors.As(err, &notFoundException) {
notFound = true
} else if errors.As(err, &offlineException) {
offline = true
} else {
var e *gogithub.RateLimitError
if errors.As(err, &e) {
// We log the underlying error when we failed calling GitHub API to list or unregisters,
// or the runner is still busy.
log.Error(
err,
fmt.Sprintf(
"Failed to check if runner is busy due to Github API rate limit. Retrying in %s to avoid excessive GitHub API calls",
retryDelayOnGitHubAPIRateLimitError,
),
)
return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
}
return ctrl.Result{}, err
}
}
registrationTimeout := 10 * time.Minute
durationAfterRegistrationTimeout := currentTime.Sub(runnerPod.CreationTimestamp.Add(registrationTimeout))
registrationDidTimeout := durationAfterRegistrationTimeout > 0
if notFound {
if registrationDidTimeout {
log.Info(
"Runner failed to register itself to GitHub in timely manner. "+
"Recreating the pod to see if it resolves the issue. "+
"CAUTION: If you see this a lot, you should investigate the root cause. "+
"See https://github.com/actions-runner-controller/actions-runner-controller/issues/288",
"podCreationTimestamp", runnerPod.CreationTimestamp,
"currentTime", currentTime,
"configuredRegistrationTimeout", registrationTimeout,
)
restart = true
} else {
log.V(1).Info(
"Runner pod exists but we failed to check if runner is busy. Apparently it still needs more time.",
"runnerName", runnerPod.Name,
)
}
} else if offline {
if registrationDidTimeout {
log.Info(
"Already existing GitHub runner still appears offline . "+
"Recreating the pod to see if it resolves the issue. "+
"CAUTION: If you see this a lot, you should investigate the root cause. ",
"podCreationTimestamp", runnerPod.CreationTimestamp,
"currentTime", currentTime,
"configuredRegistrationTimeout", registrationTimeout,
)
restart = true
} else {
log.V(1).Info(
"Runner pod exists but the GitHub runner appears to be still offline. Waiting for runner to get online ...",
"runnerName", runnerPod.Name,
)
}
}
if (notFound || offline) && !registrationDidTimeout {
registrationRecheckJitter := 10 * time.Second
if r.RegistrationRecheckJitter > 0 {
registrationRecheckJitter = r.RegistrationRecheckJitter
}
registrationRecheckDelay = registrationCheckInterval + wait.Jitter(registrationRecheckJitter, 0.1)
}
}
// Don't do anything if there's no need to restart the runner
if !restart {
// This guard enables us to update runner.Status.Phase to `Running` only after
// the runner is registered to GitHub.
if registrationRecheckDelay > 0 {
log.V(1).Info(fmt.Sprintf("Rechecking the runner registration in %s", registrationRecheckDelay))
updated := runnerPod.DeepCopy()
t := time.Now().Format(time.RFC3339)
updated.Annotations[AnnotationKeyLastRegistrationCheckTime] = t
if err := r.Patch(ctx, updated, client.MergeFrom(&runnerPod)); err != nil {
log.Error(err, "Failed to update runner pod annotation for LastRegistrationCheckTime")
return ctrl.Result{}, err
}
return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil
}
// Seeing this message, you can expect the runner to become `Running` soon.
log.Info(
"Runner appears to have registered and running.",
"podCreationTimestamp", runnerPod.CreationTimestamp,
)
return ctrl.Result{}, nil
}
// Delete current pod if recreation is needed
if err := r.Delete(ctx, &runnerPod); err != nil {
log.Error(err, "Failed to delete pod resource")
return ctrl.Result{}, err
}
r.Recorder.Event(&runnerPod, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Deleted pod '%s'", runnerPod.Name))
log.Info("Deleted runner pod", "name", runnerPod.Name)
return ctrl.Result{}, nil
}
func (r *RunnerPodReconciler) unregisterRunner(ctx context.Context, enterprise, org, repo, name string) (bool, error) {
runners, err := r.GitHubClient.ListRunners(ctx, enterprise, org, repo)
if err != nil {
return false, err
}
var busy bool
id := int64(0)
for _, runner := range runners {
if runner.GetName() == name {
// Sometimes a runner can stuck "busy" even though it is already "offline".
// Thus removing the condition on status can block the runner pod from being terminated forever.
busy = runner.GetBusy()
if runner.GetStatus() != "offline" && busy {
r.Log.Info("This runner will delay the runner pod deletion and the runner deregistration until it becomes either offline or non-busy", "name", runner.GetName(), "status", runner.GetStatus(), "busy", runner.GetBusy())
return false, fmt.Errorf("runner is busy")
}
id = runner.GetID()
break
}
}
if id == int64(0) {
return false, nil
}
// Sometimes a runner can stuck "busy" even though it is already "offline".
// Trying to remove the offline but busy runner can result in errors like the following:
// failed to remove runner: DELETE https://api.github.com/repos/actions-runner-controller/mumoshu-actions-test/actions/runners/47: 422 Bad request - Runner \"example-runnerset-0\" is still running a job\" []
if !busy {
if err := r.GitHubClient.RemoveRunner(ctx, enterprise, org, repo, id); err != nil {
return false, err
}
}
return true, nil
}
func (r *RunnerPodReconciler) SetupWithManager(mgr ctrl.Manager) error {
name := "runnerpod-controller"
if r.Name != "" {
name = r.Name
}
r.Recorder = mgr.GetEventRecorderFor(name)
return ctrl.NewControllerManagedBy(mgr).
For(&corev1.Pod{}).
Named(name).
Complete(r)
}

12
main.go
View File

@ -215,6 +215,18 @@ func main() {
CacheDuration: gitHubAPICacheDuration, CacheDuration: gitHubAPICacheDuration,
} }
runnerPodReconciler := &controllers.RunnerPodReconciler{
Client: mgr.GetClient(),
Log: log.WithName("runnerpod"),
Scheme: mgr.GetScheme(),
GitHubClient: ghClient,
}
if err = runnerPodReconciler.SetupWithManager(mgr); err != nil {
log.Error(err, "unable to create controller", "controller", "RunnerPod")
os.Exit(1)
}
if err = horizontalRunnerAutoscaler.SetupWithManager(mgr); err != nil { if err = horizontalRunnerAutoscaler.SetupWithManager(mgr); err != nil {
log.Error(err, "unable to create controller", "controller", "HorizontalRunnerAutoscaler") log.Error(err, "unable to create controller", "controller", "HorizontalRunnerAutoscaler")
os.Exit(1) os.Exit(1)