862 lines
32 KiB
Go
862 lines
32 KiB
Go
/*
|
|
Copyright 2020 The actions-runner-controller authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package actionsgithubcom
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"net/http"
|
|
"time"
|
|
|
|
"github.com/actions/actions-runner-controller/apis/actions.github.com/v1alpha1"
|
|
"github.com/actions/actions-runner-controller/github/actions"
|
|
"github.com/go-logr/logr"
|
|
corev1 "k8s.io/api/core/v1"
|
|
kerrors "k8s.io/apimachinery/pkg/api/errors"
|
|
"k8s.io/apimachinery/pkg/runtime"
|
|
"k8s.io/apimachinery/pkg/types"
|
|
ctrl "sigs.k8s.io/controller-runtime"
|
|
"sigs.k8s.io/controller-runtime/pkg/client"
|
|
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
|
|
"sigs.k8s.io/controller-runtime/pkg/predicate"
|
|
)
|
|
|
|
const (
|
|
ephemeralRunnerFinalizerName = "ephemeralrunner.actions.github.com/finalizer"
|
|
ephemeralRunnerActionsFinalizerName = "ephemeralrunner.actions.github.com/runner-registration-finalizer"
|
|
)
|
|
|
|
// EphemeralRunnerReconciler reconciles a EphemeralRunner object
|
|
type EphemeralRunnerReconciler struct {
|
|
client.Client
|
|
Log logr.Logger
|
|
Scheme *runtime.Scheme
|
|
ActionsClient actions.MultiClient
|
|
ResourceBuilder
|
|
}
|
|
|
|
// +kubebuilder:rbac:groups=actions.github.com,resources=ephemeralrunners,verbs=get;list;watch;create;update;patch;delete
|
|
// +kubebuilder:rbac:groups=actions.github.com,resources=ephemeralrunners/status,verbs=get;update;patch
|
|
// +kubebuilder:rbac:groups=actions.github.com,resources=ephemeralrunners/finalizers,verbs=get;list;watch;create;update;patch;delete
|
|
// +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;create;update;patch;delete
|
|
// +kubebuilder:rbac:groups=core,resources=pods/status,verbs=get
|
|
// +kubebuilder:rbac:groups=core,resources=secrets,verbs=create;get;list;watch;delete
|
|
|
|
// Reconcile is part of the main kubernetes reconciliation loop which aims to
|
|
// move the current state of the cluster closer to the desired state.
|
|
//
|
|
// For more details, check Reconcile and its Result here:
|
|
// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.6.4/pkg/reconcile
|
|
func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
|
|
log := r.Log.WithValues("ephemeralrunner", req.NamespacedName)
|
|
|
|
ephemeralRunner := new(v1alpha1.EphemeralRunner)
|
|
if err := r.Get(ctx, req.NamespacedName, ephemeralRunner); err != nil {
|
|
return ctrl.Result{}, client.IgnoreNotFound(err)
|
|
}
|
|
|
|
if !ephemeralRunner.DeletionTimestamp.IsZero() {
|
|
if !controllerutil.ContainsFinalizer(ephemeralRunner, ephemeralRunnerFinalizerName) {
|
|
return ctrl.Result{}, nil
|
|
}
|
|
|
|
if controllerutil.ContainsFinalizer(ephemeralRunner, ephemeralRunnerActionsFinalizerName) {
|
|
log.Info("Trying to clean up runner from the service")
|
|
ok, err := r.cleanupRunnerFromService(ctx, ephemeralRunner, log)
|
|
if err != nil {
|
|
log.Error(err, "Failed to clean up runner from service")
|
|
return ctrl.Result{}, err
|
|
}
|
|
if !ok {
|
|
log.Info("Runner is not finished yet, retrying in 30s")
|
|
return ctrl.Result{RequeueAfter: 30 * time.Second}, nil
|
|
}
|
|
|
|
log.Info("Runner is cleaned up from the service, removing finalizer")
|
|
if err := patch(ctx, r.Client, ephemeralRunner, func(obj *v1alpha1.EphemeralRunner) {
|
|
controllerutil.RemoveFinalizer(obj, ephemeralRunnerActionsFinalizerName)
|
|
}); err != nil {
|
|
return ctrl.Result{}, err
|
|
}
|
|
log.Info("Removed finalizer from ephemeral runner")
|
|
}
|
|
|
|
log.Info("Finalizing ephemeral runner")
|
|
err := r.cleanupResources(ctx, ephemeralRunner, log)
|
|
if err != nil {
|
|
log.Error(err, "Failed to clean up ephemeral runner owned resources")
|
|
return ctrl.Result{}, err
|
|
}
|
|
|
|
if ephemeralRunner.HasContainerHookConfigured() {
|
|
log.Info("Runner has container hook configured, cleaning up container hook resources")
|
|
err = r.cleanupContainerHooksResources(ctx, ephemeralRunner, log)
|
|
if err != nil {
|
|
log.Error(err, "Failed to clean up container hooks resources")
|
|
return ctrl.Result{}, err
|
|
}
|
|
}
|
|
|
|
log.Info("Removing finalizer")
|
|
err = patch(ctx, r.Client, ephemeralRunner, func(obj *v1alpha1.EphemeralRunner) {
|
|
controllerutil.RemoveFinalizer(obj, ephemeralRunnerFinalizerName)
|
|
})
|
|
if err != nil && !kerrors.IsNotFound(err) {
|
|
log.Error(err, "Failed to update ephemeral runner without the finalizer")
|
|
return ctrl.Result{}, err
|
|
}
|
|
|
|
log.Info("Successfully removed finalizer after cleanup")
|
|
return ctrl.Result{}, nil
|
|
}
|
|
|
|
if ephemeralRunner.IsDone() {
|
|
log.Info("Cleaning up resources after after ephemeral runner termination", "phase", ephemeralRunner.Status.Phase)
|
|
err := r.cleanupResources(ctx, ephemeralRunner, log)
|
|
if err != nil {
|
|
log.Error(err, "Failed to clean up ephemeral runner owned resources")
|
|
return ctrl.Result{}, err
|
|
}
|
|
|
|
// Stop reconciling on this object.
|
|
// The EphemeralRunnerSet is responsible for cleaning it up.
|
|
log.Info("EphemeralRunner has already finished. Stopping reconciliation and waiting for EphemeralRunnerSet to clean it up", "phase", ephemeralRunner.Status.Phase)
|
|
return ctrl.Result{}, nil
|
|
}
|
|
|
|
if !controllerutil.ContainsFinalizer(ephemeralRunner, ephemeralRunnerFinalizerName) {
|
|
log.Info("Adding finalizer")
|
|
if err := patch(ctx, r.Client, ephemeralRunner, func(obj *v1alpha1.EphemeralRunner) {
|
|
controllerutil.AddFinalizer(obj, ephemeralRunnerFinalizerName)
|
|
}); err != nil {
|
|
log.Error(err, "Failed to update with finalizer set")
|
|
return ctrl.Result{}, err
|
|
}
|
|
|
|
log.Info("Successfully added finalizer")
|
|
return ctrl.Result{}, nil
|
|
}
|
|
|
|
if !controllerutil.ContainsFinalizer(ephemeralRunner, ephemeralRunnerActionsFinalizerName) {
|
|
log.Info("Adding runner registration finalizer")
|
|
err := patch(ctx, r.Client, ephemeralRunner, func(obj *v1alpha1.EphemeralRunner) {
|
|
controllerutil.AddFinalizer(obj, ephemeralRunnerActionsFinalizerName)
|
|
})
|
|
if err != nil {
|
|
log.Error(err, "Failed to update with runner registration finalizer set")
|
|
return ctrl.Result{}, err
|
|
}
|
|
|
|
log.Info("Successfully added runner registration finalizer")
|
|
return ctrl.Result{}, nil
|
|
}
|
|
|
|
if ephemeralRunner.Status.RunnerId == 0 {
|
|
log.Info("Creating new ephemeral runner registration and updating status with runner config")
|
|
if r, err := r.updateStatusWithRunnerConfig(ctx, ephemeralRunner, log); r != nil {
|
|
return *r, err
|
|
}
|
|
}
|
|
|
|
secret := new(corev1.Secret)
|
|
if err := r.Get(ctx, req.NamespacedName, secret); err != nil {
|
|
if !kerrors.IsNotFound(err) {
|
|
log.Error(err, "Failed to fetch secret")
|
|
return ctrl.Result{}, err
|
|
}
|
|
// create secret if not created
|
|
log.Info("Creating new ephemeral runner secret for jitconfig.")
|
|
if r, err := r.createSecret(ctx, ephemeralRunner, log); r != nil {
|
|
return *r, err
|
|
}
|
|
|
|
// Retry to get the secret that was just created.
|
|
// Otherwise, even though we want to continue to create the pod,
|
|
// it fails due to the missing secret resulting in an invalid pod spec.
|
|
if err := r.Get(ctx, req.NamespacedName, secret); err != nil {
|
|
log.Error(err, "Failed to fetch secret")
|
|
return ctrl.Result{}, err
|
|
}
|
|
}
|
|
|
|
pod := new(corev1.Pod)
|
|
if err := r.Get(ctx, req.NamespacedName, pod); err != nil {
|
|
switch {
|
|
case !kerrors.IsNotFound(err):
|
|
log.Error(err, "Failed to fetch the pod")
|
|
return ctrl.Result{}, err
|
|
|
|
case len(ephemeralRunner.Status.Failures) > 5:
|
|
log.Info("EphemeralRunner has failed more than 5 times. Marking it as failed")
|
|
errMessage := fmt.Sprintf("Pod has failed to start more than 5 times: %s", pod.Status.Message)
|
|
if err := r.markAsFailed(ctx, ephemeralRunner, errMessage, ReasonTooManyPodFailures, log); err != nil {
|
|
log.Error(err, "Failed to set ephemeral runner to phase Failed")
|
|
return ctrl.Result{}, err
|
|
}
|
|
return ctrl.Result{}, nil
|
|
|
|
default:
|
|
// Pod was not found. Create if the pod has never been created
|
|
log.Info("Creating new EphemeralRunner pod.")
|
|
result, err := r.createPod(ctx, ephemeralRunner, secret, log)
|
|
switch {
|
|
case err == nil:
|
|
return result, nil
|
|
case kerrors.IsInvalid(err) || kerrors.IsForbidden(err):
|
|
log.Error(err, "Failed to create a pod due to unrecoverable failure")
|
|
errMessage := fmt.Sprintf("Failed to create the pod: %v", err)
|
|
if err := r.markAsFailed(ctx, ephemeralRunner, errMessage, ReasonInvalidPodFailure, log); err != nil {
|
|
log.Error(err, "Failed to set ephemeral runner to phase Failed")
|
|
return ctrl.Result{}, err
|
|
}
|
|
return ctrl.Result{}, nil
|
|
default:
|
|
log.Error(err, "Failed to create the pod")
|
|
return ctrl.Result{}, err
|
|
}
|
|
}
|
|
}
|
|
|
|
cs := runnerContainerStatus(pod)
|
|
switch {
|
|
case cs == nil:
|
|
// starting, no container state yet
|
|
log.Info("Waiting for runner container status to be available")
|
|
return ctrl.Result{}, nil
|
|
case cs.State.Terminated == nil: // still running or evicted
|
|
if pod.Status.Phase == corev1.PodFailed && pod.Status.Reason == "Evicted" {
|
|
log.Info("Pod set the termination phase, but container state is not terminated. Deleting pod",
|
|
"PodPhase", pod.Status.Phase,
|
|
"PodReason", pod.Status.Reason,
|
|
"PodMessage", pod.Status.Message,
|
|
)
|
|
|
|
if err := r.deletePodAsFailed(ctx, ephemeralRunner, pod, log); err != nil {
|
|
log.Error(err, "failed to delete pod as failed on pod.Status.Phase: Failed")
|
|
return ctrl.Result{}, err
|
|
}
|
|
return ctrl.Result{}, nil
|
|
}
|
|
|
|
log.Info("Ephemeral runner container is still running")
|
|
if err := r.updateRunStatusFromPod(ctx, ephemeralRunner, pod, log); err != nil {
|
|
log.Info("Failed to update ephemeral runner status. Requeue to not miss this event")
|
|
return ctrl.Result{}, err
|
|
}
|
|
return ctrl.Result{}, nil
|
|
|
|
case cs.State.Terminated.ExitCode != 0: // failed
|
|
log.Info("Ephemeral runner container failed", "exitCode", cs.State.Terminated.ExitCode)
|
|
if err := r.deletePodAsFailed(ctx, ephemeralRunner, pod, log); err != nil {
|
|
log.Error(err, "Failed to delete runner pod on failure")
|
|
return ctrl.Result{}, err
|
|
}
|
|
return ctrl.Result{}, nil
|
|
|
|
default:
|
|
// pod succeeded. We double-check with the service if the runner exists.
|
|
// The reason is that image can potentially finish with status 0, but not pick up the job.
|
|
existsInService, err := r.runnerRegisteredWithService(ctx, ephemeralRunner.DeepCopy(), log)
|
|
if err != nil {
|
|
log.Error(err, "Failed to check if runner is registered with the service")
|
|
return ctrl.Result{}, err
|
|
}
|
|
if !existsInService {
|
|
// the runner does not exist in the service, so it must be done
|
|
log.Info("Ephemeral runner has finished since it does not exist in the service anymore")
|
|
if err := r.markAsFinished(ctx, ephemeralRunner, log); err != nil {
|
|
log.Error(err, "Failed to mark ephemeral runner as finished")
|
|
return ctrl.Result{}, err
|
|
}
|
|
return ctrl.Result{}, nil
|
|
}
|
|
|
|
// The runner still exists. This can happen if the pod exited with 0 but fails to start
|
|
log.Info("Ephemeral runner pod has finished, but the runner still exists in the service. Deleting the pod to restart it.")
|
|
if err := r.deletePodAsFailed(ctx, ephemeralRunner, pod, log); err != nil {
|
|
log.Error(err, "failed to delete a pod that still exists in the service")
|
|
return ctrl.Result{}, err
|
|
}
|
|
return ctrl.Result{}, nil
|
|
}
|
|
}
|
|
|
|
func (r *EphemeralRunnerReconciler) cleanupRunnerFromService(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) (ok bool, err error) {
|
|
if err := r.deleteRunnerFromService(ctx, ephemeralRunner, log); err != nil {
|
|
actionsError := &actions.ActionsError{}
|
|
if !errors.As(err, &actionsError) {
|
|
return false, err
|
|
}
|
|
|
|
if actionsError.StatusCode == http.StatusBadRequest && actionsError.IsException("JobStillRunningException") {
|
|
return false, nil
|
|
}
|
|
|
|
return false, err
|
|
}
|
|
|
|
return true, nil
|
|
}
|
|
|
|
func (r *EphemeralRunnerReconciler) cleanupResources(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) error {
|
|
log.Info("Cleaning up the runner pod")
|
|
pod := new(corev1.Pod)
|
|
err := r.Get(ctx, types.NamespacedName{Namespace: ephemeralRunner.Namespace, Name: ephemeralRunner.Name}, pod)
|
|
switch {
|
|
case err == nil:
|
|
if pod.DeletionTimestamp.IsZero() {
|
|
log.Info("Deleting the runner pod")
|
|
if err := r.Delete(ctx, pod); err != nil && !kerrors.IsNotFound(err) {
|
|
return fmt.Errorf("failed to delete pod: %w", err)
|
|
}
|
|
log.Info("Deleted the runner pod")
|
|
} else {
|
|
log.Info("Pod contains deletion timestamp")
|
|
}
|
|
case kerrors.IsNotFound(err):
|
|
log.Info("Runner pod is deleted")
|
|
default:
|
|
return err
|
|
}
|
|
|
|
log.Info("Cleaning up the runner jitconfig secret")
|
|
secret := new(corev1.Secret)
|
|
err = r.Get(ctx, types.NamespacedName{Namespace: ephemeralRunner.Namespace, Name: ephemeralRunner.Name}, secret)
|
|
switch {
|
|
case err == nil:
|
|
if secret.DeletionTimestamp.IsZero() {
|
|
log.Info("Deleting the jitconfig secret")
|
|
if err := r.Delete(ctx, secret); err != nil && !kerrors.IsNotFound(err) {
|
|
return fmt.Errorf("failed to delete secret: %w", err)
|
|
}
|
|
log.Info("Deleted jitconfig secret")
|
|
} else {
|
|
log.Info("Secret contains deletion timestamp")
|
|
}
|
|
case kerrors.IsNotFound(err):
|
|
log.Info("Runner jitconfig secret is deleted")
|
|
default:
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (r *EphemeralRunnerReconciler) cleanupContainerHooksResources(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) error {
|
|
log.Info("Cleaning up runner linked pods")
|
|
var errs []error
|
|
if err := r.cleanupRunnerLinkedPods(ctx, ephemeralRunner, log); err != nil {
|
|
errs = append(errs, err)
|
|
}
|
|
|
|
log.Info("Cleaning up runner linked secrets")
|
|
if err := r.cleanupRunnerLinkedSecrets(ctx, ephemeralRunner, log); err != nil {
|
|
errs = append(errs, err)
|
|
}
|
|
|
|
return errors.Join(errs...)
|
|
}
|
|
|
|
func (r *EphemeralRunnerReconciler) cleanupRunnerLinkedPods(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) error {
|
|
runnerLinedLabels := client.MatchingLabels(
|
|
map[string]string{
|
|
"runner-pod": ephemeralRunner.Name,
|
|
},
|
|
)
|
|
var runnerLinkedPodList corev1.PodList
|
|
if err := r.List(ctx, &runnerLinkedPodList, client.InNamespace(ephemeralRunner.Namespace), runnerLinedLabels); err != nil {
|
|
return fmt.Errorf("failed to list runner-linked pods: %w", err)
|
|
}
|
|
|
|
if len(runnerLinkedPodList.Items) == 0 {
|
|
log.Info("Runner-linked pods are deleted")
|
|
return nil
|
|
}
|
|
|
|
log.Info("Deleting container hooks runner-linked pods", "count", len(runnerLinkedPodList.Items))
|
|
|
|
var errs []error
|
|
for i := range runnerLinkedPodList.Items {
|
|
linkedPod := &runnerLinkedPodList.Items[i]
|
|
if !linkedPod.DeletionTimestamp.IsZero() {
|
|
continue
|
|
}
|
|
|
|
log.Info("Deleting container hooks runner-linked pod", "name", linkedPod.Name)
|
|
if err := r.Delete(ctx, linkedPod); err != nil && !kerrors.IsNotFound(err) {
|
|
errs = append(errs, fmt.Errorf("failed to delete runner linked pod %q: %w", linkedPod.Name, err))
|
|
}
|
|
}
|
|
|
|
return errors.Join(errs...)
|
|
}
|
|
|
|
func (r *EphemeralRunnerReconciler) cleanupRunnerLinkedSecrets(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) error {
|
|
runnerLinkedLabels := client.MatchingLabels(
|
|
map[string]string{
|
|
"runner-pod": ephemeralRunner.Name,
|
|
},
|
|
)
|
|
var runnerLinkedSecretList corev1.SecretList
|
|
if err := r.List(ctx, &runnerLinkedSecretList, client.InNamespace(ephemeralRunner.Namespace), runnerLinkedLabels); err != nil {
|
|
return fmt.Errorf("failed to list runner-linked secrets: %w", err)
|
|
}
|
|
|
|
if len(runnerLinkedSecretList.Items) == 0 {
|
|
log.Info("Runner-linked secrets are deleted")
|
|
return nil
|
|
}
|
|
|
|
log.Info("Deleting container hooks runner-linked secrets", "count", len(runnerLinkedSecretList.Items))
|
|
|
|
var errs []error
|
|
for i := range runnerLinkedSecretList.Items {
|
|
s := &runnerLinkedSecretList.Items[i]
|
|
if !s.DeletionTimestamp.IsZero() {
|
|
continue
|
|
}
|
|
|
|
log.Info("Deleting container hooks runner-linked secret", "name", s.Name)
|
|
if err := r.Delete(ctx, s); err != nil && !kerrors.IsNotFound(err) {
|
|
errs = append(errs, fmt.Errorf("failed to delete runner linked secret %q: %w", s.Name, err))
|
|
}
|
|
}
|
|
|
|
return errors.Join(errs...)
|
|
}
|
|
|
|
func (r *EphemeralRunnerReconciler) markAsFailed(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, errMessage string, reason string, log logr.Logger) error {
|
|
log.Info("Updating ephemeral runner status to Failed")
|
|
if err := patchSubResource(ctx, r.Status(), ephemeralRunner, func(obj *v1alpha1.EphemeralRunner) {
|
|
obj.Status.Phase = corev1.PodFailed
|
|
obj.Status.Reason = reason
|
|
obj.Status.Message = errMessage
|
|
}); err != nil {
|
|
return fmt.Errorf("failed to update ephemeral runner status Phase/Message: %w", err)
|
|
}
|
|
|
|
log.Info("Removing the runner from the service")
|
|
if err := r.deleteRunnerFromService(ctx, ephemeralRunner, log); err != nil {
|
|
return fmt.Errorf("failed to remove the runner from service: %w", err)
|
|
}
|
|
|
|
log.Info("EphemeralRunner is marked as Failed and deleted from the service")
|
|
return nil
|
|
}
|
|
|
|
func (r *EphemeralRunnerReconciler) markAsFinished(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) error {
|
|
log.Info("Updating ephemeral runner status to Finished")
|
|
if err := patchSubResource(ctx, r.Status(), ephemeralRunner, func(obj *v1alpha1.EphemeralRunner) {
|
|
obj.Status.Phase = corev1.PodSucceeded
|
|
}); err != nil {
|
|
return fmt.Errorf("failed to update ephemeral runner with status finished: %w", err)
|
|
}
|
|
|
|
log.Info("EphemeralRunner status is marked as Finished")
|
|
return nil
|
|
}
|
|
|
|
// deletePodAsFailed is responsible for deleting the pod and updating the .Status.Failures for tracking failure count.
|
|
// It should not be responsible for setting the status to Failed.
|
|
func (r *EphemeralRunnerReconciler) deletePodAsFailed(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, pod *corev1.Pod, log logr.Logger) error {
|
|
if pod.DeletionTimestamp.IsZero() {
|
|
log.Info("Deleting the ephemeral runner pod", "podId", pod.UID)
|
|
if err := r.Delete(ctx, pod); err != nil && !kerrors.IsNotFound(err) {
|
|
return fmt.Errorf("failed to delete pod with status failed: %w", err)
|
|
}
|
|
}
|
|
|
|
log.Info("Updating ephemeral runner status to track the failure count")
|
|
if err := patchSubResource(ctx, r.Status(), ephemeralRunner, func(obj *v1alpha1.EphemeralRunner) {
|
|
if obj.Status.Failures == nil {
|
|
obj.Status.Failures = make(map[string]bool)
|
|
}
|
|
obj.Status.Failures[string(pod.UID)] = true
|
|
obj.Status.Ready = false
|
|
obj.Status.Reason = pod.Status.Reason
|
|
obj.Status.Message = pod.Status.Message
|
|
}); err != nil {
|
|
return fmt.Errorf("failed to update ephemeral runner status: failed attempts: %w", err)
|
|
}
|
|
|
|
log.Info("EphemeralRunner pod is deleted and status is updated with failure count")
|
|
return nil
|
|
}
|
|
|
|
// updateStatusWithRunnerConfig fetches runtime configuration needed by the runner
|
|
// This method should always set .status.runnerId and .status.runnerJITConfig
|
|
func (r *EphemeralRunnerReconciler) updateStatusWithRunnerConfig(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) (*ctrl.Result, error) {
|
|
// Runner is not registered with the service. We need to register it first
|
|
log.Info("Creating ephemeral runner JIT config")
|
|
actionsClient, err := r.actionsClientFor(ctx, ephemeralRunner)
|
|
if err != nil {
|
|
return &ctrl.Result{}, fmt.Errorf("failed to get actions client for generating JIT config: %w", err)
|
|
}
|
|
|
|
jitSettings := &actions.RunnerScaleSetJitRunnerSetting{
|
|
Name: ephemeralRunner.Name,
|
|
}
|
|
|
|
for i := range ephemeralRunner.Spec.Spec.Containers {
|
|
if ephemeralRunner.Spec.Spec.Containers[i].Name == v1alpha1.EphemeralRunnerContainerName &&
|
|
ephemeralRunner.Spec.Spec.Containers[i].WorkingDir != "" {
|
|
jitSettings.WorkFolder = ephemeralRunner.Spec.Spec.Containers[i].WorkingDir
|
|
}
|
|
}
|
|
|
|
jitConfig, err := actionsClient.GenerateJitRunnerConfig(ctx, jitSettings, ephemeralRunner.Spec.RunnerScaleSetId)
|
|
if err != nil {
|
|
actionsError := &actions.ActionsError{}
|
|
if !errors.As(err, &actionsError) {
|
|
return &ctrl.Result{}, fmt.Errorf("failed to generate JIT config with generic error: %w", err)
|
|
}
|
|
|
|
if actionsError.StatusCode != http.StatusConflict ||
|
|
!actionsError.IsException("AgentExistsException") {
|
|
return &ctrl.Result{}, fmt.Errorf("failed to generate JIT config with Actions service error: %w", err)
|
|
}
|
|
|
|
// If the runner with the name we want already exists it means:
|
|
// - We might have a name collision.
|
|
// - Our previous reconciliation loop failed to update the
|
|
// status with the runnerId and runnerJITConfig after the `GenerateJitRunnerConfig`
|
|
// created the runner registration on the service.
|
|
// We will try to get the runner and see if it's belong to this AutoScalingRunnerSet,
|
|
// if so, we can simply delete the runner registration and create a new one.
|
|
log.Info("Getting runner jit config failed with conflict error, trying to get the runner by name", "runnerName", ephemeralRunner.Name)
|
|
existingRunner, err := actionsClient.GetRunnerByName(ctx, ephemeralRunner.Name)
|
|
if err != nil {
|
|
return &ctrl.Result{}, fmt.Errorf("failed to get runner by name: %w", err)
|
|
}
|
|
|
|
if existingRunner == nil {
|
|
log.Info("Runner with the same name does not exist, re-queuing the reconciliation")
|
|
return &ctrl.Result{Requeue: true}, nil
|
|
}
|
|
|
|
log.Info("Found the runner with the same name", "runnerId", existingRunner.Id, "runnerScaleSetId", existingRunner.RunnerScaleSetId)
|
|
if existingRunner.RunnerScaleSetId == ephemeralRunner.Spec.RunnerScaleSetId {
|
|
log.Info("Removing the runner with the same name")
|
|
err := actionsClient.RemoveRunner(ctx, int64(existingRunner.Id))
|
|
if err != nil {
|
|
return &ctrl.Result{}, fmt.Errorf("failed to remove runner from the service: %w", err)
|
|
}
|
|
|
|
log.Info("Removed the runner with the same name, re-queuing the reconciliation")
|
|
return &ctrl.Result{Requeue: true}, nil
|
|
}
|
|
|
|
// TODO: Do we want to mark the ephemeral runner as failed, and let EphemeralRunnerSet to clean it up, so we can recover from this situation?
|
|
// The situation is that the EphemeralRunner's name is already used by something else to register a runner, and we can't take the control back.
|
|
return &ctrl.Result{}, fmt.Errorf("runner with the same name but doesn't belong to this RunnerScaleSet: %w", err)
|
|
}
|
|
log.Info("Created ephemeral runner JIT config", "runnerId", jitConfig.Runner.Id)
|
|
|
|
log.Info("Updating ephemeral runner status with runnerId and runnerJITConfig")
|
|
err = patchSubResource(ctx, r.Status(), ephemeralRunner, func(obj *v1alpha1.EphemeralRunner) {
|
|
obj.Status.RunnerId = jitConfig.Runner.Id
|
|
obj.Status.RunnerName = jitConfig.Runner.Name
|
|
obj.Status.RunnerJITConfig = jitConfig.EncodedJITConfig
|
|
})
|
|
if err != nil {
|
|
return &ctrl.Result{}, fmt.Errorf("failed to update runner status for RunnerId/RunnerName/RunnerJITConfig: %w", err)
|
|
}
|
|
|
|
// We want to continue without a requeue for faster pod creation.
|
|
//
|
|
// To do so, we update the status in-place, so that both continuing the loop and
|
|
// and requeuing and skipping updateStatusWithRunnerConfig in the next loop, will
|
|
// have the same effect.
|
|
ephemeralRunner.Status.RunnerId = jitConfig.Runner.Id
|
|
ephemeralRunner.Status.RunnerName = jitConfig.Runner.Name
|
|
ephemeralRunner.Status.RunnerJITConfig = jitConfig.EncodedJITConfig
|
|
|
|
log.Info("Updated ephemeral runner status with runnerId and runnerJITConfig")
|
|
return nil, nil
|
|
}
|
|
|
|
func (r *EphemeralRunnerReconciler) createPod(ctx context.Context, runner *v1alpha1.EphemeralRunner, secret *corev1.Secret, log logr.Logger) (ctrl.Result, error) {
|
|
var envs []corev1.EnvVar
|
|
if runner.Spec.ProxySecretRef != "" {
|
|
http := corev1.EnvVar{
|
|
Name: "http_proxy",
|
|
ValueFrom: &corev1.EnvVarSource{
|
|
SecretKeyRef: &corev1.SecretKeySelector{
|
|
LocalObjectReference: corev1.LocalObjectReference{
|
|
Name: runner.Spec.ProxySecretRef,
|
|
},
|
|
Key: "http_proxy",
|
|
},
|
|
},
|
|
}
|
|
if runner.Spec.Proxy.HTTP != nil {
|
|
envs = append(envs, http)
|
|
}
|
|
|
|
https := corev1.EnvVar{
|
|
Name: "https_proxy",
|
|
ValueFrom: &corev1.EnvVarSource{
|
|
SecretKeyRef: &corev1.SecretKeySelector{
|
|
LocalObjectReference: corev1.LocalObjectReference{
|
|
Name: runner.Spec.ProxySecretRef,
|
|
},
|
|
Key: "https_proxy",
|
|
},
|
|
},
|
|
}
|
|
if runner.Spec.Proxy.HTTPS != nil {
|
|
envs = append(envs, https)
|
|
}
|
|
|
|
noProxy := corev1.EnvVar{
|
|
Name: "no_proxy",
|
|
ValueFrom: &corev1.EnvVarSource{
|
|
SecretKeyRef: &corev1.SecretKeySelector{
|
|
LocalObjectReference: corev1.LocalObjectReference{
|
|
Name: runner.Spec.ProxySecretRef,
|
|
},
|
|
Key: "no_proxy",
|
|
},
|
|
},
|
|
}
|
|
if len(runner.Spec.Proxy.NoProxy) > 0 {
|
|
envs = append(envs, noProxy)
|
|
}
|
|
}
|
|
|
|
log.Info("Creating new pod for ephemeral runner")
|
|
newPod := r.newEphemeralRunnerPod(ctx, runner, secret, envs...)
|
|
|
|
if err := ctrl.SetControllerReference(runner, newPod, r.Scheme); err != nil {
|
|
log.Error(err, "Failed to set controller reference to a new pod")
|
|
return ctrl.Result{}, err
|
|
}
|
|
|
|
log.Info("Created new pod spec for ephemeral runner")
|
|
if err := r.Create(ctx, newPod); err != nil {
|
|
log.Error(err, "Failed to create pod resource for ephemeral runner.")
|
|
return ctrl.Result{}, err
|
|
}
|
|
|
|
log.Info("Created ephemeral runner pod",
|
|
"runnerScaleSetId", runner.Spec.RunnerScaleSetId,
|
|
"runnerName", runner.Status.RunnerName,
|
|
"runnerId", runner.Status.RunnerId,
|
|
"configUrl", runner.Spec.GitHubConfigUrl,
|
|
"podName", newPod.Name)
|
|
|
|
return ctrl.Result{}, nil
|
|
}
|
|
|
|
func (r *EphemeralRunnerReconciler) createSecret(ctx context.Context, runner *v1alpha1.EphemeralRunner, log logr.Logger) (*ctrl.Result, error) {
|
|
log.Info("Creating new secret for ephemeral runner")
|
|
jitSecret := r.newEphemeralRunnerJitSecret(runner)
|
|
|
|
if err := ctrl.SetControllerReference(runner, jitSecret, r.Scheme); err != nil {
|
|
return &ctrl.Result{}, fmt.Errorf("failed to set controller reference: %w", err)
|
|
}
|
|
|
|
log.Info("Created new secret spec for ephemeral runner")
|
|
if err := r.Create(ctx, jitSecret); err != nil {
|
|
return &ctrl.Result{}, fmt.Errorf("failed to create jit secret: %w", err)
|
|
}
|
|
|
|
log.Info("Created ephemeral runner secret", "secretName", jitSecret.Name)
|
|
return nil, nil
|
|
}
|
|
|
|
// updateRunStatusFromPod is responsible for updating non-exiting statuses.
|
|
// It should never update phase to Failed or Succeeded
|
|
//
|
|
// The event should not be re-queued since the termination status should be set
|
|
// before proceeding with reconciliation logic
|
|
func (r *EphemeralRunnerReconciler) updateRunStatusFromPod(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, pod *corev1.Pod, log logr.Logger) error {
|
|
if pod.Status.Phase == corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed {
|
|
return nil
|
|
}
|
|
|
|
var ready bool
|
|
var lastTransitionTime time.Time
|
|
for _, condition := range pod.Status.Conditions {
|
|
if condition.Type == corev1.PodReady && condition.LastTransitionTime.After(lastTransitionTime) {
|
|
ready = condition.Status == corev1.ConditionTrue
|
|
lastTransitionTime = condition.LastTransitionTime.Time
|
|
}
|
|
}
|
|
|
|
phaseChanged := ephemeralRunner.Status.Phase != pod.Status.Phase
|
|
readyChanged := ready != ephemeralRunner.Status.Ready
|
|
|
|
if !phaseChanged && !readyChanged {
|
|
return nil
|
|
}
|
|
|
|
log.Info(
|
|
"Updating ephemeral runner status",
|
|
"statusPhase", pod.Status.Phase,
|
|
"statusReason", pod.Status.Reason,
|
|
"statusMessage", pod.Status.Message,
|
|
"ready", ready,
|
|
)
|
|
err := patchSubResource(ctx, r.Status(), ephemeralRunner, func(obj *v1alpha1.EphemeralRunner) {
|
|
obj.Status.Phase = pod.Status.Phase
|
|
obj.Status.Ready = ready
|
|
obj.Status.Reason = pod.Status.Reason
|
|
obj.Status.Message = pod.Status.Message
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("failed to update runner status for Phase/Reason/Message/Ready: %w", err)
|
|
}
|
|
|
|
log.Info("Updated ephemeral runner status")
|
|
return nil
|
|
}
|
|
|
|
func (r *EphemeralRunnerReconciler) actionsClientFor(ctx context.Context, runner *v1alpha1.EphemeralRunner) (actions.ActionsService, error) {
|
|
secret := new(corev1.Secret)
|
|
if err := r.Get(ctx, types.NamespacedName{Namespace: runner.Namespace, Name: runner.Spec.GitHubConfigSecret}, secret); err != nil {
|
|
return nil, fmt.Errorf("failed to get secret: %w", err)
|
|
}
|
|
|
|
opts, err := r.actionsClientOptionsFor(ctx, runner)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get actions client options: %w", err)
|
|
}
|
|
|
|
return r.ActionsClient.GetClientFromSecret(
|
|
ctx,
|
|
runner.Spec.GitHubConfigUrl,
|
|
runner.Namespace,
|
|
secret.Data,
|
|
opts...,
|
|
)
|
|
}
|
|
|
|
func (r *EphemeralRunnerReconciler) actionsClientOptionsFor(ctx context.Context, runner *v1alpha1.EphemeralRunner) ([]actions.ClientOption, error) {
|
|
var opts []actions.ClientOption
|
|
if runner.Spec.Proxy != nil {
|
|
proxyFunc, err := runner.Spec.Proxy.ProxyFunc(func(s string) (*corev1.Secret, error) {
|
|
var secret corev1.Secret
|
|
err := r.Get(ctx, types.NamespacedName{Namespace: runner.Namespace, Name: s}, &secret)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get proxy secret %s: %w", s, err)
|
|
}
|
|
|
|
return &secret, nil
|
|
})
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get proxy func: %w", err)
|
|
}
|
|
|
|
opts = append(opts, actions.WithProxy(proxyFunc))
|
|
}
|
|
|
|
tlsConfig := runner.Spec.GitHubServerTLS
|
|
if tlsConfig != nil {
|
|
pool, err := tlsConfig.ToCertPool(func(name, key string) ([]byte, error) {
|
|
var configmap corev1.ConfigMap
|
|
err := r.Get(
|
|
ctx,
|
|
types.NamespacedName{
|
|
Namespace: runner.Namespace,
|
|
Name: name,
|
|
},
|
|
&configmap,
|
|
)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get configmap %s: %w", name, err)
|
|
}
|
|
|
|
return []byte(configmap.Data[key]), nil
|
|
})
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get tls config: %w", err)
|
|
}
|
|
|
|
opts = append(opts, actions.WithRootCAs(pool))
|
|
}
|
|
|
|
return opts, nil
|
|
}
|
|
|
|
// runnerRegisteredWithService checks if the runner is still registered with the service
|
|
// Returns found=false and err=nil if ephemeral runner does not exist in GitHub service and should be deleted
|
|
func (r EphemeralRunnerReconciler) runnerRegisteredWithService(ctx context.Context, runner *v1alpha1.EphemeralRunner, log logr.Logger) (found bool, err error) {
|
|
actionsClient, err := r.actionsClientFor(ctx, runner)
|
|
if err != nil {
|
|
return false, fmt.Errorf("failed to get Actions client for ScaleSet: %w", err)
|
|
}
|
|
|
|
log.Info("Checking if runner exists in GitHub service", "runnerId", runner.Status.RunnerId)
|
|
_, err = actionsClient.GetRunner(ctx, int64(runner.Status.RunnerId))
|
|
if err != nil {
|
|
actionsError := &actions.ActionsError{}
|
|
if !errors.As(err, &actionsError) {
|
|
return false, err
|
|
}
|
|
|
|
if actionsError.StatusCode != http.StatusNotFound ||
|
|
!actionsError.IsException("AgentNotFoundException") {
|
|
return false, fmt.Errorf("failed to check if runner exists in GitHub service: %w", err)
|
|
}
|
|
|
|
log.Info("Runner does not exist in GitHub service", "runnerId", runner.Status.RunnerId)
|
|
return false, nil
|
|
}
|
|
|
|
log.Info("Runner exists in GitHub service", "runnerId", runner.Status.RunnerId)
|
|
return true, nil
|
|
}
|
|
|
|
func (r *EphemeralRunnerReconciler) deleteRunnerFromService(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) error {
|
|
client, err := r.actionsClientFor(ctx, ephemeralRunner)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get actions client for runner: %w", err)
|
|
}
|
|
|
|
log.Info("Removing runner from the service", "runnerId", ephemeralRunner.Status.RunnerId)
|
|
err = client.RemoveRunner(ctx, int64(ephemeralRunner.Status.RunnerId))
|
|
if err != nil {
|
|
return fmt.Errorf("failed to remove runner from the service: %w", err)
|
|
}
|
|
|
|
log.Info("Removed runner from the service", "runnerId", ephemeralRunner.Status.RunnerId)
|
|
return nil
|
|
}
|
|
|
|
// SetupWithManager sets up the controller with the Manager.
|
|
func (r *EphemeralRunnerReconciler) SetupWithManager(mgr ctrl.Manager, opts ...Option) error {
|
|
return builderWithOptions(
|
|
ctrl.NewControllerManagedBy(mgr).
|
|
For(&v1alpha1.EphemeralRunner{}).
|
|
Owns(&corev1.Pod{}).
|
|
WithEventFilter(predicate.ResourceVersionChangedPredicate{}),
|
|
opts,
|
|
).Complete(r)
|
|
}
|
|
|
|
func runnerContainerStatus(pod *corev1.Pod) *corev1.ContainerStatus {
|
|
for i := range pod.Status.ContainerStatuses {
|
|
cs := &pod.Status.ContainerStatuses[i]
|
|
if cs.Name == v1alpha1.EphemeralRunnerContainerName {
|
|
return cs
|
|
}
|
|
}
|
|
return nil
|
|
}
|