refactor: Extract definitions of various annotation keys and other defaults to their own source

This commit is contained in:
Yusuke Kuoka 2022-03-02 02:31:35 +00:00
parent 12a04b7f38
commit 7f0e65cb73
4 changed files with 58 additions and 57 deletions

40
controllers/constants.go Normal file
View File

@ -0,0 +1,40 @@
package controllers
import "time"
const (
LabelKeyRunnerSetName = "runnerset-name"
)
const (
// This names requires at least one slash to work.
// See https://github.com/google/knative-gcp/issues/378
runnerPodFinalizerName = "actions.summerwind.dev/runner-pod"
AnnotationKeyLastRegistrationCheckTime = "actions-runner-controller/last-registration-check-time"
// AnnotationKeyUnregistrationCompleteTimestamp is the annotation that is added onto the pod once the previously started unregistration process has been completed.
AnnotationKeyUnregistrationCompleteTimestamp = "unregistration-complete-timestamp"
// unregistarionStartTimestamp is the annotation that contains the time that the requested unregistration process has been started
AnnotationKeyUnregistrationStartTimestamp = "unregistration-start-timestamp"
// AnnotationKeyUnregistrationRequestTimestamp is the annotation that contains the time that the unregistration has been requested.
// This doesn't immediately start the unregistration. Instead, ARC will first check if the runner has already been registered.
// If not, ARC will hold on until the registration to complete first, and only after that it starts the unregistration process.
// This is crucial to avoid a race between ARC marking the runner pod for deletion while the actions-runner registers itself to GitHub, leaving the assigned job
// hang like forever.
AnnotationKeyUnregistrationRequestTimestamp = "unregistration-request-timestamp"
AnnotationKeyRunnerID = "runner-id"
// DefaultUnregistrationTimeout is the duration until ARC gives up retrying the combo of ListRunners API (to detect the runner ID by name)
// and RemoveRunner API (to actually unregister the runner) calls.
// This needs to be longer than 60 seconds because a part of the combo, the ListRunners API, seems to use the Cache-Control header of max-age=60s
// and that instructs our cache library httpcache to cache responses for 60 seconds, which results in ARC unable to see the runner in the ListRunners response
// up to 60 seconds (or even more depending on the situation).
DefaultUnregistrationTimeout = 60 * time.Second
// This can be any value but a larger value can make an unregistration timeout longer than configured in practice.
DefaultUnregistrationRetryDelay = 30 * time.Second
)

View File

@ -16,33 +16,6 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client"
) )
const (
// unregistrationCompleteTimestamp is the annotation that is added onto the pod once the previously started unregistration process has been completed.
unregistrationCompleteTimestamp = "unregistration-complete-timestamp"
// unregistarionStartTimestamp is the annotation that contains the time that the requested unregistration process has been started
unregistrationStartTimestamp = "unregistration-start-timestamp"
// unregistrationRequestTimestamp is the annotation that contains the time that the unregistration has been requested.
// This doesn't immediately start the unregistration. Instead, ARC will first check if the runner has already been registered.
// If not, ARC will hold on until the registration to complete first, and only after that it starts the unregistration process.
// This is crucial to avoid a race between ARC marking the runner pod for deletion while the actions-runner registers itself to GitHub, leaving the assigned job
// hang like forever.
unregistrationRequestTimestamp = "unregistration-request-timestamp"
annotationKeyRunnerID = "runner-id"
// DefaultUnregistrationTimeout is the duration until ARC gives up retrying the combo of ListRunners API (to detect the runner ID by name)
// and RemoveRunner API (to actually unregister the runner) calls.
// This needs to be longer than 60 seconds because a part of the combo, the ListRunners API, seems to use the Cache-Control header of max-age=60s
// and that instructs our cache library httpcache to cache responses for 60 seconds, which results in ARC unable to see the runner in the ListRunners response
// up to 60 seconds (or even more depending on the situation).
DefaultUnregistrationTimeout = 60 * time.Second
// This can be any value but a larger value can make an unregistration timeout longer than configured in practice.
DefaultUnregistrationRetryDelay = 30 * time.Second
)
// tickRunnerGracefulStop reconciles the runner and the runner pod in a way so that // tickRunnerGracefulStop reconciles the runner and the runner pod in a way so that
// we can delete the runner pod without disrupting a workflow job. // we can delete the runner pod without disrupting a workflow job.
// //
@ -54,7 +27,7 @@ const (
// When it wants to be retried later, the function returns a non-nil *ctrl.Result as the second return value, may or may not populating the error in the second return value. // When it wants to be retried later, the function returns a non-nil *ctrl.Result as the second return value, may or may not populating the error in the second return value.
// The caller is expected to return the returned ctrl.Result and error to postpone the current reconcilation loop and trigger a scheduled retry. // The caller is expected to return the returned ctrl.Result and error to postpone the current reconcilation loop and trigger a scheduled retry.
func tickRunnerGracefulStop(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*corev1.Pod, *ctrl.Result, error) { func tickRunnerGracefulStop(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*corev1.Pod, *ctrl.Result, error) {
pod, err := annotatePodOnce(ctx, c, log, pod, unregistrationStartTimestamp, time.Now().Format(time.RFC3339)) pod, err := annotatePodOnce(ctx, c, log, pod, AnnotationKeyUnregistrationStartTimestamp, time.Now().Format(time.RFC3339))
if err != nil { if err != nil {
return nil, &ctrl.Result{}, err return nil, &ctrl.Result{}, err
} }
@ -63,7 +36,7 @@ func tickRunnerGracefulStop(ctx context.Context, unregistrationTimeout time.Dura
return nil, res, err return nil, res, err
} }
pod, err = annotatePodOnce(ctx, c, log, pod, unregistrationCompleteTimestamp, time.Now().Format(time.RFC3339)) pod, err = annotatePodOnce(ctx, c, log, pod, AnnotationKeyUnregistrationCompleteTimestamp, time.Now().Format(time.RFC3339))
if err != nil { if err != nil {
return nil, &ctrl.Result{}, err return nil, &ctrl.Result{}, err
} }
@ -99,7 +72,7 @@ func annotatePodOnce(ctx context.Context, c client.Client, log logr.Logger, pod
func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*ctrl.Result, error) { func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*ctrl.Result, error) {
var runnerID *int64 var runnerID *int64
if id, ok := getAnnotation(&pod.ObjectMeta, annotationKeyRunnerID); ok { if id, ok := getAnnotation(&pod.ObjectMeta, AnnotationKeyRunnerID); ok {
v, err := strconv.ParseInt(id, 10, 64) v, err := strconv.ParseInt(id, 10, 64)
if err != nil { if err != nil {
return &ctrl.Result{}, err return &ctrl.Result{}, err
@ -162,7 +135,7 @@ func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.
// In that case we can safely assume that the runner will never be registered. // In that case we can safely assume that the runner will never be registered.
log.Info("Runner was not found on GitHub and the runner pod was not found on Kuberntes.") log.Info("Runner was not found on GitHub and the runner pod was not found on Kuberntes.")
} else if pod.Annotations[unregistrationCompleteTimestamp] != "" { } else if pod.Annotations[AnnotationKeyUnregistrationCompleteTimestamp] != "" {
// If it's already unregistered in the previous reconcilation loop, // If it's already unregistered in the previous reconcilation loop,
// you can safely assume that it won't get registered again so it's safe to delete the runner pod. // you can safely assume that it won't get registered again so it's safe to delete the runner pod.
log.Info("Runner pod is marked as already unregistered.") log.Info("Runner pod is marked as already unregistered.")
@ -174,7 +147,7 @@ func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.
// If pod has ended up succeeded we need to restart it // If pod has ended up succeeded we need to restart it
// Happens e.g. when dind is in runner and run completes // Happens e.g. when dind is in runner and run completes
log.Info("Runner pod has been stopped with a successful status.") log.Info("Runner pod has been stopped with a successful status.")
} else if ts := pod.Annotations[unregistrationStartTimestamp]; ts != "" { } else if ts := pod.Annotations[AnnotationKeyUnregistrationStartTimestamp]; ts != "" {
t, err := time.Parse(time.RFC3339, ts) t, err := time.Parse(time.RFC3339, ts)
if err != nil { if err != nil {
return &ctrl.Result{RequeueAfter: retryDelay}, err return &ctrl.Result{RequeueAfter: retryDelay}, err
@ -202,7 +175,7 @@ func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.
} }
func ensureRunnerPodRegistered(ctx context.Context, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*corev1.Pod, *ctrl.Result, error) { func ensureRunnerPodRegistered(ctx context.Context, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*corev1.Pod, *ctrl.Result, error) {
_, hasRunnerID := getAnnotation(&pod.ObjectMeta, annotationKeyRunnerID) _, hasRunnerID := getAnnotation(&pod.ObjectMeta, AnnotationKeyRunnerID)
if runnerPodOrContainerIsStopped(pod) || hasRunnerID { if runnerPodOrContainerIsStopped(pod) || hasRunnerID {
return pod, nil, nil return pod, nil, nil
} }
@ -218,7 +191,7 @@ func ensureRunnerPodRegistered(ctx context.Context, log logr.Logger, ghClient *g
id := *r.ID id := *r.ID
updated, err := annotatePodOnce(ctx, c, log, pod, annotationKeyRunnerID, fmt.Sprintf("%d", id)) updated, err := annotatePodOnce(ctx, c, log, pod, AnnotationKeyRunnerID, fmt.Sprintf("%d", id))
if err != nil { if err != nil {
return nil, &ctrl.Result{RequeueAfter: 10 * time.Second}, err return nil, &ctrl.Result{RequeueAfter: 10 * time.Second}, err
} }

View File

@ -52,14 +52,6 @@ type RunnerPodReconciler struct {
UnregistrationRetryDelay time.Duration UnregistrationRetryDelay time.Duration
} }
const (
// This names requires at least one slash to work.
// See https://github.com/google/knative-gcp/issues/378
runnerPodFinalizerName = "actions.summerwind.dev/runner-pod"
AnnotationKeyLastRegistrationCheckTime = "actions-runner-controller/last-registration-check-time"
)
// +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;update;patch;delete // +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;update;patch;delete
// +kubebuilder:rbac:groups=core,resources=events,verbs=create;patch // +kubebuilder:rbac:groups=core,resources=events,verbs=create;patch
@ -173,7 +165,7 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
runnerPod = *po runnerPod = *po
if _, unregistrationRequested := getAnnotation(&runnerPod.ObjectMeta, unregistrationRequestTimestamp); unregistrationRequested { if _, unregistrationRequested := getAnnotation(&runnerPod.ObjectMeta, AnnotationKeyUnregistrationRequestTimestamp); unregistrationRequested {
log.V(2).Info("Progressing unregistration because unregistration-request timestamp is set") log.V(2).Info("Progressing unregistration because unregistration-request timestamp is set")
// At this point we're sure that DeletionTimestamp is not set yet, but the unregistration process is triggered by an upstream controller like runnerset-controller. // At this point we're sure that DeletionTimestamp is not set yet, but the unregistration process is triggered by an upstream controller like runnerset-controller.

View File

@ -39,10 +39,6 @@ import (
"github.com/go-logr/logr" "github.com/go-logr/logr"
) )
const (
LabelKeyRunnerSetName = "runnerset-name"
)
// RunnerSetReconciler reconciles a Runner object // RunnerSetReconciler reconciles a Runner object
type RunnerSetReconciler struct { type RunnerSetReconciler struct {
Name string Name string
@ -161,7 +157,7 @@ func (r *RunnerSetReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
} }
// Statefulset termination process 3/4: Set the deletionTimestamp to let Kubernetes start a cascade deletion of the statefulset and the pods. // Statefulset termination process 3/4: Set the deletionTimestamp to let Kubernetes start a cascade deletion of the statefulset and the pods.
if _, ok := getAnnotation(&res.statefulset.ObjectMeta, unregistrationCompleteTimestamp); ok { if _, ok := getAnnotation(&res.statefulset.ObjectMeta, AnnotationKeyUnregistrationCompleteTimestamp); ok {
if err := r.Client.Delete(ctx, res.statefulset); err != nil { if err := r.Client.Delete(ctx, res.statefulset); err != nil {
log.Error(err, "Failed to delete statefulset") log.Error(err, "Failed to delete statefulset")
return ctrl.Result{}, err return ctrl.Result{}, err
@ -171,10 +167,10 @@ func (r *RunnerSetReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
// Statefulset termination process 2/4: Set unregistrationCompleteTimestamp only if all the pods managed by the statefulset // Statefulset termination process 2/4: Set unregistrationCompleteTimestamp only if all the pods managed by the statefulset
// have either unregistered or being deleted. // have either unregistered or being deleted.
if _, ok := getAnnotation(&res.statefulset.ObjectMeta, unregistrationRequestTimestamp); ok { if _, ok := getAnnotation(&res.statefulset.ObjectMeta, AnnotationKeyUnregistrationRequestTimestamp); ok {
var deletionSafe int var deletionSafe int
for _, po := range res.pods { for _, po := range res.pods {
if _, ok := getAnnotation(&po.ObjectMeta, unregistrationCompleteTimestamp); ok { if _, ok := getAnnotation(&po.ObjectMeta, AnnotationKeyUnregistrationCompleteTimestamp); ok {
deletionSafe++ deletionSafe++
} else if !po.DeletionTimestamp.IsZero() { } else if !po.DeletionTimestamp.IsZero() {
deletionSafe++ deletionSafe++
@ -184,12 +180,12 @@ func (r *RunnerSetReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
log.V(2).Info("Marking statefulset for unregistration completion", "deletionSafe", deletionSafe, "total", res.total) log.V(2).Info("Marking statefulset for unregistration completion", "deletionSafe", deletionSafe, "total", res.total)
if deletionSafe == res.total { if deletionSafe == res.total {
if _, ok := getAnnotation(&res.statefulset.ObjectMeta, unregistrationCompleteTimestamp); !ok { if _, ok := getAnnotation(&res.statefulset.ObjectMeta, AnnotationKeyUnregistrationCompleteTimestamp); !ok {
updated := res.statefulset.DeepCopy() updated := res.statefulset.DeepCopy()
setAnnotation(&updated.ObjectMeta, unregistrationCompleteTimestamp, time.Now().Format(time.RFC3339)) setAnnotation(&updated.ObjectMeta, AnnotationKeyUnregistrationCompleteTimestamp, time.Now().Format(time.RFC3339))
if err := r.Client.Patch(ctx, updated, client.MergeFrom(res.statefulset)); err != nil { if err := r.Client.Patch(ctx, updated, client.MergeFrom(res.statefulset)); err != nil {
log.Error(err, fmt.Sprintf("Failed to patch statefulset to have %s annotation", unregistrationCompleteTimestamp)) log.Error(err, fmt.Sprintf("Failed to patch statefulset to have %s annotation", AnnotationKeyUnregistrationCompleteTimestamp))
return ctrl.Result{}, err return ctrl.Result{}, err
} }
@ -320,17 +316,17 @@ func (r *RunnerSetReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
// We'd better unregister first and then start a pod deletion process. // We'd better unregister first and then start a pod deletion process.
// The annotation works as a mark to start the pod unregistration and deletion process of ours. // The annotation works as a mark to start the pod unregistration and deletion process of ours.
for _, po := range ss.pods { for _, po := range ss.pods {
if _, err := annotatePodOnce(ctx, r.Client, log, &po, unregistrationRequestTimestamp, time.Now().Format(time.RFC3339)); err != nil { if _, err := annotatePodOnce(ctx, r.Client, log, &po, AnnotationKeyUnregistrationRequestTimestamp, time.Now().Format(time.RFC3339)); err != nil {
return ctrl.Result{}, err return ctrl.Result{}, err
} }
} }
if _, ok := getAnnotation(&ss.statefulset.ObjectMeta, unregistrationRequestTimestamp); !ok { if _, ok := getAnnotation(&ss.statefulset.ObjectMeta, AnnotationKeyUnregistrationRequestTimestamp); !ok {
updated := ss.statefulset.DeepCopy() updated := ss.statefulset.DeepCopy()
setAnnotation(&updated.ObjectMeta, unregistrationRequestTimestamp, time.Now().Format(time.RFC3339)) setAnnotation(&updated.ObjectMeta, AnnotationKeyUnregistrationRequestTimestamp, time.Now().Format(time.RFC3339))
if err := r.Client.Patch(ctx, updated, client.MergeFrom(ss.statefulset)); err != nil { if err := r.Client.Patch(ctx, updated, client.MergeFrom(ss.statefulset)); err != nil {
log.Error(err, fmt.Sprintf("Failed to patch statefulset to have %s annotation", unregistrationRequestTimestamp)) log.Error(err, fmt.Sprintf("Failed to patch statefulset to have %s annotation", AnnotationKeyUnregistrationRequestTimestamp))
return ctrl.Result{}, err return ctrl.Result{}, err
} }