Merge branch 'master' into improve-logs

This commit is contained in:
Felipe Galindo Sanchez 2022-02-28 09:25:30 -08:00 committed by GitHub
commit eff0c7364f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
36 changed files with 725 additions and 211 deletions

View File

@ -15,7 +15,7 @@ on:
- '!**.md' - '!**.md'
env: env:
RUNNER_VERSION: 2.287.1 RUNNER_VERSION: 2.288.0
DOCKER_VERSION: 20.10.12 DOCKER_VERSION: 20.10.12
DOCKERHUB_USERNAME: summerwind DOCKERHUB_USERNAME: summerwind

View File

@ -37,7 +37,10 @@ fi
tool=${ACCEPTANCE_TEST_DEPLOYMENT_TOOL} tool=${ACCEPTANCE_TEST_DEPLOYMENT_TOOL}
TEST_ID=${TEST_ID:-default}
if [ "${tool}" == "helm" ]; then if [ "${tool}" == "helm" ]; then
set -v
helm upgrade --install actions-runner-controller \ helm upgrade --install actions-runner-controller \
charts/actions-runner-controller \ charts/actions-runner-controller \
-n actions-runner-system \ -n actions-runner-system \
@ -46,7 +49,10 @@ if [ "${tool}" == "helm" ]; then
--set authSecret.create=false \ --set authSecret.create=false \
--set image.repository=${NAME} \ --set image.repository=${NAME} \
--set image.tag=${VERSION} \ --set image.tag=${VERSION} \
--set podAnnotations.test-id=${TEST_ID} \
--set githubWebhookServer.podAnnotations.test-id=${TEST_ID} \
-f ${VALUES_FILE} -f ${VALUES_FILE}
set +v
# To prevent `CustomResourceDefinition.apiextensions.k8s.io "runners.actions.summerwind.dev" is invalid: metadata.annotations: Too long: must have at most 262144 bytes` # To prevent `CustomResourceDefinition.apiextensions.k8s.io "runners.actions.summerwind.dev" is invalid: metadata.annotations: Too long: must have at most 262144 bytes`
# errors # errors
kubectl create -f charts/actions-runner-controller/crds || kubectl replace -f charts/actions-runner-controller/crds kubectl create -f charts/actions-runner-controller/crds || kubectl replace -f charts/actions-runner-controller/crds
@ -77,7 +83,7 @@ else
fi fi
if [ -n "${TEST_ORG}" ]; then if [ -n "${TEST_ORG}" ]; then
cat acceptance/testdata/runnerdeploy.envsubst.yaml | TEST_ENTERPRISE= TEST_REPO= NAME=org-runnerdeploy envsubst | kubectl apply -f - cat acceptance/testdata/runnerdeploy.envsubst.yaml | TEST_ENTERPRISE= TEST_REPO= RUNNER_MIN_REPLICAS=${ORG_RUNNER_MIN_REPLICAS} NAME=org-runnerdeploy envsubst | kubectl apply -f -
if [ -n "${TEST_ORG_GROUP}" ]; then if [ -n "${TEST_ORG_GROUP}" ]; then
cat acceptance/testdata/runnerdeploy.envsubst.yaml | TEST_ENTERPRISE= TEST_REPO= TEST_GROUP=${TEST_ORG_GROUP} NAME=orggroup-runnerdeploy envsubst | kubectl apply -f - cat acceptance/testdata/runnerdeploy.envsubst.yaml | TEST_ENTERPRISE= TEST_REPO= TEST_GROUP=${TEST_ORG_GROUP} NAME=orggroup-runnerdeploy envsubst | kubectl apply -f -

View File

@ -28,6 +28,7 @@ spec:
## Replace `mumoshu/actions-runner-dind:dev` with your dind image ## Replace `mumoshu/actions-runner-dind:dev` with your dind image
#dockerdWithinRunnerContainer: true #dockerdWithinRunnerContainer: true
#image: mumoshu/actions-runner-dind:dev #image: mumoshu/actions-runner-dind:dev
dockerdWithinRunnerContainer: ${RUNNER_DOCKERD_WITHIN_RUNNER_CONTAINER}
# #
# Set the MTU used by dockerd-managed network interfaces (including docker-build-ubuntu) # Set the MTU used by dockerd-managed network interfaces (including docker-build-ubuntu)
@ -56,6 +57,7 @@ spec:
scaleUpTriggers: scaleUpTriggers:
- githubEvent: {} - githubEvent: {}
amount: 1 amount: 1
duration: "1m" duration: "10m"
minReplicas: 0 minReplicas: ${RUNNER_MIN_REPLICAS}
maxReplicas: 10 maxReplicas: 10
scaleDownDelaySecondsAfterScaleOut: ${RUNNER_SCALE_DOWN_DELAY_SECONDS_AFTER_SCALE_OUT}

View File

@ -1,7 +1,8 @@
# Set actions-runner-controller settings for testing # Set actions-runner-controller settings for testing
githubAPICacheDuration: 10s githubAPICacheDuration: 10s
logLevel: "-3"
githubWebhookServer: githubWebhookServer:
logLevel: debug logLevel: "-3"
enabled: true enabled: true
labels: {} labels: {}
replicaCount: 1 replicaCount: 1

View File

@ -107,6 +107,9 @@ type CapacityReservation struct {
Name string `json:"name,omitempty"` Name string `json:"name,omitempty"`
ExpirationTime metav1.Time `json:"expirationTime,omitempty"` ExpirationTime metav1.Time `json:"expirationTime,omitempty"`
Replicas int `json:"replicas,omitempty"` Replicas int `json:"replicas,omitempty"`
// +optional
EffectiveTime metav1.Time `json:"effectiveTime,omitempty"`
} }
type ScaleTargetRef struct { type ScaleTargetRef struct {

View File

@ -31,6 +31,14 @@ type RunnerDeploymentSpec struct {
// +nullable // +nullable
Replicas *int `json:"replicas,omitempty"` Replicas *int `json:"replicas,omitempty"`
// EffectiveTime is the time the upstream controller requested to sync Replicas.
// It is usually populated by the webhook-based autoscaler via HRA.
// The value is inherited to RunnerRepicaSet(s) and used to prevent ephemeral runners from unnecessarily recreated.
//
// +optional
// +nullable
EffectiveTime *metav1.Time `json:"effectiveTime"`
// +optional // +optional
// +nullable // +nullable
Selector *metav1.LabelSelector `json:"selector"` Selector *metav1.LabelSelector `json:"selector"`

View File

@ -26,6 +26,15 @@ type RunnerReplicaSetSpec struct {
// +nullable // +nullable
Replicas *int `json:"replicas,omitempty"` Replicas *int `json:"replicas,omitempty"`
// EffectiveTime is the time the upstream controller requested to sync Replicas.
// It is usually populated by the webhook-based autoscaler via HRA and RunnerDeployment.
// The value is used to prevent runnerreplicaset controller from unnecessarily recreating ephemeral runners
// based on potentially outdated Replicas value.
//
// +optional
// +nullable
EffectiveTime *metav1.Time `json:"effectiveTime"`
// +optional // +optional
// +nullable // +nullable
Selector *metav1.LabelSelector `json:"selector"` Selector *metav1.LabelSelector `json:"selector"`

View File

@ -47,6 +47,7 @@ func (in *CacheEntry) DeepCopy() *CacheEntry {
func (in *CapacityReservation) DeepCopyInto(out *CapacityReservation) { func (in *CapacityReservation) DeepCopyInto(out *CapacityReservation) {
*out = *in *out = *in
in.ExpirationTime.DeepCopyInto(&out.ExpirationTime) in.ExpirationTime.DeepCopyInto(&out.ExpirationTime)
in.EffectiveTime.DeepCopyInto(&out.EffectiveTime)
} }
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CapacityReservation. // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CapacityReservation.
@ -498,6 +499,10 @@ func (in *RunnerDeploymentSpec) DeepCopyInto(out *RunnerDeploymentSpec) {
*out = new(int) *out = new(int)
**out = **in **out = **in
} }
if in.EffectiveTime != nil {
in, out := &in.EffectiveTime, &out.EffectiveTime
*out = (*in).DeepCopy()
}
if in.Selector != nil { if in.Selector != nil {
in, out := &in.Selector, &out.Selector in, out := &in.Selector, &out.Selector
*out = new(metav1.LabelSelector) *out = new(metav1.LabelSelector)
@ -812,6 +817,10 @@ func (in *RunnerReplicaSetSpec) DeepCopyInto(out *RunnerReplicaSetSpec) {
*out = new(int) *out = new(int)
**out = **in **out = **in
} }
if in.EffectiveTime != nil {
in, out := &in.EffectiveTime, &out.EffectiveTime
*out = (*in).DeepCopy()
}
if in.Selector != nil { if in.Selector != nil {
in, out := &in.Selector, &out.Selector in, out := &in.Selector, &out.Selector
*out = new(metav1.LabelSelector) *out = new(metav1.LabelSelector)

View File

@ -33,6 +33,7 @@ All additional docs are kept in the `docs/` folder, this README is solely for do
| `authSecret.github_basicauth_username` | Username for GitHub basic auth to use instead of PAT or GitHub APP in case it's running behind a proxy API | | | `authSecret.github_basicauth_username` | Username for GitHub basic auth to use instead of PAT or GitHub APP in case it's running behind a proxy API | |
| `authSecret.github_basicauth_password` | Password for GitHub basic auth to use instead of PAT or GitHub APP in case it's running behind a proxy API | | | `authSecret.github_basicauth_password` | Password for GitHub basic auth to use instead of PAT or GitHub APP in case it's running behind a proxy API | |
| `dockerRegistryMirror` | The default Docker Registry Mirror used by runners. | | | `dockerRegistryMirror` | The default Docker Registry Mirror used by runners. | |
| `hostNetwork` | The "hostNetwork" of the controller container | false |
| `image.repository` | The "repository/image" of the controller container | summerwind/actions-runner-controller | | `image.repository` | The "repository/image" of the controller container | summerwind/actions-runner-controller |
| `image.tag` | The tag of the controller container | | | `image.tag` | The tag of the controller container | |
| `image.actionsRunnerRepositoryAndTag` | The "repository/image" of the actions runner container | summerwind/actions-runner:latest | | `image.actionsRunnerRepositoryAndTag` | The "repository/image" of the actions runner container | summerwind/actions-runner:latest |

View File

@ -49,6 +49,9 @@ spec:
items: items:
description: CapacityReservation specifies the number of replicas temporarily added to the scale target until ExpirationTime. description: CapacityReservation specifies the number of replicas temporarily added to the scale target until ExpirationTime.
properties: properties:
effectiveTime:
format: date-time
type: string
expirationTime: expirationTime:
format: date-time format: date-time
type: string type: string

View File

@ -48,6 +48,11 @@ spec:
spec: spec:
description: RunnerDeploymentSpec defines the desired state of RunnerDeployment description: RunnerDeploymentSpec defines the desired state of RunnerDeployment
properties: properties:
effectiveTime:
description: EffectiveTime is the time the upstream controller requested to sync Replicas. It is usually populated by the webhook-based autoscaler via HRA. The value is inherited to RunnerRepicaSet(s) and used to prevent ephemeral runners from unnecessarily recreated.
format: date-time
nullable: true
type: string
replicas: replicas:
nullable: true nullable: true
type: integer type: integer

View File

@ -45,6 +45,11 @@ spec:
spec: spec:
description: RunnerReplicaSetSpec defines the desired state of RunnerReplicaSet description: RunnerReplicaSetSpec defines the desired state of RunnerReplicaSet
properties: properties:
effectiveTime:
description: EffectiveTime is the time the upstream controller requested to sync Replicas. It is usually populated by the webhook-based autoscaler via HRA and RunnerDeployment. The value is used to prevent runnerreplicaset controller from unnecessarily recreating ephemeral runners based on potentially outdated Replicas value.
format: date-time
nullable: true
type: string
replicas: replicas:
nullable: true nullable: true
type: integer type: integer

View File

@ -14,6 +14,7 @@ spec:
metadata: metadata:
{{- with .Values.podAnnotations }} {{- with .Values.podAnnotations }}
annotations: annotations:
kubectl.kubernetes.io/default-logs-container: "manager"
{{- toYaml . | nindent 8 }} {{- toYaml . | nindent 8 }}
{{- end }} {{- end }}
labels: labels:
@ -199,3 +200,6 @@ spec:
topologySpreadConstraints: topologySpreadConstraints:
{{- toYaml . | nindent 8 }} {{- toYaml . | nindent 8 }}
{{- end }} {{- end }}
{{- if .Values.hostNetwork }}
hostNetwork: {{ .Values.hostNetwork }}
{{- end }}

View File

@ -15,6 +15,7 @@ spec:
metadata: metadata:
{{- with .Values.githubWebhookServer.podAnnotations }} {{- with .Values.githubWebhookServer.podAnnotations }}
annotations: annotations:
kubectl.kubernetes.io/default-logs-container: "github-webhook-server"
{{- toYaml . | nindent 8 }} {{- toYaml . | nindent 8 }}
{{- end }} {{- end }}
labels: labels:

View File

@ -165,6 +165,10 @@ admissionWebHooks:
{} {}
#caBundle: "Ci0tLS0tQk...<base64-encoded PEM bundle containing the CA that signed the webhook's serving certificate>...tLS0K" #caBundle: "Ci0tLS0tQk...<base64-encoded PEM bundle containing the CA that signed the webhook's serving certificate>...tLS0K"
# There may be alternatives to setting `hostNetwork: true`, see
# https://github.com/actions-runner-controller/actions-runner-controller/issues/1005#issuecomment-993097155
#hostNetwork: true
githubWebhookServer: githubWebhookServer:
enabled: false enabled: false
replicaCount: 1 replicaCount: 1

View File

@ -29,15 +29,14 @@ import (
actionsv1alpha1 "github.com/actions-runner-controller/actions-runner-controller/api/v1alpha1" actionsv1alpha1 "github.com/actions-runner-controller/actions-runner-controller/api/v1alpha1"
"github.com/actions-runner-controller/actions-runner-controller/controllers" "github.com/actions-runner-controller/actions-runner-controller/controllers"
"github.com/actions-runner-controller/actions-runner-controller/github" "github.com/actions-runner-controller/actions-runner-controller/github"
"github.com/actions-runner-controller/actions-runner-controller/logging"
"github.com/kelseyhightower/envconfig" "github.com/kelseyhightower/envconfig"
zaplib "go.uber.org/zap"
"k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime"
clientgoscheme "k8s.io/client-go/kubernetes/scheme" clientgoscheme "k8s.io/client-go/kubernetes/scheme"
_ "k8s.io/client-go/plugin/pkg/client/auth/exec" _ "k8s.io/client-go/plugin/pkg/client/auth/exec"
_ "k8s.io/client-go/plugin/pkg/client/auth/gcp" _ "k8s.io/client-go/plugin/pkg/client/auth/gcp"
_ "k8s.io/client-go/plugin/pkg/client/auth/oidc" _ "k8s.io/client-go/plugin/pkg/client/auth/oidc"
ctrl "sigs.k8s.io/controller-runtime" ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
// +kubebuilder:scaffold:imports // +kubebuilder:scaffold:imports
) )
@ -47,11 +46,6 @@ var (
) )
const ( const (
logLevelDebug = "debug"
logLevelInfo = "info"
logLevelWarn = "warn"
logLevelError = "error"
webhookSecretTokenEnvName = "GITHUB_WEBHOOK_SECRET_TOKEN" webhookSecretTokenEnvName = "GITHUB_WEBHOOK_SECRET_TOKEN"
) )
@ -97,7 +91,7 @@ func main() {
flag.BoolVar(&enableLeaderElection, "enable-leader-election", false, flag.BoolVar(&enableLeaderElection, "enable-leader-election", false,
"Enable leader election for controller manager. Enabling this will ensure there is only one active controller manager.") "Enable leader election for controller manager. Enabling this will ensure there is only one active controller manager.")
flag.DurationVar(&syncPeriod, "sync-period", 10*time.Minute, "Determines the minimum frequency at which K8s resources managed by this controller are reconciled. When you use autoscaling, set to a lower value like 10 minute, because this corresponds to the minimum time to react on demand change") flag.DurationVar(&syncPeriod, "sync-period", 10*time.Minute, "Determines the minimum frequency at which K8s resources managed by this controller are reconciled. When you use autoscaling, set to a lower value like 10 minute, because this corresponds to the minimum time to react on demand change")
flag.StringVar(&logLevel, "log-level", logLevelDebug, `The verbosity of the logging. Valid values are "debug", "info", "warn", "error". Defaults to "debug".`) flag.StringVar(&logLevel, "log-level", logging.LogLevelDebug, `The verbosity of the logging. Valid values are "debug", "info", "warn", "error". Defaults to "debug".`)
flag.StringVar(&webhookSecretToken, "github-webhook-secret-token", "", "The personal access token of GitHub.") flag.StringVar(&webhookSecretToken, "github-webhook-secret-token", "", "The personal access token of GitHub.")
flag.StringVar(&c.Token, "github-token", c.Token, "The personal access token of GitHub.") flag.StringVar(&c.Token, "github-token", c.Token, "The personal access token of GitHub.")
flag.Int64Var(&c.AppID, "github-app-id", c.AppID, "The application ID of GitHub App.") flag.Int64Var(&c.AppID, "github-app-id", c.AppID, "The application ID of GitHub App.")
@ -126,23 +120,7 @@ func main() {
setupLog.Info("-watch-namespace is %q. Only HorizontalRunnerAutoscalers in %q are watched, cached, and considered as scale targets.") setupLog.Info("-watch-namespace is %q. Only HorizontalRunnerAutoscalers in %q are watched, cached, and considered as scale targets.")
} }
logger := zap.New(func(o *zap.Options) { logger := logging.NewLogger(logLevel)
switch logLevel {
case logLevelDebug:
o.Development = true
lvl := zaplib.NewAtomicLevelAt(-2) // maps to logr's V(2)
o.Level = &lvl
case logLevelInfo:
lvl := zaplib.NewAtomicLevelAt(zaplib.InfoLevel)
o.Level = &lvl
case logLevelWarn:
lvl := zaplib.NewAtomicLevelAt(zaplib.WarnLevel)
o.Level = &lvl
case logLevelError:
lvl := zaplib.NewAtomicLevelAt(zaplib.ErrorLevel)
o.Level = &lvl
}
})
ctrl.SetLogger(logger) ctrl.SetLogger(logger)
@ -152,6 +130,8 @@ func main() {
// That is, all runner groups managed by ARC are assumed to be visible to any repositories, // That is, all runner groups managed by ARC are assumed to be visible to any repositories,
// which is wrong when you have one or more non-default runner groups in your organization or enterprise. // which is wrong when you have one or more non-default runner groups in your organization or enterprise.
if len(c.Token) > 0 || (c.AppID > 0 && c.AppInstallationID > 0 && c.AppPrivateKey != "") || (len(c.BasicauthUsername) > 0 && len(c.BasicauthPassword) > 0) { if len(c.Token) > 0 || (c.AppID > 0 && c.AppInstallationID > 0 && c.AppPrivateKey != "") || (len(c.BasicauthUsername) > 0 && len(c.BasicauthPassword) > 0) {
c.Log = &logger
ghClient, err = c.NewClient() ghClient, err = c.NewClient()
if err != nil { if err != nil {
fmt.Fprintln(os.Stderr, "Error: Client creation failed.", err) fmt.Fprintln(os.Stderr, "Error: Client creation failed.", err)

View File

@ -49,6 +49,9 @@ spec:
items: items:
description: CapacityReservation specifies the number of replicas temporarily added to the scale target until ExpirationTime. description: CapacityReservation specifies the number of replicas temporarily added to the scale target until ExpirationTime.
properties: properties:
effectiveTime:
format: date-time
type: string
expirationTime: expirationTime:
format: date-time format: date-time
type: string type: string

View File

@ -48,6 +48,11 @@ spec:
spec: spec:
description: RunnerDeploymentSpec defines the desired state of RunnerDeployment description: RunnerDeploymentSpec defines the desired state of RunnerDeployment
properties: properties:
effectiveTime:
description: EffectiveTime is the time the upstream controller requested to sync Replicas. It is usually populated by the webhook-based autoscaler via HRA. The value is inherited to RunnerRepicaSet(s) and used to prevent ephemeral runners from unnecessarily recreated.
format: date-time
nullable: true
type: string
replicas: replicas:
nullable: true nullable: true
type: integer type: integer

View File

@ -45,6 +45,11 @@ spec:
spec: spec:
description: RunnerReplicaSetSpec defines the desired state of RunnerReplicaSet description: RunnerReplicaSetSpec defines the desired state of RunnerReplicaSet
properties: properties:
effectiveTime:
description: EffectiveTime is the time the upstream controller requested to sync Replicas. It is usually populated by the webhook-based autoscaler via HRA and RunnerDeployment. The value is used to prevent runnerreplicaset controller from unnecessarily recreating ephemeral runners based on potentially outdated Replicas value.
format: date-time
nullable: true
type: string
replicas: replicas:
nullable: true nullable: true
type: integer type: integer

View File

@ -768,8 +768,10 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) tryScale(ctx context.
capacityReservations := getValidCapacityReservations(copy) capacityReservations := getValidCapacityReservations(copy)
if amount > 0 { if amount > 0 {
now := time.Now()
copy.Spec.CapacityReservations = append(capacityReservations, v1alpha1.CapacityReservation{ copy.Spec.CapacityReservations = append(capacityReservations, v1alpha1.CapacityReservation{
ExpirationTime: metav1.Time{Time: time.Now().Add(target.ScaleUpTrigger.Duration.Duration)}, EffectiveTime: metav1.Time{Time: now},
ExpirationTime: metav1.Time{Time: now.Add(target.ScaleUpTrigger.Duration.Duration)},
Replicas: amount, Replicas: amount,
}) })
} else if amount < 0 { } else if amount < 0 {
@ -788,10 +790,16 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) tryScale(ctx context.
copy.Spec.CapacityReservations = reservations copy.Spec.CapacityReservations = reservations
} }
autoscaler.Log.Info( before := len(target.HorizontalRunnerAutoscaler.Spec.CapacityReservations)
expired := before - len(capacityReservations)
after := len(copy.Spec.CapacityReservations)
autoscaler.Log.V(1).Info(
fmt.Sprintf("Patching hra %s for capacityReservations update", target.HorizontalRunnerAutoscaler.Name), fmt.Sprintf("Patching hra %s for capacityReservations update", target.HorizontalRunnerAutoscaler.Name),
"before", target.HorizontalRunnerAutoscaler.Spec.CapacityReservations, "before", before,
"after", copy.Spec.CapacityReservations, "expired", expired,
"amount", amount,
"after", after,
) )
if err := autoscaler.Client.Patch(ctx, copy, client.MergeFrom(&target.HorizontalRunnerAutoscaler)); err != nil { if err := autoscaler.Client.Patch(ctx, copy, client.MergeFrom(&target.HorizontalRunnerAutoscaler)); err != nil {

View File

@ -99,11 +99,33 @@ func (r *HorizontalRunnerAutoscalerReconciler) Reconcile(ctx context.Context, re
return r.reconcile(ctx, req, log, hra, st, func(newDesiredReplicas int) error { return r.reconcile(ctx, req, log, hra, st, func(newDesiredReplicas int) error {
currentDesiredReplicas := getIntOrDefault(rd.Spec.Replicas, defaultReplicas) currentDesiredReplicas := getIntOrDefault(rd.Spec.Replicas, defaultReplicas)
ephemeral := rd.Spec.Template.Spec.Ephemeral == nil || *rd.Spec.Template.Spec.Ephemeral
var effectiveTime *time.Time
for _, r := range hra.Spec.CapacityReservations {
t := r.EffectiveTime
if effectiveTime == nil || effectiveTime.Before(t.Time) {
effectiveTime = &t.Time
}
}
// Please add more conditions that we can in-place update the newest runnerreplicaset without disruption // Please add more conditions that we can in-place update the newest runnerreplicaset without disruption
if currentDesiredReplicas != newDesiredReplicas { if currentDesiredReplicas != newDesiredReplicas {
copy := rd.DeepCopy() copy := rd.DeepCopy()
copy.Spec.Replicas = &newDesiredReplicas copy.Spec.Replicas = &newDesiredReplicas
if ephemeral && effectiveTime != nil {
copy.Spec.EffectiveTime = &metav1.Time{Time: *effectiveTime}
}
if err := r.Client.Patch(ctx, copy, client.MergeFrom(&rd)); err != nil {
return fmt.Errorf("patching runnerdeployment to have %d replicas: %w", newDesiredReplicas, err)
}
} else if ephemeral && effectiveTime != nil {
copy := rd.DeepCopy()
copy.Spec.EffectiveTime = &metav1.Time{Time: *effectiveTime}
if err := r.Client.Patch(ctx, copy, client.MergeFrom(&rd)); err != nil { if err := r.Client.Patch(ctx, copy, client.MergeFrom(&rd)); err != nil {
return fmt.Errorf("patching runnerdeployment to have %d replicas: %w", newDesiredReplicas, err) return fmt.Errorf("patching runnerdeployment to have %d replicas: %w", newDesiredReplicas, err)
} }

View File

@ -110,6 +110,8 @@ func SetupIntegrationTest(ctx2 context.Context) *testEnvironment {
Name: controllerName("runner"), Name: controllerName("runner"),
RegistrationRecheckInterval: time.Millisecond, RegistrationRecheckInterval: time.Millisecond,
RegistrationRecheckJitter: time.Millisecond, RegistrationRecheckJitter: time.Millisecond,
UnregistrationTimeout: 1 * time.Second,
UnregistrationRetryDelay: 1 * time.Second,
} }
err = runnerController.SetupWithManager(mgr) err = runnerController.SetupWithManager(mgr)
Expect(err).NotTo(HaveOccurred(), "failed to setup runner controller") Expect(err).NotTo(HaveOccurred(), "failed to setup runner controller")

View File

@ -72,6 +72,9 @@ type RunnerReconciler struct {
Name string Name string
RegistrationRecheckInterval time.Duration RegistrationRecheckInterval time.Duration
RegistrationRecheckJitter time.Duration RegistrationRecheckJitter time.Duration
UnregistrationTimeout time.Duration
UnregistrationRetryDelay time.Duration
} }
// +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runners,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runners,verbs=get;list;watch;create;update;patch;delete
@ -110,8 +113,23 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
return ctrl.Result{}, nil return ctrl.Result{}, nil
} }
} else { } else {
var p *corev1.Pod
{
var pod corev1.Pod
if err := r.Get(ctx, req.NamespacedName, &pod); err != nil {
if !kerrors.IsNotFound(err) {
log.Info(fmt.Sprintf("Retrying soon as we failed to get registration-only runner pod: %v", err))
return ctrl.Result{Requeue: true}, nil
}
} else {
p = &pod
}
}
// Request to remove a runner. DeletionTimestamp was set in the runner - we need to unregister runner // Request to remove a runner. DeletionTimestamp was set in the runner - we need to unregister runner
return r.processRunnerDeletion(runner, ctx, log) return r.processRunnerDeletion(runner, ctx, log, p)
} }
registrationOnly := metav1.HasAnnotation(runner.ObjectMeta, annotationKeyRegistrationOnly) registrationOnly := metav1.HasAnnotation(runner.ObjectMeta, annotationKeyRegistrationOnly)
@ -159,20 +177,27 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
// If pod has ended up succeeded we need to restart it // If pod has ended up succeeded we need to restart it
// Happens e.g. when dind is in runner and run completes // Happens e.g. when dind is in runner and run completes
stopped := pod.Status.Phase == corev1.PodSucceeded stopped := runnerPodOrContainerIsStopped(&pod)
if !stopped { ephemeral := runner.Spec.Ephemeral == nil || *runner.Spec.Ephemeral
if pod.Status.Phase == corev1.PodRunning {
for _, status := range pod.Status.ContainerStatuses { if stopped && ephemeral {
if status.Name != containerName { log.V(1).Info("Ephemeral runner has been stopped successfully. Marking this runner for deletion.")
continue
// This is the key to make ephemeral runners to work reliably with webhook-based autoscale.
// See https://github.com/actions-runner-controller/actions-runner-controller/issues/911#issuecomment-1046161384 for more context.
//
// In the next reconcilation loop, this triggers a runner unregistration.
// (Note that the unregistration can fail safely because an ephemeral runner usually unregisters itself from GitHub but we do it just for confirmation)
//
// See the code path above that is executed when `runner.ObjectMeta.DeletionTimestamp.IsZero()` isn't true,
// which handles the unregistrationa the removal of the completed pod, and so on.
if err := r.Delete(ctx, &runner); err != nil {
log.V(1).Error(err, "Retrying to mark this runner for deletion in 10 seconds.")
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
} }
if status.State.Terminated != nil && status.State.Terminated.ExitCode == 0 { return ctrl.Result{Requeue: true}, nil
stopped = true
}
}
}
} }
restart := stopped restart := stopped
@ -404,64 +429,53 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
return ctrl.Result{}, nil return ctrl.Result{}, nil
} }
// Try to delete current pod if recreation is needed updatedPod, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name, &pod)
safeToDeletePod := false if res != nil {
ok, err := r.unregisterRunner(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name) return *res, err
if err != nil {
log.Error(err, "Failed to unregister runner before deleting the pod.", "runner", runner.Name)
} else {
// `r.unregisterRunner()` will returns `false, nil` if the runner is not found on GitHub.
if !ok {
log.Info("Runner no longer exists on GitHub", "runner", runner.Name)
} }
safeToDeletePod = true
}
if safeToDeletePod {
// Only delete the pod if we successfully unregistered the runner or the runner is already deleted from the service. // Only delete the pod if we successfully unregistered the runner or the runner is already deleted from the service.
// This should help us avoid race condition between runner pickup job after we think the runner is not busy. // This should help us avoid race condition between runner pickup job after we think the runner is not busy.
if err := r.Delete(ctx, &pod); err != nil { if err := r.Delete(ctx, updatedPod); err != nil {
log.Error(err, "Failed to delete pod resource") log.Error(err, "Failed to delete pod resource")
return ctrl.Result{}, err return ctrl.Result{}, err
} }
r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Deleted pod '%s'", newPod.Name)) r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Deleted pod '%s'", newPod.Name))
log.Info("Deleted runner pod", "repository", runner.Spec.Repository) log.Info("Deleted runner pod", "repository", runner.Spec.Repository)
}
return ctrl.Result{}, nil return ctrl.Result{}, nil
} }
func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx context.Context, log logr.Logger) (reconcile.Result, error) { func runnerPodOrContainerIsStopped(pod *corev1.Pod) bool {
// If pod has ended up succeeded we need to restart it
// Happens e.g. when dind is in runner and run completes
stopped := pod.Status.Phase == corev1.PodSucceeded
if !stopped {
if pod.Status.Phase == corev1.PodRunning {
for _, status := range pod.Status.ContainerStatuses {
if status.Name != containerName {
continue
}
if status.State.Terminated != nil && status.State.Terminated.ExitCode == 0 {
stopped = true
}
}
}
}
return stopped
}
func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx context.Context, log logr.Logger, pod *corev1.Pod) (reconcile.Result, error) {
finalizers, removed := removeFinalizer(runner.ObjectMeta.Finalizers, finalizerName) finalizers, removed := removeFinalizer(runner.ObjectMeta.Finalizers, finalizerName)
if removed { if removed {
if len(runner.Status.Registration.Token) > 0 { _, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name, pod)
ok, err := r.unregisterRunner(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name) if res != nil {
if err != nil { return *res, err
if errors.Is(err, &gogithub.RateLimitError{}) {
// We log the underlying error when we failed calling GitHub API to list or unregisters,
// or the runner is still busy.
log.Error(
err,
fmt.Sprintf(
"Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls",
retryDelayOnGitHubAPIRateLimitError,
),
)
return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
}
return ctrl.Result{}, err
}
if !ok {
log.V(1).Info("Runner no longer exists on GitHub")
}
} else {
log.V(1).Info("Runner was never registered on GitHub")
} }
newRunner := runner.DeepCopy() newRunner := runner.DeepCopy()
@ -478,6 +492,24 @@ func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx con
return ctrl.Result{}, nil return ctrl.Result{}, nil
} }
func (r *RunnerReconciler) unregistrationTimeout() time.Duration {
unregistrationTimeout := DefaultUnregistrationTimeout
if r.UnregistrationTimeout > 0 {
unregistrationTimeout = r.UnregistrationTimeout
}
return unregistrationTimeout
}
func (r *RunnerReconciler) unregistrationRetryDelay() time.Duration {
retryDelay := DefaultUnregistrationRetryDelay
if r.UnregistrationRetryDelay > 0 {
retryDelay = r.UnregistrationRetryDelay
}
return retryDelay
}
func (r *RunnerReconciler) processRunnerPodDeletion(ctx context.Context, runner v1alpha1.Runner, log logr.Logger, pod corev1.Pod) (reconcile.Result, error) { func (r *RunnerReconciler) processRunnerPodDeletion(ctx context.Context, runner v1alpha1.Runner, log logr.Logger, pod corev1.Pod) (reconcile.Result, error) {
deletionTimeout := 1 * time.Minute deletionTimeout := 1 * time.Minute
currentTime := time.Now() currentTime := time.Now()

View File

@ -0,0 +1,249 @@
package controllers
import (
"context"
"errors"
"fmt"
"time"
"github.com/actions-runner-controller/actions-runner-controller/github"
"github.com/go-logr/logr"
gogithub "github.com/google/go-github/v39/github"
corev1 "k8s.io/api/core/v1"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
)
const (
unregistrationCompleteTimestamp = "unregistration-complete-timestamp"
unregistrationStartTimestamp = "unregistration-start-timestamp"
// DefaultUnregistrationTimeout is the duration until ARC gives up retrying the combo of ListRunners API (to detect the runner ID by name)
// and RemoveRunner API (to actually unregister the runner) calls.
// This needs to be longer than 60 seconds because a part of the combo, the ListRunners API, seems to use the Cache-Control header of max-age=60s
// and that instructs our cache library httpcache to cache responses for 60 seconds, which results in ARC unable to see the runner in the ListRunners response
// up to 60 seconds (or even more depending on the situation).
DefaultUnregistrationTimeout = 60 * time.Second
// This can be any value but a larger value can make an unregistration timeout longer than configured in practice.
DefaultUnregistrationRetryDelay = 30 * time.Second
)
// tickRunnerGracefulStop reconciles the runner and the runner pod in a way so that
// we can delete the runner pod without disrupting a workflow job.
//
// This function returns a non-nil pointer to corev1.Pod as the first return value
// if the runner is considered to have gracefully stopped, hence it's pod is safe for deletion.
//
// It's a "tick" operation so a graceful stop can take multiple calls to complete.
// This function is designed to complete a length graceful stop process in a unblocking way.
// When it wants to be retried later, the function returns a non-nil *ctrl.Result as the second return value, may or may not populating the error in the second return value.
// The caller is expected to return the returned ctrl.Result and error to postpone the current reconcilation loop and trigger a scheduled retry.
func tickRunnerGracefulStop(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*corev1.Pod, *ctrl.Result, error) {
if pod != nil {
if _, ok := getAnnotation(pod, unregistrationStartTimestamp); !ok {
updated := pod.DeepCopy()
setAnnotation(updated, unregistrationStartTimestamp, time.Now().Format(time.RFC3339))
if err := c.Patch(ctx, updated, client.MergeFrom(pod)); err != nil {
log.Error(err, fmt.Sprintf("Failed to patch pod to have %s annotation", unregistrationStartTimestamp))
return nil, &ctrl.Result{}, err
}
pod = updated
log.Info("Runner has started unregistration")
} else {
log.Info("Runner has already started unregistration")
}
}
if res, err := ensureRunnerUnregistration(ctx, unregistrationTimeout, retryDelay, log, ghClient, enterprise, organization, repository, runner, pod); res != nil {
return nil, res, err
}
if pod != nil {
if _, ok := getAnnotation(pod, unregistrationCompleteTimestamp); !ok {
updated := pod.DeepCopy()
setAnnotation(updated, unregistrationCompleteTimestamp, time.Now().Format(time.RFC3339))
if err := c.Patch(ctx, updated, client.MergeFrom(pod)); err != nil {
log.Error(err, fmt.Sprintf("Failed to patch pod to have %s annotation", unregistrationCompleteTimestamp))
return nil, &ctrl.Result{}, err
}
pod = updated
log.Info("Runner has completed unregistration")
} else {
log.Info("Runner has already completed unregistration")
}
}
return pod, nil, nil
}
// If the first return value is nil, it's safe to delete the runner pod.
func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*ctrl.Result, error) {
ok, err := unregisterRunner(ctx, ghClient, enterprise, organization, repository, runner)
if err != nil {
if errors.Is(err, &gogithub.RateLimitError{}) {
// We log the underlying error when we failed calling GitHub API to list or unregisters,
// or the runner is still busy.
log.Error(
err,
fmt.Sprintf(
"Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls",
retryDelayOnGitHubAPIRateLimitError,
),
)
return &ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
}
log.Error(err, "Failed to unregister runner before deleting the pod.")
return &ctrl.Result{}, err
} else if ok {
log.Info("Runner has just been unregistered. Removing the runner pod.")
} else if pod == nil {
// `r.unregisterRunner()` will returns `false, nil` if the runner is not found on GitHub.
// However, that doesn't always mean the pod can be safely removed.
//
// If the pod does not exist for the runner,
// it may be due to that the runner pod has never been created.
// In that case we can safely assume that the runner will never be registered.
log.Info("Runner was not found on GitHub and the runner pod was not found on Kuberntes.")
} else if pod.Annotations[unregistrationCompleteTimestamp] != "" {
// If it's already unregistered in the previous reconcilation loop,
// you can safely assume that it won't get registered again so it's safe to delete the runner pod.
log.Info("Runner pod is marked as already unregistered.")
} else if runnerPodOrContainerIsStopped(pod) {
// If it's an ephemeral runner with the actions/runner container exited with 0,
// we can safely assume that it has unregistered itself from GitHub Actions
// so it's natural that RemoveRunner fails due to 404.
// If pod has ended up succeeded we need to restart it
// Happens e.g. when dind is in runner and run completes
log.Info("Runner pod has been stopped with a successful status.")
} else if ts := pod.Annotations[unregistrationStartTimestamp]; ts != "" {
t, err := time.Parse(time.RFC3339, ts)
if err != nil {
return &ctrl.Result{RequeueAfter: retryDelay}, err
}
if r := time.Until(t.Add(unregistrationTimeout)); r > 0 {
log.Info("Runner unregistration is in-progress.", "timeout", unregistrationTimeout, "remaining", r)
return &ctrl.Result{RequeueAfter: retryDelay}, err
}
log.Info("Runner unregistration has been timed out. The runner pod will be deleted soon.", "timeout", unregistrationTimeout)
} else {
// A runner and a runner pod that is created by this version of ARC should match
// any of the above branches.
//
// But we leave this match all branch for potential backward-compatibility.
// The caller is expected to take appropriate actions, like annotating the pod as started the unregistration process,
// and retry later.
log.V(1).Info("Runner unregistration is being retried later.")
return &ctrl.Result{RequeueAfter: retryDelay}, nil
}
return nil, nil
}
func getAnnotation(pod *corev1.Pod, key string) (string, bool) {
if pod.Annotations == nil {
return "", false
}
v, ok := pod.Annotations[key]
return v, ok
}
func setAnnotation(pod *corev1.Pod, key, value string) {
if pod.Annotations == nil {
pod.Annotations = map[string]string{}
}
pod.Annotations[key] = value
}
// unregisterRunner unregisters the runner from GitHub Actions by name.
//
// This function returns:
//
// Case 1. (true, nil) when it has successfully unregistered the runner.
// Case 2. (false, nil) when (2-1.) the runner has been already unregistered OR (2-2.) the runner will never be created OR (2-3.) the runner is not created yet and it is about to be registered(hence we couldn't see it's existence from GitHub Actions API yet)
// Case 3. (false, err) when it postponed unregistration due to the runner being busy, or it tried to unregister the runner but failed due to
// an error returned by GitHub API.
//
// When the returned values is "Case 2. (false, nil)", the caller must handle the three possible sub-cases appropriately.
// In other words, all those three sub-cases cannot be distinguished by this function alone.
//
// - Case "2-1." can happen when e.g. ARC has successfully unregistered in a previous reconcilation loop or it was an ephemeral runner that finished it's job run(an ephemeral runner is designed to stop after a job run).
// You'd need to maintain the runner state(i.e. if it's already unregistered or not) somewhere,
// so that you can either not call this function at all if the runner state says it's already unregistered, or determine that it's case "2-1." when you got (false, nil).
//
// - Case "2-2." can happen when e.g. the runner registration token was somehow broken so that `config.sh` within the runner container was never meant to succeed.
// Waiting and retrying forever on this case is not a solution, because `config.sh` won't succeed with a wrong token hence the runner gets stuck in this state forever.
// There isn't a perfect solution to this, but a practical workaround would be implement a "grace period" in the caller side.
//
// - Case "2-3." can happen when e.g. ARC recreated an ephemral runner pod in a previous reconcilation loop and then it was requested to delete the runner before the runner comes up.
// If handled inappropriately, this can cause a race condition betweeen a deletion of the runner pod and GitHub scheduling a workflow job onto the runner.
//
// Once successfully detected case "2-1." or "2-2.", you can safely delete the runner pod because you know that the runner won't come back
// as long as you recreate the runner pod.
//
// If it was "2-3.", you need a workaround to avoid the race condition.
//
// You shall introduce a "grace period" mechanism, similar or equal to that is required for "Case 2-2.", so that you ever
// start the runner pod deletion only after it's more and more likely that the runner pod is not coming up.
//
// Beware though, you need extra care to set an appropriate grace period depending on your environment.
// There isn't a single right grace period that works for everyone.
// The longer the grace period is, the earlier a cluster resource shortage can occur due to throttoled runner pod deletions,
// while the shorter the grace period is, the more likely you may encounter the race issue.
func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string) (bool, error) {
runners, err := client.ListRunners(ctx, enterprise, org, repo)
if err != nil {
return false, err
}
id := int64(0)
for _, runner := range runners {
if runner.GetName() == name {
id = runner.GetID()
break
}
}
if id == int64(0) {
return false, nil
}
// For the record, historically ARC did not try to call RemoveRunner on a busy runner, but it's no longer true.
// The reason ARC did so was to let a runner running a job to not stop prematurely.
//
// However, we learned that RemoveRunner already has an ability to prevent stopping a busy runner,
// so ARC doesn't need to do anything special for a graceful runner stop.
// It can just call RemoveRunner, and if it returned 200 you're guaranteed that the runner will not automatically come back and
// the runner pod is safe for deletion.
//
// Trying to remove a busy runner can result in errors like the following:
// failed to remove runner: DELETE https://api.github.com/repos/actions-runner-controller/mumoshu-actions-test/actions/runners/47: 422 Bad request - Runner \"example-runnerset-0\" is still running a job\" []
//
// # NOTES
//
// - It can be "status=offline" at the same time but that's another story.
// - After https://github.com/actions-runner-controller/actions-runner-controller/pull/1127, ListRunners responses that are used to
// determine if the runner is busy can be more outdated than before, as those responeses are now cached for 60 seconds.
// - Note that 60 seconds is controlled by the Cache-Control response header provided by GitHub so we don't have a strict control on it but we assume it won't
// change from 60 seconds.
//
// TODO: Probably we can just remove the runner by ID without seeing if the runner is busy, by treating it as busy when a remove-runner call failed with 422?
if err := client.RemoveRunner(ctx, enterprise, org, repo, id); err != nil {
return false, err
}
return true, nil
}

View File

@ -47,6 +47,9 @@ type RunnerPodReconciler struct {
Name string Name string
RegistrationRecheckInterval time.Duration RegistrationRecheckInterval time.Duration
RegistrationRecheckJitter time.Duration RegistrationRecheckJitter time.Duration
UnregistrationTimeout time.Duration
UnregistrationRetryDelay time.Duration
} }
const ( const (
@ -105,33 +108,16 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
finalizers, removed := removeFinalizer(runnerPod.ObjectMeta.Finalizers, runnerPodFinalizerName) finalizers, removed := removeFinalizer(runnerPod.ObjectMeta.Finalizers, runnerPodFinalizerName)
if removed { if removed {
ok, err := r.unregisterRunner(ctx, enterprise, org, repo, runnerPod.Name) updatedPod, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod)
if err != nil { if res != nil {
if errors.Is(err, &gogithub.RateLimitError{}) { return *res, err
// We log the underlying error when we failed calling GitHub API to list or unregisters,
// or the runner is still busy.
log.Error(
err,
fmt.Sprintf(
"Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls",
retryDelayOnGitHubAPIRateLimitError,
),
)
return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
} }
return ctrl.Result{}, err patchedPod := updatedPod.DeepCopy()
} patchedPod.ObjectMeta.Finalizers = finalizers
if !ok { // We commit the removal of the finalizer so that Kuberenetes notices it and delete the pod resource from the cluster.
log.V(1).Info("Runner no longer exists on GitHub") if err := r.Patch(ctx, patchedPod, client.MergeFrom(&runnerPod)); err != nil {
}
newRunner := runnerPod.DeepCopy()
newRunner.ObjectMeta.Finalizers = finalizers
if err := r.Patch(ctx, newRunner, client.MergeFrom(&runnerPod)); err != nil {
log.Error(err, "Failed to update runner for finalizer removal") log.Error(err, "Failed to update runner for finalizer removal")
return ctrl.Result{}, err return ctrl.Result{}, err
} }
@ -365,8 +351,13 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
return ctrl.Result{}, nil return ctrl.Result{}, nil
} }
updated, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod)
if res != nil {
return *res, err
}
// Delete current pod if recreation is needed // Delete current pod if recreation is needed
if err := r.Delete(ctx, &runnerPod); err != nil { if err := r.Delete(ctx, updated); err != nil {
log.Error(err, "Failed to delete pod resource") log.Error(err, "Failed to delete pod resource")
return ctrl.Result{}, err return ctrl.Result{}, err
} }
@ -377,8 +368,22 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
return ctrl.Result{}, nil return ctrl.Result{}, nil
} }
func (r *RunnerPodReconciler) unregisterRunner(ctx context.Context, enterprise, org, repo, name string) (bool, error) { func (r *RunnerPodReconciler) unregistrationTimeout() time.Duration {
return unregisterRunner(ctx, r.GitHubClient, enterprise, org, repo, name) unregistrationTimeout := DefaultUnregistrationTimeout
if r.UnregistrationTimeout > 0 {
unregistrationTimeout = r.UnregistrationTimeout
}
return unregistrationTimeout
}
func (r *RunnerPodReconciler) unregistrationRetryDelay() time.Duration {
retryDelay := DefaultUnregistrationRetryDelay
if r.UnregistrationRetryDelay > 0 {
retryDelay = r.UnregistrationRetryDelay
}
return retryDelay
} }
func (r *RunnerPodReconciler) SetupWithManager(mgr ctrl.Manager) error { func (r *RunnerPodReconciler) SetupWithManager(mgr ctrl.Manager) error {

View File

@ -177,6 +177,7 @@ func (r *RunnerDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Req
// Please add more conditions that we can in-place update the newest runnerreplicaset without disruption // Please add more conditions that we can in-place update the newest runnerreplicaset without disruption
if currentDesiredReplicas != newDesiredReplicas { if currentDesiredReplicas != newDesiredReplicas {
newestSet.Spec.Replicas = &newDesiredReplicas newestSet.Spec.Replicas = &newDesiredReplicas
newestSet.Spec.EffectiveTime = rd.Spec.EffectiveTime
if err := r.Client.Update(ctx, newestSet); err != nil { if err := r.Client.Update(ctx, newestSet); err != nil {
log.Error(err, "Failed to update runnerreplicaset resource") log.Error(err, "Failed to update runnerreplicaset resource")
@ -420,6 +421,7 @@ func newRunnerReplicaSet(rd *v1alpha1.RunnerDeployment, commonRunnerLabels []str
Replicas: rd.Spec.Replicas, Replicas: rd.Spec.Replicas,
Selector: newRSSelector, Selector: newRSSelector,
Template: newRSTemplate, Template: newRSTemplate,
EffectiveTime: rd.Spec.EffectiveTime,
}, },
} }

View File

@ -49,6 +49,10 @@ type RunnerReplicaSetReconciler struct {
Name string Name string
} }
const (
SyncTimeAnnotationKey = "sync-time"
)
// +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnerreplicasets,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnerreplicasets,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnerreplicasets/finalizers,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnerreplicasets/finalizers,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnerreplicasets/status,verbs=get;update;patch // +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnerreplicasets/status,verbs=get;update;patch
@ -85,19 +89,36 @@ func (r *RunnerReplicaSetReconciler) Reconcile(ctx context.Context, req ctrl.Req
} }
} }
var myRunners []v1alpha1.Runner
var ( var (
current int current int
ready int ready int
available int available int
lastSyncTime *time.Time
) )
for _, r := range allRunners.Items { for _, r := range allRunners.Items {
// This guard is required to avoid the RunnerReplicaSet created by the controller v0.17.0 or before // This guard is required to avoid the RunnerReplicaSet created by the controller v0.17.0 or before
// to not treat all the runners in the namespace as its children. // to not treat all the runners in the namespace as its children.
if metav1.IsControlledBy(&r, &rs) && !metav1.HasAnnotation(r.ObjectMeta, annotationKeyRegistrationOnly) { if metav1.IsControlledBy(&r, &rs) && !metav1.HasAnnotation(r.ObjectMeta, annotationKeyRegistrationOnly) {
myRunners = append(myRunners, r) // If the runner is already marked for deletion(=has a non-zero deletion timestamp) by the runner controller (can be caused by an ephemeral runner completion)
// or by runnerreplicaset controller (in case it was deleted in the previous reconcilation loop),
// we don't need to bother calling GitHub API to re-mark the runner for deletion.
// Just hold on, and runners will disappear as long as the runner controller is up and running.
if !r.DeletionTimestamp.IsZero() {
continue
}
if r.Annotations != nil {
if a, ok := r.Annotations[SyncTimeAnnotationKey]; ok {
t, err := time.Parse(time.RFC3339, a)
if err == nil {
if lastSyncTime == nil || lastSyncTime.Before(t) {
lastSyncTime = &t
}
}
}
}
current += 1 current += 1
@ -152,7 +173,30 @@ func (r *RunnerReplicaSetReconciler) Reconcile(ctx context.Context, req ctrl.Req
} }
} }
if current > desired { effectiveTime := rs.Spec.EffectiveTime
ephemeral := rs.Spec.Template.Spec.Ephemeral == nil || *rs.Spec.Template.Spec.Ephemeral
if current < desired && ephemeral && lastSyncTime != nil && effectiveTime != nil && lastSyncTime.After(effectiveTime.Time) {
log.V(1).Info("Detected that some ephemeral runners have disappeared. Usually this is due to that ephemeral runner completions so ARC does not create new runners until EffectiveTime is updated.", "lastSyncTime", metav1.Time{Time: *lastSyncTime}, "effectiveTime", *effectiveTime, "desired", desired, "available", current, "ready", ready)
} else if current > desired {
// If you use ephemeral runners with webhook-based autoscaler and the runner controller is working normally,
// you're unlikely to fall into this branch.
//
// That's becaseu all the stakeholders work like this:
//
// 1. A runner pod completes with the runner container exiting with code 0
// 2. ARC runner controller detects the pod completion, marks the runner resource on k8s for deletion (=Runner.DeletionTimestamp becomes non-zero)
// 3. GitHub triggers a corresponding workflow_job "complete" webhook event
// 4. ARC github-webhook-server (webhook-based autoscaler) receives the webhook event updates HRA with removing the oldest capacity reservation
// 5. ARC horizontalrunnerautoscaler updates RunnerDeployment's desired replicas based on capacity reservations
// 6. ARC runnerdeployment controller updates RunnerReplicaSet's desired replicas
// 7. (We're here) ARC runnerreplicaset controller (this controller) starts reconciling the RunnerReplicaSet
//
// In a normally working ARC installation, the runner that was used to run the workflow job should already have been
// marked for deletion by the runner controller.
// This runnerreplicaset controller doesn't count marked runners into the `current` value, hence you're unlikely to
// fall into this branch when you're using ephemeral runners with webhook-based-autoscaler.
n := current - desired n := current - desired
log.V(0).Info(fmt.Sprintf("Deleting %d runners from RunnerReplicaSet %s", n, req.NamespacedName), "desired", desired, "current", current, "ready", ready) log.V(0).Info(fmt.Sprintf("Deleting %d runners from RunnerReplicaSet %s", n, req.NamespacedName), "desired", desired, "current", current, "ready", ready)
@ -282,6 +326,10 @@ func (r *RunnerReplicaSetReconciler) newRunner(rs v1alpha1.RunnerReplicaSet) (v1
objectMeta.GenerateName = rs.ObjectMeta.Name + "-" objectMeta.GenerateName = rs.ObjectMeta.Name + "-"
objectMeta.Namespace = rs.ObjectMeta.Namespace objectMeta.Namespace = rs.ObjectMeta.Namespace
if objectMeta.Annotations == nil {
objectMeta.Annotations = map[string]string{}
}
objectMeta.Annotations[SyncTimeAnnotationKey] = time.Now().Format(time.RFC3339)
runner := v1alpha1.Runner{ runner := v1alpha1.Runner{
TypeMeta: metav1.TypeMeta{}, TypeMeta: metav1.TypeMeta{},

View File

@ -1,49 +0,0 @@
package controllers
import (
"context"
"fmt"
"github.com/actions-runner-controller/actions-runner-controller/github"
)
// unregisterRunner unregisters the runner from GitHub Actions by name.
//
// This function returns:
// - (true, nil) when it has successfully unregistered the runner.
// - (false, nil) when the runner has been already unregistered.
// - (false, err) when it postponed unregistration due to the runner being busy, or it tried to unregister the runner but failed due to
// an error returned by GitHub API.
func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string) (bool, error) {
runners, err := client.ListRunners(ctx, enterprise, org, repo)
if err != nil {
return false, err
}
id := int64(0)
for _, runner := range runners {
if runner.GetName() == name {
// Note that sometimes a runner can stuck "busy" even though it is already "offline".
// But we assume that it's not actually offline and still running a job.
if runner.GetBusy() {
return false, fmt.Errorf("runner is busy")
}
id = runner.GetID()
break
}
}
if id == int64(0) {
return false, nil
}
// Trying to remove a busy runner can result in errors like the following:
// failed to remove runner: DELETE https://api.github.com/repos/actions-runner-controller/mumoshu-actions-test/actions/runners/47: 422 Bad request - Runner \"example-runnerset-0\" is still running a job\" []
//
// TODO: Probably we can just remove the runner by ID without seeing if the runner is busy, by treating it as busy when a remove-runner call failed with 422?
if err := client.RemoveRunner(ctx, enterprise, org, repo, id); err != nil {
return false, err
}
return true, nil
}

View File

@ -11,8 +11,11 @@ import (
"time" "time"
"github.com/actions-runner-controller/actions-runner-controller/github/metrics" "github.com/actions-runner-controller/actions-runner-controller/github/metrics"
"github.com/actions-runner-controller/actions-runner-controller/logging"
"github.com/bradleyfalzon/ghinstallation" "github.com/bradleyfalzon/ghinstallation"
"github.com/go-logr/logr"
"github.com/google/go-github/v39/github" "github.com/google/go-github/v39/github"
"github.com/gregjones/httpcache"
"golang.org/x/oauth2" "golang.org/x/oauth2"
) )
@ -28,6 +31,8 @@ type Config struct {
BasicauthUsername string `split_words:"true"` BasicauthUsername string `split_words:"true"`
BasicauthPassword string `split_words:"true"` BasicauthPassword string `split_words:"true"`
RunnerGitHubURL string `split_words:"true"` RunnerGitHubURL string `split_words:"true"`
Log *logr.Logger
} }
// Client wraps GitHub client with some additional // Client wraps GitHub client with some additional
@ -46,7 +51,6 @@ type BasicAuthTransport struct {
func (p BasicAuthTransport) RoundTrip(req *http.Request) (*http.Response, error) { func (p BasicAuthTransport) RoundTrip(req *http.Request) (*http.Response, error) {
req.SetBasicAuth(p.Username, p.Password) req.SetBasicAuth(p.Username, p.Password)
req.Header.Set("User-Agent", "actions-runner-controller")
return http.DefaultTransport.RoundTrip(req) return http.DefaultTransport.RoundTrip(req)
} }
@ -82,8 +86,11 @@ func (c *Config) NewClient() (*Client, error) {
transport = tr transport = tr
} }
transport = metrics.Transport{Transport: transport} cached := httpcache.NewTransport(httpcache.NewMemoryCache())
httpClient := &http.Client{Transport: transport} cached.Transport = transport
loggingTransport := logging.Transport{Transport: cached, Log: c.Log}
metricsTransport := metrics.Transport{Transport: loggingTransport}
httpClient := &http.Client{Transport: metricsTransport}
var client *github.Client var client *github.Client
var githubBaseURL string var githubBaseURL string
@ -128,6 +135,8 @@ func (c *Config) NewClient() (*Client, error) {
} }
} }
client.UserAgent = "actions-runner-controller"
return &Client{ return &Client{
Client: client, Client: client,
regTokens: map[string]*github.RegistrationToken{}, regTokens: map[string]*github.RegistrationToken{},

View File

@ -152,3 +152,10 @@ func TestCleanup(t *testing.T) {
t.Errorf("expired token still exists") t.Errorf("expired token still exists")
} }
} }
func TestUserAgent(t *testing.T) {
client := newTestClient()
if client.UserAgent != "actions-runner-controller" {
t.Errorf("UserAgent should be set to actions-runner-controller")
}
}

1
go.mod
View File

@ -41,6 +41,7 @@ require (
github.com/google/gofuzz v1.1.0 // indirect github.com/google/gofuzz v1.1.0 // indirect
github.com/google/uuid v1.1.2 // indirect github.com/google/uuid v1.1.2 // indirect
github.com/googleapis/gnostic v0.5.5 // indirect github.com/googleapis/gnostic v0.5.5 // indirect
github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 // indirect
github.com/imdario/mergo v0.3.12 // indirect github.com/imdario/mergo v0.3.12 // indirect
github.com/json-iterator/go v1.1.12 // indirect github.com/json-iterator/go v1.1.12 // indirect
github.com/matttproud/golang_protobuf_extensions v1.0.2-0.20181231171920-c182affec369 // indirect github.com/matttproud/golang_protobuf_extensions v1.0.2-0.20181231171920-c182affec369 // indirect

2
go.sum
View File

@ -259,6 +259,8 @@ github.com/gorilla/mux v1.8.0 h1:i40aqfkR1h2SlN9hojwV5ZA91wcXFOvkdNIeFDP5koI=
github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So= github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So=
github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA= github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA=
github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 h1:+ngKgrYPPJrOjhax5N+uePQ0Fh1Z7PheYoUI/0nzkPA=
github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA=
github.com/grpc-ecosystem/go-grpc-middleware v1.0.0/go.mod h1:FiyG127CGDf3tlThmgyCl78X/SZQqEOJBCDaAfeWzPs= github.com/grpc-ecosystem/go-grpc-middleware v1.0.0/go.mod h1:FiyG127CGDf3tlThmgyCl78X/SZQqEOJBCDaAfeWzPs=
github.com/grpc-ecosystem/go-grpc-middleware v1.3.0/go.mod h1:z0ButlSOZa5vEBq9m2m2hlwIgKw+rp3sdCBRoJY+30Y= github.com/grpc-ecosystem/go-grpc-middleware v1.3.0/go.mod h1:z0ButlSOZa5vEBq9m2m2hlwIgKw+rp3sdCBRoJY+30Y=
github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgfV/d3M/q6VIi02HzZEHgUlZvzk= github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgfV/d3M/q6VIi02HzZEHgUlZvzk=

54
logging/logger.go Normal file
View File

@ -0,0 +1,54 @@
package logging
import (
"fmt"
"os"
"strconv"
"github.com/go-logr/logr"
zaplib "go.uber.org/zap"
"go.uber.org/zap/zapcore"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
)
const (
LogLevelDebug = "debug"
LogLevelInfo = "info"
LogLevelWarn = "warn"
LogLevelError = "error"
)
func NewLogger(logLevel string) logr.Logger {
log := zap.New(func(o *zap.Options) {
switch logLevel {
case LogLevelDebug:
o.Development = true
lvl := zaplib.NewAtomicLevelAt(zaplib.DebugLevel) // maps to logr's V(1)
o.Level = &lvl
case LogLevelInfo:
lvl := zaplib.NewAtomicLevelAt(zaplib.InfoLevel)
o.Level = &lvl
case LogLevelWarn:
lvl := zaplib.NewAtomicLevelAt(zaplib.WarnLevel)
o.Level = &lvl
case LogLevelError:
lvl := zaplib.NewAtomicLevelAt(zaplib.ErrorLevel)
o.Level = &lvl
default:
// We use bitsize of 8 as zapcore.Level is a type alias to int8
levelInt, err := strconv.ParseInt(logLevel, 10, 8)
if err != nil {
fmt.Fprintf(os.Stderr, "Failed to parse --log-level=%s: %v", logLevel, err)
os.Exit(1)
}
// For example, --log-level=debug a.k.a --log-level=-1 maps to zaplib.DebugLevel, which is associated to logr's V(1)
// --log-level=-2 maps the specific custom log level that is associated to logr's V(2).
level := zapcore.Level(levelInt)
atomicLevel := zaplib.NewAtomicLevelAt(level)
o.Level = &atomicLevel
}
})
return log
}

51
logging/transport.go Normal file
View File

@ -0,0 +1,51 @@
// Package logging provides various logging helpers for ARC
package logging
import (
"net/http"
"github.com/go-logr/logr"
"github.com/gregjones/httpcache"
)
const (
// https://docs.github.com/en/rest/overview/resources-in-the-rest-api#rate-limiting
headerRateLimitRemaining = "X-RateLimit-Remaining"
)
// Transport wraps a transport with metrics monitoring
type Transport struct {
Transport http.RoundTripper
Log *logr.Logger
}
func (t Transport) RoundTrip(req *http.Request) (*http.Response, error) {
resp, err := t.Transport.RoundTrip(req)
if resp != nil {
t.log(req, resp)
}
return resp, err
}
func (t Transport) log(req *http.Request, resp *http.Response) {
if t.Log == nil {
return
}
var args []interface{}
marked := resp.Header.Get(httpcache.XFromCache) == "1"
args = append(args, "from_cache", marked, "method", req.Method, "url", req.URL.String())
if !marked {
// Do not log outdated rate limit remaining value
remaining := resp.Header.Get(headerRateLimitRemaining)
args = append(args, "ratelimit_remaining", remaining)
}
t.Log.V(3).Info("Seen HTTP response", args...)
}

28
main.go
View File

@ -26,24 +26,18 @@ import (
actionsv1alpha1 "github.com/actions-runner-controller/actions-runner-controller/api/v1alpha1" actionsv1alpha1 "github.com/actions-runner-controller/actions-runner-controller/api/v1alpha1"
"github.com/actions-runner-controller/actions-runner-controller/controllers" "github.com/actions-runner-controller/actions-runner-controller/controllers"
"github.com/actions-runner-controller/actions-runner-controller/github" "github.com/actions-runner-controller/actions-runner-controller/github"
"github.com/actions-runner-controller/actions-runner-controller/logging"
"github.com/kelseyhightower/envconfig" "github.com/kelseyhightower/envconfig"
zaplib "go.uber.org/zap"
"k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime"
clientgoscheme "k8s.io/client-go/kubernetes/scheme" clientgoscheme "k8s.io/client-go/kubernetes/scheme"
_ "k8s.io/client-go/plugin/pkg/client/auth/gcp" _ "k8s.io/client-go/plugin/pkg/client/auth/gcp"
ctrl "sigs.k8s.io/controller-runtime" ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
// +kubebuilder:scaffold:imports // +kubebuilder:scaffold:imports
) )
const ( const (
defaultRunnerImage = "summerwind/actions-runner:latest" defaultRunnerImage = "summerwind/actions-runner:latest"
defaultDockerImage = "docker:dind" defaultDockerImage = "docker:dind"
logLevelDebug = "debug"
logLevelInfo = "info"
logLevelWarn = "warn"
logLevelError = "error"
) )
var ( var (
@ -120,24 +114,12 @@ func main() {
flag.DurationVar(&syncPeriod, "sync-period", 10*time.Minute, "Determines the minimum frequency at which K8s resources managed by this controller are reconciled. When you use autoscaling, set to a lower value like 10 minute, because this corresponds to the minimum time to react on demand change. . If you're tweaking this in order to make autoscaling more responsive, you'll probably want to tweak github-api-cache-duration, too") flag.DurationVar(&syncPeriod, "sync-period", 10*time.Minute, "Determines the minimum frequency at which K8s resources managed by this controller are reconciled. When you use autoscaling, set to a lower value like 10 minute, because this corresponds to the minimum time to react on demand change. . If you're tweaking this in order to make autoscaling more responsive, you'll probably want to tweak github-api-cache-duration, too")
flag.Var(&commonRunnerLabels, "common-runner-labels", "Runner labels in the K1=V1,K2=V2,... format that are inherited all the runners created by the controller. See https://github.com/actions-runner-controller/actions-runner-controller/issues/321 for more information") flag.Var(&commonRunnerLabels, "common-runner-labels", "Runner labels in the K1=V1,K2=V2,... format that are inherited all the runners created by the controller. See https://github.com/actions-runner-controller/actions-runner-controller/issues/321 for more information")
flag.StringVar(&namespace, "watch-namespace", "", "The namespace to watch for custom resources. Set to empty for letting it watch for all namespaces.") flag.StringVar(&namespace, "watch-namespace", "", "The namespace to watch for custom resources. Set to empty for letting it watch for all namespaces.")
flag.StringVar(&logLevel, "log-level", logLevelDebug, `The verbosity of the logging. Valid values are "debug", "info", "warn", "error". Defaults to "debug".`) flag.StringVar(&logLevel, "log-level", logging.LogLevelDebug, `The verbosity of the logging. Valid values are "debug", "info", "warn", "error". Defaults to "debug".`)
flag.Parse() flag.Parse()
logger := zap.New(func(o *zap.Options) { logger := logging.NewLogger(logLevel)
switch logLevel {
case logLevelDebug: c.Log = &logger
o.Development = true
case logLevelInfo:
lvl := zaplib.NewAtomicLevelAt(zaplib.InfoLevel)
o.Level = &lvl
case logLevelWarn:
lvl := zaplib.NewAtomicLevelAt(zaplib.WarnLevel)
o.Level = &lvl
case logLevelError:
lvl := zaplib.NewAtomicLevelAt(zaplib.ErrorLevel)
o.Level = &lvl
}
})
ghClient, err = c.NewClient() ghClient, err = c.NewClient()
if err != nil { if err != nil {

View File

@ -17,12 +17,15 @@ var (
controllerImageTag = "e2e" controllerImageTag = "e2e"
controllerImage = testing.Img(controllerImageRepo, controllerImageTag) controllerImage = testing.Img(controllerImageRepo, controllerImageTag)
runnerImageRepo = "actionsrunnercontrollere2e/actions-runner" runnerImageRepo = "actionsrunnercontrollere2e/actions-runner"
runnerDindImageRepo = "actionsrunnercontrollere2e/actions-runner-dind"
runnerImageTag = "e2e" runnerImageTag = "e2e"
runnerImage = testing.Img(runnerImageRepo, runnerImageTag) runnerImage = testing.Img(runnerImageRepo, runnerImageTag)
runnerDindImage = testing.Img(runnerDindImageRepo, runnerImageTag)
prebuildImages = []testing.ContainerImage{ prebuildImages = []testing.ContainerImage{
controllerImage, controllerImage,
runnerImage, runnerImage,
runnerDindImage,
} }
builds = []testing.DockerBuild{ builds = []testing.DockerBuild{
@ -36,6 +39,11 @@ var (
Args: []testing.BuildArg{}, Args: []testing.BuildArg{},
Image: runnerImage, Image: runnerImage,
}, },
{
Dockerfile: "../../runner/Dockerfile.dindrunner",
Args: []testing.BuildArg{},
Image: runnerDindImage,
},
} }
certManagerVersion = "v1.1.1" certManagerVersion = "v1.1.1"
@ -52,7 +60,6 @@ var (
"SYNC_PERIOD=" + "10s", "SYNC_PERIOD=" + "10s",
"NAME=" + controllerImageRepo, "NAME=" + controllerImageRepo,
"VERSION=" + controllerImageTag, "VERSION=" + controllerImageTag,
"RUNNER_NAME=" + runnerImageRepo,
"RUNNER_TAG=" + runnerImageTag, "RUNNER_TAG=" + runnerImageTag,
} }
@ -167,11 +174,15 @@ type env struct {
useRunnerSet bool useRunnerSet bool
testID string testID string
testName string
repoToCommit string repoToCommit string
runnerLabel, githubToken, testRepo, testOrg, testOrgRepo string runnerLabel, githubToken, testRepo, testOrg, testOrgRepo string
githubTokenWebhook string githubTokenWebhook string
testEnterprise string testEnterprise string
featureFlagEphemeral bool featureFlagEphemeral bool
scaleDownDelaySecondsAfterScaleOut int64
minReplicas int64
dockerdWithinRunnerContainer bool
testJobs []job testJobs []job
} }
@ -184,11 +195,12 @@ func initTestEnv(t *testing.T) *env {
id := e.ID() id := e.ID()
testID := t.Name() + " " + id testName := t.Name() + " " + id
t.Logf("Using test id %s", testID) t.Logf("Initializing test with name %s", testName)
e.testID = testID e.testID = id
e.testName = testName
e.runnerLabel = "test-" + id e.runnerLabel = "test-" + id
e.githubToken = testing.Getenv(t, "GITHUB_TOKEN") e.githubToken = testing.Getenv(t, "GITHUB_TOKEN")
e.githubTokenWebhook = testing.Getenv(t, "WEBHOOK_GITHUB_TOKEN") e.githubTokenWebhook = testing.Getenv(t, "WEBHOOK_GITHUB_TOKEN")
@ -197,9 +209,17 @@ func initTestEnv(t *testing.T) *env {
e.testOrg = testing.Getenv(t, "TEST_ORG", "") e.testOrg = testing.Getenv(t, "TEST_ORG", "")
e.testOrgRepo = testing.Getenv(t, "TEST_ORG_REPO", "") e.testOrgRepo = testing.Getenv(t, "TEST_ORG_REPO", "")
e.testEnterprise = testing.Getenv(t, "TEST_ENTERPRISE") e.testEnterprise = testing.Getenv(t, "TEST_ENTERPRISE")
e.testJobs = createTestJobs(id, testResultCMNamePrefix, 10) e.testJobs = createTestJobs(id, testResultCMNamePrefix, 100)
ephemeral, _ := strconv.ParseBool(testing.Getenv(t, "TEST_FEATURE_FLAG_EPHEMERAL")) ephemeral, _ := strconv.ParseBool(testing.Getenv(t, "TEST_FEATURE_FLAG_EPHEMERAL"))
e.featureFlagEphemeral = ephemeral e.featureFlagEphemeral = ephemeral
e.scaleDownDelaySecondsAfterScaleOut, _ = strconv.ParseInt(testing.Getenv(t, "TEST_RUNNER_SCALE_DOWN_DELAY_SECONDS_AFTER_SCALE_OUT", "10"), 10, 32)
e.minReplicas, _ = strconv.ParseInt(testing.Getenv(t, "TEST_RUNNER_MIN_REPLICAS", "1"), 10, 32)
var err error
e.dockerdWithinRunnerContainer, err = strconv.ParseBool(testing.Getenv(t, "TEST_RUNNER_DOCKERD_WITHIN_RUNNER_CONTAINER", "false"))
if err != nil {
panic(fmt.Sprintf("unable to parse bool from TEST_RUNNER_DOCKERD_WITHIN_RUNNER_CONTAINER: %v", err))
}
return e return e
} }
@ -254,7 +274,22 @@ func (e *env) installActionsRunnerController(t *testing.T) {
"GITHUB_TOKEN=" + e.githubToken, "GITHUB_TOKEN=" + e.githubToken,
"WEBHOOK_GITHUB_TOKEN=" + e.githubTokenWebhook, "WEBHOOK_GITHUB_TOKEN=" + e.githubTokenWebhook,
"RUNNER_LABEL=" + e.runnerLabel, "RUNNER_LABEL=" + e.runnerLabel,
"TEST_ID=" + e.testID,
fmt.Sprintf("RUNNER_FEATURE_FLAG_EPHEMERAL=%v", e.featureFlagEphemeral), fmt.Sprintf("RUNNER_FEATURE_FLAG_EPHEMERAL=%v", e.featureFlagEphemeral),
fmt.Sprintf("RUNNER_SCALE_DOWN_DELAY_SECONDS_AFTER_SCALE_OUT=%d", e.scaleDownDelaySecondsAfterScaleOut),
fmt.Sprintf("ORG_RUNNER_MIN_REPLICAS=%d", e.minReplicas),
}
if e.dockerdWithinRunnerContainer {
varEnv = append(varEnv,
"RUNNER_DOCKERD_WITHIN_RUNNER_CONTAINER=true",
"RUNNER_NAME="+runnerDindImageRepo,
)
} else {
varEnv = append(varEnv,
"RUNNER_DOCKERD_WITHIN_RUNNER_CONTAINER=false",
"RUNNER_NAME="+runnerImageRepo,
)
} }
scriptEnv = append(scriptEnv, varEnv...) scriptEnv = append(scriptEnv, varEnv...)
@ -273,7 +308,7 @@ func (e *env) createControllerNamespaceAndServiceAccount(t *testing.T) {
func (e *env) installActionsWorkflow(t *testing.T) { func (e *env) installActionsWorkflow(t *testing.T) {
t.Helper() t.Helper()
installActionsWorkflow(t, e.testID, e.runnerLabel, testResultCMNamePrefix, e.repoToCommit, e.testJobs) installActionsWorkflow(t, e.testName, e.runnerLabel, testResultCMNamePrefix, e.repoToCommit, e.testJobs)
} }
func (e *env) verifyActionsWorkflowRun(t *testing.T) { func (e *env) verifyActionsWorkflowRun(t *testing.T) {
@ -302,13 +337,13 @@ func createTestJobs(id, testResultCMNamePrefix string, numJobs int) []job {
const Branch = "main" const Branch = "main"
func installActionsWorkflow(t *testing.T, testID, runnerLabel, testResultCMNamePrefix, testRepo string, testJobs []job) { func installActionsWorkflow(t *testing.T, testName, runnerLabel, testResultCMNamePrefix, testRepo string, testJobs []job) {
t.Helper() t.Helper()
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel() defer cancel()
wfName := "E2E " + testID wfName := "E2E " + testName
wf := testing.Workflow{ wf := testing.Workflow{
Name: wfName, Name: wfName,
On: testing.On{ On: testing.On{