From 4c53e3aa75bb7a16a68b70ce08f5d78eecc6480b Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Thu, 17 Feb 2022 02:00:47 +0000 Subject: [PATCH 01/12] Add GitHub API cache to avoid rate limit This will cache any GitHub API responses with correct Cache-Control header. `gregjones/httpcache` has been chosen as a library to implement this feature, as it is as recommended in `go-github`'s documentation: https://github.com/google/go-github#conditional-requests `gregjones/httpcache` supports a number of cache backends like `diskcache`, `s3cache`, and so on: https://github.com/gregjones/httpcache#cache-backends We stick to the built-in in-memory cache as a starter. Probably this will never becomes an issue as long as various HTTP responses for all the GitHub API calls that ARC makes, list-runners, list-workflow-jobs, list-runner-groups, etc., doesn't overflow the in-memory cache. `httpcache` has an known unfixed issue that it doesn't update cache on chunked responses. But we assume that the APIs that we call doesn't use chunked responses. See #1503 for more information on that. Ref #920 --- github/github.go | 7 +++++-- go.mod | 1 + go.sum | 2 ++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/github/github.go b/github/github.go index 89a49913..146130d9 100644 --- a/github/github.go +++ b/github/github.go @@ -13,6 +13,7 @@ import ( "github.com/actions-runner-controller/actions-runner-controller/github/metrics" "github.com/bradleyfalzon/ghinstallation" "github.com/google/go-github/v39/github" + "github.com/gregjones/httpcache" "golang.org/x/oauth2" ) @@ -82,8 +83,10 @@ func (c *Config) NewClient() (*Client, error) { transport = tr } - transport = metrics.Transport{Transport: transport} - httpClient := &http.Client{Transport: transport} + cached := httpcache.NewTransport(httpcache.NewMemoryCache()) + cached.Transport = transport + metricsTransport := metrics.Transport{Transport: cached} + httpClient := &http.Client{Transport: metricsTransport} var client *github.Client var githubBaseURL string diff --git a/go.mod b/go.mod index f9627220..25380078 100644 --- a/go.mod +++ b/go.mod @@ -41,6 +41,7 @@ require ( github.com/google/gofuzz v1.1.0 // indirect github.com/google/uuid v1.1.2 // indirect github.com/googleapis/gnostic v0.5.5 // indirect + github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 // indirect github.com/imdario/mergo v0.3.12 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/matttproud/golang_protobuf_extensions v1.0.2-0.20181231171920-c182affec369 // indirect diff --git a/go.sum b/go.sum index d9d4388c..65d55022 100644 --- a/go.sum +++ b/go.sum @@ -259,6 +259,8 @@ github.com/gorilla/mux v1.8.0 h1:i40aqfkR1h2SlN9hojwV5ZA91wcXFOvkdNIeFDP5koI= github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So= github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA= +github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 h1:+ngKgrYPPJrOjhax5N+uePQ0Fh1Z7PheYoUI/0nzkPA= +github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA= github.com/grpc-ecosystem/go-grpc-middleware v1.0.0/go.mod h1:FiyG127CGDf3tlThmgyCl78X/SZQqEOJBCDaAfeWzPs= github.com/grpc-ecosystem/go-grpc-middleware v1.3.0/go.mod h1:z0ButlSOZa5vEBq9m2m2hlwIgKw+rp3sdCBRoJY+30Y= github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgfV/d3M/q6VIi02HzZEHgUlZvzk= From 4b557dc54cac5f302ea28b4344488bdd60f7bb9e Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Sat, 19 Feb 2022 09:21:14 +0000 Subject: [PATCH 02/12] Add logging transport to log HTTP requests in log level -3 The log level -3 is the minimum log level that is supported today, smaller than debug(-1) and -2(used to log some HRA related logs). This commit adds a logging HTTP transport to log HTTP requests and responses to that log level. It implements http.RoundTripper so that it can log each HTTP request with useful metadata like `from_cache` and `ratelimit_remaining`. The former is set to `true` only when the logged request's response was served from ARC's in-memory cache. The latter is set to X-RateLimit-Remaining response header value if and only if the response was served by GitHub, not by ARC's cache. --- acceptance/values.yaml | 3 +- cmd/githubwebhookserver/main.go | 30 +++--------------- github/github.go | 7 ++++- logging/logger.go | 54 +++++++++++++++++++++++++++++++++ logging/transport.go | 51 +++++++++++++++++++++++++++++++ main.go | 28 +++-------------- 6 files changed, 123 insertions(+), 50 deletions(-) create mode 100644 logging/logger.go create mode 100644 logging/transport.go diff --git a/acceptance/values.yaml b/acceptance/values.yaml index 496589b9..1a86610e 100644 --- a/acceptance/values.yaml +++ b/acceptance/values.yaml @@ -1,7 +1,8 @@ # Set actions-runner-controller settings for testing githubAPICacheDuration: 10s +logLevel: "-3" githubWebhookServer: - logLevel: debug + logLevel: "-3" enabled: true labels: {} replicaCount: 1 diff --git a/cmd/githubwebhookserver/main.go b/cmd/githubwebhookserver/main.go index 436e8b4a..4e3e604b 100644 --- a/cmd/githubwebhookserver/main.go +++ b/cmd/githubwebhookserver/main.go @@ -29,15 +29,14 @@ import ( actionsv1alpha1 "github.com/actions-runner-controller/actions-runner-controller/api/v1alpha1" "github.com/actions-runner-controller/actions-runner-controller/controllers" "github.com/actions-runner-controller/actions-runner-controller/github" + "github.com/actions-runner-controller/actions-runner-controller/logging" "github.com/kelseyhightower/envconfig" - zaplib "go.uber.org/zap" "k8s.io/apimachinery/pkg/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" _ "k8s.io/client-go/plugin/pkg/client/auth/exec" _ "k8s.io/client-go/plugin/pkg/client/auth/gcp" _ "k8s.io/client-go/plugin/pkg/client/auth/oidc" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/log/zap" // +kubebuilder:scaffold:imports ) @@ -47,11 +46,6 @@ var ( ) const ( - logLevelDebug = "debug" - logLevelInfo = "info" - logLevelWarn = "warn" - logLevelError = "error" - webhookSecretTokenEnvName = "GITHUB_WEBHOOK_SECRET_TOKEN" ) @@ -97,7 +91,7 @@ func main() { flag.BoolVar(&enableLeaderElection, "enable-leader-election", false, "Enable leader election for controller manager. Enabling this will ensure there is only one active controller manager.") flag.DurationVar(&syncPeriod, "sync-period", 10*time.Minute, "Determines the minimum frequency at which K8s resources managed by this controller are reconciled. When you use autoscaling, set to a lower value like 10 minute, because this corresponds to the minimum time to react on demand change") - flag.StringVar(&logLevel, "log-level", logLevelDebug, `The verbosity of the logging. Valid values are "debug", "info", "warn", "error". Defaults to "debug".`) + flag.StringVar(&logLevel, "log-level", logging.LogLevelDebug, `The verbosity of the logging. Valid values are "debug", "info", "warn", "error". Defaults to "debug".`) flag.StringVar(&webhookSecretToken, "github-webhook-secret-token", "", "The personal access token of GitHub.") flag.StringVar(&c.Token, "github-token", c.Token, "The personal access token of GitHub.") flag.Int64Var(&c.AppID, "github-app-id", c.AppID, "The application ID of GitHub App.") @@ -126,23 +120,7 @@ func main() { setupLog.Info("-watch-namespace is %q. Only HorizontalRunnerAutoscalers in %q are watched, cached, and considered as scale targets.") } - logger := zap.New(func(o *zap.Options) { - switch logLevel { - case logLevelDebug: - o.Development = true - lvl := zaplib.NewAtomicLevelAt(-2) // maps to logr's V(2) - o.Level = &lvl - case logLevelInfo: - lvl := zaplib.NewAtomicLevelAt(zaplib.InfoLevel) - o.Level = &lvl - case logLevelWarn: - lvl := zaplib.NewAtomicLevelAt(zaplib.WarnLevel) - o.Level = &lvl - case logLevelError: - lvl := zaplib.NewAtomicLevelAt(zaplib.ErrorLevel) - o.Level = &lvl - } - }) + logger := logging.NewLogger(logLevel) ctrl.SetLogger(logger) @@ -152,6 +130,8 @@ func main() { // That is, all runner groups managed by ARC are assumed to be visible to any repositories, // which is wrong when you have one or more non-default runner groups in your organization or enterprise. if len(c.Token) > 0 || (c.AppID > 0 && c.AppInstallationID > 0 && c.AppPrivateKey != "") || (len(c.BasicauthUsername) > 0 && len(c.BasicauthPassword) > 0) { + c.Log = &logger + ghClient, err = c.NewClient() if err != nil { fmt.Fprintln(os.Stderr, "Error: Client creation failed.", err) diff --git a/github/github.go b/github/github.go index 146130d9..d8d0acf6 100644 --- a/github/github.go +++ b/github/github.go @@ -11,7 +11,9 @@ import ( "time" "github.com/actions-runner-controller/actions-runner-controller/github/metrics" + "github.com/actions-runner-controller/actions-runner-controller/logging" "github.com/bradleyfalzon/ghinstallation" + "github.com/go-logr/logr" "github.com/google/go-github/v39/github" "github.com/gregjones/httpcache" "golang.org/x/oauth2" @@ -29,6 +31,8 @@ type Config struct { BasicauthUsername string `split_words:"true"` BasicauthPassword string `split_words:"true"` RunnerGitHubURL string `split_words:"true"` + + Log *logr.Logger } // Client wraps GitHub client with some additional @@ -85,7 +89,8 @@ func (c *Config) NewClient() (*Client, error) { cached := httpcache.NewTransport(httpcache.NewMemoryCache()) cached.Transport = transport - metricsTransport := metrics.Transport{Transport: cached} + loggingTransport := logging.Transport{Transport: cached, Log: c.Log} + metricsTransport := metrics.Transport{Transport: loggingTransport} httpClient := &http.Client{Transport: metricsTransport} var client *github.Client diff --git a/logging/logger.go b/logging/logger.go new file mode 100644 index 00000000..463726b9 --- /dev/null +++ b/logging/logger.go @@ -0,0 +1,54 @@ +package logging + +import ( + "fmt" + "os" + "strconv" + + "github.com/go-logr/logr" + zaplib "go.uber.org/zap" + "go.uber.org/zap/zapcore" + "sigs.k8s.io/controller-runtime/pkg/log/zap" +) + +const ( + LogLevelDebug = "debug" + LogLevelInfo = "info" + LogLevelWarn = "warn" + LogLevelError = "error" +) + +func NewLogger(logLevel string) logr.Logger { + log := zap.New(func(o *zap.Options) { + switch logLevel { + case LogLevelDebug: + o.Development = true + lvl := zaplib.NewAtomicLevelAt(zaplib.DebugLevel) // maps to logr's V(1) + o.Level = &lvl + case LogLevelInfo: + lvl := zaplib.NewAtomicLevelAt(zaplib.InfoLevel) + o.Level = &lvl + case LogLevelWarn: + lvl := zaplib.NewAtomicLevelAt(zaplib.WarnLevel) + o.Level = &lvl + case LogLevelError: + lvl := zaplib.NewAtomicLevelAt(zaplib.ErrorLevel) + o.Level = &lvl + default: + // We use bitsize of 8 as zapcore.Level is a type alias to int8 + levelInt, err := strconv.ParseInt(logLevel, 10, 8) + if err != nil { + fmt.Fprintf(os.Stderr, "Failed to parse --log-level=%s: %v", logLevel, err) + os.Exit(1) + } + + // For example, --log-level=debug a.k.a --log-level=-1 maps to zaplib.DebugLevel, which is associated to logr's V(1) + // --log-level=-2 maps the specific custom log level that is associated to logr's V(2). + level := zapcore.Level(levelInt) + atomicLevel := zaplib.NewAtomicLevelAt(level) + o.Level = &atomicLevel + } + }) + + return log +} diff --git a/logging/transport.go b/logging/transport.go new file mode 100644 index 00000000..9d61a318 --- /dev/null +++ b/logging/transport.go @@ -0,0 +1,51 @@ +// Package logging provides various logging helpers for ARC +package logging + +import ( + "net/http" + + "github.com/go-logr/logr" + "github.com/gregjones/httpcache" +) + +const ( + // https://docs.github.com/en/rest/overview/resources-in-the-rest-api#rate-limiting + headerRateLimitRemaining = "X-RateLimit-Remaining" +) + +// Transport wraps a transport with metrics monitoring +type Transport struct { + Transport http.RoundTripper + + Log *logr.Logger +} + +func (t Transport) RoundTrip(req *http.Request) (*http.Response, error) { + resp, err := t.Transport.RoundTrip(req) + if resp != nil { + t.log(req, resp) + } + return resp, err +} + +func (t Transport) log(req *http.Request, resp *http.Response) { + if t.Log == nil { + return + } + + var args []interface{} + + marked := resp.Header.Get(httpcache.XFromCache) == "1" + + args = append(args, "from_cache", marked, "method", req.Method, "url", req.URL.String()) + + if !marked { + // Do not log outdated rate limit remaining value + + remaining := resp.Header.Get(headerRateLimitRemaining) + + args = append(args, "ratelimit_remaining", remaining) + } + + t.Log.V(3).Info("Seen HTTP response", args...) +} diff --git a/main.go b/main.go index 43c55961..7383fc5f 100644 --- a/main.go +++ b/main.go @@ -26,24 +26,18 @@ import ( actionsv1alpha1 "github.com/actions-runner-controller/actions-runner-controller/api/v1alpha1" "github.com/actions-runner-controller/actions-runner-controller/controllers" "github.com/actions-runner-controller/actions-runner-controller/github" + "github.com/actions-runner-controller/actions-runner-controller/logging" "github.com/kelseyhightower/envconfig" - zaplib "go.uber.org/zap" "k8s.io/apimachinery/pkg/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" _ "k8s.io/client-go/plugin/pkg/client/auth/gcp" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/log/zap" // +kubebuilder:scaffold:imports ) const ( defaultRunnerImage = "summerwind/actions-runner:latest" defaultDockerImage = "docker:dind" - - logLevelDebug = "debug" - logLevelInfo = "info" - logLevelWarn = "warn" - logLevelError = "error" ) var ( @@ -120,24 +114,12 @@ func main() { flag.DurationVar(&syncPeriod, "sync-period", 10*time.Minute, "Determines the minimum frequency at which K8s resources managed by this controller are reconciled. When you use autoscaling, set to a lower value like 10 minute, because this corresponds to the minimum time to react on demand change. . If you're tweaking this in order to make autoscaling more responsive, you'll probably want to tweak github-api-cache-duration, too") flag.Var(&commonRunnerLabels, "common-runner-labels", "Runner labels in the K1=V1,K2=V2,... format that are inherited all the runners created by the controller. See https://github.com/actions-runner-controller/actions-runner-controller/issues/321 for more information") flag.StringVar(&namespace, "watch-namespace", "", "The namespace to watch for custom resources. Set to empty for letting it watch for all namespaces.") - flag.StringVar(&logLevel, "log-level", logLevelDebug, `The verbosity of the logging. Valid values are "debug", "info", "warn", "error". Defaults to "debug".`) + flag.StringVar(&logLevel, "log-level", logging.LogLevelDebug, `The verbosity of the logging. Valid values are "debug", "info", "warn", "error". Defaults to "debug".`) flag.Parse() - logger := zap.New(func(o *zap.Options) { - switch logLevel { - case logLevelDebug: - o.Development = true - case logLevelInfo: - lvl := zaplib.NewAtomicLevelAt(zaplib.InfoLevel) - o.Level = &lvl - case logLevelWarn: - lvl := zaplib.NewAtomicLevelAt(zaplib.WarnLevel) - o.Level = &lvl - case logLevelError: - lvl := zaplib.NewAtomicLevelAt(zaplib.ErrorLevel) - o.Level = &lvl - } - }) + logger := logging.NewLogger(logLevel) + + c.Log = &logger ghClient, err = c.NewClient() if err != nil { From f3ceccd904ef3ffc5346d1c87814d8caac9f1d93 Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Sat, 19 Feb 2022 09:24:12 +0000 Subject: [PATCH 03/12] acceptance: Improve deploy.sh to recreate ARC (not runner) pods on new test id So that one does not need to manually recreate ARC pods frequently. --- acceptance/deploy.sh | 6 ++++++ test/e2e/e2e_test.go | 17 ++++++++++------- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/acceptance/deploy.sh b/acceptance/deploy.sh index 2922a396..ece1b6d7 100755 --- a/acceptance/deploy.sh +++ b/acceptance/deploy.sh @@ -37,7 +37,10 @@ fi tool=${ACCEPTANCE_TEST_DEPLOYMENT_TOOL} +TEST_ID=${TEST_ID:-default} + if [ "${tool}" == "helm" ]; then + set -v helm upgrade --install actions-runner-controller \ charts/actions-runner-controller \ -n actions-runner-system \ @@ -46,7 +49,10 @@ if [ "${tool}" == "helm" ]; then --set authSecret.create=false \ --set image.repository=${NAME} \ --set image.tag=${VERSION} \ + --set podAnnotations.test-id=${TEST_ID} \ + --set githubWebhookServer.podAnnotations.test-id=${TEST_ID} \ -f ${VALUES_FILE} + set +v # To prevent `CustomResourceDefinition.apiextensions.k8s.io "runners.actions.summerwind.dev" is invalid: metadata.annotations: Too long: must have at most 262144 bytes` # errors kubectl create -f charts/actions-runner-controller/crds || kubectl replace -f charts/actions-runner-controller/crds diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index 0d93821a..e6d25f12 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -167,6 +167,7 @@ type env struct { useRunnerSet bool testID string + testName string repoToCommit string runnerLabel, githubToken, testRepo, testOrg, testOrgRepo string githubTokenWebhook string @@ -184,11 +185,12 @@ func initTestEnv(t *testing.T) *env { id := e.ID() - testID := t.Name() + " " + id + testName := t.Name() + " " + id - t.Logf("Using test id %s", testID) + t.Logf("Initializing test with name %s", testName) - e.testID = testID + e.testID = id + e.testName = testName e.runnerLabel = "test-" + id e.githubToken = testing.Getenv(t, "GITHUB_TOKEN") e.githubTokenWebhook = testing.Getenv(t, "WEBHOOK_GITHUB_TOKEN") @@ -197,7 +199,7 @@ func initTestEnv(t *testing.T) *env { e.testOrg = testing.Getenv(t, "TEST_ORG", "") e.testOrgRepo = testing.Getenv(t, "TEST_ORG_REPO", "") e.testEnterprise = testing.Getenv(t, "TEST_ENTERPRISE") - e.testJobs = createTestJobs(id, testResultCMNamePrefix, 10) + e.testJobs = createTestJobs(id, testResultCMNamePrefix, 100) ephemeral, _ := strconv.ParseBool(testing.Getenv(t, "TEST_FEATURE_FLAG_EPHEMERAL")) e.featureFlagEphemeral = ephemeral @@ -254,6 +256,7 @@ func (e *env) installActionsRunnerController(t *testing.T) { "GITHUB_TOKEN=" + e.githubToken, "WEBHOOK_GITHUB_TOKEN=" + e.githubTokenWebhook, "RUNNER_LABEL=" + e.runnerLabel, + "TEST_ID=" + e.testID, fmt.Sprintf("RUNNER_FEATURE_FLAG_EPHEMERAL=%v", e.featureFlagEphemeral), } @@ -273,7 +276,7 @@ func (e *env) createControllerNamespaceAndServiceAccount(t *testing.T) { func (e *env) installActionsWorkflow(t *testing.T) { t.Helper() - installActionsWorkflow(t, e.testID, e.runnerLabel, testResultCMNamePrefix, e.repoToCommit, e.testJobs) + installActionsWorkflow(t, e.testName, e.runnerLabel, testResultCMNamePrefix, e.repoToCommit, e.testJobs) } func (e *env) verifyActionsWorkflowRun(t *testing.T) { @@ -302,13 +305,13 @@ func createTestJobs(id, testResultCMNamePrefix string, numJobs int) []job { const Branch = "main" -func installActionsWorkflow(t *testing.T, testID, runnerLabel, testResultCMNamePrefix, testRepo string, testJobs []job) { +func installActionsWorkflow(t *testing.T, testName, runnerLabel, testResultCMNamePrefix, testRepo string, testJobs []job) { t.Helper() ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() - wfName := "E2E " + testID + wfName := "E2E " + testName wf := testing.Workflow{ Name: wfName, On: testing.On{ From 9e356b419e9032292e28000eb040612c4d65ad1e Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Sat, 19 Feb 2022 09:27:41 +0000 Subject: [PATCH 04/12] chart: Add default-logs-container annotation to controller pods so that you can run `kubectl logs` on controller pods without the specifying the container name. It is especially useful when you want to run kubectl-logs on all ARC pods across controller-manager and github-webhook-server like: ``` kubectl -n actions-runner-system logs -l app.kubernetes.io/name=actions-runner-controller ``` That was previously impossible due to that the selector matches pods from both controller-manager and github-webhook-server and kubectl does not provide a way to specify container names for respective pods. --- charts/actions-runner-controller/templates/deployment.yaml | 1 + .../templates/githubwebhook.deployment.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/charts/actions-runner-controller/templates/deployment.yaml b/charts/actions-runner-controller/templates/deployment.yaml index 8885a6a5..0b9ed7d4 100644 --- a/charts/actions-runner-controller/templates/deployment.yaml +++ b/charts/actions-runner-controller/templates/deployment.yaml @@ -14,6 +14,7 @@ spec: metadata: {{- with .Values.podAnnotations }} annotations: + kubectl.kubernetes.io/default-logs-container: "manager" {{- toYaml . | nindent 8 }} {{- end }} labels: diff --git a/charts/actions-runner-controller/templates/githubwebhook.deployment.yaml b/charts/actions-runner-controller/templates/githubwebhook.deployment.yaml index 3776fb09..6dbb428c 100644 --- a/charts/actions-runner-controller/templates/githubwebhook.deployment.yaml +++ b/charts/actions-runner-controller/templates/githubwebhook.deployment.yaml @@ -15,6 +15,7 @@ spec: metadata: {{- with .Values.githubWebhookServer.podAnnotations }} annotations: + kubectl.kubernetes.io/default-logs-container: "github-webhook-server" {{- toYaml . | nindent 8 }} {{- end }} labels: From 3c1618837108e7e8db28db43038cc5cc9aef1048 Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Sat, 19 Feb 2022 16:12:39 +0000 Subject: [PATCH 05/12] Introduce consistent timeouts for runner unregistration and runner pod deletion Enhances runner controller and runner pod controller to have consistent timeouts for runner unregistration and runner pod deletion, so that we are very much unlikely to terminate pods that are running any jobs. --- controllers/runner_controller.go | 118 ++++++------- controllers/runner_graceful_stop.go | 239 +++++++++++++++++++++++++++ controllers/runner_pod_controller.go | 42 ++--- controllers/unregister.go | 49 ------ 4 files changed, 306 insertions(+), 142 deletions(-) create mode 100644 controllers/runner_graceful_stop.go delete mode 100644 controllers/unregister.go diff --git a/controllers/runner_controller.go b/controllers/runner_controller.go index 1bfa8bcc..d007e9fa 100644 --- a/controllers/runner_controller.go +++ b/controllers/runner_controller.go @@ -110,8 +110,23 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr return ctrl.Result{}, nil } } else { + var p *corev1.Pod + + { + var pod corev1.Pod + if err := r.Get(ctx, req.NamespacedName, &pod); err != nil { + if !kerrors.IsNotFound(err) { + log.Info(fmt.Sprintf("Retrying soon as we failed to get registration-only runner pod: %v", err)) + + return ctrl.Result{Requeue: true}, nil + } + } else { + p = &pod + } + } + // Request to remove a runner. DeletionTimestamp was set in the runner - we need to unregister runner - return r.processRunnerDeletion(runner, ctx, log) + return r.processRunnerDeletion(runner, ctx, log, p) } registrationOnly := metav1.HasAnnotation(runner.ObjectMeta, annotationKeyRegistrationOnly) @@ -159,21 +174,7 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr // If pod has ended up succeeded we need to restart it // Happens e.g. when dind is in runner and run completes - stopped := pod.Status.Phase == corev1.PodSucceeded - - if !stopped { - if pod.Status.Phase == corev1.PodRunning { - for _, status := range pod.Status.ContainerStatuses { - if status.Name != containerName { - continue - } - - if status.State.Terminated != nil && status.State.Terminated.ExitCode == 0 { - stopped = true - } - } - } - } + stopped := runnerPodOrContainerIsStopped(&pod) restart := stopped @@ -404,64 +405,53 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr return ctrl.Result{}, nil } - // Try to delete current pod if recreation is needed - safeToDeletePod := false - ok, err := r.unregisterRunner(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name) - if err != nil { - log.Error(err, "Failed to unregister runner before deleting the pod.", "runner", runner.Name) - } else { - // `r.unregisterRunner()` will returns `false, nil` if the runner is not found on GitHub. - if !ok { - log.Info("Runner no longer exists on GitHub", "runner", runner.Name) - } - - safeToDeletePod = true + updatedPod, res, err := tickRunnerGracefulStop(ctx, log, r.GitHubClient, r.Client, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name, &pod) + if res != nil { + return *res, err } - if safeToDeletePod { - // Only delete the pod if we successfully unregistered the runner or the runner is already deleted from the service. - // This should help us avoid race condition between runner pickup job after we think the runner is not busy. - if err := r.Delete(ctx, &pod); err != nil { - log.Error(err, "Failed to delete pod resource") - return ctrl.Result{}, err - } - - r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Deleted pod '%s'", newPod.Name)) - log.Info("Deleted runner pod", "repository", runner.Spec.Repository) + // Only delete the pod if we successfully unregistered the runner or the runner is already deleted from the service. + // This should help us avoid race condition between runner pickup job after we think the runner is not busy. + if err := r.Delete(ctx, updatedPod); err != nil { + log.Error(err, "Failed to delete pod resource") + return ctrl.Result{}, err } + r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Deleted pod '%s'", newPod.Name)) + log.Info("Deleted runner pod", "repository", runner.Spec.Repository) + return ctrl.Result{}, nil } -func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx context.Context, log logr.Logger) (reconcile.Result, error) { +func runnerPodOrContainerIsStopped(pod *corev1.Pod) bool { + // If pod has ended up succeeded we need to restart it + // Happens e.g. when dind is in runner and run completes + stopped := pod.Status.Phase == corev1.PodSucceeded + + if !stopped { + if pod.Status.Phase == corev1.PodRunning { + for _, status := range pod.Status.ContainerStatuses { + if status.Name != containerName { + continue + } + + if status.State.Terminated != nil && status.State.Terminated.ExitCode == 0 { + stopped = true + } + } + } + } + + return stopped +} + +func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx context.Context, log logr.Logger, pod *corev1.Pod) (reconcile.Result, error) { finalizers, removed := removeFinalizer(runner.ObjectMeta.Finalizers, finalizerName) if removed { - if len(runner.Status.Registration.Token) > 0 { - ok, err := r.unregisterRunner(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name) - if err != nil { - if errors.Is(err, &gogithub.RateLimitError{}) { - // We log the underlying error when we failed calling GitHub API to list or unregisters, - // or the runner is still busy. - log.Error( - err, - fmt.Sprintf( - "Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls", - retryDelayOnGitHubAPIRateLimitError, - ), - ) - - return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err - } - - return ctrl.Result{}, err - } - - if !ok { - log.V(1).Info("Runner no longer exists on GitHub") - } - } else { - log.V(1).Info("Runner was never registered on GitHub") + _, res, err := tickRunnerGracefulStop(ctx, log, r.GitHubClient, r.Client, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name, pod) + if res != nil { + return *res, err } newRunner := runner.DeepCopy() diff --git a/controllers/runner_graceful_stop.go b/controllers/runner_graceful_stop.go new file mode 100644 index 00000000..ce242d53 --- /dev/null +++ b/controllers/runner_graceful_stop.go @@ -0,0 +1,239 @@ +package controllers + +import ( + "context" + "errors" + "fmt" + "time" + + "github.com/actions-runner-controller/actions-runner-controller/github" + "github.com/go-logr/logr" + gogithub "github.com/google/go-github/v39/github" + corev1 "k8s.io/api/core/v1" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +const ( + unregistrationCompleteTimestamp = "unregistration-complete-timestamp" + unregistrationStartTimestamp = "unregistration-start-timestamp" +) + +// tickRunnerGracefulStop reconciles the runner and the runner pod in a way so that +// we can delete the runner pod without disrupting a workflow job. +// +// This function returns a non-nil pointer to corev1.Pod as the first return value +// if the runner is considered to have gracefully stopped, hence it's pod is safe for deletion. +// +// It's a "tick" operation so a graceful stop can take multiple calls to complete. +// This function is designed to complete a length graceful stop process in a unblocking way. +// When it wants to be retried later, the function returns a non-nil *ctrl.Result as the second return value, may or may not populating the error in the second return value. +// The caller is expected to return the returned ctrl.Result and error to postpone the current reconcilation loop and trigger a scheduled retry. +func tickRunnerGracefulStop(ctx context.Context, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*corev1.Pod, *ctrl.Result, error) { + if pod != nil { + if _, ok := getAnnotation(pod, unregistrationStartTimestamp); !ok { + updated := pod.DeepCopy() + setAnnotation(updated, unregistrationStartTimestamp, time.Now().Format(time.RFC3339)) + if err := c.Patch(ctx, updated, client.MergeFrom(pod)); err != nil { + log.Error(err, fmt.Sprintf("Failed to patch pod to have %s annotation", unregistrationStartTimestamp)) + return nil, &ctrl.Result{}, err + } + pod = updated + + log.Info("Runner has started unregistration") + } else { + log.Info("Runner has already started unregistration") + } + } + + if res, err := ensureRunnerUnregistration(ctx, 10*time.Minute, 30*time.Second, log, ghClient, enterprise, organization, repository, runner, pod); res != nil { + return nil, res, err + } + + if pod != nil { + if _, ok := getAnnotation(pod, unregistrationCompleteTimestamp); !ok { + updated := pod.DeepCopy() + setAnnotation(updated, unregistrationCompleteTimestamp, time.Now().Format(time.RFC3339)) + if err := c.Patch(ctx, updated, client.MergeFrom(pod)); err != nil { + log.Error(err, fmt.Sprintf("Failed to patch pod to have %s annotation", unregistrationCompleteTimestamp)) + return nil, &ctrl.Result{}, err + } + pod = updated + + log.Info("Runner has completed unregistration") + } else { + log.Info("Runner has already completed unregistration") + } + } + + return pod, nil, nil +} + +// If the first return value is nil, it's safe to delete the runner pod. +func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*ctrl.Result, error) { + ok, err := unregisterRunner(ctx, ghClient, enterprise, organization, repository, runner) + if err != nil { + if errors.Is(err, &gogithub.RateLimitError{}) { + // We log the underlying error when we failed calling GitHub API to list or unregisters, + // or the runner is still busy. + log.Error( + err, + fmt.Sprintf( + "Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls", + retryDelayOnGitHubAPIRateLimitError, + ), + ) + + return &ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err + } + + log.Error(err, "Failed to unregister runner before deleting the pod.") + + return &ctrl.Result{}, err + } else if ok { + log.Info("Runner has just been unregistered. Removing the runner pod.") + } else if pod == nil { + // `r.unregisterRunner()` will returns `false, nil` if the runner is not found on GitHub. + // However, that doesn't always mean the pod can be safely removed. + // + // If the pod does not exist for the runner, + // it may be due to that the runner pod has never been created. + // In that case we can safely assume that the runner will never be registered. + + log.Info("Runner was not found on GitHub and the runner pod was not found on Kuberntes.") + } else if pod.Annotations[unregistrationCompleteTimestamp] != "" { + // If it's already unregistered in the previous reconcilation loop, + // you can safely assume that it won't get registered again so it's safe to delete the runner pod. + log.Info("Runner pod is marked as already unregistered.") + } else if runnerPodOrContainerIsStopped(pod) { + // If it's an ephemeral runner with the actions/runner container exited with 0, + // we can safely assume that it has unregistered itself from GitHub Actions + // so it's natural that RemoveRunner fails due to 404. + + // If pod has ended up succeeded we need to restart it + // Happens e.g. when dind is in runner and run completes + log.Info("Runner pod has been stopped with a successful status.") + } else if ts := pod.Annotations[unregistrationStartTimestamp]; ts != "" { + t, err := time.Parse(time.RFC3339, ts) + if err != nil { + return &ctrl.Result{RequeueAfter: retryDelay}, err + } + + if r := time.Until(t.Add(unregistrationTimeout)); r > 0 { + log.Info("Runner unregistration is in-progress.", "timeout", unregistrationTimeout, "remaining", r) + return &ctrl.Result{RequeueAfter: retryDelay}, err + } + + log.Info("Runner unregistration has been timed out. The runner pod will be deleted soon.", "timeout", unregistrationTimeout) + } else { + // A runner and a runner pod that is created by this version of ARC should match + // any of the above branches. + // + // But we leave this match all branch for potential backward-compatibility. + // The caller is expected to take appropriate actions, like annotating the pod as started the unregistration process, + // and retry later. + log.V(1).Info("Runner unregistration is being retried later.") + + return &ctrl.Result{RequeueAfter: retryDelay}, nil + } + + return nil, nil +} + +func getAnnotation(pod *corev1.Pod, key string) (string, bool) { + if pod.Annotations == nil { + return "", false + } + + v, ok := pod.Annotations[key] + + return v, ok +} + +func setAnnotation(pod *corev1.Pod, key, value string) { + if pod.Annotations == nil { + pod.Annotations = map[string]string{} + } + + pod.Annotations[key] = value +} + +// unregisterRunner unregisters the runner from GitHub Actions by name. +// +// This function returns: +// +// Case 1. (true, nil) when it has successfully unregistered the runner. +// Case 2. (false, nil) when (2-1.) the runner has been already unregistered OR (2-2.) the runner will never be created OR (2-3.) the runner is not created yet and it is about to be registered(hence we couldn't see it's existence from GitHub Actions API yet) +// Case 3. (false, err) when it postponed unregistration due to the runner being busy, or it tried to unregister the runner but failed due to +// an error returned by GitHub API. +// +// When the returned values is "Case 2. (false, nil)", the caller must handle the three possible sub-cases appropriately. +// In other words, all those three sub-cases cannot be distinguished by this function alone. +// +// - Case "2-1." can happen when e.g. ARC has successfully unregistered in a previous reconcilation loop or it was an ephemeral runner that finished it's job run(an ephemeral runner is designed to stop after a job run). +// You'd need to maintain the runner state(i.e. if it's already unregistered or not) somewhere, +// so that you can either not call this function at all if the runner state says it's already unregistered, or determine that it's case "2-1." when you got (false, nil). +// +// - Case "2-2." can happen when e.g. the runner registration token was somehow broken so that `config.sh` within the runner container was never meant to succeed. +// Waiting and retrying forever on this case is not a solution, because `config.sh` won't succeed with a wrong token hence the runner gets stuck in this state forever. +// There isn't a perfect solution to this, but a practical workaround would be implement a "grace period" in the caller side. +// +// - Case "2-3." can happen when e.g. ARC recreated an ephemral runner pod in a previous reconcilation loop and then it was requested to delete the runner before the runner comes up. +// If handled inappropriately, this can cause a race condition betweeen a deletion of the runner pod and GitHub scheduling a workflow job onto the runner. +// +// Once successfully detected case "2-1." or "2-2.", you can safely delete the runner pod because you know that the runner won't come back +// as long as you recreate the runner pod. +// +// If it was "2-3.", you need a workaround to avoid the race condition. +// +// You shall introduce a "grace period" mechanism, similar or equal to that is required for "Case 2-2.", so that you ever +// start the runner pod deletion only after it's more and more likely that the runner pod is not coming up. +// +// Beware though, you need extra care to set an appropriate grace period depending on your environment. +// There isn't a single right grace period that works for everyone. +// The longer the grace period is, the earlier a cluster resource shortage can occur due to throttoled runner pod deletions, +// while the shorter the grace period is, the more likely you may encounter the race issue. +func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string) (bool, error) { + runners, err := client.ListRunners(ctx, enterprise, org, repo) + if err != nil { + return false, err + } + + id := int64(0) + for _, runner := range runners { + if runner.GetName() == name { + id = runner.GetID() + break + } + } + + if id == int64(0) { + return false, nil + } + + // For the record, historically ARC did not try to call RemoveRunner on a busy runner, but it's no longer true. + // The reason ARC did so was to let a runner running a job to not stop prematurely. + // + // However, we learned that RemoveRunner already has an ability to prevent stopping a busy runner, + // so ARC doesn't need to do anything special for a graceful runner stop. + // It can just call RemoveRunner, and if it returned 200 you're guaranteed that the runner will not automatically come back and + // the runner pod is safe for deletion. + // + // Trying to remove a busy runner can result in errors like the following: + // failed to remove runner: DELETE https://api.github.com/repos/actions-runner-controller/mumoshu-actions-test/actions/runners/47: 422 Bad request - Runner \"example-runnerset-0\" is still running a job\" [] + // + // # NOTES + // + // - It can be "status=offline" at the same time but that's another story. + // - After https://github.com/actions-runner-controller/actions-runner-controller/pull/1127, ListRunners responses that are used to + // determine if the runner is busy can be more outdated than before, as those responeses are now cached for 60 seconds. + // - Note that 60 seconds is controlled by the Cache-Control response header provided by GitHub so we don't have a strict control on it but we assume it won't + // change from 60 seconds. + // + // TODO: Probably we can just remove the runner by ID without seeing if the runner is busy, by treating it as busy when a remove-runner call failed with 422? + if err := client.RemoveRunner(ctx, enterprise, org, repo, id); err != nil { + return false, err + } + + return true, nil +} diff --git a/controllers/runner_pod_controller.go b/controllers/runner_pod_controller.go index d43eea37..57851f16 100644 --- a/controllers/runner_pod_controller.go +++ b/controllers/runner_pod_controller.go @@ -105,33 +105,16 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( finalizers, removed := removeFinalizer(runnerPod.ObjectMeta.Finalizers, runnerPodFinalizerName) if removed { - ok, err := r.unregisterRunner(ctx, enterprise, org, repo, runnerPod.Name) - if err != nil { - if errors.Is(err, &gogithub.RateLimitError{}) { - // We log the underlying error when we failed calling GitHub API to list or unregisters, - // or the runner is still busy. - log.Error( - err, - fmt.Sprintf( - "Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls", - retryDelayOnGitHubAPIRateLimitError, - ), - ) - - return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err - } - - return ctrl.Result{}, err + updatedPod, res, err := tickRunnerGracefulStop(ctx, log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod) + if res != nil { + return *res, err } - if !ok { - log.V(1).Info("Runner no longer exists on GitHub") - } + patchedPod := updatedPod.DeepCopy() + patchedPod.ObjectMeta.Finalizers = finalizers - newRunner := runnerPod.DeepCopy() - newRunner.ObjectMeta.Finalizers = finalizers - - if err := r.Patch(ctx, newRunner, client.MergeFrom(&runnerPod)); err != nil { + // We commit the removal of the finalizer so that Kuberenetes notices it and delete the pod resource from the cluster. + if err := r.Patch(ctx, patchedPod, client.MergeFrom(&runnerPod)); err != nil { log.Error(err, "Failed to update runner for finalizer removal") return ctrl.Result{}, err } @@ -365,8 +348,13 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( return ctrl.Result{}, nil } + updated, res, err := tickRunnerGracefulStop(ctx, log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod) + if res != nil { + return *res, err + } + // Delete current pod if recreation is needed - if err := r.Delete(ctx, &runnerPod); err != nil { + if err := r.Delete(ctx, updated); err != nil { log.Error(err, "Failed to delete pod resource") return ctrl.Result{}, err } @@ -377,10 +365,6 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( return ctrl.Result{}, nil } -func (r *RunnerPodReconciler) unregisterRunner(ctx context.Context, enterprise, org, repo, name string) (bool, error) { - return unregisterRunner(ctx, r.GitHubClient, enterprise, org, repo, name) -} - func (r *RunnerPodReconciler) SetupWithManager(mgr ctrl.Manager) error { name := "runnerpod-controller" if r.Name != "" { diff --git a/controllers/unregister.go b/controllers/unregister.go deleted file mode 100644 index 9c139be0..00000000 --- a/controllers/unregister.go +++ /dev/null @@ -1,49 +0,0 @@ -package controllers - -import ( - "context" - "fmt" - - "github.com/actions-runner-controller/actions-runner-controller/github" -) - -// unregisterRunner unregisters the runner from GitHub Actions by name. -// -// This function returns: -// - (true, nil) when it has successfully unregistered the runner. -// - (false, nil) when the runner has been already unregistered. -// - (false, err) when it postponed unregistration due to the runner being busy, or it tried to unregister the runner but failed due to -// an error returned by GitHub API. -func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string) (bool, error) { - runners, err := client.ListRunners(ctx, enterprise, org, repo) - if err != nil { - return false, err - } - - id := int64(0) - for _, runner := range runners { - if runner.GetName() == name { - // Note that sometimes a runner can stuck "busy" even though it is already "offline". - // But we assume that it's not actually offline and still running a job. - if runner.GetBusy() { - return false, fmt.Errorf("runner is busy") - } - id = runner.GetID() - break - } - } - - if id == int64(0) { - return false, nil - } - - // Trying to remove a busy runner can result in errors like the following: - // failed to remove runner: DELETE https://api.github.com/repos/actions-runner-controller/mumoshu-actions-test/actions/runners/47: 422 Bad request - Runner \"example-runnerset-0\" is still running a job\" [] - // - // TODO: Probably we can just remove the runner by ID without seeing if the runner is busy, by treating it as busy when a remove-runner call failed with 422? - if err := client.RemoveRunner(ctx, enterprise, org, repo, id); err != nil { - return false, err - } - - return true, nil -} From 4e6bfd811491b11c4ca5a14ff0127947c2ef05b6 Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Sun, 20 Feb 2022 04:37:15 +0000 Subject: [PATCH 06/12] e2e: Add ability to toggle dockerdWithinRunnerContainer --- .../testdata/runnerdeploy.envsubst.yaml | 1 + test/e2e/e2e_test.go | 28 ++++++++++++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/acceptance/testdata/runnerdeploy.envsubst.yaml b/acceptance/testdata/runnerdeploy.envsubst.yaml index ad935125..4ac0b704 100644 --- a/acceptance/testdata/runnerdeploy.envsubst.yaml +++ b/acceptance/testdata/runnerdeploy.envsubst.yaml @@ -28,6 +28,7 @@ spec: ## Replace `mumoshu/actions-runner-dind:dev` with your dind image #dockerdWithinRunnerContainer: true #image: mumoshu/actions-runner-dind:dev + dockerdWithinRunnerContainer: ${RUNNER_DOCKERD_WITHIN_RUNNER_CONTAINER} # # Set the MTU used by dockerd-managed network interfaces (including docker-build-ubuntu) diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index e6d25f12..3b24ba91 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -17,12 +17,15 @@ var ( controllerImageTag = "e2e" controllerImage = testing.Img(controllerImageRepo, controllerImageTag) runnerImageRepo = "actionsrunnercontrollere2e/actions-runner" + runnerDindImageRepo = "actionsrunnercontrollere2e/actions-runner-dind" runnerImageTag = "e2e" runnerImage = testing.Img(runnerImageRepo, runnerImageTag) + runnerDindImage = testing.Img(runnerDindImageRepo, runnerImageTag) prebuildImages = []testing.ContainerImage{ controllerImage, runnerImage, + runnerDindImage, } builds = []testing.DockerBuild{ @@ -36,6 +39,11 @@ var ( Args: []testing.BuildArg{}, Image: runnerImage, }, + { + Dockerfile: "../../runner/Dockerfile.dindrunner", + Args: []testing.BuildArg{}, + Image: runnerDindImage, + }, } certManagerVersion = "v1.1.1" @@ -52,7 +60,6 @@ var ( "SYNC_PERIOD=" + "10s", "NAME=" + controllerImageRepo, "VERSION=" + controllerImageTag, - "RUNNER_NAME=" + runnerImageRepo, "RUNNER_TAG=" + runnerImageTag, } @@ -173,6 +180,7 @@ type env struct { githubTokenWebhook string testEnterprise string featureFlagEphemeral bool + dockerdWithinRunnerContainer bool testJobs []job } @@ -203,6 +211,12 @@ func initTestEnv(t *testing.T) *env { ephemeral, _ := strconv.ParseBool(testing.Getenv(t, "TEST_FEATURE_FLAG_EPHEMERAL")) e.featureFlagEphemeral = ephemeral + var err error + e.dockerdWithinRunnerContainer, err = strconv.ParseBool(testing.Getenv(t, "TEST_RUNNER_DOCKERD_WITHIN_RUNNER_CONTAINER", "false")) + if err != nil { + panic(fmt.Sprintf("unable to parse bool from TEST_RUNNER_DOCKERD_WITHIN_RUNNER_CONTAINER: %v", err)) + } + return e } @@ -260,6 +274,18 @@ func (e *env) installActionsRunnerController(t *testing.T) { fmt.Sprintf("RUNNER_FEATURE_FLAG_EPHEMERAL=%v", e.featureFlagEphemeral), } + if e.dockerdWithinRunnerContainer { + varEnv = append(varEnv, + "RUNNER_DOCKERD_WITHIN_RUNNER_CONTAINER=true", + "RUNNER_NAME="+runnerDindImageRepo, + ) + } else { + varEnv = append(varEnv, + "RUNNER_DOCKERD_WITHIN_RUNNER_CONTAINER=false", + "RUNNER_NAME="+runnerImageRepo, + ) + } + scriptEnv = append(scriptEnv, varEnv...) scriptEnv = append(scriptEnv, commonScriptEnv...) From 79a31328a55bec31f6bf77a641734976b6e24551 Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Sun, 20 Feb 2022 04:42:19 +0000 Subject: [PATCH 07/12] Stop recreating ephemeral runner pod Ref https://github.com/actions-runner-controller/actions-runner-controller/issues/911#issuecomment-1046161384 --- controllers/runner_controller.go | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/controllers/runner_controller.go b/controllers/runner_controller.go index d007e9fa..efbb74bc 100644 --- a/controllers/runner_controller.go +++ b/controllers/runner_controller.go @@ -176,6 +176,27 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr // Happens e.g. when dind is in runner and run completes stopped := runnerPodOrContainerIsStopped(&pod) + ephemeral := runner.Spec.Ephemeral == nil || *runner.Spec.Ephemeral + + if stopped && ephemeral { + log.V(1).Info("Ephemeral runner has been stopped successfully. Marking this runner for deletion.") + + // This is the key to make ephemeral runners to work reliably with webhook-based autoscale. + // See https://github.com/actions-runner-controller/actions-runner-controller/issues/911#issuecomment-1046161384 for more context. + // + // In the next reconcilation loop, this triggers a runner unregistration. + // (Note that the unregistration can fail safely because an ephemeral runner usually unregisters itself from GitHub but we do it just for confirmation) + // + // See the code path above that is executed when `runner.ObjectMeta.DeletionTimestamp.IsZero()` isn't true, + // which handles the unregistrationa the removal of the completed pod, and so on. + if err := r.Delete(ctx, &runner); err != nil { + log.V(1).Error(err, "Retrying to mark this runner for deletion in 10 seconds.") + return ctrl.Result{RequeueAfter: 10 * time.Second}, nil + } + + return ctrl.Result{Requeue: true}, nil + } + restart := stopped if registrationOnly && stopped { From a6f0e0008f3c1e6c1504308657284cee87f53a36 Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Sun, 20 Feb 2022 07:45:49 +0000 Subject: [PATCH 08/12] Make unregistration timeout and retry delay configurable in integration tests --- controllers/integration_test.go | 2 ++ controllers/runner_controller.go | 25 +++++++++++++++++++++++-- controllers/runner_graceful_stop.go | 14 ++++++++++++-- controllers/runner_pod_controller.go | 25 +++++++++++++++++++++++-- 4 files changed, 60 insertions(+), 6 deletions(-) diff --git a/controllers/integration_test.go b/controllers/integration_test.go index d87bdb18..2e038e4d 100644 --- a/controllers/integration_test.go +++ b/controllers/integration_test.go @@ -110,6 +110,8 @@ func SetupIntegrationTest(ctx2 context.Context) *testEnvironment { Name: controllerName("runner"), RegistrationRecheckInterval: time.Millisecond, RegistrationRecheckJitter: time.Millisecond, + UnregistrationTimeout: 1 * time.Second, + UnregistrationRetryDelay: 1 * time.Second, } err = runnerController.SetupWithManager(mgr) Expect(err).NotTo(HaveOccurred(), "failed to setup runner controller") diff --git a/controllers/runner_controller.go b/controllers/runner_controller.go index efbb74bc..03c22b27 100644 --- a/controllers/runner_controller.go +++ b/controllers/runner_controller.go @@ -72,6 +72,9 @@ type RunnerReconciler struct { Name string RegistrationRecheckInterval time.Duration RegistrationRecheckJitter time.Duration + + UnregistrationTimeout time.Duration + UnregistrationRetryDelay time.Duration } // +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runners,verbs=get;list;watch;create;update;patch;delete @@ -426,7 +429,7 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr return ctrl.Result{}, nil } - updatedPod, res, err := tickRunnerGracefulStop(ctx, log, r.GitHubClient, r.Client, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name, &pod) + updatedPod, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name, &pod) if res != nil { return *res, err } @@ -470,7 +473,7 @@ func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx con finalizers, removed := removeFinalizer(runner.ObjectMeta.Finalizers, finalizerName) if removed { - _, res, err := tickRunnerGracefulStop(ctx, log, r.GitHubClient, r.Client, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name, pod) + _, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name, pod) if res != nil { return *res, err } @@ -489,6 +492,24 @@ func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx con return ctrl.Result{}, nil } +func (r *RunnerReconciler) unregistrationTimeout() time.Duration { + unregistrationTimeout := DefaultUnregistrationTimeout + + if r.UnregistrationTimeout > 0 { + unregistrationTimeout = r.UnregistrationTimeout + } + return unregistrationTimeout +} + +func (r *RunnerReconciler) unregistrationRetryDelay() time.Duration { + retryDelay := DefaultUnregistrationRetryDelay + + if r.UnregistrationRetryDelay > 0 { + retryDelay = r.UnregistrationRetryDelay + } + return retryDelay +} + func (r *RunnerReconciler) processRunnerPodDeletion(ctx context.Context, runner v1alpha1.Runner, log logr.Logger, pod corev1.Pod) (reconcile.Result, error) { deletionTimeout := 1 * time.Minute currentTime := time.Now() diff --git a/controllers/runner_graceful_stop.go b/controllers/runner_graceful_stop.go index ce242d53..83e4b41e 100644 --- a/controllers/runner_graceful_stop.go +++ b/controllers/runner_graceful_stop.go @@ -17,6 +17,16 @@ import ( const ( unregistrationCompleteTimestamp = "unregistration-complete-timestamp" unregistrationStartTimestamp = "unregistration-start-timestamp" + + // DefaultUnregistrationTimeout is the duration until ARC gives up retrying the combo of ListRunners API (to detect the runner ID by name) + // and RemoveRunner API (to actually unregister the runner) calls. + // This needs to be longer than 60 seconds because a part of the combo, the ListRunners API, seems to use the Cache-Control header of max-age=60s + // and that instructs our cache library httpcache to cache responses for 60 seconds, which results in ARC unable to see the runner in the ListRunners response + // up to 60 seconds (or even more depending on the situation). + DefaultUnregistrationTimeout = 60 * time.Second + + // This can be any value but a larger value can make an unregistration timeout longer than configured in practice. + DefaultUnregistrationRetryDelay = 30 * time.Second ) // tickRunnerGracefulStop reconciles the runner and the runner pod in a way so that @@ -29,7 +39,7 @@ const ( // This function is designed to complete a length graceful stop process in a unblocking way. // When it wants to be retried later, the function returns a non-nil *ctrl.Result as the second return value, may or may not populating the error in the second return value. // The caller is expected to return the returned ctrl.Result and error to postpone the current reconcilation loop and trigger a scheduled retry. -func tickRunnerGracefulStop(ctx context.Context, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*corev1.Pod, *ctrl.Result, error) { +func tickRunnerGracefulStop(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*corev1.Pod, *ctrl.Result, error) { if pod != nil { if _, ok := getAnnotation(pod, unregistrationStartTimestamp); !ok { updated := pod.DeepCopy() @@ -46,7 +56,7 @@ func tickRunnerGracefulStop(ctx context.Context, log logr.Logger, ghClient *gith } } - if res, err := ensureRunnerUnregistration(ctx, 10*time.Minute, 30*time.Second, log, ghClient, enterprise, organization, repository, runner, pod); res != nil { + if res, err := ensureRunnerUnregistration(ctx, unregistrationTimeout, retryDelay, log, ghClient, enterprise, organization, repository, runner, pod); res != nil { return nil, res, err } diff --git a/controllers/runner_pod_controller.go b/controllers/runner_pod_controller.go index 57851f16..e93f53c8 100644 --- a/controllers/runner_pod_controller.go +++ b/controllers/runner_pod_controller.go @@ -47,6 +47,9 @@ type RunnerPodReconciler struct { Name string RegistrationRecheckInterval time.Duration RegistrationRecheckJitter time.Duration + + UnregistrationTimeout time.Duration + UnregistrationRetryDelay time.Duration } const ( @@ -105,7 +108,7 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( finalizers, removed := removeFinalizer(runnerPod.ObjectMeta.Finalizers, runnerPodFinalizerName) if removed { - updatedPod, res, err := tickRunnerGracefulStop(ctx, log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod) + updatedPod, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod) if res != nil { return *res, err } @@ -348,7 +351,7 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( return ctrl.Result{}, nil } - updated, res, err := tickRunnerGracefulStop(ctx, log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod) + updated, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod) if res != nil { return *res, err } @@ -365,6 +368,24 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( return ctrl.Result{}, nil } +func (r *RunnerPodReconciler) unregistrationTimeout() time.Duration { + unregistrationTimeout := DefaultUnregistrationTimeout + + if r.UnregistrationTimeout > 0 { + unregistrationTimeout = r.UnregistrationTimeout + } + return unregistrationTimeout +} + +func (r *RunnerPodReconciler) unregistrationRetryDelay() time.Duration { + retryDelay := DefaultUnregistrationRetryDelay + + if r.UnregistrationRetryDelay > 0 { + retryDelay = r.UnregistrationRetryDelay + } + return retryDelay +} + func (r *RunnerPodReconciler) SetupWithManager(mgr ctrl.Manager) error { name := "runnerpod-controller" if r.Name != "" { From d4a9750e2031ece5f6ee65bb4e30e841a125e159 Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Sun, 20 Feb 2022 13:40:03 +0000 Subject: [PATCH 09/12] acceptance,e2e: Enhance E2E test and deploy.sh to support scaleDownDelaySeconds~ and minReplicas for HRA --- acceptance/deploy.sh | 2 +- acceptance/testdata/runnerdeploy.envsubst.yaml | 3 ++- test/e2e/e2e_test.go | 6 ++++++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/acceptance/deploy.sh b/acceptance/deploy.sh index ece1b6d7..953293e8 100755 --- a/acceptance/deploy.sh +++ b/acceptance/deploy.sh @@ -83,7 +83,7 @@ else fi if [ -n "${TEST_ORG}" ]; then - cat acceptance/testdata/runnerdeploy.envsubst.yaml | TEST_ENTERPRISE= TEST_REPO= NAME=org-runnerdeploy envsubst | kubectl apply -f - + cat acceptance/testdata/runnerdeploy.envsubst.yaml | TEST_ENTERPRISE= TEST_REPO= RUNNER_MIN_REPLICAS=${ORG_RUNNER_MIN_REPLICAS} NAME=org-runnerdeploy envsubst | kubectl apply -f - if [ -n "${TEST_ORG_GROUP}" ]; then cat acceptance/testdata/runnerdeploy.envsubst.yaml | TEST_ENTERPRISE= TEST_REPO= TEST_GROUP=${TEST_ORG_GROUP} NAME=orggroup-runnerdeploy envsubst | kubectl apply -f - diff --git a/acceptance/testdata/runnerdeploy.envsubst.yaml b/acceptance/testdata/runnerdeploy.envsubst.yaml index 4ac0b704..3ecea4f6 100644 --- a/acceptance/testdata/runnerdeploy.envsubst.yaml +++ b/acceptance/testdata/runnerdeploy.envsubst.yaml @@ -58,5 +58,6 @@ spec: - githubEvent: {} amount: 1 duration: "1m" - minReplicas: 0 + minReplicas: ${RUNNER_MIN_REPLICAS} maxReplicas: 10 + scaleDownDelaySecondsAfterScaleOut: ${RUNNER_SCALE_DOWN_DELAY_SECONDS_AFTER_SCALE_OUT} diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index 3b24ba91..728ba4ff 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -180,6 +180,8 @@ type env struct { githubTokenWebhook string testEnterprise string featureFlagEphemeral bool + scaleDownDelaySecondsAfterScaleOut int64 + minReplicas int64 dockerdWithinRunnerContainer bool testJobs []job } @@ -210,6 +212,8 @@ func initTestEnv(t *testing.T) *env { e.testJobs = createTestJobs(id, testResultCMNamePrefix, 100) ephemeral, _ := strconv.ParseBool(testing.Getenv(t, "TEST_FEATURE_FLAG_EPHEMERAL")) e.featureFlagEphemeral = ephemeral + e.scaleDownDelaySecondsAfterScaleOut, _ = strconv.ParseInt(testing.Getenv(t, "TEST_RUNNER_SCALE_DOWN_DELAY_SECONDS_AFTER_SCALE_OUT", "10"), 10, 32) + e.minReplicas, _ = strconv.ParseInt(testing.Getenv(t, "TEST_RUNNER_MIN_REPLICAS", "1"), 10, 32) var err error e.dockerdWithinRunnerContainer, err = strconv.ParseBool(testing.Getenv(t, "TEST_RUNNER_DOCKERD_WITHIN_RUNNER_CONTAINER", "false")) @@ -272,6 +276,8 @@ func (e *env) installActionsRunnerController(t *testing.T) { "RUNNER_LABEL=" + e.runnerLabel, "TEST_ID=" + e.testID, fmt.Sprintf("RUNNER_FEATURE_FLAG_EPHEMERAL=%v", e.featureFlagEphemeral), + fmt.Sprintf("RUNNER_SCALE_DOWN_DELAY_SECONDS_AFTER_SCALE_OUT=%d", e.scaleDownDelaySecondsAfterScaleOut), + fmt.Sprintf("ORG_RUNNER_MIN_REPLICAS=%d", e.minReplicas), } if e.dockerdWithinRunnerContainer { From b8e65aa857cf37a4cd829cb3ba61a1edf11ab468 Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Sun, 20 Feb 2022 06:59:56 +0000 Subject: [PATCH 10/12] Prevent unnecessary ephemeral runner recreations --- .../horizontalrunnerautoscaler_types.go | 3 + api/v1alpha1/runnerdeployment_types.go | 8 +++ api/v1alpha1/runnerreplicaset_types.go | 9 +++ api/v1alpha1/zz_generated.deepcopy.go | 9 +++ ...rwind.dev_horizontalrunnerautoscalers.yaml | 3 + ...ions.summerwind.dev_runnerdeployments.yaml | 5 ++ ...ions.summerwind.dev_runnerreplicasets.yaml | 5 ++ ...rwind.dev_horizontalrunnerautoscalers.yaml | 3 + ...ions.summerwind.dev_runnerdeployments.yaml | 5 ++ ...ions.summerwind.dev_runnerreplicasets.yaml | 5 ++ .../horizontal_runner_autoscaler_webhook.go | 4 +- .../horizontalrunnerautoscaler_controller.go | 22 ++++++++ controllers/runnerdeployment_controller.go | 8 ++- controllers/runnerreplicaset_controller.go | 56 +++++++++++++++++-- 14 files changed, 137 insertions(+), 8 deletions(-) diff --git a/api/v1alpha1/horizontalrunnerautoscaler_types.go b/api/v1alpha1/horizontalrunnerautoscaler_types.go index 350583e7..b5da8275 100644 --- a/api/v1alpha1/horizontalrunnerautoscaler_types.go +++ b/api/v1alpha1/horizontalrunnerautoscaler_types.go @@ -107,6 +107,9 @@ type CapacityReservation struct { Name string `json:"name,omitempty"` ExpirationTime metav1.Time `json:"expirationTime,omitempty"` Replicas int `json:"replicas,omitempty"` + + // +optional + EffectiveTime metav1.Time `json:"effectiveTime,omitempty"` } type ScaleTargetRef struct { diff --git a/api/v1alpha1/runnerdeployment_types.go b/api/v1alpha1/runnerdeployment_types.go index 23b0d0d2..588a6855 100644 --- a/api/v1alpha1/runnerdeployment_types.go +++ b/api/v1alpha1/runnerdeployment_types.go @@ -31,6 +31,14 @@ type RunnerDeploymentSpec struct { // +nullable Replicas *int `json:"replicas,omitempty"` + // EffectiveTime is the time the upstream controller requested to sync Replicas. + // It is usually populated by the webhook-based autoscaler via HRA. + // The value is inherited to RunnerRepicaSet(s) and used to prevent ephemeral runners from unnecessarily recreated. + // + // +optional + // +nullable + EffectiveTime *metav1.Time `json:"effectiveTime"` + // +optional // +nullable Selector *metav1.LabelSelector `json:"selector"` diff --git a/api/v1alpha1/runnerreplicaset_types.go b/api/v1alpha1/runnerreplicaset_types.go index 4c698feb..9ecf1349 100644 --- a/api/v1alpha1/runnerreplicaset_types.go +++ b/api/v1alpha1/runnerreplicaset_types.go @@ -26,6 +26,15 @@ type RunnerReplicaSetSpec struct { // +nullable Replicas *int `json:"replicas,omitempty"` + // EffectiveTime is the time the upstream controller requested to sync Replicas. + // It is usually populated by the webhook-based autoscaler via HRA and RunnerDeployment. + // The value is used to prevent runnerreplicaset controller from unnecessarily recreating ephemeral runners + // based on potentially outdated Replicas value. + // + // +optional + // +nullable + EffectiveTime *metav1.Time `json:"effectiveTime"` + // +optional // +nullable Selector *metav1.LabelSelector `json:"selector"` diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index d040cc78..d37ce954 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -47,6 +47,7 @@ func (in *CacheEntry) DeepCopy() *CacheEntry { func (in *CapacityReservation) DeepCopyInto(out *CapacityReservation) { *out = *in in.ExpirationTime.DeepCopyInto(&out.ExpirationTime) + in.EffectiveTime.DeepCopyInto(&out.EffectiveTime) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CapacityReservation. @@ -498,6 +499,10 @@ func (in *RunnerDeploymentSpec) DeepCopyInto(out *RunnerDeploymentSpec) { *out = new(int) **out = **in } + if in.EffectiveTime != nil { + in, out := &in.EffectiveTime, &out.EffectiveTime + *out = (*in).DeepCopy() + } if in.Selector != nil { in, out := &in.Selector, &out.Selector *out = new(metav1.LabelSelector) @@ -812,6 +817,10 @@ func (in *RunnerReplicaSetSpec) DeepCopyInto(out *RunnerReplicaSetSpec) { *out = new(int) **out = **in } + if in.EffectiveTime != nil { + in, out := &in.EffectiveTime, &out.EffectiveTime + *out = (*in).DeepCopy() + } if in.Selector != nil { in, out := &in.Selector, &out.Selector *out = new(metav1.LabelSelector) diff --git a/charts/actions-runner-controller/crds/actions.summerwind.dev_horizontalrunnerautoscalers.yaml b/charts/actions-runner-controller/crds/actions.summerwind.dev_horizontalrunnerautoscalers.yaml index 0a34c042..2f1719fd 100644 --- a/charts/actions-runner-controller/crds/actions.summerwind.dev_horizontalrunnerautoscalers.yaml +++ b/charts/actions-runner-controller/crds/actions.summerwind.dev_horizontalrunnerautoscalers.yaml @@ -49,6 +49,9 @@ spec: items: description: CapacityReservation specifies the number of replicas temporarily added to the scale target until ExpirationTime. properties: + effectiveTime: + format: date-time + type: string expirationTime: format: date-time type: string diff --git a/charts/actions-runner-controller/crds/actions.summerwind.dev_runnerdeployments.yaml b/charts/actions-runner-controller/crds/actions.summerwind.dev_runnerdeployments.yaml index 1883f901..9a693f9b 100644 --- a/charts/actions-runner-controller/crds/actions.summerwind.dev_runnerdeployments.yaml +++ b/charts/actions-runner-controller/crds/actions.summerwind.dev_runnerdeployments.yaml @@ -48,6 +48,11 @@ spec: spec: description: RunnerDeploymentSpec defines the desired state of RunnerDeployment properties: + effectiveTime: + description: EffectiveTime is the time the upstream controller requested to sync Replicas. It is usually populated by the webhook-based autoscaler via HRA. The value is inherited to RunnerRepicaSet(s) and used to prevent ephemeral runners from unnecessarily recreated. + format: date-time + nullable: true + type: string replicas: nullable: true type: integer diff --git a/charts/actions-runner-controller/crds/actions.summerwind.dev_runnerreplicasets.yaml b/charts/actions-runner-controller/crds/actions.summerwind.dev_runnerreplicasets.yaml index 1bd5051d..7527c088 100644 --- a/charts/actions-runner-controller/crds/actions.summerwind.dev_runnerreplicasets.yaml +++ b/charts/actions-runner-controller/crds/actions.summerwind.dev_runnerreplicasets.yaml @@ -45,6 +45,11 @@ spec: spec: description: RunnerReplicaSetSpec defines the desired state of RunnerReplicaSet properties: + effectiveTime: + description: EffectiveTime is the time the upstream controller requested to sync Replicas. It is usually populated by the webhook-based autoscaler via HRA and RunnerDeployment. The value is used to prevent runnerreplicaset controller from unnecessarily recreating ephemeral runners based on potentially outdated Replicas value. + format: date-time + nullable: true + type: string replicas: nullable: true type: integer diff --git a/config/crd/bases/actions.summerwind.dev_horizontalrunnerautoscalers.yaml b/config/crd/bases/actions.summerwind.dev_horizontalrunnerautoscalers.yaml index 0a34c042..2f1719fd 100644 --- a/config/crd/bases/actions.summerwind.dev_horizontalrunnerautoscalers.yaml +++ b/config/crd/bases/actions.summerwind.dev_horizontalrunnerautoscalers.yaml @@ -49,6 +49,9 @@ spec: items: description: CapacityReservation specifies the number of replicas temporarily added to the scale target until ExpirationTime. properties: + effectiveTime: + format: date-time + type: string expirationTime: format: date-time type: string diff --git a/config/crd/bases/actions.summerwind.dev_runnerdeployments.yaml b/config/crd/bases/actions.summerwind.dev_runnerdeployments.yaml index 1883f901..9a693f9b 100644 --- a/config/crd/bases/actions.summerwind.dev_runnerdeployments.yaml +++ b/config/crd/bases/actions.summerwind.dev_runnerdeployments.yaml @@ -48,6 +48,11 @@ spec: spec: description: RunnerDeploymentSpec defines the desired state of RunnerDeployment properties: + effectiveTime: + description: EffectiveTime is the time the upstream controller requested to sync Replicas. It is usually populated by the webhook-based autoscaler via HRA. The value is inherited to RunnerRepicaSet(s) and used to prevent ephemeral runners from unnecessarily recreated. + format: date-time + nullable: true + type: string replicas: nullable: true type: integer diff --git a/config/crd/bases/actions.summerwind.dev_runnerreplicasets.yaml b/config/crd/bases/actions.summerwind.dev_runnerreplicasets.yaml index 1bd5051d..7527c088 100644 --- a/config/crd/bases/actions.summerwind.dev_runnerreplicasets.yaml +++ b/config/crd/bases/actions.summerwind.dev_runnerreplicasets.yaml @@ -45,6 +45,11 @@ spec: spec: description: RunnerReplicaSetSpec defines the desired state of RunnerReplicaSet properties: + effectiveTime: + description: EffectiveTime is the time the upstream controller requested to sync Replicas. It is usually populated by the webhook-based autoscaler via HRA and RunnerDeployment. The value is used to prevent runnerreplicaset controller from unnecessarily recreating ephemeral runners based on potentially outdated Replicas value. + format: date-time + nullable: true + type: string replicas: nullable: true type: integer diff --git a/controllers/horizontal_runner_autoscaler_webhook.go b/controllers/horizontal_runner_autoscaler_webhook.go index ea407fdb..d097b7e9 100644 --- a/controllers/horizontal_runner_autoscaler_webhook.go +++ b/controllers/horizontal_runner_autoscaler_webhook.go @@ -770,8 +770,10 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) tryScale(ctx context. capacityReservations := getValidCapacityReservations(copy) if amount > 0 { + now := time.Now() copy.Spec.CapacityReservations = append(capacityReservations, v1alpha1.CapacityReservation{ - ExpirationTime: metav1.Time{Time: time.Now().Add(target.ScaleUpTrigger.Duration.Duration)}, + EffectiveTime: metav1.Time{Time: now}, + ExpirationTime: metav1.Time{Time: now.Add(target.ScaleUpTrigger.Duration.Duration)}, Replicas: amount, }) } else if amount < 0 { diff --git a/controllers/horizontalrunnerautoscaler_controller.go b/controllers/horizontalrunnerautoscaler_controller.go index 2fd949bb..140a8f51 100644 --- a/controllers/horizontalrunnerautoscaler_controller.go +++ b/controllers/horizontalrunnerautoscaler_controller.go @@ -99,11 +99,33 @@ func (r *HorizontalRunnerAutoscalerReconciler) Reconcile(ctx context.Context, re return r.reconcile(ctx, req, log, hra, st, func(newDesiredReplicas int) error { currentDesiredReplicas := getIntOrDefault(rd.Spec.Replicas, defaultReplicas) + ephemeral := rd.Spec.Template.Spec.Ephemeral == nil || *rd.Spec.Template.Spec.Ephemeral + + var effectiveTime *time.Time + + for _, r := range hra.Spec.CapacityReservations { + t := r.EffectiveTime + if effectiveTime == nil || effectiveTime.Before(t.Time) { + effectiveTime = &t.Time + } + } + // Please add more conditions that we can in-place update the newest runnerreplicaset without disruption if currentDesiredReplicas != newDesiredReplicas { copy := rd.DeepCopy() copy.Spec.Replicas = &newDesiredReplicas + if ephemeral && effectiveTime != nil { + copy.Spec.EffectiveTime = &metav1.Time{Time: *effectiveTime} + } + + if err := r.Client.Patch(ctx, copy, client.MergeFrom(&rd)); err != nil { + return fmt.Errorf("patching runnerdeployment to have %d replicas: %w", newDesiredReplicas, err) + } + } else if ephemeral && effectiveTime != nil { + copy := rd.DeepCopy() + copy.Spec.EffectiveTime = &metav1.Time{Time: *effectiveTime} + if err := r.Client.Patch(ctx, copy, client.MergeFrom(&rd)); err != nil { return fmt.Errorf("patching runnerdeployment to have %d replicas: %w", newDesiredReplicas, err) } diff --git a/controllers/runnerdeployment_controller.go b/controllers/runnerdeployment_controller.go index 6725bc64..9a55ae3d 100644 --- a/controllers/runnerdeployment_controller.go +++ b/controllers/runnerdeployment_controller.go @@ -177,6 +177,7 @@ func (r *RunnerDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Req // Please add more conditions that we can in-place update the newest runnerreplicaset without disruption if currentDesiredReplicas != newDesiredReplicas { newestSet.Spec.Replicas = &newDesiredReplicas + newestSet.Spec.EffectiveTime = rd.Spec.EffectiveTime if err := r.Client.Update(ctx, newestSet); err != nil { log.Error(err, "Failed to update runnerreplicaset resource") @@ -417,9 +418,10 @@ func newRunnerReplicaSet(rd *v1alpha1.RunnerDeployment, commonRunnerLabels []str Labels: newRSTemplate.ObjectMeta.Labels, }, Spec: v1alpha1.RunnerReplicaSetSpec{ - Replicas: rd.Spec.Replicas, - Selector: newRSSelector, - Template: newRSTemplate, + Replicas: rd.Spec.Replicas, + Selector: newRSSelector, + Template: newRSTemplate, + EffectiveTime: rd.Spec.EffectiveTime, }, } diff --git a/controllers/runnerreplicaset_controller.go b/controllers/runnerreplicaset_controller.go index 2acc93aa..e4ac040f 100644 --- a/controllers/runnerreplicaset_controller.go +++ b/controllers/runnerreplicaset_controller.go @@ -49,6 +49,10 @@ type RunnerReplicaSetReconciler struct { Name string } +const ( + SyncTimeAnnotationKey = "sync-time" +) + // +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnerreplicasets,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnerreplicasets/finalizers,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnerreplicasets/status,verbs=get;update;patch @@ -85,19 +89,36 @@ func (r *RunnerReplicaSetReconciler) Reconcile(ctx context.Context, req ctrl.Req } } - var myRunners []v1alpha1.Runner - var ( current int ready int available int + + lastSyncTime *time.Time ) for _, r := range allRunners.Items { // This guard is required to avoid the RunnerReplicaSet created by the controller v0.17.0 or before // to not treat all the runners in the namespace as its children. if metav1.IsControlledBy(&r, &rs) && !metav1.HasAnnotation(r.ObjectMeta, annotationKeyRegistrationOnly) { - myRunners = append(myRunners, r) + // If the runner is already marked for deletion(=has a non-zero deletion timestamp) by the runner controller (can be caused by an ephemeral runner completion) + // or by runnerreplicaset controller (in case it was deleted in the previous reconcilation loop), + // we don't need to bother calling GitHub API to re-mark the runner for deletion. + // Just hold on, and runners will disappear as long as the runner controller is up and running. + if !r.DeletionTimestamp.IsZero() { + continue + } + + if r.Annotations != nil { + if a, ok := r.Annotations[SyncTimeAnnotationKey]; ok { + t, err := time.Parse(time.RFC3339, a) + if err == nil { + if lastSyncTime == nil || lastSyncTime.Before(t) { + lastSyncTime = &t + } + } + } + } current += 1 @@ -152,7 +173,30 @@ func (r *RunnerReplicaSetReconciler) Reconcile(ctx context.Context, req ctrl.Req } } - if current > desired { + effectiveTime := rs.Spec.EffectiveTime + ephemeral := rs.Spec.Template.Spec.Ephemeral == nil || *rs.Spec.Template.Spec.Ephemeral + + if current < desired && ephemeral && lastSyncTime != nil && effectiveTime != nil && lastSyncTime.After(effectiveTime.Time) { + log.V(1).Info("Detected that some ephemeral runners have disappeared. Usually this is due to that ephemeral runner completions so ARC does not create new runners until EffectiveTime is updated.", "lastSyncTime", metav1.Time{Time: *lastSyncTime}, "effectiveTime", *effectiveTime, "desired", desired, "available", current, "ready", ready) + } else if current > desired { + // If you use ephemeral runners with webhook-based autoscaler and the runner controller is working normally, + // you're unlikely to fall into this branch. + // + // That's becaseu all the stakeholders work like this: + // + // 1. A runner pod completes with the runner container exiting with code 0 + // 2. ARC runner controller detects the pod completion, marks the runner resource on k8s for deletion (=Runner.DeletionTimestamp becomes non-zero) + // 3. GitHub triggers a corresponding workflow_job "complete" webhook event + // 4. ARC github-webhook-server (webhook-based autoscaler) receives the webhook event updates HRA with removing the oldest capacity reservation + // 5. ARC horizontalrunnerautoscaler updates RunnerDeployment's desired replicas based on capacity reservations + // 6. ARC runnerdeployment controller updates RunnerReplicaSet's desired replicas + // 7. (We're here) ARC runnerreplicaset controller (this controller) starts reconciling the RunnerReplicaSet + // + // In a normally working ARC installation, the runner that was used to run the workflow job should already have been + // marked for deletion by the runner controller. + // This runnerreplicaset controller doesn't count marked runners into the `current` value, hence you're unlikely to + // fall into this branch when you're using ephemeral runners with webhook-based-autoscaler. + n := current - desired log.V(0).Info(fmt.Sprintf("Deleting %d runners", n), "desired", desired, "current", current, "ready", ready) @@ -282,6 +326,10 @@ func (r *RunnerReplicaSetReconciler) newRunner(rs v1alpha1.RunnerReplicaSet) (v1 objectMeta.GenerateName = rs.ObjectMeta.Name + "-" objectMeta.Namespace = rs.ObjectMeta.Namespace + if objectMeta.Annotations == nil { + objectMeta.Annotations = map[string]string{} + } + objectMeta.Annotations[SyncTimeAnnotationKey] = time.Now().Format(time.RFC3339) runner := v1alpha1.Runner{ TypeMeta: metav1.TypeMeta{}, From 5bc16f2619d3cdf3eb5e2ea714d97fffab4bc8c3 Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Mon, 21 Feb 2022 00:06:26 +0000 Subject: [PATCH 11/12] Enhance HRA capacity reservation update log --- controllers/horizontal_runner_autoscaler_webhook.go | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/controllers/horizontal_runner_autoscaler_webhook.go b/controllers/horizontal_runner_autoscaler_webhook.go index d097b7e9..9553c33c 100644 --- a/controllers/horizontal_runner_autoscaler_webhook.go +++ b/controllers/horizontal_runner_autoscaler_webhook.go @@ -792,10 +792,16 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) tryScale(ctx context. copy.Spec.CapacityReservations = reservations } - autoscaler.Log.Info( + before := len(target.HorizontalRunnerAutoscaler.Spec.CapacityReservations) + expired := before - len(capacityReservations) + after := len(copy.Spec.CapacityReservations) + + autoscaler.Log.V(1).Info( "Patching hra for capacityReservations update", - "before", target.HorizontalRunnerAutoscaler.Spec.CapacityReservations, - "after", copy.Spec.CapacityReservations, + "before", before, + "expired", expired, + "amount", amount, + "after", after, ) if err := autoscaler.Client.Patch(ctx, copy, client.MergeFrom(&target.HorizontalRunnerAutoscaler)); err != nil { From 1463d4927f013b023c36351fd3ca3d90811846e4 Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Mon, 21 Feb 2022 00:07:49 +0000 Subject: [PATCH 12/12] acceptance,e2e: Let capacity reservation expired more later --- acceptance/testdata/runnerdeploy.envsubst.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/acceptance/testdata/runnerdeploy.envsubst.yaml b/acceptance/testdata/runnerdeploy.envsubst.yaml index 3ecea4f6..e0862f60 100644 --- a/acceptance/testdata/runnerdeploy.envsubst.yaml +++ b/acceptance/testdata/runnerdeploy.envsubst.yaml @@ -57,7 +57,7 @@ spec: scaleUpTriggers: - githubEvent: {} amount: 1 - duration: "1m" + duration: "10m" minReplicas: ${RUNNER_MIN_REPLICAS} maxReplicas: 10 scaleDownDelaySecondsAfterScaleOut: ${RUNNER_SCALE_DOWN_DELAY_SECONDS_AFTER_SCALE_OUT}