From 39fbbbc2a6576ff213f62b0ac4612a2335b7a3a6 Mon Sep 17 00:00:00 2001 From: Nikolay Edigaryev Date: Wed, 16 Jul 2025 22:58:13 +0200 Subject: [PATCH] Disable Prometheus metrics by default (#331) --- internal/command/controller/run.go | 9 ++++++++- internal/controller/controller.go | 3 ++- internal/controller/option.go | 6 ++++++ internal/controller/scheduler/scheduler.go | 19 +++++++++++++------ 4 files changed, 29 insertions(+), 8 deletions(-) diff --git a/internal/command/controller/run.go b/internal/command/controller/run.go index 482a282..38e46d9 100644 --- a/internal/command/controller/run.go +++ b/internal/command/controller/run.go @@ -31,6 +31,7 @@ var sshNoClientAuth bool var experimentalRPCV2 bool var noExperimentalRPCV2 bool var experimentalPingInterval time.Duration +var deprecatedPrometheusMetrics bool func newRunCommand() *cobra.Command { cmd := &cobra.Command{ @@ -69,10 +70,12 @@ func newRunCommand() *cobra.Command { _ = cmd.Flags().MarkHidden("experimental-rpc-v2") cmd.Flags().BoolVar(&noExperimentalRPCV2, "no-experimental-rpc-v2", false, "disable experimental RPC v2 (https://github.com/cirruslabs/orchard/issues/235)") - cmd.PersistentFlags().DurationVar(&experimentalPingInterval, "experimental-ping-interval", 0, + cmd.Flags().DurationVar(&experimentalPingInterval, "experimental-ping-interval", 0, "interval between WebSocket PING's sent by the controller to workers and clients, "+ "useful when facing intermediate load balancers/proxies that have timeouts "+ "smaller than the controller's default 30 second interval") + cmd.Flags().BoolVar(&deprecatedPrometheusMetrics, "deprecated-prometheus-metrics", false, + "enable Prometheus metrics, which will soon be deprecated in favor of OpenTelemetry") return cmd } @@ -180,6 +183,10 @@ func runController(cmd *cobra.Command, args []string) (err error) { controllerOpts = append(controllerOpts, controller.WithPingInterval(experimentalPingInterval)) } + if deprecatedPrometheusMetrics { + controllerOpts = append(controllerOpts, controller.WithPrometheusMetrics()) + } + controllerInstance, err := controller.New(controllerOpts...) if err != nil { return err diff --git a/internal/controller/controller.go b/internal/controller/controller.go index 59538f7..a32d444 100644 --- a/internal/controller/controller.go +++ b/internal/controller/controller.go @@ -61,6 +61,7 @@ type Controller struct { maxWorkersPerLicense uint experimentalRPCV2 bool pingInterval time.Duration + prometheusMetrics bool sshListenAddr string sshSigner ssh.Signer @@ -122,7 +123,7 @@ func New(opts ...Option) (*Controller, error) { // Instantiate the scheduler controller.scheduler, err = scheduler.NewScheduler(store, controller.workerNotifier, - controller.workerOfflineTimeout, controller.logger) + controller.workerOfflineTimeout, controller.prometheusMetrics, controller.logger) if err != nil { return nil, err } diff --git a/internal/controller/option.go b/internal/controller/option.go index 23687a9..d2e8941 100644 --- a/internal/controller/option.go +++ b/internal/controller/option.go @@ -65,6 +65,12 @@ func WithPingInterval(pingInterval time.Duration) Option { } } +func WithPrometheusMetrics() Option { + return func(controller *Controller) { + controller.prometheusMetrics = true + } +} + func WithLogger(logger *zap.Logger) Option { return func(controller *Controller) { controller.logger = logger.Sugar() diff --git a/internal/controller/scheduler/scheduler.go b/internal/controller/scheduler/scheduler.go index 85995d6..8fa3560 100644 --- a/internal/controller/scheduler/scheduler.go +++ b/internal/controller/scheduler/scheduler.go @@ -49,6 +49,7 @@ type Scheduler struct { workerOfflineTimeout time.Duration logger *zap.SugaredLogger schedulingRequested chan bool + prometheusMetrics bool schedulingTimeHistogram metric.Float64Histogram } @@ -57,6 +58,7 @@ func NewScheduler( store storepkg.Store, notifier *notifier.Notifier, workerOfflineTimeout time.Duration, + prometheusMetrics bool, logger *zap.SugaredLogger, ) (*Scheduler, error) { scheduler := &Scheduler{ @@ -65,6 +67,7 @@ func NewScheduler( workerOfflineTimeout: workerOfflineTimeout, logger: logger, schedulingRequested: make(chan bool, 1), + prometheusMetrics: prometheusMetrics, } // Metrics @@ -104,7 +107,9 @@ func (scheduler *Scheduler) Run() { if err != nil { scheduler.logger.Errorf("Failed to schedule VMs: %v", err) - } else { + } + + if scheduler.prometheusMetrics { schedulerLoopIterationStat.Inc() } } @@ -395,12 +400,14 @@ func (scheduler *Scheduler) healthCheckingLoopIteration() error { } // Update metrics - workers, err := txn.ListWorkers() - if err != nil { - return err - } + if scheduler.prometheusMetrics { + workers, err := txn.ListWorkers() + if err != nil { + return err + } - scheduler.reportStats(workers, vms) + scheduler.reportStats(workers, vms) + } return nil }); err != nil {