diff --git a/.golangci.yml b/.golangci.yml index 1598c1d..0734e54 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -54,15 +54,6 @@ linters: - whitespace disable: - # Messages like "struct of size 104 bytes could be of size 96 bytes" from a package - # that was last updated 2 years ago[1] are barely helpful. - # - # After all, we're writing the code for other people, so let's trust the compiler here (that's - # constantly evolving compared to this linter) and revisit this if memory usage becomes a problem. - # - # [1]: https://github.com/mdempsky/maligned/commit/6e39bd26a8c8b58c5a22129593044655a9e25959 - - maligned - # We don't have high-performance requirements at this moment, so sacrificing # the code readability for marginal performance gains is not worth it. - prealloc @@ -75,19 +66,12 @@ linters: # Unfortunately, we use globals due to how spf13/cobra works. - gochecknoglobals - # That's fine that some Proto objects don't have all fields initialized - - exhaustivestruct - # Style linters that are total nuts. - wsl - gofumpt - goimports - funlen - # This conflicts with the Protocol Buffers Version 3 design, - # which is largely based on default values for struct fields. - - exhaustivestruct - # Enough parallelism for now. - paralleltest @@ -115,9 +99,6 @@ linters: # Needs package whitelists - depguard - # This is mostly a CLI tool, not a package, so it's OK to have dynamic errors - - goerr113 - issues: # Don't hide multiple issues that belong to one class since GitHub annotations can handle them all nicely. max-issues-per-linter: 0 diff --git a/DeploymentGuide.md b/DeploymentGuide.md index b507896..1f8b4ad 100644 --- a/DeploymentGuide.md +++ b/DeploymentGuide.md @@ -126,13 +126,36 @@ orchard context default production ## Configuring Orchard Workers +First, create a service account limited with a minimal set of roles required for proper worker functioning: + ```bash orchard create service-account worker-pool-m1 --roles "compute:read" --roles "compute:write" +``` + +Then, generate a bootstrap token: + +```shell orchard get bootstrap-token worker-pool-m1 ``` -## Configuring Orchard Workers +Then, for each worker machine, start the worker as follows: + +```shell +orchard worker run --bootstrap-token +``` + +### Automation If you have a set of machines that you want to use as Orchard Workers, you can use Ansible to configure them. Please refer a [separate repository](https://github.com/cirruslabs/ansible-orchard) where we prepared a basic Ansible playbook for convenient setup. + +## Observability + +Both the controller and worker produce some useful OpenTelemetry metrics. Metrics are scoped with `org.cirruslabs.orchard` prefix and include information about resource utilization, statuses or workers, scheduling/pull time and many more. + +By default, the telemetry is sent to https://localhost:4317 using the gRPC protocol and to http://localhost:4318 using the HTTP protocol. + +You can override this by setting the [standard OpenTelemetry environment variable](https://opentelemetry.io/docs/specs/otel/configuration/sdk-environment-variables/) `OTEL_EXPORTER_OTLP_ENDPOINT`. + +Please refer to [OTEL Collector documentation](https://opentelemetry.io/docs/collector/) for instruction on how to setup a sidecar for the metrics collections or find out if your SaaS monitoring has an available OTEL endpoint (see [Honeycomb](https://docs.honeycomb.io/send-data/opentelemetry/) as an example). diff --git a/go.mod b/go.mod index fae13cd..56a057b 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,8 @@ module github.com/cirruslabs/orchard -go 1.19 +go 1.21 + +toolchain go1.22.4 require ( github.com/avast/retry-go v3.0.0+incompatible @@ -11,8 +13,8 @@ require ( github.com/gin-gonic/gin v1.9.0 github.com/go-openapi/runtime v0.25.0 github.com/gofrs/flock v0.8.1 - github.com/golang/protobuf v1.5.3 - github.com/google/uuid v1.3.0 + github.com/golang/protobuf v1.5.4 + github.com/google/uuid v1.6.0 github.com/gosuri/uitable v0.0.4 github.com/hashicorp/go-multierror v1.1.1 github.com/hashicorp/go-version v1.6.0 @@ -24,14 +26,19 @@ require ( github.com/samber/lo v1.38.1 github.com/skratchdot/open-golang v0.0.0-20200116055534-eef842397966 github.com/spf13/cobra v1.6.0 - github.com/stretchr/testify v1.8.1 + github.com/stretchr/testify v1.9.0 + go.opentelemetry.io/otel v1.27.0 + go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.27.0 + go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.27.0 + go.opentelemetry.io/otel/metric v1.27.0 + go.opentelemetry.io/otel/sdk/metric v1.27.0 go.uber.org/zap v1.24.0 golang.org/x/crypto v0.23.0 golang.org/x/exp v0.0.0-20230321023759-10a507213a29 - golang.org/x/net v0.23.0 + golang.org/x/net v0.25.0 golang.org/x/term v0.20.0 - google.golang.org/grpc v1.56.3 - google.golang.org/protobuf v1.30.0 + google.golang.org/grpc v1.64.0 + google.golang.org/protobuf v1.34.1 gopkg.in/natefinch/lumberjack.v2 v2.2.1 gopkg.in/yaml.v3 v3.0.1 howett.net/plist v1.0.0 @@ -45,6 +52,7 @@ require ( github.com/beorn7/perks v1.0.1 // indirect github.com/bits-and-blooms/bitset v1.2.0 // indirect github.com/bytedance/sonic v1.8.0 // indirect + github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/cespare/xxhash v1.1.0 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 // indirect @@ -53,6 +61,8 @@ require ( github.com/dgraph-io/ristretto v0.1.1 // indirect github.com/fatih/color v1.13.0 // indirect github.com/gin-contrib/sse v0.1.0 // indirect + github.com/go-logr/logr v1.4.1 // indirect + github.com/go-logr/stdr v1.2.2 // indirect github.com/go-openapi/analysis v0.21.2 // indirect github.com/go-openapi/errors v0.20.2 // indirect github.com/go-openapi/jsonpointer v0.19.5 // indirect @@ -68,10 +78,11 @@ require ( github.com/go-stack/stack v1.8.1 // indirect github.com/goccy/go-json v0.10.0 // indirect github.com/gogo/protobuf v1.3.2 // indirect - github.com/golang/glog v1.1.0 // indirect + github.com/golang/glog v1.2.0 // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/golang/snappy v0.0.3 // indirect github.com/google/flatbuffers v1.12.1 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 // indirect github.com/hashicorp/errwrap v1.0.0 // indirect github.com/inconshreveable/mousetrap v1.0.1 // indirect github.com/josharian/intern v1.0.0 // indirect @@ -99,6 +110,9 @@ require ( github.com/ugorji/go/codec v1.2.9 // indirect go.mongodb.org/mongo-driver v1.8.3 // indirect go.opencensus.io v0.22.5 // indirect + go.opentelemetry.io/otel/sdk v1.27.0 // indirect + go.opentelemetry.io/otel/trace v1.27.0 // indirect + go.opentelemetry.io/proto/otlp v1.2.0 // indirect go.uber.org/atomic v1.10.0 // indirect go.uber.org/multierr v1.9.0 // indirect golang.org/x/arch v0.0.0-20210923205945-b76863e36670 // indirect diff --git a/go.sum b/go.sum index a7f8b54..6cff380 100644 --- a/go.sum +++ b/go.sum @@ -53,6 +53,7 @@ github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevB github.com/avast/retry-go/v4 v4.3.3 h1:G56Bp6mU0b5HE1SkaoVjscZjlQb0oy4mezwY/cGH19w= github.com/avast/retry-go/v4 v4.3.3/go.mod h1:rg6XFaiuFYII0Xu3RDbZQkxCofFwruZKW8oEF1jpWiU= github.com/benbjohnson/clock v1.1.0 h1:Q92kusRqC1XV2MjkWETPvjJVqKetz1OzxZB7mHJLju8= +github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= @@ -62,6 +63,8 @@ github.com/bits-and-blooms/bitset v1.2.0/go.mod h1:gIdJ4wp64HaoK2YrL1Q5/N7Y16edY github.com/bytedance/sonic v1.5.0/go.mod h1:ED5hyg4y6t3/9Ku1R6dU/4KyJ48DZ4jPhfY1O2AihPM= github.com/bytedance/sonic v1.8.0 h1:ea0Xadu+sHlu7x5O3gKhRpQ1IKiMrSiHttPF0ybECuA= github.com/bytedance/sonic v1.8.0/go.mod h1:i736AoUSYt75HyZLoJW9ERYxcy6eaN6h4BZXU064P/U= +github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= +github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/cespare/xxhash v1.1.0 h1:a6HrQnmkObjyL+Gs60czilIUGqrzKutQD6XZog3p+ko= github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc= @@ -122,6 +125,11 @@ github.com/go-kit/log v0.1.0/go.mod h1:zbhenjAZHb184qTLMA9ZjW7ThYL0H2mk7Q6pNt4vb github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE= github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk= github.com/go-logfmt/logfmt v0.5.0/go.mod h1:wCYkCAKZfumFQihp8CzCvQ3paCTfi41vtzG1KdI/P7A= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.4.1 h1:pKouT5E8xu9zeFC39JXRDukb6JFQPXM5p5I91188VAQ= +github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-openapi/analysis v0.21.2 h1:hXFrOYFHUAMQdu6zwAiKKJHJQ8kqZs1ux/ru1P1wLJU= github.com/go-openapi/analysis v0.21.2/go.mod h1:HZwRk4RRisyG8vx2Oe6aqeSQcoxRp47Xkp3+K6q+LdY= github.com/go-openapi/errors v0.19.8/go.mod h1:cM//ZKUKyO06HSwqAelJ5NsEMMcpa6VpXe8DOa1Mi1M= @@ -151,6 +159,7 @@ github.com/go-openapi/validate v0.21.0 h1:+Wqk39yKOhfpLqNLEC0/eViCkzM5FVXVqrvt52 github.com/go-openapi/validate v0.21.0/go.mod h1:rjnrwK57VJ7A8xqfpAOEKRH8yQSGUriMu5/zuPSQ1hg= github.com/go-playground/assert/v2 v2.0.1/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4= github.com/go-playground/assert/v2 v2.2.0 h1:JvknZsQTYeFEAhQwI4qEt9cyV5ONwRHC+lYKSsYSR8s= +github.com/go-playground/assert/v2 v2.2.0/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4= github.com/go-playground/locales v0.13.0/go.mod h1:taPMhCMXrRLJO55olJkUXHZBHCxTMfnGwq/HNwmWNS8= github.com/go-playground/locales v0.14.1 h1:EWaQ/wswjilfKLTECiXz7Rh+3BjFhfDFKv/oXslEjJA= github.com/go-playground/locales v0.14.1/go.mod h1:hxrqLVvrK65+Rwrd5Fc6F2O76J/NuW9t0sjnWqG1slY= @@ -202,8 +211,8 @@ github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7a github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= -github.com/golang/glog v1.1.0 h1:/d3pCKDPWNnvIWe0vVUpNP32qc8U3PDVxySP/y360qE= -github.com/golang/glog v1.1.0/go.mod h1:pfYeQZ3JWZoXTV5sFc986z3HTpwQs9At6P4ImfuP3NQ= +github.com/golang/glog v1.2.0 h1:uCdmnmatrKCgMBlM4rMuJZWOkPDqdbZPnrMXDY4gI68= +github.com/golang/glog v1.2.0/go.mod h1:6AhwSGph0fcJtXVM/PEHPqZlFeoLxhs7/t5UDAwmO+w= github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/groupcache v0.0.0-20191227052852-215e87163ea7/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= @@ -232,8 +241,8 @@ github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= -github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= -github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/golang/snappy v0.0.3 h1:fHPg5GQYlCeLIPB9BZqMVR5nR9A+IM5zcgeTdjMYmLA= github.com/golang/snappy v0.0.3/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= @@ -251,7 +260,8 @@ github.com/google/go-cmp v0.5.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/ github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs= github.com/google/martian/v3 v3.0.0/go.mod h1:y5Zk1BBys9G+gd6Jrk0W3cC1+ELVxBWuIGO+w/tUAp0= @@ -264,14 +274,16 @@ github.com/google/pprof v0.0.0-20200430221834-fc25d7d30c6d/go.mod h1:ZgVRPoUq/hf github.com/google/pprof v0.0.0-20200708004538-1a94d8640e99/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= -github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg= github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk= github.com/gorilla/websocket v1.4.1 h1:q7AeDBpnBk8AogcD4DSag/Ukw/KV+YhzLj2bP5HvKCM= github.com/gorilla/websocket v1.4.1/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= github.com/gosuri/uitable v0.0.4 h1:IG2xLKRvErL3uhY6e1BylFzG+aJiwQviDDTfOKeKTpY= github.com/gosuri/uitable v0.0.4/go.mod h1:tKR86bXuXPZazfOTG1FIzvjIdXzd0mo4Vtn16vt0PJo= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 h1:bkypFPDjIYGfCYD5mRBvpqxfYX1YCS1PXdKYWi8FsN0= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0/go.mod h1:P+Lt/0by1T8bfcF3z737NnSbmxQAppXMRziHUxPOC8k= github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/UYA= github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo= @@ -317,6 +329,7 @@ github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxv github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= @@ -409,7 +422,8 @@ github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJ github.com/rogpeppe/go-internal v1.1.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= github.com/rogpeppe/go-internal v1.2.2/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= -github.com/rogpeppe/go-internal v1.8.0 h1:FCbCCtXNOY3UtUuHUYaghJg4y7Fd14rXifAYUAtL9R8= +github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= +github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= github.com/russross/blackfriday v1.5.2/go.mod h1:JO/DiYxRf+HjHt06OyowR9PTA263kcR/rfWxYHBV53g= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/samber/lo v1.38.1 h1:j2XEAqXKb09Am4ebOg31SpvzUTTs6EN3VfgeLUhPdXM= @@ -447,8 +461,9 @@ github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/ github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= -github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/tidwall/pretty v1.0.0 h1:HsD+QiTn7sK6flMKIvNmpqz1qrpP3Ps6jOKIKMooyg4= github.com/tidwall/pretty v1.0.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk= github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS4MhqMhdFk5YI= @@ -478,9 +493,26 @@ go.opencensus.io v0.22.3/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= go.opencensus.io v0.22.4/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= go.opencensus.io v0.22.5 h1:dntmOdLpSpHlVqbW5Eay97DelsZHe+55D+xC6i0dDS0= go.opencensus.io v0.22.5/go.mod h1:5pWMHQbX5EPX2/62yrJeAkowc+lfs/XD7Uxpq3pI6kk= +go.opentelemetry.io/otel v1.27.0 h1:9BZoF3yMK/O1AafMiQTVu0YDj5Ea4hPhxCs7sGva+cg= +go.opentelemetry.io/otel v1.27.0/go.mod h1:DMpAK8fzYRzs+bi3rS5REupisuqTheUlSZJ1WnZaPAQ= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.27.0 h1:bFgvUr3/O4PHj3VQcFEuYKvRZJX1SJDQ+11JXuSB3/w= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.27.0/go.mod h1:xJntEd2KL6Qdg5lwp97HMLQDVeAhrYxmzFseAMDPQ8I= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.27.0 h1:CIHWikMsN3wO+wq1Tp5VGdVRTcON+DmOJSfDjXypKOc= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.27.0/go.mod h1:TNupZ6cxqyFEpLXAZW7On+mLFL0/g0TE3unIYL91xWc= +go.opentelemetry.io/otel/metric v1.27.0 h1:hvj3vdEKyeCi4YaYfNjv2NUje8FqKqUY8IlF0FxV/ik= +go.opentelemetry.io/otel/metric v1.27.0/go.mod h1:mVFgmRlhljgBiuk/MP/oKylr4hs85GZAylncepAX/ak= +go.opentelemetry.io/otel/sdk v1.27.0 h1:mlk+/Y1gLPLn84U4tI8d3GNJmGT/eXe3ZuOXN9kTWmI= +go.opentelemetry.io/otel/sdk v1.27.0/go.mod h1:Ha9vbLwJE6W86YstIywK2xFfPjbWlCuwPtMkKdz/Y4A= +go.opentelemetry.io/otel/sdk/metric v1.27.0 h1:5uGNOlpXi+Hbo/DRoI31BSb1v+OGcpv2NemcCrOL8gI= +go.opentelemetry.io/otel/sdk/metric v1.27.0/go.mod h1:we7jJVrYN2kh3mVBlswtPU22K0SA+769l93J6bsyvqw= +go.opentelemetry.io/otel/trace v1.27.0 h1:IqYb813p7cmbHk0a5y6pD5JPakbVfftRXABGt5/Rscw= +go.opentelemetry.io/otel/trace v1.27.0/go.mod h1:6RiD1hkAprV4/q+yd2ln1HG9GoPx39SuvvstaLBl+l4= +go.opentelemetry.io/proto/otlp v1.2.0 h1:pVeZGk7nXDC9O2hncA6nHldxEjm6LByfA2aN8IOkz94= +go.opentelemetry.io/proto/otlp v1.2.0/go.mod h1:gGpR8txAl5M03pDhMC79G6SdqNV26naRm/KDsgaHD8A= go.uber.org/atomic v1.10.0 h1:9qC72Qh0+3MqyJbAn8YU5xVq1frD8bn3JtD2oXtafVQ= go.uber.org/atomic v1.10.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0= go.uber.org/goleak v1.1.11 h1:wy28qYRKZgnJTxGxvye5/wgWr1EKjmUDGYox5mGlRlI= +go.uber.org/goleak v1.1.11/go.mod h1:cwTWslyiVhfpKIDGSZEM2HlOvcqm+tG4zioyIeLoqMQ= go.uber.org/multierr v1.9.0 h1:7fIwc/ZtS0q++VgcfqFDxSBZVv/Xo49/SYnDFupUwlI= go.uber.org/multierr v1.9.0/go.mod h1:X2jQV1h+kxSjClGpnseKVIxpmcjrj7MNnI0bnlfKTVQ= go.uber.org/zap v1.24.0 h1:FiJd5l1UOLj0wCgbSE0rwwXHzEdAZS6hiiSnxJN/D60= @@ -562,8 +594,8 @@ golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81R golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.0.0-20210421230115-4e50805a0758/go.mod h1:72T/g9IO56b78aLF+1Kcs5dz7/ng1VjMUvfKvpfy+jM= golang.org/x/net v0.0.0-20210525063256-abc453219eb5/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= -golang.org/x/net v0.23.0 h1:7EYJ93RZ9vYSZAIb2x3lnuvqO5zneoD6IvWjuhfxjTs= -golang.org/x/net v0.23.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg= +golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac= +golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -764,8 +796,8 @@ google.golang.org/grpc v1.28.1/go.mod h1:rpkK4SK4GF4Ach/+MFLZUBavHOvF2JJB5uozKKa google.golang.org/grpc v1.29.1/go.mod h1:itym6AZVZYACWQqET3MqgPpjcuV5QH3BxFS3IjizoKk= google.golang.org/grpc v1.30.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak= google.golang.org/grpc v1.31.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak= -google.golang.org/grpc v1.56.3 h1:8I4C0Yq1EjstUzUJzpcRVbuYA2mODtEmpWiQoN/b2nc= -google.golang.org/grpc v1.56.3/go.mod h1:I9bI3vqKfayGqPUAwGdOSu7kt6oIJLixfffKrpXqQ9s= +google.golang.org/grpc v1.64.0 h1:KH3VH9y/MgNQg1dE7b3XfVK0GsPSIzJwdF617gUSbvY= +google.golang.org/grpc v1.64.0/go.mod h1:oxjF8E3FBnjp+/gVFYdWacaLDx9na1aqy9oovLpxQYg= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= @@ -778,14 +810,15 @@ google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGj google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c= google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= -google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng= -google.golang.org/protobuf v1.30.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +google.golang.org/protobuf v1.34.1 h1:9ddQBjfCyZPOHPUiPxpYESBLc+T8P3E+Vo4IbKZgFWg= +google.golang.org/protobuf v1.34.1/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= gopkg.in/natefinch/lumberjack.v2 v2.2.1 h1:bBRl1b0OH9s/DuPhuXpNl+VtCaJXFZ5/uEFST95x9zc= gopkg.in/natefinch/lumberjack.v2 v2.2.1/go.mod h1:YD8tP3GAjkrDg1eZH7EGmyESg/lsYskCTPBJVb9jqSc= diff --git a/internal/command/root.go b/internal/command/root.go index 59ff820..35c81f0 100644 --- a/internal/command/root.go +++ b/internal/command/root.go @@ -16,6 +16,7 @@ import ( "github.com/cirruslabs/orchard/internal/command/ssh" "github.com/cirruslabs/orchard/internal/command/vnc" "github.com/cirruslabs/orchard/internal/command/worker" + "github.com/cirruslabs/orchard/internal/opentelemetry" "github.com/cirruslabs/orchard/internal/version" "github.com/spf13/cobra" ) @@ -26,6 +27,14 @@ func NewRootCmd() *cobra.Command { SilenceUsage: true, SilenceErrors: true, Version: version.FullVersion, + PersistentPreRunE: func(cmd *cobra.Command, _ []string) error { + // Configure OpenTelemetry + if err := opentelemetry.Configure(cmd.Context()); err != nil { + return err + } + + return nil + }, } addGroupedCommands(command, "Working With Resources:", diff --git a/internal/controller/controller.go b/internal/controller/controller.go index ed7a7ef..1c19567 100644 --- a/internal/controller/controller.go +++ b/internal/controller/controller.go @@ -12,8 +12,12 @@ import ( storepkg "github.com/cirruslabs/orchard/internal/controller/store" "github.com/cirruslabs/orchard/internal/controller/store/badger" "github.com/cirruslabs/orchard/internal/netconstants" + "github.com/cirruslabs/orchard/internal/opentelemetry" v1 "github.com/cirruslabs/orchard/pkg/resource/v1" "github.com/cirruslabs/orchard/rpc" + "github.com/samber/lo" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" "go.uber.org/zap" "golang.org/x/crypto/ssh" "golang.org/x/net/http2" @@ -111,8 +115,11 @@ func New(opts ...Option) (*Controller, error) { controller.workerNotifier = notifier.NewNotifier(controller.logger.With("component", "rpc")) // Instantiate the scheduler - controller.scheduler = scheduler.NewScheduler(store, controller.workerNotifier, + controller.scheduler, err = scheduler.NewScheduler(store, controller.workerNotifier, controller.workerOfflineTimeout, controller.logger) + if err != nil { + return nil, err + } // Instantiate the SSH server (if configured) if controller.sshListenAddr != "" && controller.sshSigner != nil { @@ -169,6 +176,11 @@ func New(opts ...Option) (*Controller, error) { ErrInitFailed, err) } + // Metrics + if err := controller.initializeMetrics(); err != nil { + return nil, err + } + return controller, nil } @@ -239,3 +251,118 @@ func (controller *Controller) SSHAddress() (string, bool) { return controller.sshServer.Address(), true } + +//nolint:gocognit // looks OK for now +func (controller *Controller) initializeMetrics() error { + _, err := opentelemetry.DefaultMeter.Int64ObservableGauge("org.cirruslabs.orchard.controller.vm_status", + metric.WithInt64Callback(func(ctx context.Context, observer metric.Int64Observer) error { + return controller.store.View(func(txn storepkg.Transaction) error { + vms, err := txn.ListVMs() + if err != nil { + return err + } + + type Key struct { + Worker string + Status v1.VMStatus + } + + groups := lo.CountValuesBy(vms, func(vm v1.VM) Key { + return Key{ + Worker: vm.Worker, + Status: vm.Status, + } + }) + + for key, count := range groups { + observer.Observe(int64(count), metric.WithAttributes( + attribute.String("worker", key.Worker), + attribute.String("status", key.Status.String()), + )) + } + + return nil + }) + }), + ) + if err != nil { + return err + } + + _, err = opentelemetry.DefaultMeter.Int64ObservableGauge("org.cirruslabs.orchard.controller.worker_status", + metric.WithInt64Callback(func(ctx context.Context, observer metric.Int64Observer) error { + return controller.store.View(func(txn storepkg.Transaction) error { + workers, err := txn.ListWorkers() + if err != nil { + return err + } + + groups := lo.CountValuesBy(workers, func(worker v1.Worker) string { + if worker.Offline(time.Minute) { + return "offline" + } + + return "online" + }) + + for status, count := range groups { + observer.Observe(int64(count), metric.WithAttributes( + attribute.String("status", status), + )) + } + + return nil + }) + }), + ) + if err != nil { + return err + } + + _, err = opentelemetry.DefaultMeter.Int64ObservableGauge("org.cirruslabs.orchard.controller.worker_resource", + metric.WithInt64Callback(func(ctx context.Context, observer metric.Int64Observer) error { + return controller.store.View(func(txn storepkg.Transaction) error { + workers, err := txn.ListWorkers() + if err != nil { + return err + } + + vms, err := txn.ListVMs() + if err != nil { + return err + } + + _, workerToResources := scheduler.ProcessVMs(vms) + + for _, worker := range workers { + resourcesUsed := workerToResources.Get(worker.Name) + + for key, value := range resourcesUsed { + observer.Observe(int64(value), metric.WithAttributes( + attribute.String("worker", worker.Name), + attribute.String("resource", key), + attribute.String("type", "used"), + )) + } + + resourcesAvailable := worker.Resources.Subtracted(resourcesUsed) + + for key, value := range resourcesAvailable { + observer.Observe(int64(value), metric.WithAttributes( + attribute.String("worker", worker.Name), + attribute.String("resource", key), + attribute.String("type", "available"), + )) + } + } + + return nil + }) + }), + ) + if err != nil { + return err + } + + return nil +} diff --git a/internal/controller/scheduler/scheduler.go b/internal/controller/scheduler/scheduler.go index dbcf7a8..cb97752 100644 --- a/internal/controller/scheduler/scheduler.go +++ b/internal/controller/scheduler/scheduler.go @@ -4,10 +4,12 @@ import ( "context" "github.com/cirruslabs/orchard/internal/controller/notifier" storepkg "github.com/cirruslabs/orchard/internal/controller/store" + "github.com/cirruslabs/orchard/internal/opentelemetry" "github.com/cirruslabs/orchard/pkg/resource/v1" "github.com/cirruslabs/orchard/rpc" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" + "go.opentelemetry.io/otel/metric" "go.uber.org/zap" "sort" "time" @@ -37,6 +39,8 @@ type Scheduler struct { workerOfflineTimeout time.Duration logger *zap.SugaredLogger schedulingRequested chan bool + + schedulingTimeHistogram metric.Float64Histogram } func NewScheduler( @@ -44,14 +48,25 @@ func NewScheduler( notifier *notifier.Notifier, workerOfflineTimeout time.Duration, logger *zap.SugaredLogger, -) *Scheduler { - return &Scheduler{ +) (*Scheduler, error) { + scheduler := &Scheduler{ store: store, notifier: notifier, workerOfflineTimeout: workerOfflineTimeout, logger: logger, schedulingRequested: make(chan bool, 1), } + + // Metrics + var err error + + scheduler.schedulingTimeHistogram, err = opentelemetry.DefaultMeter. + Float64Histogram("org.cirruslabs.orchard.controller.scheduling_time") + if err != nil { + return nil, err + } + + return scheduler, nil } func (scheduler *Scheduler) Run() { @@ -103,7 +118,7 @@ func (scheduler *Scheduler) schedulingLoopIteration() error { if err != nil { return err } - unscheduledVMs, workerToResources := processVMs(vms) + unscheduledVMs, workerToResources := ProcessVMs(vms) workers, err := txn.ListWorkers() if err != nil { @@ -119,6 +134,10 @@ func (scheduler *Scheduler) schedulingLoopIteration() error { if resourcesRemaining.CanFit(unscheduledVM.Resources) && !worker.Offline(scheduler.workerOfflineTimeout) && !worker.SchedulingPaused { + // Metrics + scheduler.schedulingTimeHistogram.Record(context.Background(), + time.Since(unscheduledVM.CreatedAt).Seconds()) + unscheduledVM.Worker = worker.Name if err := txn.SetVM(unscheduledVM); err != nil { @@ -148,7 +167,7 @@ func (scheduler *Scheduler) schedulingLoopIteration() error { return err } -func processVMs(vms []v1.VM) ([]v1.VM, WorkerToResources) { +func ProcessVMs(vms []v1.VM) ([]v1.VM, WorkerToResources) { var unscheduledVMs []v1.VM workerToResources := make(WorkerToResources) diff --git a/internal/opentelemetry/opentelemetry.go b/internal/opentelemetry/opentelemetry.go new file mode 100644 index 0000000..520e0d8 --- /dev/null +++ b/internal/opentelemetry/opentelemetry.go @@ -0,0 +1,56 @@ +package opentelemetry + +import ( + "context" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc" + "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp" + "go.opentelemetry.io/otel/sdk/metric" + "os" +) + +var ( + DefaultMeter = otel.Meter("") +) + +func Configure(ctx context.Context) error { + // Avoid logging errors when local OpenTelemetry Collector is not available, for example: + // "failed to upload metrics: [...]: dial tcp 127.0.0.1:4318: connect: connection refused" + otel.SetErrorHandler(otel.ErrorHandlerFunc(func(err error) { + // do nothing + })) + + // Work around https://github.com/open-telemetry/opentelemetry-go/issues/4834 + if _, ok := os.LookupEnv("OTEL_EXPORTER_OTLP_ENDPOINT"); !ok { + if err := os.Setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318"); err != nil { + return err + } + } + + if err := setupMeterProvider(ctx); err != nil { + return err + } + + return nil +} + +func setupMeterProvider(ctx context.Context) error { + httpExporter, err := otlpmetrichttp.New(ctx) + if err != nil { + return err + } + + grpcExporter, err := otlpmetricgrpc.New(ctx) + if err != nil { + return err + } + + meterProvider := metric.NewMeterProvider( + metric.WithReader(metric.NewPeriodicReader(httpExporter)), + metric.WithReader(metric.NewPeriodicReader(grpcExporter)), + ) + + otel.SetMeterProvider(meterProvider) + + return nil +} diff --git a/internal/worker/rpc.go b/internal/worker/rpc.go index 71d7cea..7da5a86 100644 --- a/internal/worker/rpc.go +++ b/internal/worker/rpc.go @@ -23,7 +23,7 @@ import ( func (worker *Worker) watchRPC(ctx context.Context) error { worker.logger.Infof("connecting to %s over gRPC", worker.client.GRPCTarget()) - conn, err := grpc.Dial(worker.client.GRPCTarget(), + conn, err := grpc.NewClient(worker.client.GRPCTarget(), grpc.WithTransportCredentials(worker.client.GRPCTransportCredentials()), grpc.WithKeepaliveParams(keepalive.ClientParameters{ Time: 30 * time.Second, diff --git a/internal/worker/vmmanager/vm.go b/internal/worker/vmmanager/vm.go index 0b933e5..602390a 100644 --- a/internal/worker/vmmanager/vm.go +++ b/internal/worker/vmmanager/vm.go @@ -10,6 +10,8 @@ import ( "github.com/cirruslabs/orchard/internal/worker/tart" "github.com/cirruslabs/orchard/pkg/client" "github.com/cirruslabs/orchard/pkg/resource/v1" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" "go.uber.org/zap" "golang.org/x/crypto/ssh" "io" @@ -53,6 +55,7 @@ type VM struct { func NewVM( vmResource v1.VM, eventStreamer *client.EventStreamer, + vmPullTimeHistogram metric.Float64Histogram, logger *zap.SugaredLogger, ) *VM { vmContext, vmContextCancel := context.WithCancel(context.Background()) @@ -80,6 +83,8 @@ func NewVM( if vmResource.ImagePullPolicy == v1.ImagePullPolicyAlways { vm.logger.Debugf("pulling VM") + pullStartedAt := time.Now() + _, _, err := tart.Tart(vm.ctx, vm.logger, "pull", vm.Resource.Image) if err != nil { select { @@ -91,6 +96,11 @@ func NewVM( return } + + vmPullTimeHistogram.Record(vm.ctx, time.Since(pullStartedAt).Seconds(), metric.WithAttributes( + attribute.String("worker", vm.Resource.Worker), + attribute.String("image", vm.Resource.Image), + )) } vm.logger.Debugf("creating VM") diff --git a/internal/worker/worker.go b/internal/worker/worker.go index 314c509..2f4599c 100644 --- a/internal/worker/worker.go +++ b/internal/worker/worker.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "github.com/avast/retry-go/v4" + "github.com/cirruslabs/orchard/internal/opentelemetry" "github.com/cirruslabs/orchard/internal/worker/iokitregistry" "github.com/cirruslabs/orchard/internal/worker/ondiskname" "github.com/cirruslabs/orchard/internal/worker/tart" @@ -13,6 +14,7 @@ import ( v1 "github.com/cirruslabs/orchard/pkg/resource/v1" "github.com/cirruslabs/orchard/rpc" "github.com/hashicorp/go-multierror" + "go.opentelemetry.io/otel/metric" "go.uber.org/zap" "google.golang.org/grpc/metadata" "os" @@ -30,7 +32,10 @@ type Worker struct { client *client.Client pollTicker *time.Ticker resources v1.Resources - logger *zap.SugaredLogger + + vmPullTimeHistogram metric.Float64Histogram + + logger *zap.SugaredLogger } func New(client *client.Client, opts ...Option) (*Worker, error) { @@ -61,6 +66,16 @@ func New(client *client.Client, opts ...Option) (*Worker, error) { } worker.resources = defaultResources.Merged(worker.resources) + // Worker, VMs and images-related metrics + var err error + + worker.vmPullTimeHistogram, err = opentelemetry.DefaultMeter.Float64Histogram( + "org.cirruslabs.orchard.worker.vm.pull_time", + ) + if err != nil { + return nil, err + } + if worker.logger == nil { worker.logger = zap.NewNop().Sugar() } @@ -353,7 +368,7 @@ func (worker *Worker) deleteVM(vm *vmmanager.VM) error { func (worker *Worker) createVM(odn ondiskname.OnDiskName, vmResource v1.VM) { eventStreamer := worker.client.VMs().StreamEvents(vmResource.Name) - vm := vmmanager.NewVM(vmResource, eventStreamer, worker.logger) + vm := vmmanager.NewVM(vmResource, eventStreamer, worker.vmPullTimeHistogram, worker.logger) worker.vmm.Put(odn, vm) }