From c60d08bf432df73915835976a0ea0f9030a0d8df Mon Sep 17 00:00:00 2001 From: Fedor Korotkov Date: Mon, 15 Sep 2025 07:07:47 -0400 Subject: [PATCH] feat(telemetry): migrate from Sentry to OpenTelemetry - Initialize OTEL only when OTEL_EXPORTER_OTLP_ENDPOINT is set - Support both OTLP/gRPC (host:port) and OTLP/HTTP (http[s]://...) - Map CIRRUS_SENTRY_TAGS to OTEL resource attributes - Replace Sentry transactions/events with OTEL spans + attributes - Add Telemetry wrapper and update code to record errors + flush - Remove sentry-cocoa dependency; add opentelemetry-swift + NIO This preserves existing tags and measurements under OTEL and keeps telemetry disabled by default. --- Package.resolved | 47 +++++-- Package.swift | 9 +- Sources/tart/Commands/IP.swift | 1 - Sources/tart/Commands/Prune.swift | 37 +++--- Sources/tart/Commands/Run.swift | 8 +- Sources/tart/Root.swift | 43 ++----- Sources/tart/Telemetry.swift | 191 +++++++++++++++++++++++++++++ Sources/tart/VMDirectory+OCI.swift | 4 +- Sources/tart/VMStorageOCI.swift | 19 +-- 9 files changed, 278 insertions(+), 81 deletions(-) create mode 100644 Sources/tart/Telemetry.swift diff --git a/Package.resolved b/Package.resolved index 0b9655d..c8adc48 100644 --- a/Package.resolved +++ b/Package.resolved @@ -1,5 +1,5 @@ { - "originHash" : "668bad809d4882f75f097e66a12a6dbc8e61ec998f1800a7e09439c854fadda1", + "originHash" : "aa0a5df26b9e35d1908d6876d045af7ce1899086d641507e5faa9d1f9bd29787", "pins" : [ { "identity" : "antlr4", @@ -46,6 +46,24 @@ "version" : "1.24.2" } }, + { + "identity" : "opentelemetry-swift", + "kind" : "remoteSourceControl", + "location" : "https://github.com/open-telemetry/opentelemetry-swift", + "state" : { + "revision" : "6a2c29d53ff0b543b551b2221538bd3d0206c6d6", + "version" : "1.15.0" + } + }, + { + "identity" : "opentracing-objc", + "kind" : "remoteSourceControl", + "location" : "https://github.com/undefinedlabs/opentracing-objc", + "state" : { + "revision" : "18c1a35ca966236cee0c5a714a51a73ff33384c1", + "version" : "0.5.2" + } + }, { "identity" : "semaphore", "kind" : "remoteSourceControl", @@ -55,15 +73,6 @@ "version" : "0.1.0" } }, - { - "identity" : "sentry-cocoa", - "kind" : "remoteSourceControl", - "location" : "https://github.com/getsentry/sentry-cocoa", - "state" : { - "revision" : "65b3d2a7608685e8d4a37c68fa2c64f28d0b537e", - "version" : "8.51.1" - } - }, { "identity" : "swift-algorithms", "kind" : "remoteSourceControl", @@ -127,6 +136,15 @@ "version" : "1.6.1" } }, + { + "identity" : "swift-metrics", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-metrics.git", + "state" : { + "revision" : "4c83e1cdf4ba538ef6e43a9bbd0bcc33a0ca46e3", + "version" : "2.7.0" + } + }, { "identity" : "swift-nio", "kind" : "remoteSourceControl", @@ -261,6 +279,15 @@ "branch" : "master", "revision" : "e03289289155b4e7aa565e32862f9cb42140596a" } + }, + { + "identity" : "thrift-swift", + "kind" : "remoteSourceControl", + "location" : "https://github.com/undefinedlabs/Thrift-Swift", + "state" : { + "revision" : "18ff09e6b30e589ed38f90a1af23e193b8ecef8e", + "version" : "1.1.2" + } } ], "version" : 3 diff --git a/Package.swift b/Package.swift index 69ce3cf..0d39a5b 100644 --- a/Package.swift +++ b/Package.swift @@ -17,7 +17,7 @@ let package = Package( .package(url: "https://github.com/antlr/antlr4", exact: "4.13.2"), .package(url: "https://github.com/apple/swift-atomics.git", .upToNextMajor(from: "1.2.0")), .package(url: "https://github.com/nicklockwood/SwiftFormat", from: "0.53.6"), - .package(url: "https://github.com/getsentry/sentry-cocoa", from: "8.51.1"), + .package(url: "https://github.com/open-telemetry/opentelemetry-swift", from: "1.7.0"), .package(url: "https://github.com/cfilipov/TextTable", branch: "master"), .package(url: "https://github.com/sersoft-gmbh/swift-sysctl.git", from: "1.8.0"), .package(url: "https://github.com/orchetect/SwiftRadix", from: "1.3.1"), @@ -25,6 +25,7 @@ let package = Package( .package(url: "https://github.com/fumoboy007/swift-retry", from: "0.2.3"), .package(url: "https://github.com/jozefizso/swift-xattr", from: "3.0.0"), .package(url: "https://github.com/grpc/grpc-swift.git", .upToNextMajor(from: "1.24.2")), + .package(url: "https://github.com/apple/swift-nio.git", from: "2.83.0"), .package(url: "https://buf.build/gen/swift/git/1.24.2-00000000000000-17d7dedafb88.1/cirruslabs_tart-guest-agent_grpc_swift.git", revision: "1.24.2-00000000000000-17d7dedafb88.1"), ], targets: [ @@ -35,7 +36,11 @@ let package = Package( .product(name: "SwiftDate", package: "SwiftDate"), .product(name: "Antlr4Static", package: "Antlr4"), .product(name: "Atomics", package: "swift-atomics"), - .product(name: "Sentry", package: "sentry-cocoa"), + .product(name: "OpenTelemetryApi", package: "opentelemetry-swift"), + .product(name: "OpenTelemetrySdk", package: "opentelemetry-swift"), + .product(name: "OpenTelemetryProtocolExporter", package: "opentelemetry-swift"), + .product(name: "OpenTelemetryProtocolExporterHTTP", package: "opentelemetry-swift"), + .product(name: "NIO", package: "swift-nio"), .product(name: "TextTable", package: "TextTable"), .product(name: "Sysctl", package: "swift-sysctl"), .product(name: "SwiftRadix", package: "SwiftRadix"), diff --git a/Sources/tart/Commands/IP.swift b/Sources/tart/Commands/IP.swift index 51e74ab..ab142a2 100644 --- a/Sources/tart/Commands/IP.swift +++ b/Sources/tart/Commands/IP.swift @@ -2,7 +2,6 @@ import ArgumentParser import Foundation import Network import SystemConfiguration -import Sentry enum IPResolutionStrategy: String, ExpressibleByArgument, CaseIterable { case dhcp, arp, agent diff --git a/Sources/tart/Commands/Prune.swift b/Sources/tart/Commands/Prune.swift index 5e41e03..fb18530 100644 --- a/Sources/tart/Commands/Prune.swift +++ b/Sources/tart/Commands/Prune.swift @@ -1,6 +1,6 @@ import ArgumentParser import Dispatch -import Sentry +import OpenTelemetryApi import SwiftUI import SwiftDate @@ -109,9 +109,10 @@ struct Prune: AsyncParsableCommand { return } - SentrySDK.configureScope { scope in - scope.setContext(value: ["requiredBytes": requiredBytes], key: "Prune") - } + // Record desired reclaim size as an event context + Telemetry.addEvent("Prune.required", attributes: [ + "requiredBytes": .int(Int(requiredBytes)) + ]) // Figure out how much disk space is available let attrs = try Config().tartCacheDir.resourceValues(forKeys: [ @@ -123,18 +124,17 @@ struct Prune: AsyncParsableCommand { UInt64(attrs.volumeAvailableCapacityForImportantUsage!) ) - SentrySDK.configureScope { scope in - scope.setContext(value: [ - "volumeAvailableCapacity": attrs.volumeAvailableCapacity!, - "volumeAvailableCapacityForImportantUsage": attrs.volumeAvailableCapacityForImportantUsage!, - "volumeAvailableCapacityCalculated": volumeAvailableCapacityCalculated - ], key: "Prune") - } + Telemetry.addEvent("Prune.capacity", attributes: [ + "volumeAvailableCapacity": .int(Int(attrs.volumeAvailableCapacity!)), + "volumeAvailableCapacityForImportantUsage": .int(Int(attrs.volumeAvailableCapacityForImportantUsage!)), + "volumeAvailableCapacityCalculated": .int(Int(volumeAvailableCapacityCalculated)) + ]) if volumeAvailableCapacityCalculated <= 0 { - SentrySDK.capture(message: "Zero volume capacity reported") { scope in - scope.setLevel(.warning) - } + Telemetry.addEvent("Prune.warning", attributes: [ + "message": .string("Zero volume capacity reported"), + "level": .string("warning") + ]) return } @@ -149,7 +149,7 @@ struct Prune: AsyncParsableCommand { } private static func reclaimIfPossible(_ reclaimBytes: UInt64, _ initiator: Prunable? = nil) throws { - let transaction = SentrySDK.startTransaction(name: "Pruning cache", operation: "prune", bindToScope: true) + let transaction = Telemetry.startTransaction(name: "Pruning cache", operation: "prune", bindToScope: true) defer { transaction.finish() } let prunableStorages: [PrunableStorage] = [VMStorageOCI(), try IPSWCache()] @@ -177,13 +177,16 @@ struct Prune: AsyncParsableCommand { continue } - try SentrySDK.span?.setData(value: prunable.allocatedSizeBytes(), key: prunable.url.path) + Telemetry.addEvent("Prune.prunable", attributes: [ + "path": .string(prunable.url.path), + "size_bytes": .int(try prunable.allocatedSizeBytes()) + ]) cacheReclaimedBytes += try prunable.allocatedSizeBytes() try prunable.delete() } - SentrySDK.span?.setMeasurement(name: "gc_disk_reclaimed", value: cacheReclaimedBytes as NSNumber, unit: MeasurementUnitInformation.byte); + Telemetry.setAttribute("gc_disk_reclaimed", .int(cacheReclaimedBytes)) } } diff --git a/Sources/tart/Commands/Run.swift b/Sources/tart/Commands/Run.swift index fd7a851..24442eb 100644 --- a/Sources/tart/Commands/Run.swift +++ b/Sources/tart/Commands/Run.swift @@ -4,7 +4,7 @@ import Darwin import Dispatch import SwiftUI import Virtualization -import Sentry +import OpenTelemetryApi import System var vm: VM? @@ -498,9 +498,9 @@ struct Run: AsyncParsableCommand { Foundation.exit(0) } catch { - // Capture the error into Sentry - SentrySDK.capture(error: error) - SentrySDK.flush(timeout: 2.seconds.timeInterval) + // Record the error into OpenTelemetry + Telemetry.recordError(error) + Telemetry.flush() fputs("\(error)\n", stderr) diff --git a/Sources/tart/Root.swift b/Sources/tart/Root.swift index 43a5721..85bba58 100644 --- a/Sources/tart/Root.swift +++ b/Sources/tart/Root.swift @@ -1,7 +1,7 @@ import ArgumentParser import Darwin import Foundation -import Sentry +import OpenTelemetryApi @main struct Root: AsyncParsableCommand { @@ -54,38 +54,9 @@ struct Root: AsyncParsableCommand { // Parse command var command = try parseAsRoot() - // Initialize Sentry - if let dsn = ProcessInfo.processInfo.environment["SENTRY_DSN"] { - SentrySDK.start { options in - options.dsn = dsn - options.releaseName = CI.release - options.tracesSampleRate = Float( - ProcessInfo.processInfo.environment["SENTRY_TRACES_SAMPLE_RATE"] ?? "1.0" - ) as NSNumber? - - // By default only 5XX are captured - // Let's capture everything but 401 (unauthorized) - options.enableCaptureFailedRequests = true - options.failedRequestStatusCodes = [ - HttpStatusCodeRange(min: 400, max: 400), - HttpStatusCodeRange(min: 402, max: 599) - ] - } - } - defer { SentrySDK.flush(timeout: 2.seconds.timeInterval) } - - SentrySDK.configureScope { scope in - scope.setExtra(value: ProcessInfo.processInfo.arguments, key: "Command-line arguments") - } - - // Enrich future events with Cirrus CI-specific tags - if let tags = ProcessInfo.processInfo.environment["CIRRUS_SENTRY_TAGS"] { - SentrySDK.configureScope { scope in - for (key, value) in tags.split(separator: ",").compactMap({ parseCirrusSentryTag($0) }) { - scope.setTag(value: value, key: key) - } - } - } + // Initialize OpenTelemetry if configured + Telemetry.bootstrapFromEnv() + defer { Telemetry.flush() } // Run garbage-collection before each command (shouldn't take too long) if type(of: command) != type(of: Pull()) && type(of: command) != type(of: Clone()){ @@ -108,9 +79,9 @@ struct Root: AsyncParsableCommand { Foundation.exit(execCustomExitCodeError.exitCode) } - // Capture the error into Sentry - SentrySDK.capture(error: error) - SentrySDK.flush(timeout: 2.seconds.timeInterval) + // Record the error into OpenTelemetry + Telemetry.recordError(error) + Telemetry.flush() // Handle a non-ArgumentParser's exception that requires a specific exit code to be set if let errorWithExitCode = error as? HasExitCode { diff --git a/Sources/tart/Telemetry.swift b/Sources/tart/Telemetry.swift new file mode 100644 index 0000000..f1137ef --- /dev/null +++ b/Sources/tart/Telemetry.swift @@ -0,0 +1,191 @@ +import Foundation + +#if canImport(OpenTelemetryApi) +import OpenTelemetryApi +import OpenTelemetrySdk +import OpenTelemetryProtocolExporterCommon +import OpenTelemetryProtocolExporterGrpc +import OpenTelemetryProtocolExporterHttp +import GRPC +import NIO +#endif + +enum TelemetrySpanStatus { + case cancelled +} + +final class TelemetrySpan { + #if canImport(OpenTelemetryApi) + private let span: Span? + #else + private let span: Any? = nil + #endif + + init(_ span: Any?) { + #if canImport(OpenTelemetryApi) + self.span = span as? Span + #endif + } + + func finish(status: TelemetrySpanStatus? = nil) { + #if canImport(OpenTelemetryApi) + if let span = span { + if let status = status { + switch status { + case .cancelled: + span.status = .error(description: "cancelled") + } + } + span.end() + if Telemetry.currentSpan === span { + Telemetry.currentSpan = nil + } + } + #endif + } +} + +enum Telemetry { + #if canImport(OpenTelemetryApi) + static var tracer: Tracer = OpenTelemetry.instance.tracerProvider.get(instrumentationName: "tart", instrumentationVersion: CI.version) + static var currentSpan: Span? + private static var eventLoopGroup: EventLoopGroup? + private static var providerSdk: TracerProviderSdk? + #else + static var currentSpan: Any? + #endif + + // Configure OpenTelemetry when OTEL_EXPORTER_OTLP_ENDPOINT is set. + static func bootstrapFromEnv() { + guard let endpoint = ProcessInfo.processInfo.environment["OTEL_EXPORTER_OTLP_ENDPOINT"], !endpoint.isEmpty else { + return + } + + #if canImport(OpenTelemetryApi) + let resource = buildResource() + + // Build exporter configuration + let headerList = parseHeaders(ProcessInfo.processInfo.environment["OTEL_EXPORTER_OTLP_HEADERS"]) // [(k,v)] + + // Build exporter based on endpoint scheme + var exporter: SpanExporter + if endpoint.lowercased().hasPrefix("http://") || endpoint.lowercased().hasPrefix("https://") { + let url = URL(string: endpoint)! + let config = OtlpConfiguration(timeout: 10, headers: headerList, exportAsJson: false) + exporter = OtlpHttpTraceExporter(endpoint: url, config: config, envVarHeaders: nil) + } else { + // gRPC: parse host[:port] + let parts = endpoint.split(separator: ":", maxSplits: 1, omittingEmptySubsequences: true) + let host = String(parts.first!) + let port = parts.count > 1 ? Int(parts[1]) ?? 4317 : 4317 + let group = MultiThreadedEventLoopGroup(numberOfThreads: 1) + eventLoopGroup = group + let channel = ClientConnection.insecure(group: group).connect(host: host, port: port) + let config = OtlpConfiguration(timeout: 10, headers: headerList, exportAsJson: false) + exporter = OtlpTraceExporter(channel: channel, config: config, envVarHeaders: nil) + } + + let spanProcessor = BatchSpanProcessor(spanExporter: exporter) + let provider = TracerProviderBuilder() + .add(spanProcessor: spanProcessor) + .with(resource: resource) + .build() + + providerSdk = provider + OpenTelemetry.registerTracerProvider(tracerProvider: provider) + tracer = OpenTelemetry.instance.tracerProvider.get(instrumentationName: "tart", instrumentationVersion: CI.version) + #endif + } + + // Flush spans quickly on shutdown + static func flush() { + #if canImport(OpenTelemetryApi) + providerSdk?.forceFlush(timeout: 5) + if let group = eventLoopGroup { + try? group.syncShutdownGracefully() + eventLoopGroup = nil + } + #endif + } + + static func startTransaction(name: String, operation: String? = nil, bindToScope: Bool = false) -> TelemetrySpan { + #if canImport(OpenTelemetryApi) + let builder = tracer.spanBuilder(spanName: name) + if let op = operation { + builder.setSpanKind(spanKind: .internal) + builder.setAttribute(key: "operation", value: op) + } + let span = builder.startSpan() + if bindToScope { + currentSpan = span + } + return TelemetrySpan(span) + #else + return TelemetrySpan(nil) + #endif + } + + static func recordError(_ error: Error) { + #if canImport(OpenTelemetryApi) + let span = currentSpan ?? tracer.spanBuilder(spanName: "error").startSpan() + span.recordException(error) + span.status = .error(description: String(describing: error)) + if currentSpan == nil { + span.end() + } + #endif + } + + static func addEvent(_ name: String, attributes: [String: AttributeValue] = [:]) { + #if canImport(OpenTelemetryApi) + currentSpan?.addEvent(name: name, attributes: attributes) + #endif + } + + static func setAttribute(_ key: String, _ value: AttributeValue) { + #if canImport(OpenTelemetryApi) + currentSpan?.setAttribute(key: key, value: value) + #endif + } + + // Build a Resource with service + environment tags + private static func buildResource() -> Resource { + #if canImport(OpenTelemetryApi) + var attributes: [String: AttributeValue] = [ + "service.name": .string("tart"), + "service.version": .string(CI.version), + "process.command_args": AttributeValue(ProcessInfo.processInfo.arguments) + ] + + // Migrate Sentry tags to resource attributes if present + if let tags = ProcessInfo.processInfo.environment["CIRRUS_SENTRY_TAGS"] { + for (k, v) in parseTags(tags) { + attributes[k] = .string(v) + } + } + + return Resource(attributes: attributes) + #else + return Resource() + #endif + } + + private static func parseTags(_ raw: String) -> [(String, String)] { + raw.split(separator: ",").compactMap { pair in + let parts = pair.split(separator: "=", maxSplits: 1) + guard parts.count == 2 else { return nil } + return (String(parts[0]), String(parts[1])) + } + } + + private static func parseHeaders(_ raw: String?) -> [(String, String)] { + guard let raw = raw else { return [] } + var result: [(String, String)] = [] + for part in raw.split(separator: ",") { + let kv = part.split(separator: "=", maxSplits: 1) + guard kv.count == 2 else { continue } + result.append((String(kv[0]), String(kv[1]))) + } + return result + } +} diff --git a/Sources/tart/VMDirectory+OCI.swift b/Sources/tart/VMDirectory+OCI.swift index 6240853..5bf161a 100644 --- a/Sources/tart/VMDirectory+OCI.swift +++ b/Sources/tart/VMDirectory+OCI.swift @@ -1,6 +1,6 @@ import Compression import Foundation -import Sentry +import OpenTelemetryApi enum OCIError: Error { case ShouldBeExactlyOneLayer @@ -43,7 +43,7 @@ extension VMDirectory { } let diskCompressedSize = layers.map { Int64($0.size) }.reduce(0, +) - SentrySDK.span?.setMeasurement(name: "compressed_disk_size", value: diskCompressedSize as NSNumber, unit: MeasurementUnitInformation.byte) + Telemetry.setAttribute("compressed_disk_size", .int(Int(diskCompressedSize))) let prettyDiskSize = String(format: "%.1f", Double(diskCompressedSize) / 1_000_000_000.0) defaultLogger.appendNewLine("pulling disk (\(prettyDiskSize) GB compressed)...") diff --git a/Sources/tart/VMStorageOCI.swift b/Sources/tart/VMStorageOCI.swift index f71c547..7cdd2e7 100644 --- a/Sources/tart/VMStorageOCI.swift +++ b/Sources/tart/VMStorageOCI.swift @@ -1,5 +1,5 @@ import Foundation -import Sentry +import OpenTelemetryApi import Retry class VMStorageOCI: PrunableStorage { @@ -141,9 +141,10 @@ class VMStorageOCI: PrunableStorage { } func pull(_ name: RemoteName, registry: Registry, concurrency: UInt, deduplicate: Bool) async throws { - SentrySDK.configureScope { scope in - scope.setContext(value: ["imageName": name.description], key: "OCI") - } + // Record image name for diagnostics + Telemetry.addEvent("OCI.pull.start", attributes: [ + "imageName": .string(name.description) + ]) defaultLogger.appendNewLine("pulling manifest...") @@ -177,7 +178,7 @@ class VMStorageOCI: PrunableStorage { } if !exists(digestName) { - let transaction = SentrySDK.startTransaction(name: name.description, operation: "pull", bindToScope: true) + let transaction = Telemetry.startTransaction(name: name.description, operation: "pull", bindToScope: true) let tmpVMDir = try VMDirectory.temporaryDeterministic(key: name.description) // Open an existing VM directory corresponding to this name, if any, @@ -190,9 +191,9 @@ class VMStorageOCI: PrunableStorage { // Try to reclaim some cache space if we know the VM size in advance if let uncompressedDiskSize = manifest.uncompressedDiskSize() { - SentrySDK.configureScope { scope in - scope.setContext(value: ["imageUncompressedDiskSize": uncompressedDiskSize], key: "OCI") - } + Telemetry.addEvent("OCI.pull.uncompressed_size", attributes: [ + "bytes": .int(Int(uncompressedDiskSize)) + ]) let otherVMFilesSize: UInt64 = 128 * 1024 * 1024 @@ -227,7 +228,7 @@ class VMStorageOCI: PrunableStorage { try move(digestName, from: tmpVMDir) transaction.finish() }, onCancel: { - transaction.finish(status: SentrySpanStatus.cancelled) + transaction.finish(status: .cancelled) try? FileManager.default.removeItem(at: tmpVMDir.baseURL) }) } else {