diff --git a/docs/_sidebar.md b/docs/_sidebar.md index 09aa85a..4527911 100644 --- a/docs/_sidebar.md +++ b/docs/_sidebar.md @@ -4,6 +4,7 @@ - [Configuration](/configuration) - [Strategies](/strategies) - [Themes](/themes) +- [Tracing](/tracing) - [Versioning](/versioning) - **Providers** - [Overview](/providers/overview) diff --git a/docs/tracing.md b/docs/tracing.md new file mode 100644 index 0000000..f06a6e1 --- /dev/null +++ b/docs/tracing.md @@ -0,0 +1,212 @@ +# Observability with OpenTelemetry + +Sablier includes built-in support for OpenTelemetry, providing comprehensive observability through distributed tracing and metrics. + +## Configuration + +Enable OpenTelemetry by setting the following configuration: + +### YAML Configuration + +```yaml +tracing: + enabled: true + endpoint: localhost:4317 # OTLP gRPC endpoint +``` + +### Environment Variables + +```bash +export TRACING_ENABLED=true +export TRACING_ENDPOINT=localhost:4317 +``` + +### Command-Line Flags + +```bash +sablier start --tracing.enabled=true --tracing.endpoint=localhost:4317 +``` + +## Traces + +Sablier automatically instruments: + +- **HTTP requests** - All incoming requests to the Sablier server +- **Provider operations** - Instance start, stop, inspect, list, and group operations +- **Session management** - Session creation and lifecycle + +### Trace Attributes + +Each trace includes relevant attributes such as: +- `instance` - Instance name +- `provider` - Provider type (docker, kubernetes, etc.) +- `strategy` - Scaling strategy (dynamic, blocking) +- `http.method` - HTTP method +- `http.route` - HTTP route +- `http.status_code` - HTTP response status code + +## Metrics + +Sablier exposes the following metrics: + +### Counters + +- `sablier.sessions.total` - Total number of sessions created + - Labels: `strategy` +- `sablier.instances.started` - Total number of instances started + - Labels: `provider` +- `sablier.instances.stopped` - Total number of instances stopped + - Labels: `provider` + +### Gauges + +- `sablier.sessions.active` - Number of currently active sessions + - Labels: `strategy` + +### Histograms + +- `sablier.requests.duration` - Request duration in milliseconds + - Labels: `strategy`, `status` + +## Using with Jaeger + +Example using Jaeger all-in-one: + +```bash +# Start Jaeger +docker run -d --name jaeger \ + -p 16686:16686 \ + -p 4317:4317 \ + jaegertracing/all-in-one:latest + +# Start Sablier with tracing +sablier start --tracing.enabled=true --tracing.endpoint=localhost:4317 + +# View traces at http://localhost:16686 +``` + +## Using with Prometheus + Grafana + +Example docker-compose setup: + +```yaml +version: '3' +services: + otel-collector: + image: otel/opentelemetry-collector:latest + command: ["--config=/etc/otel-config.yaml"] + volumes: + - ./otel-config.yaml:/etc/otel-config.yaml + ports: + - "4317:4317" + - "8889:8889" + + prometheus: + image: prom/prometheus:latest + volumes: + - ./prometheus.yaml:/etc/prometheus/prometheus.yml + ports: + - "9090:9090" + + grafana: + image: grafana/grafana:latest + ports: + - "3000:3000" + + sablier: + image: sablierapp/sablier:latest + environment: + - TRACING_ENABLED=true + - TRACING_ENDPOINT=otel-collector:4317 + volumes: + - /var/run/docker.sock:/var/run/docker.sock +``` + +Example OpenTelemetry Collector configuration (`otel-config.yaml`): + +```yaml +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + batch: + +exporters: + prometheus: + endpoint: "0.0.0.0:8889" + otlp: + endpoint: jaeger:4317 + tls: + insecure: true + +service: + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [otlp] + metrics: + receivers: [otlp] + processors: [batch] + exporters: [prometheus] +``` + +Example Prometheus configuration (`prometheus.yaml`): + +```yaml +global: + scrape_interval: 15s + +scrape_configs: + - job_name: 'otel-collector' + static_configs: + - targets: ['otel-collector:8889'] +``` + +## Custom Instrumentation + +If you're building custom integrations, you can use the global tracer: + +```go +import ( + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" +) + +tracer := otel.Tracer("my-component") +ctx, span := tracer.Start(ctx, "operation-name") +defer span.End() + +span.SetAttributes( + attribute.String("key", "value"), +) + +// Your code here +``` + +## Troubleshooting + +### Tracing not working + +1. Verify the OpenTelemetry collector is running and accessible +2. Check the endpoint configuration matches your collector +3. Ensure firewall rules allow connections to port 4317 +4. Check Sablier logs for tracing initialization errors + +### High memory usage + +If you experience high memory usage: +- Reduce the batch size in the collector +- Increase the export interval +- Filter traces and metrics at the collector level + +### Missing traces + +- Ensure `tracing.enabled=true` is set +- Verify the collector is configured to receive OTLP data +- Check that the correct port (4317 for gRPC) is being used diff --git a/go.mod b/go.mod index 4b10ca3..10736bb 100644 --- a/go.mod +++ b/go.mod @@ -29,7 +29,17 @@ require ( k8s.io/client-go v0.34.2 ) -require github.com/containers/image/v5 v5.36.2 +require ( + github.com/containers/image/v5 v5.36.2 + go.opentelemetry.io/otel v1.38.0 + go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 + go.opentelemetry.io/otel/metric v1.38.0 + go.opentelemetry.io/otel/sdk v1.38.0 + go.opentelemetry.io/otel/sdk/metric v1.38.0 + go.opentelemetry.io/otel/trace v1.38.0 +) require ( dario.cat/mergo v1.0.2 // indirect @@ -51,6 +61,7 @@ require ( github.com/bytedance/sonic v1.14.0 // indirect github.com/bytedance/sonic/loader v0.3.0 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect + github.com/cenkalti/backoff/v5 v5.0.3 // indirect github.com/charmbracelet/bubbles v0.20.0 // indirect github.com/charmbracelet/bubbletea v1.3.3 // indirect github.com/charmbracelet/lipgloss v1.0.0 // indirect @@ -89,10 +100,10 @@ require ( github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/fxamacker/cbor/v2 v2.9.0 // indirect - github.com/gabriel-vasile/mimetype v1.4.8 // indirect + github.com/gabriel-vasile/mimetype v1.4.10 // indirect github.com/gin-contrib/sse v1.1.0 // indirect github.com/go-delve/delve v1.24.0 // indirect - github.com/go-jose/go-jose/v4 v4.0.5 // indirect + github.com/go-jose/go-jose/v4 v4.1.1 // indirect github.com/go-json-experiment/json v0.0.0-20250213060926-925ba3f173fa // indirect github.com/go-logr/logr v1.4.3 // indirect github.com/go-logr/stdr v1.2.2 // indirect @@ -116,6 +127,7 @@ require ( github.com/google/uuid v1.6.0 // indirect github.com/gorilla/mux v1.8.1 // indirect github.com/gorilla/schema v1.4.1 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 // indirect github.com/hashicorp/errwrap v1.1.0 // indirect github.com/hashicorp/go-multierror v1.1.1 // indirect github.com/hpcloud/tail v1.0.0 // indirect @@ -211,10 +223,9 @@ require ( github.com/yusufpapurcu/wmi v1.2.4 // indirect go.opencensus.io v0.24.0 // indirect go.opentelemetry.io/auto/sdk v1.1.0 // indirect + go.opentelemetry.io/contrib/instrumentation/github.com/gin-gonic/gin/otelgin v0.63.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 // indirect - go.opentelemetry.io/otel v1.35.0 // indirect - go.opentelemetry.io/otel/metric v1.35.0 // indirect - go.opentelemetry.io/otel/trace v1.35.0 // indirect + go.opentelemetry.io/proto/otlp v1.7.1 // indirect go.yaml.in/yaml/v2 v2.4.2 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect golang.org/x/arch v0.20.0 // indirect @@ -228,9 +239,9 @@ require ( golang.org/x/text v0.30.0 // indirect golang.org/x/time v0.11.0 // indirect golang.org/x/tools v0.37.0 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20250313205543-e70fdf4c4cb4 // indirect - google.golang.org/grpc v1.72.2 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20250825161204-c5933d9347a5 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5 // indirect + google.golang.org/grpc v1.75.0 // indirect google.golang.org/protobuf v1.36.9 // indirect gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect gopkg.in/fsnotify.v1 v1.4.7 // indirect diff --git a/go.sum b/go.sum index 9fc2f12..78a1afa 100644 --- a/go.sum +++ b/go.sum @@ -48,6 +48,8 @@ github.com/bytedance/sonic/loader v0.3.0 h1:dskwH8edlzNMctoruo8FPTJDF3vLtDT0sXZw github.com/bytedance/sonic/loader v0.3.0/go.mod h1:N8A3vUdtUebEY2/VQC0MyhYeKUFosQU6FxH2JmUe6VI= github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= +github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM= +github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= @@ -164,14 +166,16 @@ github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sa github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= github.com/gabriel-vasile/mimetype v1.4.8 h1:FfZ3gj38NjllZIeJAmMhr+qKL8Wu+nOoI3GqacKw1NM= github.com/gabriel-vasile/mimetype v1.4.8/go.mod h1:ByKUIKGjh1ODkGM1asKUbQZOLGrPjydw3hYPU2YU9t8= +github.com/gabriel-vasile/mimetype v1.4.10 h1:zyueNbySn/z8mJZHLt6IPw0KoZsiQNszIpU+bX4+ZK0= +github.com/gabriel-vasile/mimetype v1.4.10/go.mod h1:d+9Oxyo1wTzWdyVUPMmXFvp4F9tea18J8ufA774AB3s= github.com/gin-contrib/sse v1.1.0 h1:n0w2GMuUpWDVp7qSpvze6fAu9iRxJY4Hmj6AmBOU05w= github.com/gin-contrib/sse v1.1.0/go.mod h1:hxRZ5gVpWMT7Z0B0gSNYqqsSCNIJMjzvm6fqCz9vjwM= github.com/gin-gonic/gin v1.11.0 h1:OW/6PLjyusp2PPXtyxKHU0RbX6I/l28FTdDlae5ueWk= github.com/gin-gonic/gin v1.11.0/go.mod h1:+iq/FyxlGzII0KHiBGjuNn4UNENUlKbGlNmc+W50Dls= github.com/go-delve/delve v1.24.0 h1:M1auuI7kyfXZm5LMDQEqhqr4koKWOzGKhCgwMxsLQfo= github.com/go-delve/delve v1.24.0/go.mod h1:yNWXOuo4yslMOOj7O8gIRrf/trDBrFy5ZXwJL4ZzOos= -github.com/go-jose/go-jose/v4 v4.0.5 h1:M6T8+mKZl/+fNNuFHvGIzDz7BTLQPIounk/b9dw3AaE= -github.com/go-jose/go-jose/v4 v4.0.5/go.mod h1:s3P1lRrkT8igV8D9OjyL4WRyHvjB6a4JSllnOrmmBOA= +github.com/go-jose/go-jose/v4 v4.1.1 h1:JYhSgy4mXXzAdF3nUx3ygx347LRXJRrpgyU3adRmkAI= +github.com/go-jose/go-jose/v4 v4.1.1/go.mod h1:BdsZGqgdO3b6tTc6LSE56wcDbMMLuPsw5d4ZD5f94kA= github.com/go-json-experiment/json v0.0.0-20250213060926-925ba3f173fa h1:Rpu6sKAzIeSWBkrFHD52g8yipagcPbY2Lmm70NL1Gzc= github.com/go-json-experiment/json v0.0.0-20250213060926-925ba3f173fa/go.mod h1:TiCD2a1pcmjd7YnhGH0f/zKNcCD06B029pHhzV23c2M= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= @@ -254,8 +258,8 @@ github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ= github.com/gorilla/schema v1.4.1 h1:jUg5hUjCSDZpNGLuXQOgIWGdlgrIdYvgQ0wZtdK1M3E= github.com/gorilla/schema v1.4.1/go.mod h1:Dg5SSm5PV60mhF2NFaTV1xuYYj8tV8NOPRo4FggUMnM= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.25.1 h1:VNqngBF40hVlDloBruUehVYC3ArSgIyScOAyMRqBxRg= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.25.1/go.mod h1:RBRO7fro65R6tjKzYgLAFo0t1QEXY1Dp+i/bvpRiqiQ= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 h1:8Tjv8EJ+pM1xP8mK6egEbD1OgnVTyacbefKhmbLhIhU= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2/go.mod h1:pkJQ2tZHJ0aFOVEEot6oZmaVEZcRme73eIFmhiVuRWs= github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I= github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= @@ -535,24 +539,30 @@ go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0= go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo= go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= +go.opentelemetry.io/contrib/instrumentation/github.com/gin-gonic/gin/otelgin v0.63.0 h1:5kSIJ0y8ckZZKoDhZHdVtcyjVi6rXyAwyaR8mp4zLbg= +go.opentelemetry.io/contrib/instrumentation/github.com/gin-gonic/gin/otelgin v0.63.0/go.mod h1:i+fIMHvcSQtsIY82/xgiVWRklrNt/O6QriHLjzGeY+s= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 h1:sbiXRNDSWJOTobXh5HyQKjq6wUC5tNybqjIqDpAY4CU= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0/go.mod h1:69uWxva0WgAA/4bu2Yy70SLDBwZXuQ6PbBpbsa5iZrQ= -go.opentelemetry.io/otel v1.35.0 h1:xKWKPxrxB6OtMCbmMY021CqC45J+3Onta9MqjhnusiQ= -go.opentelemetry.io/otel v1.35.0/go.mod h1:UEqy8Zp11hpkUrL73gSlELM0DupHoiq72dR+Zqel/+Y= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.32.0 h1:IJFEoHiytixx8cMiVAO+GmHR6Frwu+u5Ur8njpFO6Ac= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.32.0/go.mod h1:3rHrKNtLIoS0oZwkY2vxi+oJcwFRWdtUyRII+so45p8= +go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8= +go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0 h1:vl9obrcoWVKp/lwl8tRE33853I8Xru9HFbw/skNeLs8= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0/go.mod h1:GAXRxmLJcVM3u22IjTg74zWBrRCKq8BnOqUVLodpcpw= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 h1:GqRJVj7UmLjCVyVJ3ZFLdPRmhDUp2zFmQe3RHIOsw24= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0/go.mod h1:ri3aaHSmCTVYu2AWv44YMauwAQc0aqI9gHKIcSbI1pU= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 h1:lwI4Dc5leUqENgGuQImwLo4WnuXFPetmPpkLi2IrX54= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0/go.mod h1:Kz/oCE7z5wuyhPxsXDuaPteSWqjSBD5YaSdbxZYGbGk= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.35.0 h1:xJ2qHD0C1BeYVTLLR9sX12+Qb95kfeD/byKj6Ky1pXg= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.35.0/go.mod h1:u5BF1xyjstDowA1R5QAO9JHzqK+ublenEW/dyqTjBVk= -go.opentelemetry.io/otel/metric v1.35.0 h1:0znxYu2SNyuMSQT4Y9WDWej0VpcsxkuklLa4/siN90M= -go.opentelemetry.io/otel/metric v1.35.0/go.mod h1:nKVFgxBZ2fReX6IlyW28MgZojkoAkJGaE8CpgeAU3oE= -go.opentelemetry.io/otel/sdk v1.35.0 h1:iPctf8iprVySXSKJffSS79eOjl9pvxV9ZqOWT0QejKY= -go.opentelemetry.io/otel/sdk v1.35.0/go.mod h1:+ga1bZliga3DxJ3CQGg3updiaAJoNECOgJREo9KHGQg= -go.opentelemetry.io/otel/sdk/metric v1.35.0 h1:1RriWBmCKgkeHEhM7a2uMjMUfP7MsOF5JpUCaEqEI9o= -go.opentelemetry.io/otel/sdk/metric v1.35.0/go.mod h1:is6XYCUMpcKi+ZsOvfluY5YstFnhW0BidkR+gL+qN+w= -go.opentelemetry.io/otel/trace v1.35.0 h1:dPpEfJu1sDIqruz7BHFG3c7528f6ddfSWfFDVt/xgMs= -go.opentelemetry.io/otel/trace v1.35.0/go.mod h1:WUk7DtFp1Aw2MkvqGdwiXYDZZNvA/1J8o6xRXLrIkyc= -go.opentelemetry.io/proto/otlp v1.3.1 h1:TrMUixzpM0yuc/znrFTP9MMRh8trP93mkCiDVeXrui0= -go.opentelemetry.io/proto/otlp v1.3.1/go.mod h1:0X1WI4de4ZsLrrJNLAQbFeLCm3T7yBkR0XqQ7niQU+8= +go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA= +go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI= +go.opentelemetry.io/otel/sdk v1.38.0 h1:l48sr5YbNf2hpCUj/FoGhW9yDkl+Ma+LrVl8qaM5b+E= +go.opentelemetry.io/otel/sdk v1.38.0/go.mod h1:ghmNdGlVemJI3+ZB5iDEuk4bWA3GkTpW+DOoZMYBVVg= +go.opentelemetry.io/otel/sdk/metric v1.38.0 h1:aSH66iL0aZqo//xXzQLYozmWrXxyFkBJ6qT5wthqPoM= +go.opentelemetry.io/otel/sdk/metric v1.38.0/go.mod h1:dg9PBnW9XdQ1Hd6ZnRz689CbtrUp0wMMs9iPcgT9EZA= +go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE= +go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs= +go.opentelemetry.io/proto/otlp v1.7.1 h1:gTOMpGDb0WTBOP8JaO72iL3auEZhVmAQg4ipjOVAtj4= +go.opentelemetry.io/proto/otlp v1.7.1/go.mod h1:b2rVh6rfI/s2pHWNlB7ILJcRALpcNDzKhACevjI+ZnE= go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs= go.uber.org/automaxprocs v1.6.0/go.mod h1:ifeIMSnPZuznNm6jmdzmU3/bfk01Fe2fotchwEFJ8r8= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= @@ -693,22 +703,24 @@ golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8T golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= +gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= -google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb h1:p31xT4yrYrSM/G4Sn2+TNUkVhFCbG9y8itM2S6Th950= -google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb/go.mod h1:jbe3Bkdp+Dh2IrslsFCklNhweNTBgSYanP1UXhJDhKg= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250313205543-e70fdf4c4cb4 h1:iK2jbkWL86DXjEx0qiHcRE9dE4/Ahua5k6V8OWFb//c= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250313205543-e70fdf4c4cb4/go.mod h1:LuRYeWDFV6WOn90g357N17oMCaxpgCnbi/44qJvDn2I= +google.golang.org/genproto/googleapis/api v0.0.0-20250825161204-c5933d9347a5 h1:BIRfGDEjiHRrk0QKZe3Xv2ieMhtgRGeLcZQ0mIVn4EY= +google.golang.org/genproto/googleapis/api v0.0.0-20250825161204-c5933d9347a5/go.mod h1:j3QtIyytwqGr1JUDtYXwtMXWPKsEa5LtzIFN1Wn5WvE= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5 h1:eaY8u2EuxbRv7c3NiGK0/NedzVsCcV6hDuU5qPX5EGE= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5/go.mod h1:M4/wBTSeyLxupu3W3tJtOgB14jILAS/XWPSSa3TAlJc= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY= google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= google.golang.org/grpc v1.33.2/go.mod h1:JMHMWHQWaTccqQQlmk3MJZS+GWXOdAesneDmEnv2fbc= -google.golang.org/grpc v1.72.2 h1:TdbGzwb82ty4OusHWepvFWGLgIbNo1/SUynEN0ssqv8= -google.golang.org/grpc v1.72.2/go.mod h1:wH5Aktxcg25y1I3w7H69nHfXdOG3UiadoBtjh3izSDM= +google.golang.org/grpc v1.75.0 h1:+TW+dqTd2Biwe6KKfhE5JpiYIBWq865PhKGSXiivqt4= +google.golang.org/grpc v1.75.0/go.mod h1:JtPAzKiq4v1xcAB2hydNlWI2RnF85XXcV0mhKXr2ecQ= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= diff --git a/internal/server/server.go b/internal/server/server.go index 5f172bc..c051e96 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -11,6 +11,7 @@ import ( "github.com/gin-gonic/gin" "github.com/sablierapp/sablier/internal/api" "github.com/sablierapp/sablier/pkg/config" + "go.opentelemetry.io/contrib/instrumentation/github.com/gin-gonic/gin/otelgin" ) func setupRouter(ctx context.Context, logger *slog.Logger, serverConf config.Server, s *api.ServeStrategy) *gin.Engine { @@ -18,6 +19,7 @@ func setupRouter(ctx context.Context, logger *slog.Logger, serverConf config.Ser r.Use(StructuredLogger(logger)) r.Use(gin.Recovery()) + r.Use(otelgin.Middleware("sablier")) registerRoutes(ctx, r, serverConf, s) diff --git a/pkg/config/configuration.go b/pkg/config/configuration.go index d40bfc9..0bcc55f 100644 --- a/pkg/config/configuration.go +++ b/pkg/config/configuration.go @@ -7,6 +7,7 @@ type Config struct { Sessions Sessions Logging Logging Strategy Strategy + Tracing Tracing } func NewConfig() Config { @@ -17,5 +18,6 @@ func NewConfig() Config { Sessions: NewSessionsConfig(), Logging: NewLoggingConfig(), Strategy: NewStrategyConfig(), + Tracing: NewTracingConfig(), } } diff --git a/pkg/config/tracing.go b/pkg/config/tracing.go new file mode 100644 index 0000000..798f52a --- /dev/null +++ b/pkg/config/tracing.go @@ -0,0 +1,13 @@ +package config + +type Tracing struct { + Enabled bool `mapstructure:"ENABLED" yaml:"enabled,omitempty" default:"false"` + Endpoint string `mapstructure:"ENDPOINT" yaml:"endpoint,omitempty" default:"localhost:4317"` +} + +func NewTracingConfig() Tracing { + return Tracing{ + Enabled: false, + Endpoint: "localhost:4317", + } +} diff --git a/pkg/sabliercmd/root.go b/pkg/sabliercmd/root.go index acf920d..8a28c1a 100644 --- a/pkg/sabliercmd/root.go +++ b/pkg/sabliercmd/root.go @@ -83,6 +83,12 @@ It provides integrations with multiple reverse proxies and different loading str startCmd.Flags().DurationVar(&conf.Strategy.Blocking.DefaultRefreshFrequency, "strategy.blocking.default-refresh-frequency", 5*time.Second, "Default refresh frequency at which the instances status are checked for blocking strategy") _ = viper.BindPFlag("strategy.blocking.default-refresh-frequency", startCmd.Flags().Lookup("strategy.blocking.default-refresh-frequency")) + // Tracing flags + startCmd.Flags().BoolVar(&conf.Tracing.Enabled, "tracing.enabled", false, "Enable OpenTelemetry tracing and metrics") + _ = viper.BindPFlag("tracing.enabled", startCmd.Flags().Lookup("tracing.enabled")) + startCmd.Flags().StringVar(&conf.Tracing.Endpoint, "tracing.endpoint", "localhost:4317", "OpenTelemetry collector endpoint") + _ = viper.BindPFlag("tracing.endpoint", startCmd.Flags().Lookup("tracing.endpoint")) + rootCmd.AddCommand(startCmd) rootCmd.AddCommand(NewVersionCmd()) diff --git a/pkg/sabliercmd/start.go b/pkg/sabliercmd/start.go index bdc2052..8386ca1 100644 --- a/pkg/sabliercmd/start.go +++ b/pkg/sabliercmd/start.go @@ -13,6 +13,7 @@ import ( "github.com/sablierapp/sablier/pkg/config" "github.com/sablierapp/sablier/pkg/sablier" "github.com/sablierapp/sablier/pkg/store/inmemory" + "github.com/sablierapp/sablier/pkg/tracing" "github.com/sablierapp/sablier/pkg/version" "github.com/spf13/cobra" "github.com/spf13/viper" @@ -45,6 +46,26 @@ func Start(ctx context.Context, conf config.Config) error { logger.Info("running Sablier version " + version.Info()) + // Initialize tracing + tracingConfig := tracing.Config{ + ServiceName: "sablier", + ServiceVersion: version.Version, + Endpoint: conf.Tracing.Endpoint, + Enabled: conf.Tracing.Enabled, + } + + tracer, err := tracing.New(ctx, tracingConfig, logger) + if err != nil { + return fmt.Errorf("failed to initialize tracing: %w", err) + } + defer func() { + shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := tracer.Shutdown(shutdownCtx); err != nil { + logger.Error("failed to shutdown tracing", "error", err) + } + }() + provider, err := setupProvider(ctx, logger, conf.Provider) if err != nil { return fmt.Errorf("cannot setup provider: %w", err) diff --git a/pkg/tracing/metrics.go b/pkg/tracing/metrics.go new file mode 100644 index 0000000..0cec067 --- /dev/null +++ b/pkg/tracing/metrics.go @@ -0,0 +1,109 @@ +package tracing + +import ( + "context" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +type Metrics struct { + sessionsActive metric.Int64UpDownCounter + sessionsTotal metric.Int64Counter + instancesStarted metric.Int64Counter + instancesStopped metric.Int64Counter + requestsDuration metric.Float64Histogram +} + +func InitMetrics() (*Metrics, error) { + meter := otel.Meter("sablier") + + sessionsActive, err := meter.Int64UpDownCounter("sablier.sessions.active", + metric.WithDescription("Number of currently active sessions")) + if err != nil { + return nil, err + } + + sessionsTotal, err := meter.Int64Counter("sablier.sessions.total", + metric.WithDescription("Total number of sessions created")) + if err != nil { + return nil, err + } + + instancesStarted, err := meter.Int64Counter("sablier.instances.started", + metric.WithDescription("Total number of instances started")) + if err != nil { + return nil, err + } + + instancesStopped, err := meter.Int64Counter("sablier.instances.stopped", + metric.WithDescription("Total number of instances stopped")) + if err != nil { + return nil, err + } + + requestsDuration, err := meter.Float64Histogram("sablier.requests.duration", + metric.WithDescription("Duration of requests in milliseconds"), + metric.WithUnit("ms")) + if err != nil { + return nil, err + } + + return &Metrics{ + sessionsActive: sessionsActive, + sessionsTotal: sessionsTotal, + instancesStarted: instancesStarted, + instancesStopped: instancesStopped, + requestsDuration: requestsDuration, + }, nil +} + +func (m *Metrics) RecordSessionStart(ctx context.Context, strategy string) { + if m == nil { + return + } + m.sessionsActive.Add(ctx, 1, metric.WithAttributes( + attribute.String("strategy", strategy), + )) + m.sessionsTotal.Add(ctx, 1, metric.WithAttributes( + attribute.String("strategy", strategy), + )) +} + +func (m *Metrics) RecordSessionEnd(ctx context.Context, strategy string) { + if m == nil { + return + } + m.sessionsActive.Add(ctx, -1, metric.WithAttributes( + attribute.String("strategy", strategy), + )) +} + +func (m *Metrics) RecordInstanceStart(ctx context.Context, provider string) { + if m == nil { + return + } + m.instancesStarted.Add(ctx, 1, metric.WithAttributes( + attribute.String("provider", provider), + )) +} + +func (m *Metrics) RecordInstanceStop(ctx context.Context, provider string) { + if m == nil { + return + } + m.instancesStopped.Add(ctx, 1, metric.WithAttributes( + attribute.String("provider", provider), + )) +} + +func (m *Metrics) RecordRequestDuration(ctx context.Context, duration float64, strategy string, status string) { + if m == nil { + return + } + m.requestsDuration.Record(ctx, duration, metric.WithAttributes( + attribute.String("strategy", strategy), + attribute.String("status", status), + )) +} diff --git a/pkg/tracing/tracing.go b/pkg/tracing/tracing.go new file mode 100644 index 0000000..646b252 --- /dev/null +++ b/pkg/tracing/tracing.go @@ -0,0 +1,121 @@ +package tracing + +import ( + "context" + "fmt" + "log/slog" + "time" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" + "go.opentelemetry.io/otel/propagation" + "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/resource" + "go.opentelemetry.io/otel/sdk/trace" + semconv "go.opentelemetry.io/otel/semconv/v1.24.0" +) + +type Config struct { + ServiceName string + ServiceVersion string + Endpoint string + Enabled bool +} + +type Telemetry struct { + tracerProvider *trace.TracerProvider + meterProvider *metric.MeterProvider + logger *slog.Logger +} + +// New initializes OpenTelemetry with tracing and metrics +func New(ctx context.Context, cfg Config, logger *slog.Logger) (*Telemetry, error) { + if !cfg.Enabled { + logger.Info("OpenTelemetry disabled") + return &Telemetry{logger: logger}, nil + } + + res, err := resource.New(ctx, + resource.WithAttributes( + semconv.ServiceName(cfg.ServiceName), + semconv.ServiceVersion(cfg.ServiceVersion), + ), + ) + if err != nil { + return nil, fmt.Errorf("failed to create resource: %w", err) + } + + // Setup Tracer Provider + traceExporter, err := otlptrace.New(ctx, otlptracegrpc.NewClient( + otlptracegrpc.WithEndpoint(cfg.Endpoint), + otlptracegrpc.WithInsecure(), + )) + if err != nil { + return nil, fmt.Errorf("failed to create trace exporter: %w", err) + } + + tracerProvider := trace.NewTracerProvider( + trace.WithBatcher(traceExporter), + trace.WithResource(res), + ) + otel.SetTracerProvider(tracerProvider) + + // Setup Meter Provider + metricExporter, err := otlpmetricgrpc.New(ctx, + otlpmetricgrpc.WithEndpoint(cfg.Endpoint), + otlpmetricgrpc.WithInsecure(), + ) + if err != nil { + return nil, fmt.Errorf("failed to create metric exporter: %w", err) + } + + meterProvider := metric.NewMeterProvider( + metric.WithReader(metric.NewPeriodicReader(metricExporter, + metric.WithInterval(10*time.Second))), + metric.WithResource(res), + ) + otel.SetMeterProvider(meterProvider) + + // Setup propagators + otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator( + propagation.TraceContext{}, + propagation.Baggage{}, + )) + + logger.Info("OpenTelemetry initialized", + "service", cfg.ServiceName, + "version", cfg.ServiceVersion, + "endpoint", cfg.Endpoint) + + return &Telemetry{ + tracerProvider: tracerProvider, + meterProvider: meterProvider, + logger: logger, + }, nil +} + +// Shutdown gracefully shuts down the telemetry providers +func (t *Telemetry) Shutdown(ctx context.Context) error { + if t.tracerProvider == nil && t.meterProvider == nil { + return nil + } + + var err error + if t.tracerProvider != nil { + if shutdownErr := t.tracerProvider.Shutdown(ctx); shutdownErr != nil { + err = shutdownErr + t.logger.Error("failed to shutdown tracer provider", "error", err) + } + } + + if t.meterProvider != nil { + if shutdownErr := t.meterProvider.Shutdown(ctx); shutdownErr != nil { + err = shutdownErr + t.logger.Error("failed to shutdown meter provider", "error", err) + } + } + + return err +} diff --git a/sablier.sample.yaml b/sablier.sample.yaml index ced527e..39841c5 100644 --- a/sablier.sample.yaml +++ b/sablier.sample.yaml @@ -10,6 +10,10 @@ sessions: expiration-interval: 20s logging: level: info +# OpenTelemetry tracing and metrics configuration +tracing: + enabled: false + endpoint: "localhost:4317" # OpenTelemetry collector endpoint strategy: dynamic: custom-themes-path: