From feaa462071da88bc352e58b5000d7ff911a972a1 Mon Sep 17 00:00:00 2001 From: Cyril Tovena Date: Fri, 12 Jan 2024 09:28:10 +0100 Subject: [PATCH] feat: Add tracing integration to profiling. (#11633) Same as https://github.com/grafana/tempo/pull/3276 this adds profiling integration to tracing instrumentation allowing to get profile for a single request removing the noise of everything else. --- CHANGELOG.md | 3 +- cmd/loki/main.go | 6 +- pkg/tracing/config.go | 4 +- .../grafana/dskit/spanprofiler/README.md | 104 +++++++++++++++++ .../dskit/spanprofiler/spanprofiler.go | 107 +++++++++++++++++ .../grafana/dskit/spanprofiler/tracer.go | 109 ++++++++++++++++++ vendor/modules.txt | 1 + 7 files changed, 331 insertions(+), 3 deletions(-) create mode 100644 vendor/github.com/grafana/dskit/spanprofiler/README.md create mode 100644 vendor/github.com/grafana/dskit/spanprofiler/spanprofiler.go create mode 100644 vendor/github.com/grafana/dskit/spanprofiler/tracer.go diff --git a/CHANGELOG.md b/CHANGELOG.md index 46e9a24daf1ef..0e723e64176c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,8 @@ ##### Enhancements -* [11571](https://github.com/grafana/loki/pull/11571) **MichelHollands**: Add a metrics.go log line for requests from querier to ingester +* [11633](https://github.com/grafana/loki/pull/11633) **cyriltovena**: Add profiling integrations to tracing instrumentation. +* [11571](https://github.com/grafana/loki/pull/11571) **MichelHollands**: Add a metrics.go log line for requests from querier to ingester * [11477](https://github.com/grafana/loki/pull/11477) **MichelHollands**: support GET for /ingester/shutdown * [11363](https://github.com/grafana/loki/pull/11363) **kavirajk**: bugfix(memcached): Make memcached batch fetch truely context aware. * [11319](https://github.com/grafana/loki/pull/11319) **someStrangerFromTheAbyss**: Helm: Add extraContainers to the write pods. diff --git a/cmd/loki/main.go b/cmd/loki/main.go index 845104eee8de5..937a5c16fab80 100644 --- a/cmd/loki/main.go +++ b/cmd/loki/main.go @@ -10,7 +10,9 @@ import ( "github.com/go-kit/log/level" "github.com/grafana/dskit/log" + "github.com/grafana/dskit/spanprofiler" "github.com/grafana/dskit/tracing" + "github.com/opentracing/opentracing-go" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/version" @@ -84,7 +86,9 @@ func main() { if err != nil { level.Error(util_log.Logger).Log("msg", "error in initializing tracing. tracing will not be enabled", "err", err) } - + if config.Tracing.ProfilingEnabled { + opentracing.SetGlobalTracer(spanprofiler.NewTracer(opentracing.GlobalTracer())) + } defer func() { if trace != nil { if err := trace.Close(); err != nil { diff --git a/pkg/tracing/config.go b/pkg/tracing/config.go index 1c97d88a845df..f9faefa6a7303 100644 --- a/pkg/tracing/config.go +++ b/pkg/tracing/config.go @@ -5,7 +5,8 @@ import ( ) type Config struct { - Enabled bool `yaml:"enabled"` + Enabled bool `yaml:"enabled"` + ProfilingEnabled bool `yaml:"profiling_enabled" category:"experimental" doc:"hidden"` } func (cfg *Config) RegisterFlags(f *flag.FlagSet) { @@ -14,4 +15,5 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) { func (cfg *Config) RegisterFlagsWithPrefix(prefix string, f *flag.FlagSet) { f.BoolVar(&cfg.Enabled, prefix+"tracing.enabled", true, "Set to false to disable tracing.") + f.BoolVar(&cfg.ProfilingEnabled, prefix+"tracing.profiling-enabled", true, "Set to true to enable profiling integration.") } diff --git a/vendor/github.com/grafana/dskit/spanprofiler/README.md b/vendor/github.com/grafana/dskit/spanprofiler/README.md new file mode 100644 index 0000000000000..a415985f6649e --- /dev/null +++ b/vendor/github.com/grafana/dskit/spanprofiler/README.md @@ -0,0 +1,104 @@ +# Span Profiler for OpenTracing-Go + +## Overview + +The Span Profiler for OpenTracing-Go is a package that seamlessly integrates `opentracing-go` instrumentation with +profiling through the use of pprof labels. + +Accessing trace span profiles is made convenient through the Grafana Explore view. You can find a complete example setup +with Grafana Tempo in the [Pyroscope repository](https://github.com/grafana/pyroscope/tree/main/examples/tracing/tempo): + +![image](https://github.com/grafana/otel-profiling-go/assets/12090599/31e33cd1-818b-4116-b952-c9ec7b1fb593) + +## Usage + +There are two primary ways to use the Span Profiler: + +### 1. Wrap the Global Tracer. + +You can wrap the global tracer using `spanprofiler.NewTracer`: + +```go +import ( + "github.com/opentracing/opentracing-go" + "github.com/grafana/dskit/spanprofiler" +) + +func main() { + // Initialize your OpenTracing tracer + tracer := opentracing.GlobalTracer() + // Wrap it with the tracer-profiler + wrappedTracer := spanprofiler.NewTracer(tracer) + // Use the wrapped tracer in your application + opentracing.SetGlobalTracer(wrappedTracer) + + // Or, as an oneliner: + // opentracing.SetGlobalTracer(spanprofiler.NewTracer(opentracing.GlobalTracer())) + + // Your application logic here +} +``` + +For efficiency, the tracer selectively records profiles for _root_ spans — the initial _local_ span in a process — since +a trace may encompass thousands of spans. All stack trace samples accumulated during the execution of their child spans +contribute to the root span's profile. In practical terms, this signifies that, for instance, an HTTP request results +in a singular profile, irrespective of the numerous spans within the trace. It's important to note that these profiles +don't extend beyond the boundaries of a single process. + +The limitation of this approach is that only spans created within the same goroutine, or its children, as the parent are +taken into account. Consequently, in scenarios involving asynchronous execution, where the parent span context is passed +to another goroutine, explicit profiling becomes necessary using `spanprofiler.StartSpanFromContext`. + +### 2. Profile individual spans. + +The `spanprofiler.StartSpanFromContext` function allows you to granularly control which spans to profile: + +```go +func YourOperationName(ctx context.Background()) { + // Start a span and enable profiling for it + span, ctx := spanprofiler.StartSpanFromContext(ctx, "YourOperationName", tracer) + defer span.Finish() // Finish the span when done + + // Use the span in your application logic +} +``` + +The function guarantees that the span is to be profiled. + +Both methods can be employed either in conjunction or independently. Our recommendation is to utilize the tracer for +seamless integration, reserving explicit span profiling only for cases where spans are spawned in detached goroutines. + +## Implementation details + +When a new trace span is created, and is eligible for profiling, the tracer sets `span_id` and `span_name` [pprof labels](https://github.com/google/pprof/blob/master/doc/README.md#tag-filtering) +that point to the respective span. These labels are stored in the goroutine's local storage and inherited by any +subsequent child goroutines. + +`span_name` is available as a regular label and can be used in the query expressions. For example, the following query +will show you profile for the code that is not covered with traces: +``` +{service_name="my-service",span_name=""} +``` + +Additionally, trace spans are identified by the `pyroscope.profile.id` attribute, indicating the associated profile. +This allows to find such spans in the trace view (in the screenshot) and fetch profiles for specific spans. + +It's important to note that the presence of this attribute does not guarantee profile availability; stack trace samples +might not be collected if the CPU time utilized falls below the sample interval (10ms). + +It is crucial to understand that this module doesn't directly control the pprof profiler; its initialization is still +necessary for profile collection. This initialization can be achieved through the `runtime/pprof` package, or using the +[Pyroscope client](https://github.com/grafana/pyroscope-go). + +Limitations: + - Only CPU profiling is fully supported at the moment. + - Only [Jaeger tracer](https://github.com/jaegertracing/jaeger-client-go) implementation is supported. + +## Performance implications + +The typical performance impact is generally imperceptible and primarily arises from the cost of pprof labeling. However, +intensive use of pprof labels may have negative impact on the profiled application. + +In the case of the tracer provided by this package, the `StartSpan` method wrapper introduces an approximate 20% increase +in CPU time compared to the original call. In vase majority of cases, the overhead constitutes less than 0.01% of the total +CPU time and is considered safe for deployment in production systems. diff --git a/vendor/github.com/grafana/dskit/spanprofiler/spanprofiler.go b/vendor/github.com/grafana/dskit/spanprofiler/spanprofiler.go new file mode 100644 index 0000000000000..8481d04498d5a --- /dev/null +++ b/vendor/github.com/grafana/dskit/spanprofiler/spanprofiler.go @@ -0,0 +1,107 @@ +package spanprofiler + +import ( + "context" + "runtime/pprof" + + "github.com/opentracing/opentracing-go" + "github.com/uber/jaeger-client-go" +) + +// StartSpanFromContext starts and returns a Span with `operationName`, using +// any Span found within `ctx` as a ChildOfRef. If no such parent could be +// found, StartSpanFromContext creates a root (parentless) Span. +// +// The call sets `operationName` as `span_name` pprof label, and the new span +// identifier as `span_id` pprof label, if the trace is sampled. +// +// The second return value is a context.Context object built around the +// returned Span. +// +// Example usage: +// +// SomeFunction(ctx context.Context, ...) { +// sp, ctx := opentracing.StartSpanFromContext(ctx, "SomeFunction") +// defer sp.Finish() +// ... +// } +func StartSpanFromContext(ctx context.Context, operationName string, opts ...opentracing.StartSpanOption) (opentracing.Span, context.Context) { + return StartSpanFromContextWithTracer(ctx, opentracing.GlobalTracer(), operationName, opts...) +} + +// StartSpanFromContextWithTracer starts and returns a span with `operationName` +// using a span found within the context as a ChildOfRef. If that doesn't exist +// it creates a root span. It also returns a context.Context object built +// around the returned span. +// +// The call sets `operationName` as `span_name` pprof label, and the new span +// identifier as `span_id` pprof label, if the trace is sampled. +// +// It's behavior is identical to StartSpanFromContext except that it takes an explicit +// tracer as opposed to using the global tracer. +func StartSpanFromContextWithTracer(ctx context.Context, tracer opentracing.Tracer, operationName string, opts ...opentracing.StartSpanOption) (opentracing.Span, context.Context) { + span, ctx := opentracing.StartSpanFromContextWithTracer(ctx, tracer, operationName, opts...) + spanCtx, ok := span.Context().(jaeger.SpanContext) + if ok { + span = wrapJaegerSpanWithGoroutineLabels(ctx, span, operationName, sampledSpanID(spanCtx)) + } + return span, ctx +} + +func wrapJaegerSpanWithGoroutineLabels( + parentCtx context.Context, + span opentracing.Span, + operationName string, + spanID string, +) *spanWrapper { + // Note that pprof labels are propagated through the goroutine's local + // storage and are always copied to child goroutines. This way, stack + // trace samples collected during execution of child spans will be taken + // into account at the root. + var ctx context.Context + if spanID != "" { + ctx = pprof.WithLabels(parentCtx, pprof.Labels( + spanNameLabelName, operationName, + spanIDLabelName, spanID)) + } else { + // Even if the trace has not been sampled, we still need to keep track + // of samples that belong to the span (all spans with the given name). + ctx = pprof.WithLabels(parentCtx, pprof.Labels( + spanNameLabelName, operationName)) + } + // Goroutine labels should be set as early as possible, + // in order to capture the overhead of the function call. + pprof.SetGoroutineLabels(ctx) + // We create a span wrapper to ensure we remove the newly attached pprof + // labels when span finishes. The need of this wrapper is questioned: + // as we do not have the original context, we could leave the goroutine + // labels – normally, span is finished at the very end of the goroutine's + // lifetime, so no significant side effects should take place. + w := spanWrapper{ + parentPprofCtx: parentCtx, + currentPprofCtx: ctx, + } + w.Span = span.SetTag(profileIDTagKey, spanID) + return &w +} + +type spanWrapper struct { + parentPprofCtx context.Context + currentPprofCtx context.Context + opentracing.Span +} + +func (s *spanWrapper) Finish() { + s.Span.Finish() + pprof.SetGoroutineLabels(s.parentPprofCtx) + s.currentPprofCtx = s.parentPprofCtx +} + +// sampledSpanID returns the span ID, if the span is sampled, +// otherwise an empty string is returned. +func sampledSpanID(spanCtx jaeger.SpanContext) string { + if spanCtx.IsSampled() { + return spanCtx.SpanID().String() + } + return "" +} diff --git a/vendor/github.com/grafana/dskit/spanprofiler/tracer.go b/vendor/github.com/grafana/dskit/spanprofiler/tracer.go new file mode 100644 index 0000000000000..c28b52b11d444 --- /dev/null +++ b/vendor/github.com/grafana/dskit/spanprofiler/tracer.go @@ -0,0 +1,109 @@ +package spanprofiler + +import ( + "context" + "unsafe" + + "github.com/opentracing/opentracing-go" + "github.com/uber/jaeger-client-go" +) + +const ( + profileIDTagKey = "pyroscope.profile.id" + + spanIDLabelName = "span_id" + spanNameLabelName = "span_name" +) + +type tracer struct{ opentracing.Tracer } + +// NewTracer creates a new opentracing.Tracer with the span profiler integrated. +// +// For efficiency, the tracer selectively records profiles for _root_ spans +// — the initial _local_ span in a process — since a trace may encompass +// thousands of spans. All stack trace samples accumulated during the execution +// of their child spans contribute to the root span's profile. In practical +// terms, this signifies that, for instance, an HTTP request results in a +// singular profile, irrespective of the numerous spans within the trace. It's +// important to note that these profiles don't extend beyond the boundaries of +// a single process. +// +// The limitation of this approach is that only spans created within the same +// goroutine, or its children, as the parent are taken into account. +// Consequently, in scenarios involving asynchronous execution, where the parent +// span context is passed to another goroutine, explicit profiling becomes +// necessary using `spanprofiler.StartSpanFromContext`. +func NewTracer(tr opentracing.Tracer) opentracing.Tracer { return &tracer{tr} } + +func (t *tracer) StartSpan(operationName string, opts ...opentracing.StartSpanOption) opentracing.Span { + span := t.Tracer.StartSpan(operationName, opts...) + spanCtx, ok := span.Context().(jaeger.SpanContext) + if !ok { + return span + } + // pprof labels are attached only once, at the span root level. + if !isRootSpan(opts...) { + return span + } + // The pprof label API assumes that pairs of labels are passed through the + // context. Unfortunately, the opentracing Tracer API doesn't match this + // concept: this makes it impossible to save an existing pprof context and + // all the original pprof labels associated with the goroutine. + ctx := context.Background() + return wrapJaegerSpanWithGoroutineLabels(ctx, span, operationName, sampledSpanID(spanCtx)) +} + +// isRootSpan reports whether the span is a root span. +// +// There are only two valid cases: if the span is the first span in the trace, +// or is the first _local_ span in the trace. +// +// An exception is made for FollowsFrom reference: spans without an explicit +// parent are considered as root ones. +func isRootSpan(opts ...opentracing.StartSpanOption) bool { + parent, ok := parentSpanContextFromRef(opts...) + return !ok || isRemoteSpan(parent) +} + +// parentSpanContextFromRef returns the first parent reference. +func parentSpanContextFromRef(options ...opentracing.StartSpanOption) (sc jaeger.SpanContext, ok bool) { + var sso opentracing.StartSpanOptions + for _, option := range options { + option.Apply(&sso) + } + for _, ref := range sso.References { + if ref.Type == opentracing.ChildOfRef && ref.ReferencedContext != nil { + sc, ok = ref.ReferencedContext.(jaeger.SpanContext) + return sc, ok + } + } + return sc, ok +} + +// isRemoteSpan reports whether the span context represents a remote parent. +// +// NOTE(kolesnikovae): this is ugly, but the only reliable method I found. +// The opentracing-go package and Jaeger client are not meant to change as +// both are deprecated. +func isRemoteSpan(c jaeger.SpanContext) bool { + jaegerCtx := *(*jaegerSpanCtx)(unsafe.Pointer(&c)) + return jaegerCtx.remote +} + +// jaegerSpanCtx represents memory layout of the jaeger.SpanContext type. +type jaegerSpanCtx struct { + traceID [16]byte // TraceID + spanID [8]byte // SpanID + parentID [8]byte // SpanID + baggage uintptr // map[string]string + debugID [2]uintptr // string + + // samplingState is a pointer to a struct that has "localRootSpan" member, + // which we could probably use: that would allow omitting quite expensive + // parentSpanContextFromRef call. However, interpreting the pointer and + // the complex struct memory layout is more complicated and dangerous. + samplingState uintptr + + // remote indicates that span context represents a remote parent + remote bool +} diff --git a/vendor/modules.txt b/vendor/modules.txt index 8ce4557f461ba..b69a2f1e5315d 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -894,6 +894,7 @@ github.com/grafana/dskit/server github.com/grafana/dskit/services github.com/grafana/dskit/signals github.com/grafana/dskit/spanlogger +github.com/grafana/dskit/spanprofiler github.com/grafana/dskit/tenant github.com/grafana/dskit/test github.com/grafana/dskit/tracing