Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding new inferred sampler #12927

Closed
wants to merge 14 commits into from
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Allow setting KEYSTORE_PASSWORD through env variable ([#12865](https://github.com/opensearch-project/OpenSearch/pull/12865))
- [Concurrent Segment Search] Perform buildAggregation concurrently and support Composite Aggregations ([#12697](https://github.com/opensearch-project/OpenSearch/pull/12697))
- [Concurrent Segment Search] Disable concurrent segment search for system indices and throttled requests ([#12954](https://github.com/opensearch-project/OpenSearch/pull/12954))
- Tracing Framework] Adds support for inferred sampling ([#12315](https://github.com/opensearch-project/OpenSearch/issues/12315))

### Dependencies
- Bump `org.apache.commons:commons-configuration2` from 2.10.0 to 2.10.1 ([#12896](https://github.com/opensearch-project/OpenSearch/pull/12896))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@
import org.opensearch.core.common.io.stream.Writeable;
import org.opensearch.core.common.transport.TransportAddress;

import java.util.Collections;
import java.util.Map;

/**
* Message over the transport interface
*
Expand All @@ -45,14 +48,24 @@ public abstract class TransportMessage implements Writeable {

private TransportAddress remoteAddress;

private Map<String, String> header = Collections.emptyMap();

public void remoteAddress(TransportAddress remoteAddress) {
this.remoteAddress = remoteAddress;
}

public void setResponseHeaders(Map<String, String> header) {
this.header = header;
}

public TransportAddress remoteAddress() {
return remoteAddress;
}

public Map<String, String> getResponseHeaders() {
return header;
}

/**
* Constructs a new empty transport message
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
package org.opensearch.telemetry.tracing;

import org.opensearch.common.annotation.InternalApi;
import org.opensearch.telemetry.tracing.attributes.SamplingAttributes;

import java.io.Closeable;
import java.io.IOException;
Expand Down Expand Up @@ -52,8 +53,9 @@
} else {
parentSpan = getCurrentSpanInternal();
}

Span span = createSpan(context, parentSpan);
addDefaultAttributes(span);
addDefaultAttributes(parentSpan, span);
return span;
}

Expand Down Expand Up @@ -97,14 +99,32 @@
* Adds default attributes in the span
* @param span the current active span
*/
protected void addDefaultAttributes(Span span) {
protected void addDefaultAttributes(Span parentSpan, Span span) {
copyInheritableParentAttributes(parentSpan, span);
span.addAttribute(THREAD_NAME, Thread.currentThread().getName());
}

@Override
public Span startSpan(SpanCreationContext spanCreationContext, Map<String, Collection<String>> headers) {
Optional<Span> propagatedSpan = tracingTelemetry.getContextPropagator().extractFromHeaders(headers);
addRequestAttributeToContext(spanCreationContext, headers);
return startSpan(spanCreationContext.parent(propagatedSpan.map(SpanContext::new).orElse(null)));
}

private void addRequestAttributeToContext(SpanCreationContext spanCreationContext, Map<String, Collection<String>> headers) {
if (headers != null && headers.containsKey(SamplingAttributes.SAMPLER.getValue())) {
spanCreationContext.getAttributes()
.addAttribute(SamplingAttributes.SAMPLER.getValue(), SamplingAttributes.INFERRED_SAMPLER.getValue());

Check warning on line 117 in libs/telemetry/src/main/java/org/opensearch/telemetry/tracing/DefaultTracer.java

View check run for this annotation

Codecov / codecov/patch

libs/telemetry/src/main/java/org/opensearch/telemetry/tracing/DefaultTracer.java#L116-L117

Added lines #L116 - L117 were not covered by tests
}
}

private void copyInheritableParentAttributes(Span parentSpan, Span currentSpan) {
// This work as common attribute propagator from parent to child
if (parentSpan != null) {
Optional<String> inferredAttribute = Optional.ofNullable(parentSpan.getAttributeString(SamplingAttributes.SAMPLER.getValue()));
if (inferredAttribute.isPresent()) {
currentSpan.addAttribute(SamplingAttributes.SAMPLER.getValue(), SamplingAttributes.INFERRED_SAMPLER.getValue());
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -93,4 +93,31 @@ public interface Span {
*/
String getSpanId();

/**
* *
* @param key for which we need to look for value
* @return string attribute value
*/
String getAttributeString(String key);
nishchay21 marked this conversation as resolved.
Show resolved Hide resolved

/**
* *
* @param key for which we need to look for value
* @return Boolean attribute value
*/
Boolean getAttributeBoolean(String key);

/**
* *
* @param key for which we need to look for value
* @return Long attribute value
*/
Long getAttributeLong(String key);

/**
* *
* @param key for which we need to look for value
* @return Double attribute value
*/
Double getAttributeDouble(String key);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

package org.opensearch.telemetry.tracing.attributes;

import org.opensearch.common.annotation.InternalApi;

import java.util.Locale;

/**
* Enum for Inferred Sampling*
* @opensearch.internal*
*/
@InternalApi
public enum SamplingAttributes {

/**
* Attribute added if the span is sampled by inferred sampler*
*/
INFERRED_SAMPLER,

/**
* Attribute Added if the span is an outlier*
*/
SAMPLED,

/**
* Sampler Used in the framework*
*/
SAMPLER;

/**
* returns lower case enum value*
* @return String
*/
public String getValue() {
return name().toLowerCase(Locale.ROOT);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -82,4 +82,24 @@ public String getTraceId() {
public String getSpanId() {
return "noop-span-id";
}

@Override
public String getAttributeString(String key) {
return "";
}

@Override
public Boolean getAttributeBoolean(String key) {
return false;
}

@Override
public Long getAttributeLong(String key) {
return 0L;
}

@Override
public Double getAttributeDouble(String key) {
return 0.0;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,17 @@
import org.opensearch.core.action.ActionListener;
import org.opensearch.node.Node;
import org.opensearch.telemetry.tracing.attributes.Attributes;
import org.opensearch.telemetry.tracing.attributes.SamplingAttributes;
import org.opensearch.test.OpenSearchTestCase;
import org.opensearch.test.telemetry.tracing.MockSpan;
import org.opensearch.test.telemetry.tracing.MockTracingTelemetry;
import org.opensearch.threadpool.ThreadPool;

import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutorService;

Expand Down Expand Up @@ -424,4 +429,59 @@ private SpanCreationContext buildSpanCreationContext(String spanName, Attributes
}
return spanCreationContext;
}

public void testCreateSpanWithInferredAttributes() {
TracingTelemetry tracingTelemetry = new MockTracingTelemetry();
ThreadContext threadContext = new ThreadContext(Settings.EMPTY);
DefaultTracer defaultTracer = new DefaultTracer(
tracingTelemetry,
new ThreadContextBasedTracerContextStorage(threadContext, tracingTelemetry)
);

SpanCreationContext spanCreationContext = buildSpanCreationContext(
"span_name",
Attributes.create().addAttribute(SamplingAttributes.SAMPLER.getValue(), SamplingAttributes.INFERRED_SAMPLER.getValue()),
null
);

Span span = defaultTracer.startSpan(spanCreationContext);

assertThat(defaultTracer.getCurrentSpan(), is(nullValue()));
assertEquals(SamplingAttributes.INFERRED_SAMPLER.getValue(), ((MockSpan) span).getAttribute(SamplingAttributes.SAMPLER.getValue()));
span.endSpan();
}

@SuppressWarnings("unchecked")
public void testCreateSpanWithInferredSampledParent() {
TracingTelemetry tracingTelemetry = new MockTracingTelemetry();
DefaultTracer defaultTracer = new DefaultTracer(
tracingTelemetry,
new ThreadContextBasedTracerContextStorage(new ThreadContext(Settings.EMPTY), tracingTelemetry)
);

SpanCreationContext spanCreationContext = buildSpanCreationContext("span_name", null, null);

Span span = defaultTracer.startSpan(
spanCreationContext,
(Map<String, Collection<String>>) new HashMap<String, Collection<String>>().put(
SamplingAttributes.SAMPLER.getValue(),
Collections.singleton(SamplingAttributes.INFERRED_SAMPLER.getValue())
)
);

try (final SpanScope scope = defaultTracer.withSpanInScope(span)) {
SpanContext parentSpan = defaultTracer.getCurrentSpan();
SpanCreationContext spanCreationContext1 = buildSpanCreationContext("span_name_1", Attributes.EMPTY, parentSpan.getSpan());
Span span1 = defaultTracer.startSpan(spanCreationContext1);
assertEquals("span_name_1", span1.getSpanName());
assertEquals(parentSpan.getSpan(), span1.getParentSpan());
assertEquals(
SamplingAttributes.INFERRED_SAMPLER.getValue(),
((MockSpan) span1).getAttribute(SamplingAttributes.SAMPLER.getValue())
);
span1.endSpan();
} finally {
span.endSpan();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import org.opensearch.telemetry.TelemetrySettings;
import org.opensearch.telemetry.metrics.exporter.OTelMetricsExporterFactory;
import org.opensearch.telemetry.tracing.exporter.OTelSpanExporterFactory;
import org.opensearch.telemetry.tracing.processor.OTelSpanProcessor;
import org.opensearch.telemetry.tracing.sampler.OTelSamplerFactory;
import org.opensearch.telemetry.tracing.sampler.RequestSampler;

Expand Down Expand Up @@ -117,7 +118,11 @@ private static SdkTracerProvider createSdkTracerProvider(
.build();
}

private static BatchSpanProcessor spanProcessor(Settings settings, SpanExporter spanExporter) {
private static OTelSpanProcessor spanProcessor(Settings settings, SpanExporter spanExporter) {
return new OTelSpanProcessor(batchSpanProcessor(settings, spanExporter));
}

private static BatchSpanProcessor batchSpanProcessor(Settings settings, SpanExporter spanExporter) {
return BatchSpanProcessor.builder(spanExporter)
.setScheduleDelay(TRACER_EXPORTER_DELAY_SETTING.get(settings).getSeconds(), TimeUnit.SECONDS)
.setMaxExportBatchSize(TRACER_EXPORTER_BATCH_SIZE_SETTING.get(settings))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,14 @@

package org.opensearch.telemetry.tracing;

import org.opensearch.telemetry.tracing.attributes.SamplingAttributes;

import java.util.Optional;

import io.opentelemetry.api.common.AttributeKey;
import io.opentelemetry.api.trace.Span;
import io.opentelemetry.api.trace.StatusCode;
import io.opentelemetry.sdk.trace.ReadableSpan;

/**
* Default implementation of {@link Span} using Otel span. It keeps a reference of OpenTelemetry Span and handles span
Expand All @@ -32,9 +38,33 @@ public OTelSpan(String spanName, Span span, org.opensearch.telemetry.tracing.Spa

@Override
public void endSpan() {
if (isSpanOutlier()) {
Copy link
Collaborator

@reta reta Apr 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How that is suppose to work in case of async calls when parent is finished before the child? (example below)

       try (ScopedSpan startScopedSpan = tracer.startScopedSpan(...)) {
             threadPool.executor(...).execute( ... )
        }

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So the parent span object is still maintained in the scope of child even if the parent has ended. What we plan to do here is mark the current child's parent and the chain above as one of the parent within the chain would still be alive. As suggested here as well if the parent is still in the recording phase we will go ahead and add the attribute to the parent and mark the parent to be sampled as well. If the parent is not recording then the information about this parent will be stored in event of its parent so that we don't loose on the info of not-recording spans.

Adding the closed parent event to its parent span is still pending and will add in the next revision itself.

Copy link
Collaborator

@reta reta Apr 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand the terminology here (what does it mean for span chain to be alive), sorry about that , let me try to illustrate that better:

 try (ScopedSpan startScopedSpan = tracer.startScopedSpan(...)) {
           threadPool.executor(...).execute( ... )
  }
  • since we sample 100%, all spans are sampled
  • the parent span is created (tracer.startScopedSpan) , onSpanStart is called
  • the threadPool captures context (current span as parent) and schedules the work
  • the parent span is ended, onSpanEnd is called

After onSpanEnd, the parent span is immutable, no matter if we still have reference to in from Java code or not - any modification to it have no effect but produce warning.

My question is: since we not be exporting the parent span (due to absence of the expected attributes), what would happen in this case?

Copy link
Contributor Author

@nishchay21 nishchay21 Apr 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So here is how this would work:

  • If the span is an outlier span we will add the sampled attribute to the span.
  • Once the span is marked to be sampled we will mark its parent to be sampled as well. This could now have two possibilities
    • One, the parent is still recording. If that is the case we will go ahead and add the attribute to the parent as well and the parent would be sampled as well.
    • Second, the parent is not recording/has ended in that case we mark its grand-parent if that is alive to be sampled and record this closed span information in the alive grand-parent itself. This way we will not get the span which was closed however the information about the span would still persist in its parent which will be sampled

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is what I have suspected - we are messing up the trace in unpredictable ways (another red flag), there should be only one possibility: the trace is recorded as it was sampled or not recorded.

Copy link
Contributor Author

@nishchay21 nishchay21 Apr 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes you are correct we will miss the span. However to maintain the lineage we will still maintain the information of closed span in its parent itself.

Copy link
Collaborator

@reta reta Apr 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So here is the thing folks, I my opinion, which I shared on RFC as well:

  • if we cannot preserve span hierarchy, this is no go (the mess would be on user to deal with)
  • if we cannot follow the rules of the library we are using, this is no go

OTEL cannot solve tail sampling on library level, this is documented and acknowledged:

There are other tracing instrumentations that try to implement sampling differently, including adaptive sampling that could tune the sampling rates based on the different factors:

Please consider those as a robust and reliable alternatives, that do not exhibit flaws of this solution.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see where you're coming from regarding the potential gaps in the hierarchy, which users may need to address based on their choice of collector and visualization solutions. It's been recognized as a requirement to address such gaps in the Otel specification, and the community appears to be quite receptive to it. If you take a look at the Otel issues provided, you'll notice that even with these gaps, there are still ways to infer missing parent information through parent span events, aiding in understanding the code path and other relevant details.

open-telemetry/opentelemetry-specification#3205
open-telemetry/opentelemetry-specification#3867

In general, there are numerous benefits to this approach, particularly for customers dealing with larger clusters processing tens of thousands of requests per second and managing multiple clusters. It offers insights into problematic areas of code, and the availability of missing span information as part of the event in the parent span can be very helpful in connecting the dots.

Moreover, this won't be the default behavior and can be enabled as needed (though it's crucial to clearly document the shortcomings and benefits of such samplers). There have been requests from Otel users for deferred sampling support and reverse propagation of context, and it's hoped that these features will be available soon, at which point a switch can be made.

Copy link
Collaborator

@reta reta Apr 12, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Gaganjuneja I shared my opinion on the subject many times, please let's get back to this when these features are available, if other maintainers see this is a way to go , I am fine with that, but strong -1 from me moving this further as it stands today.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One last thing, I think we can solve this issue by adding a link between the span and it's grand parent to persist the relationship. Since this is a follows_from span.

markParentForSampling();
nishchay21 marked this conversation as resolved.
Show resolved Hide resolved
}
delegateSpan.end();
}

/*
* This is added temporarily will remove this after the evaluation framework PR.
* This Framework will be used to evaluate a span if that is an outlier or not.
*/
private boolean isSpanOutlier() {
Optional<Boolean> isSpanSampled = Optional.ofNullable(getAttributeBoolean(SamplingAttributes.SAMPLED.getValue()));
Optional<String> isSpanInferredSampled = Optional.ofNullable(getAttributeString(SamplingAttributes.SAMPLER.getValue()));

return isSpanSampled.isPresent()
&& isSpanInferredSampled.isPresent()
&& isSpanInferredSampled.get().equals(SamplingAttributes.INFERRED_SAMPLER.getValue());
}

private void markParentForSampling() {
org.opensearch.telemetry.tracing.Span currentParent = getParentSpan();
while (currentParent != null && currentParent.getAttributeBoolean(SamplingAttributes.SAMPLED.getValue()) == null) {
currentParent.addAttribute(SamplingAttributes.SAMPLED.getValue(), true);
currentParent = currentParent.getParentSpan();
}
}

@Override
public void addAttribute(String key, String value) {
delegateSpan.setAttribute(key, value);
Expand Down Expand Up @@ -77,8 +107,43 @@ public String getSpanId() {
return delegateSpan.getSpanContext().getSpanId();
}

@Override
public String getAttributeString(String key) {
if (delegateSpan != null && delegateSpan instanceof ReadableSpan) return ((ReadableSpan) delegateSpan).getAttribute(
AttributeKey.stringKey(key)
);

return null;
}

@Override
public Boolean getAttributeBoolean(String key) {
if (delegateSpan != null && delegateSpan instanceof ReadableSpan) {
return ((ReadableSpan) delegateSpan).getAttribute(AttributeKey.booleanKey(key));
}

return null;
}

@Override
public Long getAttributeLong(String key) {
if (delegateSpan != null && delegateSpan instanceof ReadableSpan) return ((ReadableSpan) delegateSpan).getAttribute(
AttributeKey.longKey(key)
);

return null;
}

@Override
public Double getAttributeDouble(String key) {
if (delegateSpan != null && delegateSpan instanceof ReadableSpan) return ((ReadableSpan) delegateSpan).getAttribute(
AttributeKey.doubleKey(key)
);

return null;
}

io.opentelemetry.api.trace.Span getDelegateSpan() {
return delegateSpan;
}

}
Loading
Loading