Skip to content

This issue was moved to a discussion.

You can continue the conversation there. Go to discussion →

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MeterProvider: Excessive creation of metrics instruments #2820

Closed
2 of 3 tasks
aslatter opened this issue Oct 4, 2024 · 6 comments
Closed
2 of 3 tasks

MeterProvider: Excessive creation of metrics instruments #2820

aslatter opened this issue Oct 4, 2024 · 6 comments
Assignees
Labels
bug This issue is a bug.

Comments

@aslatter
Copy link

aslatter commented Oct 4, 2024

Acknowledgements

Describe the bug

When supplying a MeterProvider to the s3 client, the underlying retryer seems to be creating a metrics-instrument on every API call.

Following the OTEL model (and other similar metrics-libraries) I would have expected instruments to be creating once, cached, and then getting new observations on existing instruments on each API call.

For example, from the OTEL documentation on instrumenting a Go application: https://opentelemetry.io/docs/languages/go/instrumentation/#using-counters

Click here!
import (
	"net/http"

	"go.opentelemetry.io/otel/metric"
)

func init() {
	apiCounter, err := meter.Int64Counter(
		"api.counter",
		metric.WithDescription("Number of API calls."),
		metric.WithUnit("{call}"),
	)
	if err != nil {
		panic(err)
	}
	http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
		apiCounter.Add(r.Context(), 1)

		// do some work in an API call
	})
}

Excerpt of my test code:

	ctx := context.Background()

	cfg, err := config.LoadDefaultConfig(ctx)
	if err != nil {
		return fmt.Errorf("loading aws config: %s", err)
	}

	// setup demo meter provider (which logs calls)

	s3c := s3.NewFromConfig(cfg, func(o *s3.Options) {
		o.MeterProvider = &demoMeterProvider{}
	})

	// make a few API calls

	_, err = s3c.ListBuckets(ctx, &s3.ListBucketsInput{})
	if err != nil {
		return fmt.Errorf("list buckets: %s", err)
	}

	_, err = s3c.ListBuckets(ctx, &s3.ListBucketsInput{})
	if err != nil {
		return fmt.Errorf("list buckets: %s", err)
	}

	_, err = s3c.ListBuckets(ctx, &s3.ListBucketsInput{})
	if err != nil {
		return fmt.Errorf("list buckets: %s", err)
	}

Full code sample:

Click here!
package main

import (
	"context"
	"fmt"
	"log"
	"os"

	"github.com/aws/aws-sdk-go-v2/config"
	"github.com/aws/aws-sdk-go-v2/service/s3"
	"github.com/aws/smithy-go/metrics"
)

func main() {
	if err := mainErr(); err != nil {
		fmt.Fprintf(os.Stderr, "error: %s\n", err)
		os.Exit(1)
	}
}

func mainErr() error {

	ctx := context.Background()

	cfg, err := config.LoadDefaultConfig(ctx)
	if err != nil {
		return fmt.Errorf("loading aws config: %s", err)
	}

	// setup demo meter provider (which logs calls)

	s3c := s3.NewFromConfig(cfg, func(o *s3.Options) {
		o.MeterProvider = &demoMeterProvider{}
	})

	// make a few API calls

	_, err = s3c.ListBuckets(ctx, &s3.ListBucketsInput{})
	if err != nil {
		return fmt.Errorf("list buckets: %s", err)
	}

	_, err = s3c.ListBuckets(ctx, &s3.ListBucketsInput{})
	if err != nil {
		return fmt.Errorf("list buckets: %s", err)
	}

	_, err = s3c.ListBuckets(ctx, &s3.ListBucketsInput{})
	if err != nil {
		return fmt.Errorf("list buckets: %s", err)
	}

	return nil
}

type demoMeterProvider struct{}

// Meter implements metrics.MeterProvider.
func (d *demoMeterProvider) Meter(scope string, opts ...metrics.MeterOption) metrics.Meter {
	return &demoMeter{scope}
}

type demoMeter struct {
	scope string
}

// Float64AsyncCounter implements metrics.Meter.
func (d *demoMeter) Float64AsyncCounter(name string, callback metrics.Float64Callback, opts ...metrics.InstrumentOption) (metrics.AsyncInstrument, error) {
	d.logInstrument("Float64AsyncCounter", name)
	return &demoFloatInstrument{d.scope, name}, nil
}

// Float64AsyncGauge implements metrics.Meter.
func (d *demoMeter) Float64AsyncGauge(name string, callback metrics.Float64Callback, opts ...metrics.InstrumentOption) (metrics.AsyncInstrument, error) {
	d.logInstrument("Float64AsyncGauge", name)
	return &demoFloatInstrument{d.scope, name}, nil
}

// Float64AsyncUpDownCounter implements metrics.Meter.
func (d *demoMeter) Float64AsyncUpDownCounter(name string, callback metrics.Float64Callback, opts ...metrics.InstrumentOption) (metrics.AsyncInstrument, error) {
	d.logInstrument("Float64AsyncUpDownCounter", name)
	return &demoFloatInstrument{d.scope, name}, nil
}

// Float64Counter implements metrics.Meter.
func (d *demoMeter) Float64Counter(name string, opts ...metrics.InstrumentOption) (metrics.Float64Counter, error) {
	d.logInstrument("Float64Counter", name)
	return &demoFloatInstrument{d.scope, name}, nil
}

// Float64Gauge implements metrics.Meter.
func (d *demoMeter) Float64Gauge(name string, opts ...metrics.InstrumentOption) (metrics.Float64Gauge, error) {
	d.logInstrument("Float64Gauge", name)
	return &demoFloatInstrument{d.scope, name}, nil
}

// Float64Histogram implements metrics.Meter.
func (d *demoMeter) Float64Histogram(name string, opts ...metrics.InstrumentOption) (metrics.Float64Histogram, error) {
	d.logInstrument("Float64Histogram", name)
	return &demoFloatInstrument{d.scope, name}, nil
}

// Float64UpDownCounter implements metrics.Meter.
func (d *demoMeter) Float64UpDownCounter(name string, opts ...metrics.InstrumentOption) (metrics.Float64UpDownCounter, error) {
	d.logInstrument("Float64UpDownCounter", name)
	return &demoFloatInstrument{d.scope, name}, nil
}

// Int64AsyncCounter implements metrics.Meter.
func (d *demoMeter) Int64AsyncCounter(name string, callback metrics.Int64Callback, opts ...metrics.InstrumentOption) (metrics.AsyncInstrument, error) {
	d.logInstrument("Int64AsyncCounter", name)
	return &demoIntInstrument{d.scope, name}, nil
}

// Int64AsyncGauge implements metrics.Meter.
func (d *demoMeter) Int64AsyncGauge(name string, callback metrics.Int64Callback, opts ...metrics.InstrumentOption) (metrics.AsyncInstrument, error) {
	d.logInstrument("Int64AsyncGauge", name)
	return &demoIntInstrument{d.scope, name}, nil
}

// Int64AsyncUpDownCounter implements metrics.Meter.
func (d *demoMeter) Int64AsyncUpDownCounter(name string, callback metrics.Int64Callback, opts ...metrics.InstrumentOption) (metrics.AsyncInstrument, error) {
	d.logInstrument("Int64AsyncUpDownCounter", name)
	return &demoIntInstrument{d.scope, name}, nil
}

// Int64Counter implements metrics.Meter.
func (d *demoMeter) Int64Counter(name string, opts ...metrics.InstrumentOption) (metrics.Int64Counter, error) {
	d.logInstrument("Int64Counter", name)
	return &demoIntInstrument{d.scope, name}, nil
}

// Int64Gauge implements metrics.Meter.
func (d *demoMeter) Int64Gauge(name string, opts ...metrics.InstrumentOption) (metrics.Int64Gauge, error) {
	d.logInstrument("Int64Gauge", name)
	return &demoIntInstrument{d.scope, name}, nil
}

// Int64Histogram implements metrics.Meter.
func (d *demoMeter) Int64Histogram(name string, opts ...metrics.InstrumentOption) (metrics.Int64Histogram, error) {
	d.logInstrument("Int64Histogram", name)
	return &demoIntInstrument{d.scope, name}, nil
}

// Int64UpDownCounter implements metrics.Meter.
func (d *demoMeter) Int64UpDownCounter(name string, opts ...metrics.InstrumentOption) (metrics.Int64UpDownCounter, error) {
	d.logInstrument("Int64UpDownCounter", name)
	return &demoIntInstrument{d.scope, name}, nil
}

func (d *demoMeter) logInstrument(typ, name string) {
	log.Printf("creating %s instrument: %s", typ, name)
}

type demoFloatInstrument struct {
	scope string
	name  string
}

// Record implements metrics.Float64Histogram.
func (d *demoFloatInstrument) Record(context.Context, float64, ...metrics.RecordMetricOption) {
	log.Printf("%s Record", d.name)
}

// Sample implements metrics.Float64Gauge.
func (d *demoFloatInstrument) Sample(context.Context, float64, ...metrics.RecordMetricOption) {
	log.Printf("%s Sample", d.name)
}

// Add implements metrics.Float64Counter.
func (d *demoFloatInstrument) Add(context.Context, float64, ...metrics.RecordMetricOption) {
	log.Printf("%s Add", d.name)
}

// Stop implements metrics.AsyncInstrument.
func (d *demoFloatInstrument) Stop() {
	log.Printf("%s Stop", d.name)
}

type demoIntInstrument struct {
	scope string
	name  string
}

// Record implements metrics.Int64Histogram.
func (d *demoIntInstrument) Record(context.Context, int64, ...metrics.RecordMetricOption) {
	log.Printf("%s Record", d.name)
}

// Sample implements metrics.Int64Gauge.
func (d *demoIntInstrument) Sample(context.Context, int64, ...metrics.RecordMetricOption) {
	log.Printf("%s Sample", d.name)
}

// Add implements metrics.Int64Counter.
func (d *demoIntInstrument) Add(context.Context, int64, ...metrics.RecordMetricOption) {
	log.Printf("%s Add", d.name)
}

// Stop implements metrics.AsyncInstrument.
func (d *demoIntInstrument) Stop() {
	log.Printf("%s Stop", d.name)
}

Regression Issue

  • Select this option if this issue appears to be a regression.

Expected Behavior

I expect my demo-meter-provider to log each instrument creation only once.

Current Behavior

The instruments were creating three times - once per API call I made to the S3 service:

Click here!
2024/10/04 13:32:26 creating Float64Histogram instrument: client.call.duration
2024/10/04 13:32:26 creating Float64Histogram instrument: client.call.serialization_duration
2024/10/04 13:32:26 creating Float64Histogram instrument: client.call.auth.resolve_identity_duration
2024/10/04 13:32:26 creating Float64Histogram instrument: client.call.resolve_endpoint_duration
2024/10/04 13:32:26 creating Float64Histogram instrument: client.call.auth.signing_duration
2024/10/04 13:32:26 creating Float64Histogram instrument: client.call.deserialization_duration
2024/10/04 13:32:26 client.call.serialization_duration Record
2024/10/04 13:32:27 client.call.auth.resolve_identity_duration Record
2024/10/04 13:32:27 client.call.resolve_endpoint_duration Record
2024/10/04 13:32:27 creating Int64Counter instrument: client.call.attempts
2024/10/04 13:32:27 creating Int64Counter instrument: client.call.errors
2024/10/04 13:32:27 creating Float64Histogram instrument: client.call.attempt_duration
2024/10/04 13:32:27 client.call.attempts Add
2024/10/04 13:32:27 client.call.auth.signing_duration Record
2024/10/04 13:32:27 client.call.deserialization_duration Record
2024/10/04 13:32:27 client.call.attempt_duration Record
2024/10/04 13:32:27 client.call.duration Record
2024/10/04 13:32:27 creating Float64Histogram instrument: client.call.duration
2024/10/04 13:32:27 creating Float64Histogram instrument: client.call.serialization_duration
2024/10/04 13:32:27 creating Float64Histogram instrument: client.call.auth.resolve_identity_duration
2024/10/04 13:32:27 creating Float64Histogram instrument: client.call.resolve_endpoint_duration
2024/10/04 13:32:27 creating Float64Histogram instrument: client.call.auth.signing_duration
2024/10/04 13:32:27 creating Float64Histogram instrument: client.call.deserialization_duration
2024/10/04 13:32:27 client.call.serialization_duration Record
2024/10/04 13:32:27 client.call.auth.resolve_identity_duration Record
2024/10/04 13:32:27 client.call.resolve_endpoint_duration Record
2024/10/04 13:32:27 creating Int64Counter instrument: client.call.attempts
2024/10/04 13:32:27 creating Int64Counter instrument: client.call.errors
2024/10/04 13:32:27 creating Float64Histogram instrument: client.call.attempt_duration
2024/10/04 13:32:27 client.call.attempts Add
2024/10/04 13:32:27 client.call.auth.signing_duration Record
2024/10/04 13:32:27 client.call.deserialization_duration Record
2024/10/04 13:32:27 client.call.attempt_duration Record
2024/10/04 13:32:27 client.call.duration Record
2024/10/04 13:32:27 creating Float64Histogram instrument: client.call.duration
2024/10/04 13:32:27 creating Float64Histogram instrument: client.call.serialization_duration
2024/10/04 13:32:27 creating Float64Histogram instrument: client.call.auth.resolve_identity_duration
2024/10/04 13:32:27 creating Float64Histogram instrument: client.call.resolve_endpoint_duration
2024/10/04 13:32:27 creating Float64Histogram instrument: client.call.auth.signing_duration
2024/10/04 13:32:27 creating Float64Histogram instrument: client.call.deserialization_duration
2024/10/04 13:32:27 client.call.serialization_duration Record
2024/10/04 13:32:27 client.call.auth.resolve_identity_duration Record
2024/10/04 13:32:27 client.call.resolve_endpoint_duration Record
2024/10/04 13:32:27 creating Int64Counter instrument: client.call.attempts
2024/10/04 13:32:27 creating Int64Counter instrument: client.call.errors
2024/10/04 13:32:27 creating Float64Histogram instrument: client.call.attempt_duration
2024/10/04 13:32:27 client.call.attempts Add
2024/10/04 13:32:27 client.call.auth.signing_duration Record
2024/10/04 13:32:27 client.call.deserialization_duration Record
2024/10/04 13:32:27 client.call.attempt_duration Record
2024/10/04 13:32:27 client.call.duration Record

Reproduction Steps

$ AWS_REGION=us-east-1 go run ./

Possible Solution

No response

Additional Information/Context

No response

AWS Go SDK V2 Module Versions Used

$ grep github.com/aws/ go.mod
	github.com/aws/aws-sdk-go-v2/config v1.27.40
	github.com/aws/aws-sdk-go-v2/service/s3 v1.64.1
	github.com/aws/aws-sdk-go-v2 v1.31.0 // indirect
	github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.5 // indirect
	github.com/aws/aws-sdk-go-v2/credentials v1.17.38 // indirect
	github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.14 // indirect
	github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.18 // indirect
	github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.18 // indirect
	github.com/aws/aws-sdk-go-v2/internal/ini v1.8.1 // indirect
	github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.18 // indirect
	github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.11.5 // indirect
	github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.3.20 // indirect
	github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.20 // indirect
	github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.17.18 // indirect
	github.com/aws/aws-sdk-go-v2/service/sso v1.23.4 // indirect
	github.com/aws/aws-sdk-go-v2/service/ssooidc v1.27.4 // indirect
	github.com/aws/aws-sdk-go-v2/service/sts v1.31.4 // indirect
	github.com/aws/smithy-go v1.21.0 // indirect

Compiler and Version used

go version go1.23.1 linux/amd64

Operating System and version

Arch Linux x86_64, Linux 6.10.10-arch1-1

@aslatter aslatter added bug This issue is a bug. needs-triage This issue or PR still needs to be triaged. labels Oct 4, 2024
@aslatter
Copy link
Author

aslatter commented Oct 4, 2024

To add more context, if I were writing an adapter for the Go Prometheus Client I would need to write this caching myself inside the adapter to avoid duplicate registration of metrics.

I don't know how heavy-weight the creation of instruments are in other client-SDKs for metrics, so that could also be a consideration.

@lucix-aws
Copy link
Contributor

I'm not familiar with Prometheus - does that implementation not cache instruments/meters by default?

The OTEL SDK for Go explicitly states that it does, which we released an adapter for out-of-the-box. So, since we need to generally support functional options that modify config on a per-request basis (e.g. setting a different one for a specific operation) we took advantage of this and made the operation workflow invoke Meter() (and instrument calls) every time.

Since our clients support functional options, I'm leaning towards taking a hard stance that Meter() and Instrument() calls through the smithy-go APIs should be idempotent on spec.

@aslatter
Copy link
Author

aslatter commented Oct 4, 2024

Using the Prometheus Go SDK, the flow is:

  • Create a metric (using a top-level constructor-function - there is no factory)
  • Register the metric with a registry (this allows observations related to the metric to be collected)
  • Use the metric to record observations
  • From time-to-time, ask the registry for current observations (the "scrape")

It's an error to register two metrics with the same unique-identifier (or rather, it's an error to register two metrics which would result in the same time-series, if that makes sense), and there is no out-of-the-box mechanism to look up previously registered metrics to avoid creating new ones - it is up to the application to keep a reference to the metric.

If we knew that for a given scope + name we would never get different configs (which sounds like what you're suggesting), a hypothetical Prometheus meter-provider-adapter could keep these metrics persistent in-between calls.

@aslatter
Copy link
Author

aslatter commented Oct 4, 2024

It also looks like the OTEL SDK has an adapter to register itself with a Prometheus registry - that might be the smoothest option if you're primarily targeting and testing against the OTEL SDKs.

@aslatter
Copy link
Author

aslatter commented Oct 4, 2024

I'm okay closing this if your primary target is OTEL metrics - the OTEL project has a Prometheus adapter which seems to work okay. For reference, a sketch of what that looks like is below. That assumes the caller is okay hauling around the OTEL SDK for these purposes.

A prometheus adapter plugged directly into the AWS SDK which caches instruments wouldn't be too hard, and might avoid some allocations per SDK-call.

I'm not sure about what I'm doing with histogram-buckets - that might not actually be necessary.

Click here!
package main

import (
	"context"
	"fmt"
	"io"
	"net/http/httptest"
	"os"

	"github.com/aws/aws-sdk-go-v2/config"
	"github.com/aws/aws-sdk-go-v2/service/s3"
	"github.com/aws/smithy-go/metrics/smithyotelmetrics"

	"go.opentelemetry.io/otel"
	otelprom "go.opentelemetry.io/otel/exporters/prometheus"
	sdkmetric "go.opentelemetry.io/otel/sdk/metric"

	"github.com/prometheus/client_golang/prometheus"
	"github.com/prometheus/client_golang/prometheus/promhttp"
)

func main() {
	if err := mainErr(); err != nil {
		fmt.Fprintf(os.Stderr, "error: %s\n", err)
		os.Exit(1)
	}
}

func mainErr() error {
	// set up our metric-exporter
	setupOTELExporter()
	// for demo purposes, scrape all prom metrics and dump to stdout
	defer scrapePromMetrics()

	ctx := context.Background()

	cfg, err := config.LoadDefaultConfig(ctx)
	if err != nil {
		return fmt.Errorf("loading aws config: %s", err)
	}

	s3c := s3.NewFromConfig(cfg, func(o *s3.Options) {
		o.MeterProvider = smithyotelmetrics.Adapt(otel.GetMeterProvider())
	})

	// make a few API calls

	_, err = s3c.ListBuckets(ctx, &s3.ListBucketsInput{})
	if err != nil {
		return fmt.Errorf("list buckets: %s", err)
	}

	_, err = s3c.ListBuckets(ctx, &s3.ListBucketsInput{})
	if err != nil {
		return fmt.Errorf("list buckets: %s", err)
	}

	_, err = s3c.ListBuckets(ctx, &s3.ListBucketsInput{})
	if err != nil {
		return fmt.Errorf("list buckets: %s", err)
	}

	return nil
}

func setupOTELExporter() {
	// create an otel metric-exporter associated with the
	// default prometheus registry
	metricExporter, err := otelprom.New(
		otelprom.WithNamespace("aws"),
		otelprom.WithoutScopeInfo(),

		// OTEL default buckets assume you're using milliseconds. Substitute defaults
		// appropriate for units of seconds.
		//
		// https://github.com/open-telemetry/opentelemetry-go/issues/5821
		otelprom.WithAggregationSelector(func(ik sdkmetric.InstrumentKind) sdkmetric.Aggregation {
			switch ik {
			case sdkmetric.InstrumentKindHistogram:
				return sdkmetric.AggregationExplicitBucketHistogram{
					Boundaries: prometheus.DefBuckets,
					NoMinMax:   false,
				}
			default:
				return sdkmetric.DefaultAggregationSelector(ik)
			}
		}),
	)
	if err != nil {
		panic(err)
	}

	// create a meter-provider associated with the exporter
	meterProvider := sdkmetric.NewMeterProvider(
		sdkmetric.WithReader(metricExporter),
	)

	otel.SetMeterProvider(meterProvider)
}

// for demo purposes, dump all prom metrics to stdout
func scrapePromMetrics() {
	req := httptest.NewRequest("GET", "/", nil)
	rw := httptest.NewRecorder()
	promhttp.Handler().ServeHTTP(rw, req)
	io.Copy(os.Stdout, rw.Result().Body)
}

@aslatter
Copy link
Author

aslatter commented Oct 5, 2024

Another difference between OTEL and Prometheus is that Prometheus requires that the time-series labels be defined during instrument-creation, but it looks like the APIs provided by smithy-metrics defer that to observation, which makes the direct Prometheus adapter more awkward.

Anyway, I have a basic path forward. If I'm okay with the OTEL SDK as a dependency the code is pretty straightforward. If I want a native Prometheus-client integration, implementing my own middleware is likely going to be less work that fitting the smithy-metrics interfaces.

@RanVaknin RanVaknin self-assigned this Oct 7, 2024
@RanVaknin RanVaknin removed the needs-triage This issue or PR still needs to be triaged. label Oct 7, 2024
@aws aws locked and limited conversation to collaborators Oct 7, 2024
@RanVaknin RanVaknin converted this issue into discussion #2823 Oct 7, 2024

This issue was moved to a discussion.

You can continue the conversation there. Go to discussion →

Labels
bug This issue is a bug.
Projects
None yet
Development

No branches or pull requests

3 participants