Skip to content

Commit

Permalink
Migrate value.* metrics to gauges (#6476)
Browse files Browse the repository at this point in the history
  • Loading branch information
goto-bus-stop authored Dec 20, 2024
1 parent e28ab3b commit b362206
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 18 deletions.
5 changes: 5 additions & 0 deletions .changesets/maint_renee_migrate_metrics_values.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
### Migrate gauge metrics to OTel instruments ([PR #6476](https://github.com/apollographql/router/pull/6476))

Updates gauge metrics using the legacy `tracing::info!(value.*)` syntax to OTel instruments.

By [@goto-bus-stop](https://github.com/goto-bus-stop) in https://github.com/apollographql/router/pull/6476
29 changes: 25 additions & 4 deletions apollo-router/src/axum_factory/axum_http_server_factory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,15 @@ use hyper::server::conn::Http;
use hyper::Body;
use itertools::Itertools;
use multimap::MultiMap;
use opentelemetry_api::metrics::MeterProvider as _;
use opentelemetry_api::metrics::ObservableGauge;
use serde::Serialize;
use serde_json::json;
#[cfg(unix)]
use tokio::net::UnixListener;
use tokio::sync::mpsc;
use tokio_rustls::TlsAcceptor;
use tower::layer::layer_fn;
use tower::service_fn;
use tower::BoxError;
use tower::ServiceBuilder;
Expand All @@ -60,6 +63,7 @@ use crate::graphql;
use crate::http_server_factory::HttpServerFactory;
use crate::http_server_factory::HttpServerHandle;
use crate::http_server_factory::Listener;
use crate::metrics::meter_provider;
use crate::plugins::telemetry::SpanMode;
use crate::router::ApolloRouterError;
use crate::router_factory::Endpoint;
Expand All @@ -73,20 +77,29 @@ use crate::Context;

static ACTIVE_SESSION_COUNT: AtomicU64 = AtomicU64::new(0);

fn session_count_instrument() -> ObservableGauge<u64> {
let meter = meter_provider().meter("apollo/router");
meter
.u64_observable_gauge("apollo_router_session_count_active")
.with_description("Amount of in-flight sessions")
.with_callback(|gauge| {
gauge.observe(ACTIVE_SESSION_COUNT.load(Ordering::Relaxed), &[]);
})
.init()
}

struct SessionCountGuard;

impl SessionCountGuard {
fn start() -> Self {
let session_count = ACTIVE_SESSION_COUNT.fetch_add(1, Ordering::Acquire) + 1;
tracing::info!(value.apollo_router_session_count_active = session_count,);
ACTIVE_SESSION_COUNT.fetch_add(1, Ordering::Acquire);
Self
}
}

impl Drop for SessionCountGuard {
fn drop(&mut self) {
let session_count = ACTIVE_SESSION_COUNT.fetch_sub(1, Ordering::Acquire) - 1;
tracing::info!(value.apollo_router_session_count_active = session_count,);
ACTIVE_SESSION_COUNT.fetch_sub(1, Ordering::Acquire);
}
}

Expand Down Expand Up @@ -625,6 +638,14 @@ where
);
}

// Tie the lifetime of the session count instrument to the lifetime of the router
// by moving it into a no-op layer.
let instrument = session_count_instrument();
router = router.layer(layer_fn(move |service| {
let _ = &instrument;
service
}));

router
}

Expand Down
27 changes: 14 additions & 13 deletions apollo-router/src/configuration/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -120,19 +120,20 @@ impl InstrumentData {
}

pub(crate) fn populate_config_instruments(&mut self, yaml: &serde_json::Value) {
// This macro will query the config json for a primary metric and optionally metric attributes.

// The reason we use jsonpath_rust is that jsonpath_lib has correctness issues and looks abandoned.
// We should consider converting the rest of the codebase to use jsonpath_rust.

// Example usage:
// populate_usage_instrument!(
// value.apollo.router.config.authorization, // The metric name
// "$.authorization", // The path into the config
// opt.require_authentication, // The name of the attribute
// "$[?(@.require_authentication == true)]" // The path for the attribute relative to the metric
// );

/// This macro will query the config json for a primary metric and optionally metric attributes.
///
/// The reason we use jsonpath_rust is that jsonpath_lib has correctness issues and looks abandoned.
/// We should consider converting the rest of the codebase to use jsonpath_rust.
///
/// Example usage:
/// ```rust,ignore
/// populate_config_instrument!(
/// apollo.router.config.authorization, // The metric name
/// "$.authorization", // The path into the config
/// opt.require_authentication, // The name of the attribute
/// "$[?(@.require_authentication == true)]" // The path for the attribute relative to the metric
/// );
/// ```
macro_rules! populate_config_instrument {
($($metric:ident).+, $path:literal) => {
let instrument_name = stringify!($($metric).+).to_string();
Expand Down
56 changes: 55 additions & 1 deletion apollo-router/src/plugins/telemetry/tracing/apollo_telemetry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ use std::collections::HashMap;
use std::collections::HashSet;
use std::io::Cursor;
use std::num::NonZeroUsize;
use std::sync::atomic::AtomicU64;
use std::sync::Arc;
use std::time::SystemTime;
use std::time::SystemTimeError;
Expand All @@ -28,12 +29,15 @@ use opentelemetry::trace::TraceId;
use opentelemetry::Key;
use opentelemetry::KeyValue;
use opentelemetry::Value;
use opentelemetry_api::metrics::MeterProvider as _;
use opentelemetry_api::metrics::ObservableGauge;
use prost::Message;
use rand::Rng;
use serde::de::DeserializeOwned;
use thiserror::Error;
use url::Url;

use crate::metrics::meter_provider;
use crate::plugins::telemetry;
use crate::plugins::telemetry::apollo::ErrorConfiguration;
use crate::plugins::telemetry::apollo::ErrorsConfiguration;
Expand Down Expand Up @@ -245,6 +249,47 @@ impl LightSpanData {
}
}

/// An externally updateable gauge for "apollo_router_span_lru_size".
///
/// When observed, it reports the most recently stored value (give or take atomicity looseness).
///
/// This *could* be generalised to any kind of gauge, but we should ideally have gauges that can just
/// observe their accurate value whenever requested. The externally updateable approach is kind of
/// a hack that happens to work here because we only have one place where the value can change, and
/// otherwise we might have to use an inconvenient Mutex or RwLock around the entire LRU cache.
#[derive(Debug)]
struct SpanLruSizeInstrument {
value: Arc<AtomicU64>,
_gauge: ObservableGauge<u64>,
}

impl SpanLruSizeInstrument {
fn new() -> Self {
let value = Arc::new(AtomicU64::new(0));

let meter = meter_provider().meter("apollo/router");
let gauge = meter
.u64_observable_gauge("apollo_router_span_lru_size")
.with_callback({
let value = Arc::clone(&value);
move |gauge| {
gauge.observe(value.load(std::sync::atomic::Ordering::Relaxed), &[]);
}
})
.init();

Self {
value,
_gauge: gauge,
}
}

fn update(&self, value: u64) {
self.value
.store(value, std::sync::atomic::Ordering::Relaxed);
}
}

/// A [`SpanExporter`] that writes to [`Reporter`].
///
/// [`SpanExporter`]: super::SpanExporter
Expand All @@ -253,6 +298,7 @@ impl LightSpanData {
#[derivative(Debug)]
pub(crate) struct Exporter {
spans_by_parent_id: LruCache<SpanId, LruCache<usize, LightSpanData>>,
span_lru_size_instrument: SpanLruSizeInstrument,
#[derivative(Debug = "ignore")]
report_exporter: Option<Arc<ApolloExporter>>,
#[derivative(Debug = "ignore")]
Expand Down Expand Up @@ -325,8 +371,12 @@ impl Exporter {
Sampler::AlwaysOff => 0f64,
},
};

let span_lru_size_instrument = SpanLruSizeInstrument::new();

Ok(Self {
spans_by_parent_id: LruCache::new(buffer_size),
span_lru_size_instrument,
report_exporter: if otlp_tracing_ratio < 1f64 {
Some(Arc::new(ApolloExporter::new(
endpoint,
Expand Down Expand Up @@ -1082,7 +1132,11 @@ impl SpanExporter for Exporter {
);
}
}
tracing::info!(value.apollo_router_span_lru_size = self.spans_by_parent_id.len() as u64,);

// Note this won't be correct anymore if there is any way outside of `.export()`
// to affect the size of the cache.
self.span_lru_size_instrument
.update(self.spans_by_parent_id.len() as u64);

#[allow(clippy::manual_map)] // https://github.com/rust-lang/rust-clippy/issues/8346
let report_exporter = match self.report_exporter.as_ref() {
Expand Down

0 comments on commit b362206

Please sign in to comment.