Skip to content

Commit

Permalink
[indexer-alt] Add checkpoint ingestion lag metrics (#20191)
Browse files Browse the repository at this point in the history
## Description 

This PR adds two metrics, one that tracks the ingestion lag on the
latest fetched checkpoint, one that tracks histogram. These two combined
could give us the full picture on the lag.

## Test plan 

CI.

---

## Release notes

Check each box that your changes affect. If none of the boxes relate to
your changes, release notes aren't required.

For each box you select, include information after the relevant heading
that describes the impact of your changes that a user might notice and
any actions they must take to implement updates.

- [ ] Protocol: 
- [ ] Nodes (Validators and Full nodes): 
- [ ] Indexer: 
- [ ] JSON-RPC: 
- [ ] GraphQL: 
- [ ] CLI: 
- [ ] Rust SDK:
- [ ] REST API:
  • Loading branch information
lxfind authored Nov 12, 2024
1 parent 4ba4b29 commit c4afb26
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 0 deletions.
9 changes: 9 additions & 0 deletions crates/sui-indexer-alt/src/ingestion/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -172,12 +172,21 @@ impl IngestionClient {
"Fetched checkpoint"
);

let lag =
chrono::Utc::now().timestamp_millis() - data.checkpoint_summary.timestamp_ms as i64;
self.metrics
.ingested_checkpoint_timestamp_lag
.observe((lag as f64) / 1000.0);

let new_seq = data.checkpoint_summary.sequence_number;
let old_seq = self
.latest_ingested_checkpoint
.fetch_max(new_seq, Ordering::Relaxed);
if new_seq > old_seq {
self.metrics.latest_ingested_checkpoint.set(new_seq as i64);
self.metrics
.latest_ingested_checkpoint_timestamp_lag_ms
.set(lag);
}

self.metrics.total_ingested_checkpoints.inc();
Expand Down
24 changes: 24 additions & 0 deletions crates/sui-indexer-alt/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,13 @@ const INGESTION_LATENCY_SEC_BUCKETS: &[f64] = &[
0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 50.0, 100.0,
];

/// Histogram buckets for the distribution of ingestion lag (difference between the system time and
/// the timestamp in the checkpoint).
const INGESTION_LAG_SEC_BUCKETS: &[f64] = &[
0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9,
0.95, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0, 1000.0,
];

/// Histogram buckets for the distribution of latencies for processing a checkpoint in the indexer
/// (without having to call out to other services).
const PROCESSING_LATENCY_SEC_BUCKETS: &[f64] = &[
Expand Down Expand Up @@ -62,6 +69,8 @@ pub struct IndexerMetrics {
pub total_ingested_not_found_retries: IntCounter,

pub latest_ingested_checkpoint: IntGauge,
pub latest_ingested_checkpoint_timestamp_lag_ms: IntGauge,
pub ingested_checkpoint_timestamp_lag: Histogram,

pub ingested_checkpoint_latency: Histogram,

Expand Down Expand Up @@ -208,6 +217,21 @@ impl IndexerMetrics {
registry,
)
.unwrap(),
latest_ingested_checkpoint_timestamp_lag_ms: register_int_gauge_with_registry!(
"latest_ingested_checkpoint_timestamp_lag_ms",
"Difference between the system timestamp when the latest checkpoint was fetched and the \
timestamp in the checkpoint, in milliseconds",
registry,
)
.unwrap(),
ingested_checkpoint_timestamp_lag: register_histogram_with_registry!(
"indexer_ingested_checkpoint_timestamp_lag",
"Difference between the system timestamp when a checkpoint was fetched and the \
timestamp in each checkpoint, in seconds",
INGESTION_LAG_SEC_BUCKETS.to_vec(),
registry,
)
.unwrap(),
ingested_checkpoint_latency: register_histogram_with_registry!(
"indexer_ingested_checkpoint_latency",
"Time taken to fetch a checkpoint from the remote store, including retries",
Expand Down

0 comments on commit c4afb26

Please sign in to comment.