From dce741f91bc344eaae73345de9eb52fafdf27800 Mon Sep 17 00:00:00 2001
From: Julien Enoch <julien.e@zettascale.tech>
Date: Tue, 19 Mar 2024 17:28:03 +0100
Subject: [PATCH 1/3] Add congestion_control/wait_before_drop config

---
 DEFAULT_CONFIG.json5                          |   7 ++
 commons/zenoh-config/src/defaults.rs          |   9 ++
 commons/zenoh-config/src/lib.rs               |   7 ++
 io/zenoh-transport/src/common/pipeline.rs     | 109 +++++++++++-------
 io/zenoh-transport/src/manager.rs             |  14 +++
 io/zenoh-transport/src/multicast/link.rs      |   1 +
 .../src/unicast/universal/link.rs             |   1 +
 7 files changed, 104 insertions(+), 44 deletions(-)

diff --git a/DEFAULT_CONFIG.json5 b/DEFAULT_CONFIG.json5
index 1f9094efa6..6797d0075d 100644
--- a/DEFAULT_CONFIG.json5
+++ b/DEFAULT_CONFIG.json5
@@ -261,6 +261,13 @@
             data_low: 4,
             background: 4,
           },
+          /// Congestion occurs when the queue is empty (no available batch).
+          /// Using CongestionControl::Block the caller is blocked until a batch is available and re-insterted into the queue.
+          /// Using CongestionControl::Drop the message might be dropped, depending on conditions configured here.
+          congestion_control: {
+            /// The maximum time in nanosecond to wait for an available batch before dropping the message if still no batch is available.
+            wait_before_drop: 1000000
+          },
           /// The initial exponential backoff time in nanoseconds to allow the batching to eventually progress.
           /// Higher values lead to a more aggressive batching but it will introduce additional latency.
           backoff: 100,
diff --git a/commons/zenoh-config/src/defaults.rs b/commons/zenoh-config/src/defaults.rs
index 7c9184702f..6737d30415 100644
--- a/commons/zenoh-config/src/defaults.rs
+++ b/commons/zenoh-config/src/defaults.rs
@@ -170,6 +170,7 @@ impl Default for QueueConf {
     fn default() -> Self {
         Self {
             size: QueueSizeConf::default(),
+            congestion_control: CongestionControlConf::default(),
             backoff: 100,
         }
     }
@@ -195,6 +196,14 @@ impl Default for QueueSizeConf {
     }
 }
 
+impl Default for CongestionControlConf {
+    fn default() -> Self {
+        Self {
+            wait_before_drop: 1000000,
+        }
+    }
+}
+
 impl Default for LinkRxConf {
     fn default() -> Self {
         Self {
diff --git a/commons/zenoh-config/src/lib.rs b/commons/zenoh-config/src/lib.rs
index 3a0a9c3c20..1e8270b2d8 100644
--- a/commons/zenoh-config/src/lib.rs
+++ b/commons/zenoh-config/src/lib.rs
@@ -352,6 +352,13 @@ validated_struct::validator! {
                             data_low: usize,
                             background: usize,
                         } where (queue_size_validator),
+                        /// Congestion occurs when the queue is empty (no available batch).
+                        /// Using CongestionControl::Block the caller is blocked until a batch is available and re-insterted into the queue.
+                        /// Using CongestionControl::Drop the message might be dropped, depending on conditions configured here.
+                        pub congestion_control: CongestionControlConf {
+                            /// The maximum time in nanosecond to wait for an available batch before dropping the message if still no batch is available.
+                            pub wait_before_drop: u64,
+                        },
                         /// The initial exponential backoff time in nanoseconds to allow the batching to eventually progress.
                         /// Higher values lead to a more aggressive batching but it will introduce additional latency.
                         backoff: u64,
diff --git a/io/zenoh-transport/src/common/pipeline.rs b/io/zenoh-transport/src/common/pipeline.rs
index 6ef385276f..85ca7ec507 100644
--- a/io/zenoh-transport/src/common/pipeline.rs
+++ b/io/zenoh-transport/src/common/pipeline.rs
@@ -19,10 +19,12 @@ use super::{
 };
 use flume::{bounded, Receiver, Sender};
 use ringbuffer_spsc::{RingBuffer, RingBufferReader, RingBufferWriter};
-use std::sync::atomic::{AtomicBool, AtomicU16, Ordering};
 use std::sync::{Arc, Mutex, MutexGuard};
-use std::thread;
 use std::time::Duration;
+use std::{
+    sync::atomic::{AtomicBool, AtomicU16, Ordering},
+    time::Instant,
+};
 use zenoh_buffers::{
     reader::{HasReader, Reader},
     writer::HasWriter,
@@ -63,6 +65,10 @@ impl StageInRefill {
     fn wait(&self) -> bool {
         self.n_ref_r.recv().is_ok()
     }
+
+    fn wait_deadline(&self, instant: Instant) -> bool {
+        self.n_ref_r.recv_deadline(instant).is_ok()
+    }
 }
 
 // Inner structure to link the initial stage with the final stage of the pipeline
@@ -121,13 +127,15 @@ struct StageIn {
 }
 
 impl StageIn {
-    fn push_network_message(&mut self, msg: &mut NetworkMessage, priority: Priority) -> bool {
+    fn push_network_message(
+        &mut self,
+        msg: &mut NetworkMessage,
+        priority: Priority,
+        deadline_before_drop: Option<Instant>,
+    ) -> bool {
         // Lock the current serialization batch.
         let mut c_guard = self.mutex.current();
 
-        // Check congestion control
-        let is_droppable = msg.is_droppable();
-
         macro_rules! zgetbatch_rets {
             ($fragment:expr, $restore_sn:expr) => {
                 loop {
@@ -140,19 +148,24 @@ impl StageIn {
                             }
                             None => {
                                 drop(c_guard);
-                                if !$fragment && is_droppable {
-                                    // Restore the sequence number
-                                    $restore_sn;
-                                    // We are in the congestion scenario
-                                    // The yield is to avoid the writing task to spin
-                                    // indefinitely and monopolize the CPU usage.
-                                    thread::yield_now();
-                                    return false;
-                                } else {
-                                    if !self.s_ref.wait() {
+                                match deadline_before_drop {
+                                    Some(deadline) if !$fragment => {
                                         // Restore the sequence number
                                         $restore_sn;
-                                        return false;
+                                        // We are in the congestion scenario and message is droppable
+                                        // Wait for an available batch until deadline
+                                        if !self.s_ref.wait_deadline(deadline) {
+                                            // Still no available batch - Drop message
+                                            return false
+                                        }
+                                    }
+                                    _ => {
+                                        // Block waiting for an available batch
+                                        if !self.s_ref.wait() {
+                                            // Restore the sequence number
+                                            $restore_sn;
+                                            return false;
+                                        }
                                     }
                                 }
                                 c_guard = self.mutex.current();
@@ -487,24 +500,10 @@ impl StageOut {
 pub(crate) struct TransmissionPipelineConf {
     pub(crate) batch: BatchConfig,
     pub(crate) queue_size: [usize; Priority::NUM],
+    pub(crate) wait_before_drop: Duration,
     pub(crate) backoff: Duration,
 }
 
-impl Default for TransmissionPipelineConf {
-    fn default() -> Self {
-        Self {
-            batch: BatchConfig {
-                mtu: BatchSize::MAX,
-                is_streamed: false,
-                #[cfg(feature = "transport_compression")]
-                is_compression: false,
-            },
-            queue_size: [1; Priority::NUM],
-            backoff: Duration::from_micros(1),
-        }
-    }
-}
-
 // A 2-stage transmission pipeline
 pub(crate) struct TransmissionPipeline;
 impl TransmissionPipeline {
@@ -579,6 +578,7 @@ impl TransmissionPipeline {
         let producer = TransmissionPipelineProducer {
             stage_in: stage_in.into_boxed_slice().into(),
             active: active.clone(),
+            wait_before_drop: config.wait_before_drop,
         };
         let consumer = TransmissionPipelineConsumer {
             stage_out: stage_out.into_boxed_slice(),
@@ -595,6 +595,7 @@ pub(crate) struct TransmissionPipelineProducer {
     // Each priority queue has its own Mutex
     stage_in: Arc<[Mutex<StageIn>]>,
     active: Arc<AtomicBool>,
+    wait_before_drop: Duration,
 }
 
 impl TransmissionPipelineProducer {
@@ -607,9 +608,15 @@ impl TransmissionPipelineProducer {
         } else {
             (0, Priority::default())
         };
+        // If message is droppable, compute a deadline after which the sample could be dropped
+        let deadline_before_drop = if msg.is_droppable() {
+            Some(Instant::now() + self.wait_before_drop)
+        } else {
+            None
+        };
         // Lock the channel. We are the only one that will be writing on it.
         let mut queue = zlock!(self.stage_in[idx]);
-        queue.push_network_message(&mut msg, priority)
+        queue.push_network_message(&mut msg, priority, deadline_before_drop)
     }
 
     #[inline]
@@ -732,7 +739,7 @@ mod tests {
     const SLEEP: Duration = Duration::from_millis(100);
     const TIMEOUT: Duration = Duration::from_secs(60);
 
-    const CONFIG: TransmissionPipelineConf = TransmissionPipelineConf {
+    const CONFIG_STREAMED: TransmissionPipelineConf = TransmissionPipelineConf {
         batch: BatchConfig {
             mtu: BatchSize::MAX,
             is_streamed: true,
@@ -740,6 +747,19 @@ mod tests {
             is_compression: true,
         },
         queue_size: [1; Priority::NUM],
+        wait_before_drop: Duration::from_millis(1),
+        backoff: Duration::from_micros(1),
+    };
+
+    const CONFIG_NOT_STREAMED: TransmissionPipelineConf = TransmissionPipelineConf {
+        batch: BatchConfig {
+            mtu: BatchSize::MAX,
+            is_streamed: false,
+            #[cfg(feature = "transport_compression")]
+            is_compression: false,
+        },
+        queue_size: [1; Priority::NUM],
+        wait_before_drop: Duration::from_millis(1),
         backoff: Duration::from_micros(1),
     };
 
@@ -847,10 +867,8 @@ mod tests {
             // Compute the number of messages to send
             let num_msg = max_msgs.min(bytes / ps);
 
-            let (producer, consumer) = TransmissionPipeline::make(
-                TransmissionPipelineConf::default(),
-                priorities.as_slice(),
-            );
+            let (producer, consumer) =
+                TransmissionPipeline::make(CONFIG_NOT_STREAMED, priorities.as_slice());
 
             let t_c = task::spawn(async move {
                 consume(consumer, num_msg).await;
@@ -874,7 +892,7 @@ mod tests {
             // Make sure to put only one message per batch: set the payload size
             // to half of the batch in such a way the serialized zenoh message
             // will be larger then half of the batch size (header + payload).
-            let payload_size = (CONFIG.batch.mtu / 2) as usize;
+            let payload_size = (CONFIG_STREAMED.batch.mtu / 2) as usize;
 
             // Send reliable messages
             let key = "test".into();
@@ -900,7 +918,7 @@ mod tests {
 
             // The last push should block since there shouldn't any more batches
             // available for serialization.
-            let num_msg = 1 + CONFIG.queue_size[0];
+            let num_msg = 1 + CONFIG_STREAMED.queue_size[0];
             for i in 0..num_msg {
                 println!(
                     "Pipeline Blocking [>>>]: ({id}) Scheduling message #{i} with payload size of {payload_size} bytes"
@@ -919,7 +937,7 @@ mod tests {
         let tct = TransportPriorityTx::make(Bits::from(TransportSn::MAX))?;
         let priorities = vec![tct];
         let (producer, mut consumer) =
-            TransmissionPipeline::make(TransmissionPipelineConf::default(), priorities.as_slice());
+            TransmissionPipeline::make(CONFIG_NOT_STREAMED, priorities.as_slice());
 
         let counter = Arc::new(AtomicUsize::new(0));
 
@@ -937,10 +955,12 @@ mod tests {
         // Wait to have sent enough messages and to have blocked
         println!(
             "Pipeline Blocking [---]: waiting to have {} messages being scheduled",
-            CONFIG.queue_size[Priority::MAX as usize]
+            CONFIG_STREAMED.queue_size[Priority::MAX as usize]
         );
         let check = async {
-            while counter.load(Ordering::Acquire) < CONFIG.queue_size[Priority::MAX as usize] {
+            while counter.load(Ordering::Acquire)
+                < CONFIG_STREAMED.queue_size[Priority::MAX as usize]
+            {
                 tokio::time::sleep(SLEEP).await;
             }
         };
@@ -972,7 +992,8 @@ mod tests {
         // Queue
         let tct = TransportPriorityTx::make(Bits::from(TransportSn::MAX)).unwrap();
         let priorities = vec![tct];
-        let (producer, mut consumer) = TransmissionPipeline::make(CONFIG, priorities.as_slice());
+        let (producer, mut consumer) =
+            TransmissionPipeline::make(CONFIG_STREAMED, priorities.as_slice());
         let count = Arc::new(AtomicUsize::new(0));
         let size = Arc::new(AtomicUsize::new(0));
 
diff --git a/io/zenoh-transport/src/manager.rs b/io/zenoh-transport/src/manager.rs
index 9bf40e5fd3..ae6a4f64e8 100644
--- a/io/zenoh-transport/src/manager.rs
+++ b/io/zenoh-transport/src/manager.rs
@@ -94,6 +94,7 @@ pub struct TransportManagerConfig {
     pub whatami: WhatAmI,
     pub resolution: Resolution,
     pub batch_size: u16,
+    pub wait_before_drop: Duration,
     pub queue_size: [usize; Priority::NUM],
     pub queue_backoff: Duration,
     pub defrag_buff_size: usize,
@@ -122,6 +123,7 @@ pub struct TransportManagerBuilder {
     whatami: WhatAmI,
     resolution: Resolution,
     batch_size: u16,
+    wait_before_drop: Duration,
     queue_size: QueueSizeConf,
     queue_backoff: Duration,
     defrag_buff_size: usize,
@@ -154,6 +156,11 @@ impl TransportManagerBuilder {
         self
     }
 
+    pub fn wait_before_drop(mut self, wait_before_drop: Duration) -> Self {
+        self.wait_before_drop = wait_before_drop;
+        self
+    }
+
     pub fn queue_size(mut self, queue_size: QueueSizeConf) -> Self {
         self.queue_size = queue_size;
         self
@@ -212,7 +219,11 @@ impl TransportManagerBuilder {
         self = self.batch_size(*link.tx().batch_size());
         self = self.defrag_buff_size(*link.rx().max_message_size());
         self = self.link_rx_buffer_size(*link.rx().buffer_size());
+        self = self.wait_before_drop(Duration::from_nanos(
+            *link.tx().queue().congestion_control().wait_before_drop(),
+        ));
         self = self.queue_size(link.tx().queue().size().clone());
+        self = self.queue_backoff(Duration::from_nanos(*link.tx().queue().backoff()));
         self = self.tx_threads(*link.tx().threads());
         self = self.protocols(link.protocols().clone());
 
@@ -259,6 +270,7 @@ impl TransportManagerBuilder {
             whatami: self.whatami,
             resolution: self.resolution,
             batch_size: self.batch_size,
+            wait_before_drop: self.wait_before_drop,
             queue_size,
             queue_backoff: self.queue_backoff,
             defrag_buff_size: self.defrag_buff_size,
@@ -292,12 +304,14 @@ impl Default for TransportManagerBuilder {
         let link_rx = LinkRxConf::default();
         let queue = QueueConf::default();
         let backoff = *queue.backoff();
+        let wait_before_drop = *queue.congestion_control().wait_before_drop();
         Self {
             version: VERSION,
             zid: ZenohId::rand(),
             whatami: zenoh_config::defaults::mode,
             resolution: Resolution::default(),
             batch_size: BatchSize::MAX,
+            wait_before_drop: Duration::from_nanos(wait_before_drop),
             queue_size: queue.size,
             queue_backoff: Duration::from_nanos(backoff),
             defrag_buff_size: *link_rx.max_message_size(),
diff --git a/io/zenoh-transport/src/multicast/link.rs b/io/zenoh-transport/src/multicast/link.rs
index 3565f747a0..bfbdd3af61 100644
--- a/io/zenoh-transport/src/multicast/link.rs
+++ b/io/zenoh-transport/src/multicast/link.rs
@@ -320,6 +320,7 @@ impl TransportLinkMulticastUniversal {
             let tpc = TransmissionPipelineConf {
                 batch: self.link.config.batch,
                 queue_size: self.transport.manager.config.queue_size,
+                wait_before_drop: self.transport.manager.config.wait_before_drop,
                 backoff: self.transport.manager.config.queue_backoff,
             };
             // The pipeline
diff --git a/io/zenoh-transport/src/unicast/universal/link.rs b/io/zenoh-transport/src/unicast/universal/link.rs
index 5c07b69738..93a6c717dd 100644
--- a/io/zenoh-transport/src/unicast/universal/link.rs
+++ b/io/zenoh-transport/src/unicast/universal/link.rs
@@ -59,6 +59,7 @@ impl TransportLinkUnicastUniversal {
                 is_compression: link.config.batch.is_compression,
             },
             queue_size: transport.manager.config.queue_size,
+            wait_before_drop: transport.manager.config.wait_before_drop,
             backoff: transport.manager.config.queue_backoff,
         };
 

From 21f6830ac7b4e9edca6d6616bc7dbb0c2e0afe42 Mon Sep 17 00:00:00 2001
From: Julien Enoch <julien.e@zettascale.tech>
Date: Wed, 20 Mar 2024 12:17:34 +0100
Subject: [PATCH 2/3] Fix sequence number restoration

---
 io/zenoh-transport/src/common/pipeline.rs | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/io/zenoh-transport/src/common/pipeline.rs b/io/zenoh-transport/src/common/pipeline.rs
index 85ca7ec507..9df7632f7a 100644
--- a/io/zenoh-transport/src/common/pipeline.rs
+++ b/io/zenoh-transport/src/common/pipeline.rs
@@ -150,19 +150,20 @@ impl StageIn {
                                 drop(c_guard);
                                 match deadline_before_drop {
                                     Some(deadline) if !$fragment => {
-                                        // Restore the sequence number
-                                        $restore_sn;
                                         // We are in the congestion scenario and message is droppable
                                         // Wait for an available batch until deadline
                                         if !self.s_ref.wait_deadline(deadline) {
-                                            // Still no available batch - Drop message
+                                            // Still no available batch.
+                                            // Restore the sequence number and drop the message
+                                            $restore_sn;
                                             return false
                                         }
                                     }
                                     _ => {
                                         // Block waiting for an available batch
                                         if !self.s_ref.wait() {
-                                            // Restore the sequence number
+                                            // Some error prevented the queue to wait and give back an available batch
+                                            // Restore the sequence number and drop the message
                                             $restore_sn;
                                             return false;
                                         }

From 3fc210349127b39136f282d78b160afe7062c4e8 Mon Sep 17 00:00:00 2001
From: Julien Enoch <julien.e@zettascale.tech>
Date: Wed, 20 Mar 2024 14:34:03 +0100
Subject: [PATCH 3/3] Change wait_before_drop unit from nanos to micros

---
 DEFAULT_CONFIG.json5                 | 4 ++--
 commons/zenoh-config/src/defaults.rs | 2 +-
 commons/zenoh-config/src/lib.rs      | 2 +-
 io/zenoh-transport/src/manager.rs    | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/DEFAULT_CONFIG.json5 b/DEFAULT_CONFIG.json5
index 6797d0075d..4fdaa34e88 100644
--- a/DEFAULT_CONFIG.json5
+++ b/DEFAULT_CONFIG.json5
@@ -265,8 +265,8 @@
           /// Using CongestionControl::Block the caller is blocked until a batch is available and re-insterted into the queue.
           /// Using CongestionControl::Drop the message might be dropped, depending on conditions configured here.
           congestion_control: {
-            /// The maximum time in nanosecond to wait for an available batch before dropping the message if still no batch is available.
-            wait_before_drop: 1000000
+            /// The maximum time in microseconds to wait for an available batch before dropping the message if still no batch is available.
+            wait_before_drop: 1000
           },
           /// The initial exponential backoff time in nanoseconds to allow the batching to eventually progress.
           /// Higher values lead to a more aggressive batching but it will introduce additional latency.
diff --git a/commons/zenoh-config/src/defaults.rs b/commons/zenoh-config/src/defaults.rs
index 6737d30415..f1973d2c0d 100644
--- a/commons/zenoh-config/src/defaults.rs
+++ b/commons/zenoh-config/src/defaults.rs
@@ -199,7 +199,7 @@ impl Default for QueueSizeConf {
 impl Default for CongestionControlConf {
     fn default() -> Self {
         Self {
-            wait_before_drop: 1000000,
+            wait_before_drop: 1000,
         }
     }
 }
diff --git a/commons/zenoh-config/src/lib.rs b/commons/zenoh-config/src/lib.rs
index 1e8270b2d8..2b5485fa6b 100644
--- a/commons/zenoh-config/src/lib.rs
+++ b/commons/zenoh-config/src/lib.rs
@@ -356,7 +356,7 @@ validated_struct::validator! {
                         /// Using CongestionControl::Block the caller is blocked until a batch is available and re-insterted into the queue.
                         /// Using CongestionControl::Drop the message might be dropped, depending on conditions configured here.
                         pub congestion_control: CongestionControlConf {
-                            /// The maximum time in nanosecond to wait for an available batch before dropping the message if still no batch is available.
+                            /// The maximum time in microseconds to wait for an available batch before dropping the message if still no batch is available.
                             pub wait_before_drop: u64,
                         },
                         /// The initial exponential backoff time in nanoseconds to allow the batching to eventually progress.
diff --git a/io/zenoh-transport/src/manager.rs b/io/zenoh-transport/src/manager.rs
index ae6a4f64e8..f16a68cfba 100644
--- a/io/zenoh-transport/src/manager.rs
+++ b/io/zenoh-transport/src/manager.rs
@@ -219,7 +219,7 @@ impl TransportManagerBuilder {
         self = self.batch_size(*link.tx().batch_size());
         self = self.defrag_buff_size(*link.rx().max_message_size());
         self = self.link_rx_buffer_size(*link.rx().buffer_size());
-        self = self.wait_before_drop(Duration::from_nanos(
+        self = self.wait_before_drop(Duration::from_micros(
             *link.tx().queue().congestion_control().wait_before_drop(),
         ));
         self = self.queue_size(link.tx().queue().size().clone());
@@ -311,7 +311,7 @@ impl Default for TransportManagerBuilder {
             whatami: zenoh_config::defaults::mode,
             resolution: Resolution::default(),
             batch_size: BatchSize::MAX,
-            wait_before_drop: Duration::from_nanos(wait_before_drop),
+            wait_before_drop: Duration::from_micros(wait_before_drop),
             queue_size: queue.size,
             queue_backoff: Duration::from_nanos(backoff),
             defrag_buff_size: *link_rx.max_message_size(),