From 55df53faa9ee6c5faad83475f28523615b539162 Mon Sep 17 00:00:00 2001
From: David Emerson <43939939+emersodb@users.noreply.github.com>
Date: Thu, 7 Nov 2024 08:51:29 -0500
Subject: [PATCH 1/3] Expanding lam to lambda in arguments passed to scripts

---
 research/cifar10/adaptive_pfl/ditto/run_fold_experiment.slrm  | 4 ++--
 research/cifar10/adaptive_pfl/ditto/server.py                 | 2 +-
 .../cifar10/adaptive_pfl/fedprox/run_fold_experiment.slrm     | 4 ++--
 research/cifar10/adaptive_pfl/fedprox/server.py               | 2 +-
 .../cifar10/adaptive_pfl/fenda_ditto/run_fold_experiment.slrm | 4 ++--
 research/cifar10/adaptive_pfl/fenda_ditto/server.py           | 2 +-
 research/cifar10/adaptive_pfl/mrmtl/run_fold_experiment.slrm  | 4 ++--
 research/cifar10/adaptive_pfl/mrmtl/server.py                 | 2 +-
 research/cifar10/fed_dgga_pfl/ditto/run_fold_experiment.slrm  | 2 +-
 research/cifar10/fed_dgga_pfl/ditto/server.py                 | 2 +-
 .../cifar10/fed_dgga_pfl/fenda_ditto/run_fold_experiment.slrm | 2 +-
 research/cifar10/fed_dgga_pfl/fenda_ditto/server.py           | 2 +-
 12 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/research/cifar10/adaptive_pfl/ditto/run_fold_experiment.slrm b/research/cifar10/adaptive_pfl/ditto/run_fold_experiment.slrm
index f170822b2..aa8e256a0 100644
--- a/research/cifar10/adaptive_pfl/ditto/run_fold_experiment.slrm
+++ b/research/cifar10/adaptive_pfl/ditto/run_fold_experiment.slrm
@@ -132,7 +132,7 @@ do
             --config_path ${SERVER_CONFIG_PATH} \
             --server_address ${SERVER_ADDRESS} \
             --seed ${SEED} \
-            --lam ${LAM_VALUE} \
+            --lambda ${LAM_VALUE} \
             --use_adaptation \
             > ${SERVER_OUTPUT_FILE} 2>&1 &
     else
@@ -140,7 +140,7 @@ do
             --config_path ${SERVER_CONFIG_PATH} \
             --server_address ${SERVER_ADDRESS} \
             --seed ${SEED} \
-            --lam ${LAM_VALUE} \
+            --lambda ${LAM_VALUE} \
             > ${SERVER_OUTPUT_FILE} 2>&1 &
     fi
 
diff --git a/research/cifar10/adaptive_pfl/ditto/server.py b/research/cifar10/adaptive_pfl/ditto/server.py
index e3500627b..9d82c9461 100644
--- a/research/cifar10/adaptive_pfl/ditto/server.py
+++ b/research/cifar10/adaptive_pfl/ditto/server.py
@@ -101,7 +101,7 @@ def main(config: Dict[str, Any], server_address: str, lam: float, adapt_loss_wei
         required=False,
     )
     parser.add_argument(
-        "--lam", action="store", type=float, help="Ditto loss weight for local model training", default=0.01
+        "--lambda", action="store", type=float, help="Ditto loss weight for local model training", default=0.01
     )
     parser.add_argument(
         "--use_adaptation",
diff --git a/research/cifar10/adaptive_pfl/fedprox/run_fold_experiment.slrm b/research/cifar10/adaptive_pfl/fedprox/run_fold_experiment.slrm
index 59f77dff8..f95dae6c6 100644
--- a/research/cifar10/adaptive_pfl/fedprox/run_fold_experiment.slrm
+++ b/research/cifar10/adaptive_pfl/fedprox/run_fold_experiment.slrm
@@ -134,7 +134,7 @@ do
             --run_name ${RUN_NAME} \
             --server_address ${SERVER_ADDRESS} \
             --seed ${SEED} \
-            --lam ${LAM_VALUE} \
+            --lambda ${LAM_VALUE} \
             --use_adaptation \
             > ${SERVER_OUTPUT_FILE} 2>&1 &
     else
@@ -144,7 +144,7 @@ do
             --run_name ${RUN_NAME} \
             --server_address ${SERVER_ADDRESS} \
             --seed ${SEED} \
-            --lam ${LAM_VALUE} \
+            --lambda ${LAM_VALUE} \
             > ${SERVER_OUTPUT_FILE} 2>&1 &
     fi
 
diff --git a/research/cifar10/adaptive_pfl/fedprox/server.py b/research/cifar10/adaptive_pfl/fedprox/server.py
index 5032342a0..ecc177203 100644
--- a/research/cifar10/adaptive_pfl/fedprox/server.py
+++ b/research/cifar10/adaptive_pfl/fedprox/server.py
@@ -136,7 +136,7 @@ def main(
         required=False,
     )
     parser.add_argument(
-        "--lam", action="store", type=float, help="FedProx loss weight for local model training", default=0.01
+        "--lambda", action="store", type=float, help="FedProx loss weight for local model training", default=0.01
     )
     parser.add_argument(
         "--use_adaptation",
diff --git a/research/cifar10/adaptive_pfl/fenda_ditto/run_fold_experiment.slrm b/research/cifar10/adaptive_pfl/fenda_ditto/run_fold_experiment.slrm
index 58d39133a..ac4909432 100644
--- a/research/cifar10/adaptive_pfl/fenda_ditto/run_fold_experiment.slrm
+++ b/research/cifar10/adaptive_pfl/fenda_ditto/run_fold_experiment.slrm
@@ -135,7 +135,7 @@ do
             --config_path ${SERVER_CONFIG_PATH} \
             --server_address ${SERVER_ADDRESS} \
             --seed ${SEED} \
-            --lam ${LAM_VALUE} \
+            --lambda ${LAM_VALUE} \
             --use_adaptation \
             > ${SERVER_OUTPUT_FILE} 2>&1 &
     else
@@ -143,7 +143,7 @@ do
             --config_path ${SERVER_CONFIG_PATH} \
             --server_address ${SERVER_ADDRESS} \
             --seed ${SEED} \
-            --lam ${LAM_VALUE} \
+            --lambda ${LAM_VALUE} \
             > ${SERVER_OUTPUT_FILE} 2>&1 &
     fi
 
diff --git a/research/cifar10/adaptive_pfl/fenda_ditto/server.py b/research/cifar10/adaptive_pfl/fenda_ditto/server.py
index 3a45d46cd..b576852f1 100644
--- a/research/cifar10/adaptive_pfl/fenda_ditto/server.py
+++ b/research/cifar10/adaptive_pfl/fenda_ditto/server.py
@@ -101,7 +101,7 @@ def main(config: Dict[str, Any], server_address: str, lam: float, adapt_loss_wei
         required=False,
     )
     parser.add_argument(
-        "--lam", action="store", type=float, help="FENDA Ditto loss weight for local model training", default=0.01
+        "--lambda", action="store", type=float, help="FENDA Ditto loss weight for local model training", default=0.01
     )
     parser.add_argument(
         "--use_adaptation",
diff --git a/research/cifar10/adaptive_pfl/mrmtl/run_fold_experiment.slrm b/research/cifar10/adaptive_pfl/mrmtl/run_fold_experiment.slrm
index b216abf2b..2f708bce0 100644
--- a/research/cifar10/adaptive_pfl/mrmtl/run_fold_experiment.slrm
+++ b/research/cifar10/adaptive_pfl/mrmtl/run_fold_experiment.slrm
@@ -132,7 +132,7 @@ do
             --config_path ${SERVER_CONFIG_PATH} \
             --server_address ${SERVER_ADDRESS} \
             --seed ${SEED} \
-            --lam ${LAM_VALUE} \
+            --lambda ${LAM_VALUE} \
             --use_adaptation \
             > ${SERVER_OUTPUT_FILE} 2>&1 &
     else
@@ -140,7 +140,7 @@ do
             --config_path ${SERVER_CONFIG_PATH} \
             --server_address ${SERVER_ADDRESS} \
             --seed ${SEED} \
-            --lam ${LAM_VALUE} \
+            --lambda ${LAM_VALUE} \
             > ${SERVER_OUTPUT_FILE} 2>&1 &
     fi
 
diff --git a/research/cifar10/adaptive_pfl/mrmtl/server.py b/research/cifar10/adaptive_pfl/mrmtl/server.py
index f965aa813..f3a5e8c60 100644
--- a/research/cifar10/adaptive_pfl/mrmtl/server.py
+++ b/research/cifar10/adaptive_pfl/mrmtl/server.py
@@ -125,7 +125,7 @@ def main(config: Dict[str, Any], server_address: str, lam: float, adapt_loss_wei
         required=False,
     )
     parser.add_argument(
-        "--lam", action="store", type=float, help="Ditto loss weight for local model training", default=0.01
+        "--lambda", action="store", type=float, help="Ditto loss weight for local model training", default=0.01
     )
     parser.add_argument(
         "--use_adaptation",
diff --git a/research/cifar10/fed_dgga_pfl/ditto/run_fold_experiment.slrm b/research/cifar10/fed_dgga_pfl/ditto/run_fold_experiment.slrm
index 37ecfafd1..7d95fe7e4 100644
--- a/research/cifar10/fed_dgga_pfl/ditto/run_fold_experiment.slrm
+++ b/research/cifar10/fed_dgga_pfl/ditto/run_fold_experiment.slrm
@@ -130,7 +130,7 @@ do
         --config_path ${SERVER_CONFIG_PATH} \
         --server_address ${SERVER_ADDRESS} \
         --seed ${SEED} \
-        --lam ${LAM_VALUE} \
+        --lambda ${LAM_VALUE} \
         --step_size ${STEP_SIZE} \
         > ${SERVER_OUTPUT_FILE} 2>&1 &
 
diff --git a/research/cifar10/fed_dgga_pfl/ditto/server.py b/research/cifar10/fed_dgga_pfl/ditto/server.py
index fa8b46a7c..e95b8543c 100644
--- a/research/cifar10/fed_dgga_pfl/ditto/server.py
+++ b/research/cifar10/fed_dgga_pfl/ditto/server.py
@@ -118,7 +118,7 @@ def main(config: Dict[str, Any], server_address: str, lam: float, step_size: flo
         required=False,
     )
     parser.add_argument(
-        "--lam", action="store", type=float, help="Ditto loss weight for local model training", default=0.01
+        "--lambda", action="store", type=float, help="Ditto loss weight for local model training", default=0.01
     )
     parser.add_argument(
         "--step_size",
diff --git a/research/cifar10/fed_dgga_pfl/fenda_ditto/run_fold_experiment.slrm b/research/cifar10/fed_dgga_pfl/fenda_ditto/run_fold_experiment.slrm
index 0dfa94b8f..265cf4eee 100644
--- a/research/cifar10/fed_dgga_pfl/fenda_ditto/run_fold_experiment.slrm
+++ b/research/cifar10/fed_dgga_pfl/fenda_ditto/run_fold_experiment.slrm
@@ -134,7 +134,7 @@ do
         --config_path ${SERVER_CONFIG_PATH} \
         --server_address ${SERVER_ADDRESS} \
         --seed ${SEED} \
-        --lam ${LAM_VALUE} \
+        --lambda ${LAM_VALUE} \
         --step_size ${STEP_SIZE} \
         > ${SERVER_OUTPUT_FILE} 2>&1 &
 
diff --git a/research/cifar10/fed_dgga_pfl/fenda_ditto/server.py b/research/cifar10/fed_dgga_pfl/fenda_ditto/server.py
index 2f373d895..3a3452cbd 100644
--- a/research/cifar10/fed_dgga_pfl/fenda_ditto/server.py
+++ b/research/cifar10/fed_dgga_pfl/fenda_ditto/server.py
@@ -118,7 +118,7 @@ def main(config: Dict[str, Any], server_address: str, lam: float, step_size: flo
         required=False,
     )
     parser.add_argument(
-        "--lam", action="store", type=float, help="FENDA Ditto loss weight for local model training", default=0.01
+        "--lambda", action="store", type=float, help="FENDA Ditto loss weight for local model training", default=0.01
     )
     parser.add_argument(
         "--step_size",

From edc846307f8e74230277de2b7b6213fc4ae7d6fb Mon Sep 17 00:00:00 2001
From: David Emerson <43939939+emersodb@users.noreply.github.com>
Date: Thu, 7 Nov 2024 09:00:11 -0500
Subject: [PATCH 2/3] Changes to make max retries a touch more clear and
 cleaner

---
 fl4health/utils/partitioners.py | 17 +++++++++++------
 research/cifar10/preprocess.py  |  6 +++---
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/fl4health/utils/partitioners.py b/fl4health/utils/partitioners.py
index c49c934f9..b9b2492a2 100644
--- a/fl4health/utils/partitioners.py
+++ b/fl4health/utils/partitioners.py
@@ -148,7 +148,9 @@ def partition_label_indices(
         # Dropping the last partition as they are "excess" indices
         return partitioned_indices[:-1], min_samples, partition_allocations
 
-    def partition_dataset(self, original_dataset: D, max_retries: int = 5) -> Tuple[List[D], Dict[T, np.ndarray]]:
+    def partition_dataset(
+        self, original_dataset: D, max_retries: Optional[int] = 5
+    ) -> Tuple[List[D], Dict[T, np.ndarray]]:
         """
         Attempts partitioning of the original dataset up to max_retries times. Retries are potentially required if
         the user requests a minimum number of labels be assigned to each of the partitions. If the drawn Dirichlet
@@ -157,16 +159,19 @@ def partition_dataset(self, original_dataset: D, max_retries: int = 5) -> Tuple[
 
         Args:
             original_dataset (D): The dataset to be partitioned
-            max_retries (int, optional): Number of times to attempt to satisfy a user provided minimum
-                label-associated data points per partition. Defaults to 5.
+            max_retries (Optional[int], optional): Number of times to attempt to satisfy a user provided minimum
+                label-associated data points per partition. Set this value to None if you want to retry indefinitely.
+                Defaults to 5.
 
         Raises:
             ValueError: Throws this error if the retries have been exhausted and the user provided minimum is not met.
 
         Returns:
-            List[D]: The partitioned datasets, length should correspond to self.number_of_partitions
-            Dict[T, np.ndarray]: The Dirichlet distribution used to partition the data points for each label.
+            Tuple[List[D], Dict[T, np.ndarray]]: List[D] is the partitioned datasets, length should correspond to
+            self.number_of_partitions. Dict[T, np.ndarray] is the Dirichlet distribution used to partition the data
+            points for each label.
         """
+
         targets = original_dataset.targets
         assert targets is not None, "A label-based partitioner requires targets but this dataset has no targets"
         partitioned_indices = [torch.Tensor([]).int() for _ in range(self.number_of_partitions)]
@@ -195,7 +200,7 @@ def partition_dataset(self, original_dataset: D, max_retries: int = 5) -> Tuple[
                             f"minimum requested was {self.min_label_examples}. Resampling the partition..."
                         ),
                     )
-                    if partition_attempts == max_retries:
+                    if max_retries is not None and partition_attempts >= max_retries:
                         raise ValueError(
                             (
                                 f"Max Retries: {max_retries} reached. Partitioning failed to "
diff --git a/research/cifar10/preprocess.py b/research/cifar10/preprocess.py
index 8e3f9632f..58a478a1e 100644
--- a/research/cifar10/preprocess.py
+++ b/research/cifar10/preprocess.py
@@ -103,7 +103,7 @@ def preprocess_data(
         number_of_partitions=num_clients, unique_labels=list(range(10)), beta=beta, min_label_examples=1
     )
     train_partitioned_datasets, train_partitioned_dist = heterogeneous_partitioner.partition_dataset(
-        training_set, max_retries=-1
+        training_set, max_retries=None
     )
 
     # Partition validation and test data
@@ -111,9 +111,9 @@ def preprocess_data(
         number_of_partitions=num_clients, unique_labels=list(range(10)), prior_distribution=train_partitioned_dist
     )
     validation_partitioned_datasets, _ = heterogeneous_partitioner_with_prior.partition_dataset(
-        validation_set, max_retries=-1
+        validation_set, max_retries=None
     )
-    test_partitioned_datasets, _ = heterogeneous_partitioner_with_prior.partition_dataset(test_set, max_retries=-1)
+    test_partitioned_datasets, _ = heterogeneous_partitioner_with_prior.partition_dataset(test_set, max_retries=None)
 
     return train_partitioned_datasets, validation_partitioned_datasets, test_partitioned_datasets
 

From 93988c79c5349a62e457ffa21bcb6feef1d574ea Mon Sep 17 00:00:00 2001
From: David Emerson <43939939+emersodb@users.noreply.github.com>
Date: Thu, 7 Nov 2024 11:19:47 -0500
Subject: [PATCH 3/3] Moving the torch specific determinism to separate
 optional arguments so as not to disrupt any other workflows that don't need
 them. Added some documentation. Adding the setting of these to the
 appropriate places for the pFL experiments. Reducing the hidden size of the
 cnns used in pFL experiments, as they are quite large.

---
 fl4health/utils/random.py                     | 34 +++++++++++++++----
 research/cifar10/adaptive_pfl/ditto/client.py |  4 +--
 .../ditto/run_fold_experiment.slrm            |  4 +++
 research/cifar10/adaptive_pfl/ditto/server.py |  4 +--
 .../cifar10/adaptive_pfl/fedprox/client.py    |  4 +--
 .../fedprox/run_fold_experiment.slrm          |  4 +++
 .../cifar10/adaptive_pfl/fedprox/server.py    |  4 +--
 .../adaptive_pfl/fenda_ditto/client.py        |  4 +--
 .../fenda_ditto/run_fold_experiment.slrm      |  4 +++
 .../adaptive_pfl/fenda_ditto/server.py        |  4 +--
 research/cifar10/adaptive_pfl/mrmtl/client.py |  4 +--
 .../mrmtl/run_fold_experiment.slrm            |  4 +++
 research/cifar10/adaptive_pfl/mrmtl/server.py |  4 +--
 research/cifar10/fed_dgga_pfl/ditto/client.py |  4 +--
 .../ditto/run_fold_experiment.slrm            |  4 +++
 research/cifar10/fed_dgga_pfl/ditto/server.py |  4 +--
 research/cifar10/fed_dgga_pfl/fenda/client.py |  4 +--
 .../fenda/run_fold_experiment.slrm            |  4 +++
 research/cifar10/fed_dgga_pfl/fenda/server.py |  4 +--
 .../fed_dgga_pfl/fenda_ditto/client.py        |  6 ++--
 .../fenda_ditto/run_fold_experiment.slrm      |  4 +++
 .../fed_dgga_pfl/fenda_ditto/server.py        |  4 +--
 22 files changed, 84 insertions(+), 36 deletions(-)

diff --git a/fl4health/utils/random.py b/fl4health/utils/random.py
index 22e70e156..515889a67 100644
--- a/fl4health/utils/random.py
+++ b/fl4health/utils/random.py
@@ -8,22 +8,43 @@
 from flwr.common.logger import log
 
 
-def set_all_random_seeds(seed: Optional[int] = 42) -> None:
-    """Set seeds for python random, numpy random, and pytorch random.
+def set_all_random_seeds(
+    seed: Optional[int] = 42, use_deterministic_torch_algos: bool = False, disable_torch_benchmarking: bool = False
+) -> None:
+    """
+    Set seeds for python random, numpy random, and pytorch random. It also offers the option to force pytorch to use
+    deterministic algorithm for certain methods and layers see:
+    https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html) for more details. Finally, it
+    allows one to disable cuda benchmarking, which can also affect the determinism of pytorch training outside of
+    random seeding. For more information on reproducibility in pytorch see:
+    https://pytorch.org/docs/stable/notes/randomness.html
 
-    Will no-op if seed is `None`.
+    NOTE: If the use_deterministic_torch_algos flag is set to True, you may need to set the environment variable
+    CUBLAS_WORKSPACE_CONFIG to something like :4096:8, to avoid CUDA errors. Additional documentation may be found
+    here: https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility
 
     Args:
-        seed (int): The seed value to be used for random number generators. Default is 42.
+        seed (Optional[int], optional): The seed value to be used for random number generators. Default is 42. Seed
+            setting will no-op if the seed is explicitly set to None
+        use_deterministic_torch_algos (bool, optional): Whether or not to set torch.use_deterministic_algorithms to
+            True. Defaults to False.
+        disable_torch_benchmarking (bool, optional): Whether to explicitly disable cuda benchmarking in
+            torch processes. Defaults to False.
     """
     if seed is None:
         log(INFO, "No seed provided. Using random seed.")
     else:
-        log(INFO, f"Setting seed to {seed} and fixing torch determinism")
+        log(INFO, f"Setting random seeds to {seed}.")
         random.seed(seed)
         np.random.seed(seed)
         torch.manual_seed(seed)
-        torch.use_deterministic_algorithms(True)
+    if use_deterministic_torch_algos:
+        log(INFO, "Setting torch.use_deterministic_algorithms to True.")
+        # warn_only is set to true so that layers and components without deterministic algorithms available will
+        # warn the user that they don't exist, but won't take down the process with an exception.
+        torch.use_deterministic_algorithms(True, warn_only=True)
+    if disable_torch_benchmarking:
+        log(INFO, "Disabling CUDA algorithm benchmarking.")
         torch.backends.cudnn.benchmark = False
 
 
@@ -37,7 +58,6 @@ def unset_all_random_seeds() -> None:
     np.random.seed(None)
     torch.seed()
     torch.use_deterministic_algorithms(False)
-    torch.backends.cudnn.benchmark = True
 
 
 def generate_hash(length: int = 8) -> str:
diff --git a/research/cifar10/adaptive_pfl/ditto/client.py b/research/cifar10/adaptive_pfl/ditto/client.py
index 872e1f481..a2e510a7c 100644
--- a/research/cifar10/adaptive_pfl/ditto/client.py
+++ b/research/cifar10/adaptive_pfl/ditto/client.py
@@ -65,7 +65,7 @@ def get_optimizer(self, config: Config) -> Dict[str, Optimizer]:
         return {"global": global_optimizer, "local": local_optimizer}
 
     def get_model(self, config: Config) -> nn.Module:
-        return ConvNet(in_channels=3, use_bn=False, dropout=0.1).to(self.device)
+        return ConvNet(in_channels=3, use_bn=False, dropout=0.1, hidden=512).to(self.device)
 
 
 if __name__ == "__main__":
@@ -130,7 +130,7 @@ def get_model(self, config: Config) -> nn.Module:
     log(INFO, f"Beta: {args.beta}")
 
     # Set the random seed for reproducibility
-    set_all_random_seeds(args.seed)
+    set_all_random_seeds(args.seed, use_deterministic_torch_algos=True, disable_torch_benchmarking=True)
 
     # Adding extensive checkpointing for the client
     checkpoint_dir = os.path.join(args.artifact_dir, args.run_name)
diff --git a/research/cifar10/adaptive_pfl/ditto/run_fold_experiment.slrm b/research/cifar10/adaptive_pfl/ditto/run_fold_experiment.slrm
index aa8e256a0..9cc88ce05 100644
--- a/research/cifar10/adaptive_pfl/ditto/run_fold_experiment.slrm
+++ b/research/cifar10/adaptive_pfl/ditto/run_fold_experiment.slrm
@@ -62,6 +62,10 @@ if [[ "${SLURM_JOB_PARTITION}" == "t4v2" ]] || \
     export NCCL_SOCKET_IFNAME=bond0
 fi
 
+# This environment variable must be set in order to force torch to use determinsitic algorithms. See documentation
+# in fl4health/utils/random.py for more information
+export CUBLAS_WORKSPACE_CONFIG=:4096:8
+
 # Process Inputs
 
 SERVER_CONFIG_PATH=$1
diff --git a/research/cifar10/adaptive_pfl/ditto/server.py b/research/cifar10/adaptive_pfl/ditto/server.py
index 9d82c9461..51c4c4a24 100644
--- a/research/cifar10/adaptive_pfl/ditto/server.py
+++ b/research/cifar10/adaptive_pfl/ditto/server.py
@@ -45,7 +45,7 @@ def main(config: Dict[str, Any], server_address: str, lam: float, adapt_loss_wei
 
     client_manager = SimpleClientManager()
     # Initializing the model on the server side
-    model = ConvNet(in_channels=3, use_bn=False, dropout=0.1)
+    model = ConvNet(in_channels=3, use_bn=False, dropout=0.1, hidden=512)
     # Server performs simple FedAveraging as its server-side optimization strategy
     strategy = FedAvgWithAdaptiveConstraint(
         min_fit_clients=config["n_clients"],
@@ -118,6 +118,6 @@ def main(config: Dict[str, Any], server_address: str, lam: float, adapt_loss_wei
         log(INFO, "Adapting the loss weight for model drift via global model loss")
 
     # Set the random seed for reproducibility
-    set_all_random_seeds(args.seed)
+    set_all_random_seeds(args.seed, use_deterministic_torch_algos=True, disable_torch_benchmarking=True)
 
     main(config, args.server_address, args.lam, args.use_adaptation)
diff --git a/research/cifar10/adaptive_pfl/fedprox/client.py b/research/cifar10/adaptive_pfl/fedprox/client.py
index 18be6b303..624e7e639 100644
--- a/research/cifar10/adaptive_pfl/fedprox/client.py
+++ b/research/cifar10/adaptive_pfl/fedprox/client.py
@@ -63,7 +63,7 @@ def get_optimizer(self, config: Config) -> Optimizer:
         return torch.optim.AdamW(self.model.parameters(), lr=self.learning_rate)
 
     def get_model(self, config: Config) -> nn.Module:
-        return ConvNet(in_channels=3, use_bn=False, dropout=0.1).to(self.device)
+        return ConvNet(in_channels=3, use_bn=False, dropout=0.1, hidden=512).to(self.device)
 
 
 if __name__ == "__main__":
@@ -134,7 +134,7 @@ def get_model(self, config: Config) -> nn.Module:
     log(INFO, f"Beta: {args.beta}")
 
     # Set the random seed for reproducibility
-    set_all_random_seeds(args.seed)
+    set_all_random_seeds(args.seed, use_deterministic_torch_algos=True, disable_torch_benchmarking=True)
 
     # Adding extensive checkpointing for the client
     checkpoint_dir = os.path.join(args.artifact_dir, args.run_name)
diff --git a/research/cifar10/adaptive_pfl/fedprox/run_fold_experiment.slrm b/research/cifar10/adaptive_pfl/fedprox/run_fold_experiment.slrm
index f95dae6c6..56dec6e51 100644
--- a/research/cifar10/adaptive_pfl/fedprox/run_fold_experiment.slrm
+++ b/research/cifar10/adaptive_pfl/fedprox/run_fold_experiment.slrm
@@ -62,6 +62,10 @@ if [[ "${SLURM_JOB_PARTITION}" == "t4v2" ]] || \
     export NCCL_SOCKET_IFNAME=bond0
 fi
 
+# This environment variable must be set in order to force torch to use determinsitic algorithms. See documentation
+# in fl4health/utils/random.py for more information
+export CUBLAS_WORKSPACE_CONFIG=:4096:8
+
 # Process Inputs
 
 SERVER_CONFIG_PATH=$1
diff --git a/research/cifar10/adaptive_pfl/fedprox/server.py b/research/cifar10/adaptive_pfl/fedprox/server.py
index ecc177203..d8859e9f5 100644
--- a/research/cifar10/adaptive_pfl/fedprox/server.py
+++ b/research/cifar10/adaptive_pfl/fedprox/server.py
@@ -61,7 +61,7 @@ def main(
 
     client_manager = SimpleClientManager()
     # Initializing the model on the server side
-    model = ConvNet(in_channels=3, use_bn=False, dropout=0.1)
+    model = ConvNet(in_channels=3, use_bn=False, dropout=0.1, hidden=512)
     # Server performs simple FedAveraging as its server-side optimization strategy
     strategy = FedAvgWithAdaptiveConstraint(
         min_fit_clients=config["n_clients"],
@@ -153,6 +153,6 @@ def main(
         log(INFO, "Adapting the loss weight for model drift via model loss")
 
     # Set the random seed for reproducibility
-    set_all_random_seeds(args.seed)
+    set_all_random_seeds(args.seed, use_deterministic_torch_algos=True, disable_torch_benchmarking=True)
 
     main(config, args.server_address, args.artifact_dir, args.run_name, args.lam, args.use_adaptation)
diff --git a/research/cifar10/adaptive_pfl/fenda_ditto/client.py b/research/cifar10/adaptive_pfl/fenda_ditto/client.py
index e5682050f..421fba250 100644
--- a/research/cifar10/adaptive_pfl/fenda_ditto/client.py
+++ b/research/cifar10/adaptive_pfl/fenda_ditto/client.py
@@ -68,7 +68,7 @@ def get_optimizer(self, config: Config) -> Dict[str, Optimizer]:
         return {"global": global_optimizer, "local": local_optimizer}
 
     def get_model(self, config: Config) -> FendaModel:
-        return ConvNetFendaModel(in_channels=3, use_bn=False).to(self.device)
+        return ConvNetFendaModel(in_channels=3, use_bn=False, dropout=0.1, hidden=512).to(self.device)
 
     def get_global_model(self, config: Config) -> SequentiallySplitModel:
         return ConvNetFendaDittoGlobalModel(in_channels=3, use_bn=False, dropout=0.1).to(self.device)
@@ -144,7 +144,7 @@ def get_global_model(self, config: Config) -> SequentiallySplitModel:
         log(INFO, "Freezing the global feature extractor of the FENDA model")
 
     # Set the random seed for reproducibility
-    set_all_random_seeds(args.seed)
+    set_all_random_seeds(args.seed, use_deterministic_torch_algos=True, disable_torch_benchmarking=True)
 
     # Adding extensive checkpointing for the client
     checkpoint_dir = os.path.join(args.artifact_dir, args.run_name)
diff --git a/research/cifar10/adaptive_pfl/fenda_ditto/run_fold_experiment.slrm b/research/cifar10/adaptive_pfl/fenda_ditto/run_fold_experiment.slrm
index ac4909432..e30981f53 100644
--- a/research/cifar10/adaptive_pfl/fenda_ditto/run_fold_experiment.slrm
+++ b/research/cifar10/adaptive_pfl/fenda_ditto/run_fold_experiment.slrm
@@ -64,6 +64,10 @@ if [[ "${SLURM_JOB_PARTITION}" == "t4v2" ]] || \
     export NCCL_SOCKET_IFNAME=bond0
 fi
 
+# This environment variable must be set in order to force torch to use determinsitic algorithms. See documentation
+# in fl4health/utils/random.py for more information
+export CUBLAS_WORKSPACE_CONFIG=:4096:8
+
 # Process Inputs
 
 SERVER_CONFIG_PATH=$1
diff --git a/research/cifar10/adaptive_pfl/fenda_ditto/server.py b/research/cifar10/adaptive_pfl/fenda_ditto/server.py
index b576852f1..464491f64 100644
--- a/research/cifar10/adaptive_pfl/fenda_ditto/server.py
+++ b/research/cifar10/adaptive_pfl/fenda_ditto/server.py
@@ -45,7 +45,7 @@ def main(config: Dict[str, Any], server_address: str, lam: float, adapt_loss_wei
 
     client_manager = SimpleClientManager()
     # Initializing the model on the server side
-    model = ConvNetFendaDittoGlobalModel(in_channels=3, use_bn=False, dropout=0.1)
+    model = ConvNetFendaDittoGlobalModel(in_channels=3, use_bn=False, dropout=0.1, hidden=512)
     # Server performs simple FedAveraging as its server-side optimization strategy
     strategy = FedAvgWithAdaptiveConstraint(
         min_fit_clients=config["n_clients"],
@@ -118,6 +118,6 @@ def main(config: Dict[str, Any], server_address: str, lam: float, adapt_loss_wei
         log(INFO, "Adapting the loss weight for model drift via global model loss")
 
     # Set the random seed for reproducibility
-    set_all_random_seeds(args.seed)
+    set_all_random_seeds(args.seed, use_deterministic_torch_algos=True, disable_torch_benchmarking=True)
 
     main(config, args.server_address, args.lam, args.use_adaptation)
diff --git a/research/cifar10/adaptive_pfl/mrmtl/client.py b/research/cifar10/adaptive_pfl/mrmtl/client.py
index 69a5e430d..0cc5e1939 100644
--- a/research/cifar10/adaptive_pfl/mrmtl/client.py
+++ b/research/cifar10/adaptive_pfl/mrmtl/client.py
@@ -63,7 +63,7 @@ def get_optimizer(self, config: Config) -> Optimizer:
         return torch.optim.SGD(self.model.parameters(), lr=self.learning_rate)
 
     def get_model(self, config: Config) -> nn.Module:
-        return ConvNet(in_channels=3, use_bn=False, dropout=0.1).to(self.device)
+        return ConvNet(in_channels=3, use_bn=False, dropout=0.1, hidden=512).to(self.device)
 
 
 if __name__ == "__main__":
@@ -128,7 +128,7 @@ def get_model(self, config: Config) -> nn.Module:
     log(INFO, f"Beta: {args.beta}")
 
     # Set the random seed for reproducibility
-    set_all_random_seeds(args.seed)
+    set_all_random_seeds(args.seed, use_deterministic_torch_algos=True, disable_torch_benchmarking=True)
 
     # Adding extensive checkpointing for the client
     checkpoint_dir = os.path.join(args.artifact_dir, args.run_name)
diff --git a/research/cifar10/adaptive_pfl/mrmtl/run_fold_experiment.slrm b/research/cifar10/adaptive_pfl/mrmtl/run_fold_experiment.slrm
index 2f708bce0..42056ec0f 100644
--- a/research/cifar10/adaptive_pfl/mrmtl/run_fold_experiment.slrm
+++ b/research/cifar10/adaptive_pfl/mrmtl/run_fold_experiment.slrm
@@ -62,6 +62,10 @@ if [[ "${SLURM_JOB_PARTITION}" == "t4v2" ]] || \
     export NCCL_SOCKET_IFNAME=bond0
 fi
 
+# This environment variable must be set in order to force torch to use determinsitic algorithms. See documentation
+# in fl4health/utils/random.py for more information
+export CUBLAS_WORKSPACE_CONFIG=:4096:8
+
 # Process Inputs
 
 SERVER_CONFIG_PATH=$1
diff --git a/research/cifar10/adaptive_pfl/mrmtl/server.py b/research/cifar10/adaptive_pfl/mrmtl/server.py
index f3a5e8c60..6a3b57d57 100644
--- a/research/cifar10/adaptive_pfl/mrmtl/server.py
+++ b/research/cifar10/adaptive_pfl/mrmtl/server.py
@@ -69,7 +69,7 @@ def main(config: Dict[str, Any], server_address: str, lam: float, adapt_loss_wei
 
     client_manager = SimpleClientManager()
     # Initializing the model on the server side
-    model = ConvNet(in_channels=3, use_bn=False, dropout=0.1)
+    model = ConvNet(in_channels=3, use_bn=False, dropout=0.1, hidden=512)
     # Server performs simple FedAveraging as its server-side optimization strategy
     strategy = FedAvgWithAdaptiveConstraint(
         min_fit_clients=config["n_clients"],
@@ -142,6 +142,6 @@ def main(config: Dict[str, Any], server_address: str, lam: float, adapt_loss_wei
         log(INFO, "Adapting the loss weight for model drift via global model loss")
 
     # Set the random seed for reproducibility
-    set_all_random_seeds(args.seed)
+    set_all_random_seeds(args.seed, use_deterministic_torch_algos=True, disable_torch_benchmarking=True)
 
     main(config, args.server_address, args.lam, args.use_adaptation)
diff --git a/research/cifar10/fed_dgga_pfl/ditto/client.py b/research/cifar10/fed_dgga_pfl/ditto/client.py
index 872e1f481..a2e510a7c 100644
--- a/research/cifar10/fed_dgga_pfl/ditto/client.py
+++ b/research/cifar10/fed_dgga_pfl/ditto/client.py
@@ -65,7 +65,7 @@ def get_optimizer(self, config: Config) -> Dict[str, Optimizer]:
         return {"global": global_optimizer, "local": local_optimizer}
 
     def get_model(self, config: Config) -> nn.Module:
-        return ConvNet(in_channels=3, use_bn=False, dropout=0.1).to(self.device)
+        return ConvNet(in_channels=3, use_bn=False, dropout=0.1, hidden=512).to(self.device)
 
 
 if __name__ == "__main__":
@@ -130,7 +130,7 @@ def get_model(self, config: Config) -> nn.Module:
     log(INFO, f"Beta: {args.beta}")
 
     # Set the random seed for reproducibility
-    set_all_random_seeds(args.seed)
+    set_all_random_seeds(args.seed, use_deterministic_torch_algos=True, disable_torch_benchmarking=True)
 
     # Adding extensive checkpointing for the client
     checkpoint_dir = os.path.join(args.artifact_dir, args.run_name)
diff --git a/research/cifar10/fed_dgga_pfl/ditto/run_fold_experiment.slrm b/research/cifar10/fed_dgga_pfl/ditto/run_fold_experiment.slrm
index 7d95fe7e4..e1e76f248 100644
--- a/research/cifar10/fed_dgga_pfl/ditto/run_fold_experiment.slrm
+++ b/research/cifar10/fed_dgga_pfl/ditto/run_fold_experiment.slrm
@@ -61,6 +61,10 @@ if [[ "${SLURM_JOB_PARTITION}" == "t4v2" ]] || \
     export NCCL_SOCKET_IFNAME=bond0
 fi
 
+# This environment variable must be set in order to force torch to use determinsitic algorithms. See documentation
+# in fl4health/utils/random.py for more information
+export CUBLAS_WORKSPACE_CONFIG=:4096:8
+
 # Process Inputs
 
 SERVER_CONFIG_PATH=$1
diff --git a/research/cifar10/fed_dgga_pfl/ditto/server.py b/research/cifar10/fed_dgga_pfl/ditto/server.py
index e95b8543c..4bb2d5a6d 100644
--- a/research/cifar10/fed_dgga_pfl/ditto/server.py
+++ b/research/cifar10/fed_dgga_pfl/ditto/server.py
@@ -56,7 +56,7 @@ def main(config: Dict[str, Any], server_address: str, lam: float, step_size: flo
     # is done right before fit_round.
     client_manager = FixedSamplingClientManager()
     # Initializing the model on the server side
-    model = ConvNet(in_channels=3, use_bn=False, dropout=0.1)
+    model = ConvNet(in_channels=3, use_bn=False, dropout=0.1, hidden=512)
 
     # Define a fairness metric based on the loss associated with the global Ditto model as that is the one being
     # aggregated by the server.
@@ -135,6 +135,6 @@ def main(config: Dict[str, Any], server_address: str, lam: float, step_size: flo
     log(INFO, f"Step Size: {args.step_size}")
 
     # Set the random seed for reproducibility
-    set_all_random_seeds(args.seed)
+    set_all_random_seeds(args.seed, use_deterministic_torch_algos=True, disable_torch_benchmarking=True)
 
     main(config, args.server_address, args.lam, args.step_size)
diff --git a/research/cifar10/fed_dgga_pfl/fenda/client.py b/research/cifar10/fed_dgga_pfl/fenda/client.py
index 1d1abd03e..7c8828465 100644
--- a/research/cifar10/fed_dgga_pfl/fenda/client.py
+++ b/research/cifar10/fed_dgga_pfl/fenda/client.py
@@ -63,7 +63,7 @@ def get_optimizer(self, config: Config) -> Optimizer:
         return torch.optim.AdamW(self.model.parameters(), lr=self.learning_rate)
 
     def get_model(self, config: Config) -> FendaModel:
-        return ConvNetFendaModel(in_channels=3, use_bn=False, dropout=0.1).to(self.device)
+        return ConvNetFendaModel(in_channels=3, use_bn=False, dropout=0.1, hidden=512).to(self.device)
 
 
 if __name__ == "__main__":
@@ -128,7 +128,7 @@ def get_model(self, config: Config) -> FendaModel:
     log(INFO, f"Beta: {args.beta}")
 
     # Set the random seed for reproducibility
-    set_all_random_seeds(args.seed)
+    set_all_random_seeds(args.seed, use_deterministic_torch_algos=True, disable_torch_benchmarking=True)
 
     # Adding extensive checkpointing for the client
     checkpoint_dir = os.path.join(args.artifact_dir, args.run_name)
diff --git a/research/cifar10/fed_dgga_pfl/fenda/run_fold_experiment.slrm b/research/cifar10/fed_dgga_pfl/fenda/run_fold_experiment.slrm
index 9ca21399c..afd97e40b 100644
--- a/research/cifar10/fed_dgga_pfl/fenda/run_fold_experiment.slrm
+++ b/research/cifar10/fed_dgga_pfl/fenda/run_fold_experiment.slrm
@@ -60,6 +60,10 @@ if [[ "${SLURM_JOB_PARTITION}" == "t4v2" ]] || \
     export NCCL_SOCKET_IFNAME=bond0
 fi
 
+# This environment variable must be set in order to force torch to use determinsitic algorithms. See documentation
+# in fl4health/utils/random.py for more information
+export CUBLAS_WORKSPACE_CONFIG=:4096:8
+
 # Process Inputs
 
 SERVER_CONFIG_PATH=$1
diff --git a/research/cifar10/fed_dgga_pfl/fenda/server.py b/research/cifar10/fed_dgga_pfl/fenda/server.py
index b9408af6d..a880df617 100644
--- a/research/cifar10/fed_dgga_pfl/fenda/server.py
+++ b/research/cifar10/fed_dgga_pfl/fenda/server.py
@@ -55,7 +55,7 @@ def main(config: Dict[str, Any], server_address: str, step_size: float) -> None:
     # is done right before fit_round.
     client_manager = FixedSamplingClientManager()
     # Initializing the model on the server side
-    model = ConvNetFendaModel(in_channels=3, use_bn=False, dropout=0.1)
+    model = ConvNetFendaModel(in_channels=3, use_bn=False, dropout=0.1, hidden=512)
 
     # Define a fairness metric based on the loss associated with the whole FENDA model
     fenda_fairness_metric = FairnessMetric(FairnessMetricType.LOSS)
@@ -128,6 +128,6 @@ def main(config: Dict[str, Any], server_address: str, step_size: float) -> None:
     log(INFO, f"Step Size: {args.step_size}")
 
     # Set the random seed for reproducibility
-    set_all_random_seeds(args.seed)
+    set_all_random_seeds(args.seed, use_deterministic_torch_algos=True, disable_torch_benchmarking=True)
 
     main(config, args.server_address, args.step_size)
diff --git a/research/cifar10/fed_dgga_pfl/fenda_ditto/client.py b/research/cifar10/fed_dgga_pfl/fenda_ditto/client.py
index 298525e71..792178240 100644
--- a/research/cifar10/fed_dgga_pfl/fenda_ditto/client.py
+++ b/research/cifar10/fed_dgga_pfl/fenda_ditto/client.py
@@ -68,10 +68,10 @@ def get_optimizer(self, config: Config) -> Dict[str, Optimizer]:
         return {"global": global_optimizer, "local": local_optimizer}
 
     def get_model(self, config: Config) -> FendaModel:
-        return ConvNetFendaModel(in_channels=3, use_bn=False, dropout=0.1).to(self.device)
+        return ConvNetFendaModel(in_channels=3, use_bn=False, dropout=0.1, hidden=512).to(self.device)
 
     def get_global_model(self, config: Config) -> SequentiallySplitModel:
-        return ConvNetFendaDittoGlobalModel(in_channels=3, use_bn=False, dropout=0.1).to(self.device)
+        return ConvNetFendaDittoGlobalModel(in_channels=3, use_bn=False, dropout=0.1, hidden=512).to(self.device)
 
 
 if __name__ == "__main__":
@@ -144,7 +144,7 @@ def get_global_model(self, config: Config) -> SequentiallySplitModel:
         log(INFO, "Freezing the global feature extractor of the FENDA model")
 
     # Set the random seed for reproducibility
-    set_all_random_seeds(args.seed)
+    set_all_random_seeds(args.seed, use_deterministic_torch_algos=True, disable_torch_benchmarking=True)
 
     # Adding extensive checkpointing for the client
     checkpoint_dir = os.path.join(args.artifact_dir, args.run_name)
diff --git a/research/cifar10/fed_dgga_pfl/fenda_ditto/run_fold_experiment.slrm b/research/cifar10/fed_dgga_pfl/fenda_ditto/run_fold_experiment.slrm
index 265cf4eee..185032b6f 100644
--- a/research/cifar10/fed_dgga_pfl/fenda_ditto/run_fold_experiment.slrm
+++ b/research/cifar10/fed_dgga_pfl/fenda_ditto/run_fold_experiment.slrm
@@ -64,6 +64,10 @@ if [[ "${SLURM_JOB_PARTITION}" == "t4v2" ]] || \
     export NCCL_SOCKET_IFNAME=bond0
 fi
 
+# This environment variable must be set in order to force torch to use determinsitic algorithms. See documentation
+# in fl4health/utils/random.py for more information
+export CUBLAS_WORKSPACE_CONFIG=:4096:8
+
 # Process Inputs
 
 SERVER_CONFIG_PATH=$1
diff --git a/research/cifar10/fed_dgga_pfl/fenda_ditto/server.py b/research/cifar10/fed_dgga_pfl/fenda_ditto/server.py
index 3a3452cbd..2104c559a 100644
--- a/research/cifar10/fed_dgga_pfl/fenda_ditto/server.py
+++ b/research/cifar10/fed_dgga_pfl/fenda_ditto/server.py
@@ -56,7 +56,7 @@ def main(config: Dict[str, Any], server_address: str, lam: float, step_size: flo
     # is done right before fit_round.
     client_manager = FixedSamplingClientManager()
     # Initializing the model on the server side
-    model = ConvNetFendaDittoGlobalModel(in_channels=3, use_bn=False, dropout=0.1)
+    model = ConvNetFendaDittoGlobalModel(in_channels=3, use_bn=False, dropout=0.1, hidden=512)
 
     # Define a fairness metric based on the loss associated with the global Ditto model as that is the one being
     # aggregated by the server.
@@ -135,6 +135,6 @@ def main(config: Dict[str, Any], server_address: str, lam: float, step_size: flo
     log(INFO, f"Step Size: {args.step_size}")
 
     # Set the random seed for reproducibility
-    set_all_random_seeds(args.seed)
+    set_all_random_seeds(args.seed, use_deterministic_torch_algos=True, disable_torch_benchmarking=True)
 
     main(config, args.server_address, args.lam, args.step_size)