From 55df53faa9ee6c5faad83475f28523615b539162 Mon Sep 17 00:00:00 2001 From: David Emerson <43939939+emersodb@users.noreply.github.com> Date: Thu, 7 Nov 2024 08:51:29 -0500 Subject: [PATCH 1/3] Expanding lam to lambda in arguments passed to scripts --- research/cifar10/adaptive_pfl/ditto/run_fold_experiment.slrm | 4 ++-- research/cifar10/adaptive_pfl/ditto/server.py | 2 +- .../cifar10/adaptive_pfl/fedprox/run_fold_experiment.slrm | 4 ++-- research/cifar10/adaptive_pfl/fedprox/server.py | 2 +- .../cifar10/adaptive_pfl/fenda_ditto/run_fold_experiment.slrm | 4 ++-- research/cifar10/adaptive_pfl/fenda_ditto/server.py | 2 +- research/cifar10/adaptive_pfl/mrmtl/run_fold_experiment.slrm | 4 ++-- research/cifar10/adaptive_pfl/mrmtl/server.py | 2 +- research/cifar10/fed_dgga_pfl/ditto/run_fold_experiment.slrm | 2 +- research/cifar10/fed_dgga_pfl/ditto/server.py | 2 +- .../cifar10/fed_dgga_pfl/fenda_ditto/run_fold_experiment.slrm | 2 +- research/cifar10/fed_dgga_pfl/fenda_ditto/server.py | 2 +- 12 files changed, 16 insertions(+), 16 deletions(-) diff --git a/research/cifar10/adaptive_pfl/ditto/run_fold_experiment.slrm b/research/cifar10/adaptive_pfl/ditto/run_fold_experiment.slrm index f170822b2..aa8e256a0 100644 --- a/research/cifar10/adaptive_pfl/ditto/run_fold_experiment.slrm +++ b/research/cifar10/adaptive_pfl/ditto/run_fold_experiment.slrm @@ -132,7 +132,7 @@ do --config_path ${SERVER_CONFIG_PATH} \ --server_address ${SERVER_ADDRESS} \ --seed ${SEED} \ - --lam ${LAM_VALUE} \ + --lambda ${LAM_VALUE} \ --use_adaptation \ > ${SERVER_OUTPUT_FILE} 2>&1 & else @@ -140,7 +140,7 @@ do --config_path ${SERVER_CONFIG_PATH} \ --server_address ${SERVER_ADDRESS} \ --seed ${SEED} \ - --lam ${LAM_VALUE} \ + --lambda ${LAM_VALUE} \ > ${SERVER_OUTPUT_FILE} 2>&1 & fi diff --git a/research/cifar10/adaptive_pfl/ditto/server.py b/research/cifar10/adaptive_pfl/ditto/server.py index e3500627b..9d82c9461 100644 --- a/research/cifar10/adaptive_pfl/ditto/server.py +++ b/research/cifar10/adaptive_pfl/ditto/server.py @@ -101,7 +101,7 @@ def main(config: Dict[str, Any], server_address: str, lam: float, adapt_loss_wei required=False, ) parser.add_argument( - "--lam", action="store", type=float, help="Ditto loss weight for local model training", default=0.01 + "--lambda", action="store", type=float, help="Ditto loss weight for local model training", default=0.01 ) parser.add_argument( "--use_adaptation", diff --git a/research/cifar10/adaptive_pfl/fedprox/run_fold_experiment.slrm b/research/cifar10/adaptive_pfl/fedprox/run_fold_experiment.slrm index 59f77dff8..f95dae6c6 100644 --- a/research/cifar10/adaptive_pfl/fedprox/run_fold_experiment.slrm +++ b/research/cifar10/adaptive_pfl/fedprox/run_fold_experiment.slrm @@ -134,7 +134,7 @@ do --run_name ${RUN_NAME} \ --server_address ${SERVER_ADDRESS} \ --seed ${SEED} \ - --lam ${LAM_VALUE} \ + --lambda ${LAM_VALUE} \ --use_adaptation \ > ${SERVER_OUTPUT_FILE} 2>&1 & else @@ -144,7 +144,7 @@ do --run_name ${RUN_NAME} \ --server_address ${SERVER_ADDRESS} \ --seed ${SEED} \ - --lam ${LAM_VALUE} \ + --lambda ${LAM_VALUE} \ > ${SERVER_OUTPUT_FILE} 2>&1 & fi diff --git a/research/cifar10/adaptive_pfl/fedprox/server.py b/research/cifar10/adaptive_pfl/fedprox/server.py index 5032342a0..ecc177203 100644 --- a/research/cifar10/adaptive_pfl/fedprox/server.py +++ b/research/cifar10/adaptive_pfl/fedprox/server.py @@ -136,7 +136,7 @@ def main( required=False, ) parser.add_argument( - "--lam", action="store", type=float, help="FedProx loss weight for local model training", default=0.01 + "--lambda", action="store", type=float, help="FedProx loss weight for local model training", default=0.01 ) parser.add_argument( "--use_adaptation", diff --git a/research/cifar10/adaptive_pfl/fenda_ditto/run_fold_experiment.slrm b/research/cifar10/adaptive_pfl/fenda_ditto/run_fold_experiment.slrm index 58d39133a..ac4909432 100644 --- a/research/cifar10/adaptive_pfl/fenda_ditto/run_fold_experiment.slrm +++ b/research/cifar10/adaptive_pfl/fenda_ditto/run_fold_experiment.slrm @@ -135,7 +135,7 @@ do --config_path ${SERVER_CONFIG_PATH} \ --server_address ${SERVER_ADDRESS} \ --seed ${SEED} \ - --lam ${LAM_VALUE} \ + --lambda ${LAM_VALUE} \ --use_adaptation \ > ${SERVER_OUTPUT_FILE} 2>&1 & else @@ -143,7 +143,7 @@ do --config_path ${SERVER_CONFIG_PATH} \ --server_address ${SERVER_ADDRESS} \ --seed ${SEED} \ - --lam ${LAM_VALUE} \ + --lambda ${LAM_VALUE} \ > ${SERVER_OUTPUT_FILE} 2>&1 & fi diff --git a/research/cifar10/adaptive_pfl/fenda_ditto/server.py b/research/cifar10/adaptive_pfl/fenda_ditto/server.py index 3a45d46cd..b576852f1 100644 --- a/research/cifar10/adaptive_pfl/fenda_ditto/server.py +++ b/research/cifar10/adaptive_pfl/fenda_ditto/server.py @@ -101,7 +101,7 @@ def main(config: Dict[str, Any], server_address: str, lam: float, adapt_loss_wei required=False, ) parser.add_argument( - "--lam", action="store", type=float, help="FENDA Ditto loss weight for local model training", default=0.01 + "--lambda", action="store", type=float, help="FENDA Ditto loss weight for local model training", default=0.01 ) parser.add_argument( "--use_adaptation", diff --git a/research/cifar10/adaptive_pfl/mrmtl/run_fold_experiment.slrm b/research/cifar10/adaptive_pfl/mrmtl/run_fold_experiment.slrm index b216abf2b..2f708bce0 100644 --- a/research/cifar10/adaptive_pfl/mrmtl/run_fold_experiment.slrm +++ b/research/cifar10/adaptive_pfl/mrmtl/run_fold_experiment.slrm @@ -132,7 +132,7 @@ do --config_path ${SERVER_CONFIG_PATH} \ --server_address ${SERVER_ADDRESS} \ --seed ${SEED} \ - --lam ${LAM_VALUE} \ + --lambda ${LAM_VALUE} \ --use_adaptation \ > ${SERVER_OUTPUT_FILE} 2>&1 & else @@ -140,7 +140,7 @@ do --config_path ${SERVER_CONFIG_PATH} \ --server_address ${SERVER_ADDRESS} \ --seed ${SEED} \ - --lam ${LAM_VALUE} \ + --lambda ${LAM_VALUE} \ > ${SERVER_OUTPUT_FILE} 2>&1 & fi diff --git a/research/cifar10/adaptive_pfl/mrmtl/server.py b/research/cifar10/adaptive_pfl/mrmtl/server.py index f965aa813..f3a5e8c60 100644 --- a/research/cifar10/adaptive_pfl/mrmtl/server.py +++ b/research/cifar10/adaptive_pfl/mrmtl/server.py @@ -125,7 +125,7 @@ def main(config: Dict[str, Any], server_address: str, lam: float, adapt_loss_wei required=False, ) parser.add_argument( - "--lam", action="store", type=float, help="Ditto loss weight for local model training", default=0.01 + "--lambda", action="store", type=float, help="Ditto loss weight for local model training", default=0.01 ) parser.add_argument( "--use_adaptation", diff --git a/research/cifar10/fed_dgga_pfl/ditto/run_fold_experiment.slrm b/research/cifar10/fed_dgga_pfl/ditto/run_fold_experiment.slrm index 37ecfafd1..7d95fe7e4 100644 --- a/research/cifar10/fed_dgga_pfl/ditto/run_fold_experiment.slrm +++ b/research/cifar10/fed_dgga_pfl/ditto/run_fold_experiment.slrm @@ -130,7 +130,7 @@ do --config_path ${SERVER_CONFIG_PATH} \ --server_address ${SERVER_ADDRESS} \ --seed ${SEED} \ - --lam ${LAM_VALUE} \ + --lambda ${LAM_VALUE} \ --step_size ${STEP_SIZE} \ > ${SERVER_OUTPUT_FILE} 2>&1 & diff --git a/research/cifar10/fed_dgga_pfl/ditto/server.py b/research/cifar10/fed_dgga_pfl/ditto/server.py index fa8b46a7c..e95b8543c 100644 --- a/research/cifar10/fed_dgga_pfl/ditto/server.py +++ b/research/cifar10/fed_dgga_pfl/ditto/server.py @@ -118,7 +118,7 @@ def main(config: Dict[str, Any], server_address: str, lam: float, step_size: flo required=False, ) parser.add_argument( - "--lam", action="store", type=float, help="Ditto loss weight for local model training", default=0.01 + "--lambda", action="store", type=float, help="Ditto loss weight for local model training", default=0.01 ) parser.add_argument( "--step_size", diff --git a/research/cifar10/fed_dgga_pfl/fenda_ditto/run_fold_experiment.slrm b/research/cifar10/fed_dgga_pfl/fenda_ditto/run_fold_experiment.slrm index 0dfa94b8f..265cf4eee 100644 --- a/research/cifar10/fed_dgga_pfl/fenda_ditto/run_fold_experiment.slrm +++ b/research/cifar10/fed_dgga_pfl/fenda_ditto/run_fold_experiment.slrm @@ -134,7 +134,7 @@ do --config_path ${SERVER_CONFIG_PATH} \ --server_address ${SERVER_ADDRESS} \ --seed ${SEED} \ - --lam ${LAM_VALUE} \ + --lambda ${LAM_VALUE} \ --step_size ${STEP_SIZE} \ > ${SERVER_OUTPUT_FILE} 2>&1 & diff --git a/research/cifar10/fed_dgga_pfl/fenda_ditto/server.py b/research/cifar10/fed_dgga_pfl/fenda_ditto/server.py index 2f373d895..3a3452cbd 100644 --- a/research/cifar10/fed_dgga_pfl/fenda_ditto/server.py +++ b/research/cifar10/fed_dgga_pfl/fenda_ditto/server.py @@ -118,7 +118,7 @@ def main(config: Dict[str, Any], server_address: str, lam: float, step_size: flo required=False, ) parser.add_argument( - "--lam", action="store", type=float, help="FENDA Ditto loss weight for local model training", default=0.01 + "--lambda", action="store", type=float, help="FENDA Ditto loss weight for local model training", default=0.01 ) parser.add_argument( "--step_size", From edc846307f8e74230277de2b7b6213fc4ae7d6fb Mon Sep 17 00:00:00 2001 From: David Emerson <43939939+emersodb@users.noreply.github.com> Date: Thu, 7 Nov 2024 09:00:11 -0500 Subject: [PATCH 2/3] Changes to make max retries a touch more clear and cleaner --- fl4health/utils/partitioners.py | 17 +++++++++++------ research/cifar10/preprocess.py | 6 +++--- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/fl4health/utils/partitioners.py b/fl4health/utils/partitioners.py index c49c934f9..b9b2492a2 100644 --- a/fl4health/utils/partitioners.py +++ b/fl4health/utils/partitioners.py @@ -148,7 +148,9 @@ def partition_label_indices( # Dropping the last partition as they are "excess" indices return partitioned_indices[:-1], min_samples, partition_allocations - def partition_dataset(self, original_dataset: D, max_retries: int = 5) -> Tuple[List[D], Dict[T, np.ndarray]]: + def partition_dataset( + self, original_dataset: D, max_retries: Optional[int] = 5 + ) -> Tuple[List[D], Dict[T, np.ndarray]]: """ Attempts partitioning of the original dataset up to max_retries times. Retries are potentially required if the user requests a minimum number of labels be assigned to each of the partitions. If the drawn Dirichlet @@ -157,16 +159,19 @@ def partition_dataset(self, original_dataset: D, max_retries: int = 5) -> Tuple[ Args: original_dataset (D): The dataset to be partitioned - max_retries (int, optional): Number of times to attempt to satisfy a user provided minimum - label-associated data points per partition. Defaults to 5. + max_retries (Optional[int], optional): Number of times to attempt to satisfy a user provided minimum + label-associated data points per partition. Set this value to None if you want to retry indefinitely. + Defaults to 5. Raises: ValueError: Throws this error if the retries have been exhausted and the user provided minimum is not met. Returns: - List[D]: The partitioned datasets, length should correspond to self.number_of_partitions - Dict[T, np.ndarray]: The Dirichlet distribution used to partition the data points for each label. + Tuple[List[D], Dict[T, np.ndarray]]: List[D] is the partitioned datasets, length should correspond to + self.number_of_partitions. Dict[T, np.ndarray] is the Dirichlet distribution used to partition the data + points for each label. """ + targets = original_dataset.targets assert targets is not None, "A label-based partitioner requires targets but this dataset has no targets" partitioned_indices = [torch.Tensor([]).int() for _ in range(self.number_of_partitions)] @@ -195,7 +200,7 @@ def partition_dataset(self, original_dataset: D, max_retries: int = 5) -> Tuple[ f"minimum requested was {self.min_label_examples}. Resampling the partition..." ), ) - if partition_attempts == max_retries: + if max_retries is not None and partition_attempts >= max_retries: raise ValueError( ( f"Max Retries: {max_retries} reached. Partitioning failed to " diff --git a/research/cifar10/preprocess.py b/research/cifar10/preprocess.py index 8e3f9632f..58a478a1e 100644 --- a/research/cifar10/preprocess.py +++ b/research/cifar10/preprocess.py @@ -103,7 +103,7 @@ def preprocess_data( number_of_partitions=num_clients, unique_labels=list(range(10)), beta=beta, min_label_examples=1 ) train_partitioned_datasets, train_partitioned_dist = heterogeneous_partitioner.partition_dataset( - training_set, max_retries=-1 + training_set, max_retries=None ) # Partition validation and test data @@ -111,9 +111,9 @@ def preprocess_data( number_of_partitions=num_clients, unique_labels=list(range(10)), prior_distribution=train_partitioned_dist ) validation_partitioned_datasets, _ = heterogeneous_partitioner_with_prior.partition_dataset( - validation_set, max_retries=-1 + validation_set, max_retries=None ) - test_partitioned_datasets, _ = heterogeneous_partitioner_with_prior.partition_dataset(test_set, max_retries=-1) + test_partitioned_datasets, _ = heterogeneous_partitioner_with_prior.partition_dataset(test_set, max_retries=None) return train_partitioned_datasets, validation_partitioned_datasets, test_partitioned_datasets From 93988c79c5349a62e457ffa21bcb6feef1d574ea Mon Sep 17 00:00:00 2001 From: David Emerson <43939939+emersodb@users.noreply.github.com> Date: Thu, 7 Nov 2024 11:19:47 -0500 Subject: [PATCH 3/3] Moving the torch specific determinism to separate optional arguments so as not to disrupt any other workflows that don't need them. Added some documentation. Adding the setting of these to the appropriate places for the pFL experiments. Reducing the hidden size of the cnns used in pFL experiments, as they are quite large. --- fl4health/utils/random.py | 34 +++++++++++++++---- research/cifar10/adaptive_pfl/ditto/client.py | 4 +-- .../ditto/run_fold_experiment.slrm | 4 +++ research/cifar10/adaptive_pfl/ditto/server.py | 4 +-- .../cifar10/adaptive_pfl/fedprox/client.py | 4 +-- .../fedprox/run_fold_experiment.slrm | 4 +++ .../cifar10/adaptive_pfl/fedprox/server.py | 4 +-- .../adaptive_pfl/fenda_ditto/client.py | 4 +-- .../fenda_ditto/run_fold_experiment.slrm | 4 +++ .../adaptive_pfl/fenda_ditto/server.py | 4 +-- research/cifar10/adaptive_pfl/mrmtl/client.py | 4 +-- .../mrmtl/run_fold_experiment.slrm | 4 +++ research/cifar10/adaptive_pfl/mrmtl/server.py | 4 +-- research/cifar10/fed_dgga_pfl/ditto/client.py | 4 +-- .../ditto/run_fold_experiment.slrm | 4 +++ research/cifar10/fed_dgga_pfl/ditto/server.py | 4 +-- research/cifar10/fed_dgga_pfl/fenda/client.py | 4 +-- .../fenda/run_fold_experiment.slrm | 4 +++ research/cifar10/fed_dgga_pfl/fenda/server.py | 4 +-- .../fed_dgga_pfl/fenda_ditto/client.py | 6 ++-- .../fenda_ditto/run_fold_experiment.slrm | 4 +++ .../fed_dgga_pfl/fenda_ditto/server.py | 4 +-- 22 files changed, 84 insertions(+), 36 deletions(-) diff --git a/fl4health/utils/random.py b/fl4health/utils/random.py index 22e70e156..515889a67 100644 --- a/fl4health/utils/random.py +++ b/fl4health/utils/random.py @@ -8,22 +8,43 @@ from flwr.common.logger import log -def set_all_random_seeds(seed: Optional[int] = 42) -> None: - """Set seeds for python random, numpy random, and pytorch random. +def set_all_random_seeds( + seed: Optional[int] = 42, use_deterministic_torch_algos: bool = False, disable_torch_benchmarking: bool = False +) -> None: + """ + Set seeds for python random, numpy random, and pytorch random. It also offers the option to force pytorch to use + deterministic algorithm for certain methods and layers see: + https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html) for more details. Finally, it + allows one to disable cuda benchmarking, which can also affect the determinism of pytorch training outside of + random seeding. For more information on reproducibility in pytorch see: + https://pytorch.org/docs/stable/notes/randomness.html - Will no-op if seed is `None`. + NOTE: If the use_deterministic_torch_algos flag is set to True, you may need to set the environment variable + CUBLAS_WORKSPACE_CONFIG to something like :4096:8, to avoid CUDA errors. Additional documentation may be found + here: https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility Args: - seed (int): The seed value to be used for random number generators. Default is 42. + seed (Optional[int], optional): The seed value to be used for random number generators. Default is 42. Seed + setting will no-op if the seed is explicitly set to None + use_deterministic_torch_algos (bool, optional): Whether or not to set torch.use_deterministic_algorithms to + True. Defaults to False. + disable_torch_benchmarking (bool, optional): Whether to explicitly disable cuda benchmarking in + torch processes. Defaults to False. """ if seed is None: log(INFO, "No seed provided. Using random seed.") else: - log(INFO, f"Setting seed to {seed} and fixing torch determinism") + log(INFO, f"Setting random seeds to {seed}.") random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) - torch.use_deterministic_algorithms(True) + if use_deterministic_torch_algos: + log(INFO, "Setting torch.use_deterministic_algorithms to True.") + # warn_only is set to true so that layers and components without deterministic algorithms available will + # warn the user that they don't exist, but won't take down the process with an exception. + torch.use_deterministic_algorithms(True, warn_only=True) + if disable_torch_benchmarking: + log(INFO, "Disabling CUDA algorithm benchmarking.") torch.backends.cudnn.benchmark = False @@ -37,7 +58,6 @@ def unset_all_random_seeds() -> None: np.random.seed(None) torch.seed() torch.use_deterministic_algorithms(False) - torch.backends.cudnn.benchmark = True def generate_hash(length: int = 8) -> str: diff --git a/research/cifar10/adaptive_pfl/ditto/client.py b/research/cifar10/adaptive_pfl/ditto/client.py index 872e1f481..a2e510a7c 100644 --- a/research/cifar10/adaptive_pfl/ditto/client.py +++ b/research/cifar10/adaptive_pfl/ditto/client.py @@ -65,7 +65,7 @@ def get_optimizer(self, config: Config) -> Dict[str, Optimizer]: return {"global": global_optimizer, "local": local_optimizer} def get_model(self, config: Config) -> nn.Module: - return ConvNet(in_channels=3, use_bn=False, dropout=0.1).to(self.device) + return ConvNet(in_channels=3, use_bn=False, dropout=0.1, hidden=512).to(self.device) if __name__ == "__main__": @@ -130,7 +130,7 @@ def get_model(self, config: Config) -> nn.Module: log(INFO, f"Beta: {args.beta}") # Set the random seed for reproducibility - set_all_random_seeds(args.seed) + set_all_random_seeds(args.seed, use_deterministic_torch_algos=True, disable_torch_benchmarking=True) # Adding extensive checkpointing for the client checkpoint_dir = os.path.join(args.artifact_dir, args.run_name) diff --git a/research/cifar10/adaptive_pfl/ditto/run_fold_experiment.slrm b/research/cifar10/adaptive_pfl/ditto/run_fold_experiment.slrm index aa8e256a0..9cc88ce05 100644 --- a/research/cifar10/adaptive_pfl/ditto/run_fold_experiment.slrm +++ b/research/cifar10/adaptive_pfl/ditto/run_fold_experiment.slrm @@ -62,6 +62,10 @@ if [[ "${SLURM_JOB_PARTITION}" == "t4v2" ]] || \ export NCCL_SOCKET_IFNAME=bond0 fi +# This environment variable must be set in order to force torch to use determinsitic algorithms. See documentation +# in fl4health/utils/random.py for more information +export CUBLAS_WORKSPACE_CONFIG=:4096:8 + # Process Inputs SERVER_CONFIG_PATH=$1 diff --git a/research/cifar10/adaptive_pfl/ditto/server.py b/research/cifar10/adaptive_pfl/ditto/server.py index 9d82c9461..51c4c4a24 100644 --- a/research/cifar10/adaptive_pfl/ditto/server.py +++ b/research/cifar10/adaptive_pfl/ditto/server.py @@ -45,7 +45,7 @@ def main(config: Dict[str, Any], server_address: str, lam: float, adapt_loss_wei client_manager = SimpleClientManager() # Initializing the model on the server side - model = ConvNet(in_channels=3, use_bn=False, dropout=0.1) + model = ConvNet(in_channels=3, use_bn=False, dropout=0.1, hidden=512) # Server performs simple FedAveraging as its server-side optimization strategy strategy = FedAvgWithAdaptiveConstraint( min_fit_clients=config["n_clients"], @@ -118,6 +118,6 @@ def main(config: Dict[str, Any], server_address: str, lam: float, adapt_loss_wei log(INFO, "Adapting the loss weight for model drift via global model loss") # Set the random seed for reproducibility - set_all_random_seeds(args.seed) + set_all_random_seeds(args.seed, use_deterministic_torch_algos=True, disable_torch_benchmarking=True) main(config, args.server_address, args.lam, args.use_adaptation) diff --git a/research/cifar10/adaptive_pfl/fedprox/client.py b/research/cifar10/adaptive_pfl/fedprox/client.py index 18be6b303..624e7e639 100644 --- a/research/cifar10/adaptive_pfl/fedprox/client.py +++ b/research/cifar10/adaptive_pfl/fedprox/client.py @@ -63,7 +63,7 @@ def get_optimizer(self, config: Config) -> Optimizer: return torch.optim.AdamW(self.model.parameters(), lr=self.learning_rate) def get_model(self, config: Config) -> nn.Module: - return ConvNet(in_channels=3, use_bn=False, dropout=0.1).to(self.device) + return ConvNet(in_channels=3, use_bn=False, dropout=0.1, hidden=512).to(self.device) if __name__ == "__main__": @@ -134,7 +134,7 @@ def get_model(self, config: Config) -> nn.Module: log(INFO, f"Beta: {args.beta}") # Set the random seed for reproducibility - set_all_random_seeds(args.seed) + set_all_random_seeds(args.seed, use_deterministic_torch_algos=True, disable_torch_benchmarking=True) # Adding extensive checkpointing for the client checkpoint_dir = os.path.join(args.artifact_dir, args.run_name) diff --git a/research/cifar10/adaptive_pfl/fedprox/run_fold_experiment.slrm b/research/cifar10/adaptive_pfl/fedprox/run_fold_experiment.slrm index f95dae6c6..56dec6e51 100644 --- a/research/cifar10/adaptive_pfl/fedprox/run_fold_experiment.slrm +++ b/research/cifar10/adaptive_pfl/fedprox/run_fold_experiment.slrm @@ -62,6 +62,10 @@ if [[ "${SLURM_JOB_PARTITION}" == "t4v2" ]] || \ export NCCL_SOCKET_IFNAME=bond0 fi +# This environment variable must be set in order to force torch to use determinsitic algorithms. See documentation +# in fl4health/utils/random.py for more information +export CUBLAS_WORKSPACE_CONFIG=:4096:8 + # Process Inputs SERVER_CONFIG_PATH=$1 diff --git a/research/cifar10/adaptive_pfl/fedprox/server.py b/research/cifar10/adaptive_pfl/fedprox/server.py index ecc177203..d8859e9f5 100644 --- a/research/cifar10/adaptive_pfl/fedprox/server.py +++ b/research/cifar10/adaptive_pfl/fedprox/server.py @@ -61,7 +61,7 @@ def main( client_manager = SimpleClientManager() # Initializing the model on the server side - model = ConvNet(in_channels=3, use_bn=False, dropout=0.1) + model = ConvNet(in_channels=3, use_bn=False, dropout=0.1, hidden=512) # Server performs simple FedAveraging as its server-side optimization strategy strategy = FedAvgWithAdaptiveConstraint( min_fit_clients=config["n_clients"], @@ -153,6 +153,6 @@ def main( log(INFO, "Adapting the loss weight for model drift via model loss") # Set the random seed for reproducibility - set_all_random_seeds(args.seed) + set_all_random_seeds(args.seed, use_deterministic_torch_algos=True, disable_torch_benchmarking=True) main(config, args.server_address, args.artifact_dir, args.run_name, args.lam, args.use_adaptation) diff --git a/research/cifar10/adaptive_pfl/fenda_ditto/client.py b/research/cifar10/adaptive_pfl/fenda_ditto/client.py index e5682050f..421fba250 100644 --- a/research/cifar10/adaptive_pfl/fenda_ditto/client.py +++ b/research/cifar10/adaptive_pfl/fenda_ditto/client.py @@ -68,7 +68,7 @@ def get_optimizer(self, config: Config) -> Dict[str, Optimizer]: return {"global": global_optimizer, "local": local_optimizer} def get_model(self, config: Config) -> FendaModel: - return ConvNetFendaModel(in_channels=3, use_bn=False).to(self.device) + return ConvNetFendaModel(in_channels=3, use_bn=False, dropout=0.1, hidden=512).to(self.device) def get_global_model(self, config: Config) -> SequentiallySplitModel: return ConvNetFendaDittoGlobalModel(in_channels=3, use_bn=False, dropout=0.1).to(self.device) @@ -144,7 +144,7 @@ def get_global_model(self, config: Config) -> SequentiallySplitModel: log(INFO, "Freezing the global feature extractor of the FENDA model") # Set the random seed for reproducibility - set_all_random_seeds(args.seed) + set_all_random_seeds(args.seed, use_deterministic_torch_algos=True, disable_torch_benchmarking=True) # Adding extensive checkpointing for the client checkpoint_dir = os.path.join(args.artifact_dir, args.run_name) diff --git a/research/cifar10/adaptive_pfl/fenda_ditto/run_fold_experiment.slrm b/research/cifar10/adaptive_pfl/fenda_ditto/run_fold_experiment.slrm index ac4909432..e30981f53 100644 --- a/research/cifar10/adaptive_pfl/fenda_ditto/run_fold_experiment.slrm +++ b/research/cifar10/adaptive_pfl/fenda_ditto/run_fold_experiment.slrm @@ -64,6 +64,10 @@ if [[ "${SLURM_JOB_PARTITION}" == "t4v2" ]] || \ export NCCL_SOCKET_IFNAME=bond0 fi +# This environment variable must be set in order to force torch to use determinsitic algorithms. See documentation +# in fl4health/utils/random.py for more information +export CUBLAS_WORKSPACE_CONFIG=:4096:8 + # Process Inputs SERVER_CONFIG_PATH=$1 diff --git a/research/cifar10/adaptive_pfl/fenda_ditto/server.py b/research/cifar10/adaptive_pfl/fenda_ditto/server.py index b576852f1..464491f64 100644 --- a/research/cifar10/adaptive_pfl/fenda_ditto/server.py +++ b/research/cifar10/adaptive_pfl/fenda_ditto/server.py @@ -45,7 +45,7 @@ def main(config: Dict[str, Any], server_address: str, lam: float, adapt_loss_wei client_manager = SimpleClientManager() # Initializing the model on the server side - model = ConvNetFendaDittoGlobalModel(in_channels=3, use_bn=False, dropout=0.1) + model = ConvNetFendaDittoGlobalModel(in_channels=3, use_bn=False, dropout=0.1, hidden=512) # Server performs simple FedAveraging as its server-side optimization strategy strategy = FedAvgWithAdaptiveConstraint( min_fit_clients=config["n_clients"], @@ -118,6 +118,6 @@ def main(config: Dict[str, Any], server_address: str, lam: float, adapt_loss_wei log(INFO, "Adapting the loss weight for model drift via global model loss") # Set the random seed for reproducibility - set_all_random_seeds(args.seed) + set_all_random_seeds(args.seed, use_deterministic_torch_algos=True, disable_torch_benchmarking=True) main(config, args.server_address, args.lam, args.use_adaptation) diff --git a/research/cifar10/adaptive_pfl/mrmtl/client.py b/research/cifar10/adaptive_pfl/mrmtl/client.py index 69a5e430d..0cc5e1939 100644 --- a/research/cifar10/adaptive_pfl/mrmtl/client.py +++ b/research/cifar10/adaptive_pfl/mrmtl/client.py @@ -63,7 +63,7 @@ def get_optimizer(self, config: Config) -> Optimizer: return torch.optim.SGD(self.model.parameters(), lr=self.learning_rate) def get_model(self, config: Config) -> nn.Module: - return ConvNet(in_channels=3, use_bn=False, dropout=0.1).to(self.device) + return ConvNet(in_channels=3, use_bn=False, dropout=0.1, hidden=512).to(self.device) if __name__ == "__main__": @@ -128,7 +128,7 @@ def get_model(self, config: Config) -> nn.Module: log(INFO, f"Beta: {args.beta}") # Set the random seed for reproducibility - set_all_random_seeds(args.seed) + set_all_random_seeds(args.seed, use_deterministic_torch_algos=True, disable_torch_benchmarking=True) # Adding extensive checkpointing for the client checkpoint_dir = os.path.join(args.artifact_dir, args.run_name) diff --git a/research/cifar10/adaptive_pfl/mrmtl/run_fold_experiment.slrm b/research/cifar10/adaptive_pfl/mrmtl/run_fold_experiment.slrm index 2f708bce0..42056ec0f 100644 --- a/research/cifar10/adaptive_pfl/mrmtl/run_fold_experiment.slrm +++ b/research/cifar10/adaptive_pfl/mrmtl/run_fold_experiment.slrm @@ -62,6 +62,10 @@ if [[ "${SLURM_JOB_PARTITION}" == "t4v2" ]] || \ export NCCL_SOCKET_IFNAME=bond0 fi +# This environment variable must be set in order to force torch to use determinsitic algorithms. See documentation +# in fl4health/utils/random.py for more information +export CUBLAS_WORKSPACE_CONFIG=:4096:8 + # Process Inputs SERVER_CONFIG_PATH=$1 diff --git a/research/cifar10/adaptive_pfl/mrmtl/server.py b/research/cifar10/adaptive_pfl/mrmtl/server.py index f3a5e8c60..6a3b57d57 100644 --- a/research/cifar10/adaptive_pfl/mrmtl/server.py +++ b/research/cifar10/adaptive_pfl/mrmtl/server.py @@ -69,7 +69,7 @@ def main(config: Dict[str, Any], server_address: str, lam: float, adapt_loss_wei client_manager = SimpleClientManager() # Initializing the model on the server side - model = ConvNet(in_channels=3, use_bn=False, dropout=0.1) + model = ConvNet(in_channels=3, use_bn=False, dropout=0.1, hidden=512) # Server performs simple FedAveraging as its server-side optimization strategy strategy = FedAvgWithAdaptiveConstraint( min_fit_clients=config["n_clients"], @@ -142,6 +142,6 @@ def main(config: Dict[str, Any], server_address: str, lam: float, adapt_loss_wei log(INFO, "Adapting the loss weight for model drift via global model loss") # Set the random seed for reproducibility - set_all_random_seeds(args.seed) + set_all_random_seeds(args.seed, use_deterministic_torch_algos=True, disable_torch_benchmarking=True) main(config, args.server_address, args.lam, args.use_adaptation) diff --git a/research/cifar10/fed_dgga_pfl/ditto/client.py b/research/cifar10/fed_dgga_pfl/ditto/client.py index 872e1f481..a2e510a7c 100644 --- a/research/cifar10/fed_dgga_pfl/ditto/client.py +++ b/research/cifar10/fed_dgga_pfl/ditto/client.py @@ -65,7 +65,7 @@ def get_optimizer(self, config: Config) -> Dict[str, Optimizer]: return {"global": global_optimizer, "local": local_optimizer} def get_model(self, config: Config) -> nn.Module: - return ConvNet(in_channels=3, use_bn=False, dropout=0.1).to(self.device) + return ConvNet(in_channels=3, use_bn=False, dropout=0.1, hidden=512).to(self.device) if __name__ == "__main__": @@ -130,7 +130,7 @@ def get_model(self, config: Config) -> nn.Module: log(INFO, f"Beta: {args.beta}") # Set the random seed for reproducibility - set_all_random_seeds(args.seed) + set_all_random_seeds(args.seed, use_deterministic_torch_algos=True, disable_torch_benchmarking=True) # Adding extensive checkpointing for the client checkpoint_dir = os.path.join(args.artifact_dir, args.run_name) diff --git a/research/cifar10/fed_dgga_pfl/ditto/run_fold_experiment.slrm b/research/cifar10/fed_dgga_pfl/ditto/run_fold_experiment.slrm index 7d95fe7e4..e1e76f248 100644 --- a/research/cifar10/fed_dgga_pfl/ditto/run_fold_experiment.slrm +++ b/research/cifar10/fed_dgga_pfl/ditto/run_fold_experiment.slrm @@ -61,6 +61,10 @@ if [[ "${SLURM_JOB_PARTITION}" == "t4v2" ]] || \ export NCCL_SOCKET_IFNAME=bond0 fi +# This environment variable must be set in order to force torch to use determinsitic algorithms. See documentation +# in fl4health/utils/random.py for more information +export CUBLAS_WORKSPACE_CONFIG=:4096:8 + # Process Inputs SERVER_CONFIG_PATH=$1 diff --git a/research/cifar10/fed_dgga_pfl/ditto/server.py b/research/cifar10/fed_dgga_pfl/ditto/server.py index e95b8543c..4bb2d5a6d 100644 --- a/research/cifar10/fed_dgga_pfl/ditto/server.py +++ b/research/cifar10/fed_dgga_pfl/ditto/server.py @@ -56,7 +56,7 @@ def main(config: Dict[str, Any], server_address: str, lam: float, step_size: flo # is done right before fit_round. client_manager = FixedSamplingClientManager() # Initializing the model on the server side - model = ConvNet(in_channels=3, use_bn=False, dropout=0.1) + model = ConvNet(in_channels=3, use_bn=False, dropout=0.1, hidden=512) # Define a fairness metric based on the loss associated with the global Ditto model as that is the one being # aggregated by the server. @@ -135,6 +135,6 @@ def main(config: Dict[str, Any], server_address: str, lam: float, step_size: flo log(INFO, f"Step Size: {args.step_size}") # Set the random seed for reproducibility - set_all_random_seeds(args.seed) + set_all_random_seeds(args.seed, use_deterministic_torch_algos=True, disable_torch_benchmarking=True) main(config, args.server_address, args.lam, args.step_size) diff --git a/research/cifar10/fed_dgga_pfl/fenda/client.py b/research/cifar10/fed_dgga_pfl/fenda/client.py index 1d1abd03e..7c8828465 100644 --- a/research/cifar10/fed_dgga_pfl/fenda/client.py +++ b/research/cifar10/fed_dgga_pfl/fenda/client.py @@ -63,7 +63,7 @@ def get_optimizer(self, config: Config) -> Optimizer: return torch.optim.AdamW(self.model.parameters(), lr=self.learning_rate) def get_model(self, config: Config) -> FendaModel: - return ConvNetFendaModel(in_channels=3, use_bn=False, dropout=0.1).to(self.device) + return ConvNetFendaModel(in_channels=3, use_bn=False, dropout=0.1, hidden=512).to(self.device) if __name__ == "__main__": @@ -128,7 +128,7 @@ def get_model(self, config: Config) -> FendaModel: log(INFO, f"Beta: {args.beta}") # Set the random seed for reproducibility - set_all_random_seeds(args.seed) + set_all_random_seeds(args.seed, use_deterministic_torch_algos=True, disable_torch_benchmarking=True) # Adding extensive checkpointing for the client checkpoint_dir = os.path.join(args.artifact_dir, args.run_name) diff --git a/research/cifar10/fed_dgga_pfl/fenda/run_fold_experiment.slrm b/research/cifar10/fed_dgga_pfl/fenda/run_fold_experiment.slrm index 9ca21399c..afd97e40b 100644 --- a/research/cifar10/fed_dgga_pfl/fenda/run_fold_experiment.slrm +++ b/research/cifar10/fed_dgga_pfl/fenda/run_fold_experiment.slrm @@ -60,6 +60,10 @@ if [[ "${SLURM_JOB_PARTITION}" == "t4v2" ]] || \ export NCCL_SOCKET_IFNAME=bond0 fi +# This environment variable must be set in order to force torch to use determinsitic algorithms. See documentation +# in fl4health/utils/random.py for more information +export CUBLAS_WORKSPACE_CONFIG=:4096:8 + # Process Inputs SERVER_CONFIG_PATH=$1 diff --git a/research/cifar10/fed_dgga_pfl/fenda/server.py b/research/cifar10/fed_dgga_pfl/fenda/server.py index b9408af6d..a880df617 100644 --- a/research/cifar10/fed_dgga_pfl/fenda/server.py +++ b/research/cifar10/fed_dgga_pfl/fenda/server.py @@ -55,7 +55,7 @@ def main(config: Dict[str, Any], server_address: str, step_size: float) -> None: # is done right before fit_round. client_manager = FixedSamplingClientManager() # Initializing the model on the server side - model = ConvNetFendaModel(in_channels=3, use_bn=False, dropout=0.1) + model = ConvNetFendaModel(in_channels=3, use_bn=False, dropout=0.1, hidden=512) # Define a fairness metric based on the loss associated with the whole FENDA model fenda_fairness_metric = FairnessMetric(FairnessMetricType.LOSS) @@ -128,6 +128,6 @@ def main(config: Dict[str, Any], server_address: str, step_size: float) -> None: log(INFO, f"Step Size: {args.step_size}") # Set the random seed for reproducibility - set_all_random_seeds(args.seed) + set_all_random_seeds(args.seed, use_deterministic_torch_algos=True, disable_torch_benchmarking=True) main(config, args.server_address, args.step_size) diff --git a/research/cifar10/fed_dgga_pfl/fenda_ditto/client.py b/research/cifar10/fed_dgga_pfl/fenda_ditto/client.py index 298525e71..792178240 100644 --- a/research/cifar10/fed_dgga_pfl/fenda_ditto/client.py +++ b/research/cifar10/fed_dgga_pfl/fenda_ditto/client.py @@ -68,10 +68,10 @@ def get_optimizer(self, config: Config) -> Dict[str, Optimizer]: return {"global": global_optimizer, "local": local_optimizer} def get_model(self, config: Config) -> FendaModel: - return ConvNetFendaModel(in_channels=3, use_bn=False, dropout=0.1).to(self.device) + return ConvNetFendaModel(in_channels=3, use_bn=False, dropout=0.1, hidden=512).to(self.device) def get_global_model(self, config: Config) -> SequentiallySplitModel: - return ConvNetFendaDittoGlobalModel(in_channels=3, use_bn=False, dropout=0.1).to(self.device) + return ConvNetFendaDittoGlobalModel(in_channels=3, use_bn=False, dropout=0.1, hidden=512).to(self.device) if __name__ == "__main__": @@ -144,7 +144,7 @@ def get_global_model(self, config: Config) -> SequentiallySplitModel: log(INFO, "Freezing the global feature extractor of the FENDA model") # Set the random seed for reproducibility - set_all_random_seeds(args.seed) + set_all_random_seeds(args.seed, use_deterministic_torch_algos=True, disable_torch_benchmarking=True) # Adding extensive checkpointing for the client checkpoint_dir = os.path.join(args.artifact_dir, args.run_name) diff --git a/research/cifar10/fed_dgga_pfl/fenda_ditto/run_fold_experiment.slrm b/research/cifar10/fed_dgga_pfl/fenda_ditto/run_fold_experiment.slrm index 265cf4eee..185032b6f 100644 --- a/research/cifar10/fed_dgga_pfl/fenda_ditto/run_fold_experiment.slrm +++ b/research/cifar10/fed_dgga_pfl/fenda_ditto/run_fold_experiment.slrm @@ -64,6 +64,10 @@ if [[ "${SLURM_JOB_PARTITION}" == "t4v2" ]] || \ export NCCL_SOCKET_IFNAME=bond0 fi +# This environment variable must be set in order to force torch to use determinsitic algorithms. See documentation +# in fl4health/utils/random.py for more information +export CUBLAS_WORKSPACE_CONFIG=:4096:8 + # Process Inputs SERVER_CONFIG_PATH=$1 diff --git a/research/cifar10/fed_dgga_pfl/fenda_ditto/server.py b/research/cifar10/fed_dgga_pfl/fenda_ditto/server.py index 3a3452cbd..2104c559a 100644 --- a/research/cifar10/fed_dgga_pfl/fenda_ditto/server.py +++ b/research/cifar10/fed_dgga_pfl/fenda_ditto/server.py @@ -56,7 +56,7 @@ def main(config: Dict[str, Any], server_address: str, lam: float, step_size: flo # is done right before fit_round. client_manager = FixedSamplingClientManager() # Initializing the model on the server side - model = ConvNetFendaDittoGlobalModel(in_channels=3, use_bn=False, dropout=0.1) + model = ConvNetFendaDittoGlobalModel(in_channels=3, use_bn=False, dropout=0.1, hidden=512) # Define a fairness metric based on the loss associated with the global Ditto model as that is the one being # aggregated by the server. @@ -135,6 +135,6 @@ def main(config: Dict[str, Any], server_address: str, lam: float, step_size: flo log(INFO, f"Step Size: {args.step_size}") # Set the random seed for reproducibility - set_all_random_seeds(args.seed) + set_all_random_seeds(args.seed, use_deterministic_torch_algos=True, disable_torch_benchmarking=True) main(config, args.server_address, args.lam, args.step_size)